Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
69 commits
Select commit Hold shift + click to select a range
5cf5156
update isarn-sketches dep to 0.2.2
erikerlandson Jun 26, 2020
d073d0d
test design with thin shim class for new fast TDigest to clean up the…
erikerlandson Jun 29, 2020
017b508
update initial commands
erikerlandson Jun 29, 2020
1075c65
scope t digest shim class, TDigestAggregator companion obj
erikerlandson Jun 29, 2020
0f16a5d
bump isarn-sketches to 0.3.0
erikerlandson Jun 29, 2020
efddd13
example of a java/python binding
erikerlandson Jun 30, 2020
fd6d381
modify python tdigest UDT and a test UDF
erikerlandson Jun 30, 2020
ce94d46
ScalarNumeric, data type functions, python renaming, commenting out o…
erikerlandson Jun 30, 2020
298655e
spark 3.0 supports scala 2.12 only
erikerlandson Jul 10, 2020
877fc35
http -> https
erikerlandson Jul 10, 2020
e93ef96
TDigestArrayAggregator
erikerlandson Jul 10, 2020
ecc91fa
array function overloadings
erikerlandson Jul 10, 2020
4cc73b6
add instructions for cleaning out ivy on local publish
erikerlandson Jul 11, 2020
d56e23d
spark vector aggregations
erikerlandson Jul 11, 2020
63cc7a0
no longer need UDT for tdigest array
erikerlandson Jul 11, 2020
bbe94c1
old tdigest UDTs are obsolete
erikerlandson Jul 11, 2020
1ffd763
remove package object
erikerlandson Jul 11, 2020
6bc35b0
sketches.spark.tdigest._
erikerlandson Jul 11, 2020
507c658
tdigest.scala
erikerlandson Jul 11, 2020
a289972
TDigestReduceAggregator
erikerlandson Jul 11, 2020
9d980f9
TDigestArrayReduceAggregator
erikerlandson Jul 11, 2020
8f0ff8a
TDigestUDAF.scala is obsolete
erikerlandson Jul 11, 2020
82f36da
TDigestArrayReduceAggregator inherit from TDigestArrayAggregatorBase
erikerlandson Jul 11, 2020
97f7e5e
factor out compression and maxdiscrete from TDigestArrayAggregatorBase
erikerlandson Jul 11, 2020
ee48162
/udaf/ -> /spark/
erikerlandson Jul 11, 2020
d35a297
/udaf/ -> /spark/
erikerlandson Jul 12, 2020
4c8482a
move python TDigestUDT into spark/tdigest.py
erikerlandson Jul 12, 2020
ab4a4fd
update sbt build mappings for python refactor
erikerlandson Jul 12, 2020
8ee5cbf
update readme python for new organization
erikerlandson Jul 12, 2020
75ab5d1
copyright
erikerlandson Jul 12, 2020
62bd118
unused imports
erikerlandson Jul 12, 2020
80f69b3
more unused imports
erikerlandson Jul 12, 2020
670f912
switch to fast java TDigest
erikerlandson Jul 12, 2020
10dbee2
explicit import of JavaPredictionModel
erikerlandson Jul 12, 2020
087903b
/pipelines/ -> /pipelines/spark/
erikerlandson Jul 13, 2020
c0b1e4c
python/isarnproject/pipelines/__init__.py
erikerlandson Jul 13, 2020
31617db
update build mappings for new python organization
erikerlandson Jul 13, 2020
7c34139
update package paths for new organization
erikerlandson Jul 13, 2020
3db403c
fix package object path
erikerlandson Jul 13, 2020
58f100f
update copyright
erikerlandson Jul 13, 2020
d0788eb
update pyspark tdigest to be cleaner and analogous to java implementa…
erikerlandson Jul 15, 2020
5ddbfa8
spark pipeline param delta -> compression
erikerlandson Jul 16, 2020
23968b7
fi.scala
erikerlandson Jul 16, 2020
b30b001
update assembly dep and move it into plugins
erikerlandson Jul 16, 2020
14a37fa
add scaladoc
erikerlandson Jul 18, 2020
aa1a7a5
move ScalarNumeric out of tdigest specific package
erikerlandson Jul 18, 2020
eaa5d4b
update README examples for scala
erikerlandson Jul 19, 2020
f68c841
spark.sparkContext
erikerlandson Jul 19, 2020
6859030
update python tdigest examples
erikerlandson Jul 19, 2020
8fe1bee
update feature importance examples
erikerlandson Jul 19, 2020
b8e6e42
isarn-sketches-java
erikerlandson Jul 19, 2020
8875266
utest harness for spark testing
erikerlandson Jul 19, 2020
ae67bb9
TDigestAggregator test
erikerlandson Jul 19, 2020
129c8bf
TDigestAggregator test
erikerlandson Jul 19, 2020
864076c
KSD cumulative distribution divergence measure for unit testing
erikerlandson Jul 20, 2020
3f842bc
test counts
erikerlandson Jul 20, 2020
696b2ce
BigDecimal range
erikerlandson Jul 21, 2020
a430a9a
local build against spark-3.0.1-SNAPSHOT
erikerlandson Jul 21, 2020
39903a9
test TDigestArrayAggregator
erikerlandson Jul 21, 2020
a077cf3
tests for spark ML vector types
erikerlandson Jul 21, 2020
a2c0f6d
cache test data sets
erikerlandson Jul 22, 2020
fa4e45c
test tdigest reducing aggregators
erikerlandson Jul 23, 2020
c98a56a
epsD
erikerlandson Jul 23, 2020
2b3053c
move approx to base class
erikerlandson Jul 23, 2020
8e4c83a
disable parallel test execution to prevent spark cluster teardown rac…
erikerlandson Jul 24, 2020
5dc5c34
feature importance unit test
erikerlandson Jul 24, 2020
03801e9
build against spark 3.0.1
erikerlandson Sep 15, 2020
60d4fde
xsbt -> sbt
erikerlandson Sep 15, 2020
43284a2
0.5.0
erikerlandson Sep 15, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
514 changes: 211 additions & 303 deletions README.md

Large diffs are not rendered by default.

47 changes: 28 additions & 19 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,26 +1,28 @@
// xsbt clean unidoc previewSite
// xsbt clean unidoc ghpagesPushSite
// xsbt +publish
// sbt clean unidoc previewSite
// sbt clean unidoc ghpagesPushSite
// sbt +publish
// https://oss.sonatype.org
// make sure sparkVersion is set as you want prior to +publish
// when doing localPublish, also do:
// rm -rf /home/eje/.ivy2/local/org.isarnproject /home/eje/.ivy2/cache/org.isarnproject

import scala.sys.process._

name := "isarn-sketches-spark"

organization := "org.isarnproject"

val packageVersion = "0.4.1-SNAPSHOT"
val packageVersion = "0.5.0"

val sparkVersion = "3.0.0"
val sparkVersion = "3.0.1"

val sparkSuffix = s"""sp${sparkVersion.split('.').take(2).mkString(".")}"""

version := s"${packageVersion}-${sparkSuffix}"

scalaVersion := "2.12.11"

crossScalaVersions := Seq("2.12.11") // scala 2.12.11 when spark supports it
crossScalaVersions := Seq("2.12.11")

pomIncludeRepository := { _ => false }

Expand Down Expand Up @@ -54,14 +56,22 @@ developers := List(
)
)

resolvers += Resolver.mavenLocal

libraryDependencies ++= Seq(
"org.isarnproject" %% "isarn-sketches" % "0.1.2",
"org.isarnproject" % "isarn-sketches-java" % "0.3.0",
"org.apache.spark" %% "spark-core" % sparkVersion % Provided,
"org.apache.spark" %% "spark-sql" % sparkVersion % Provided,
"org.apache.spark" %% "spark-mllib" % sparkVersion % Provided,
"org.isarnproject" %% "isarn-scalatest" % "0.0.3" % Test,
"org.scalatest" %% "scalatest" % "3.0.5" % Test,
"org.apache.commons" % "commons-math3" % "3.6.1" % Test)
"com.lihaoyi" %% "utest" % "0.7.4" % Test)

// tell sbt about utest
testFrameworks += new TestFramework("utest.runner.Framework")

// default is to run tests in parallel, asynchronously, but
// that breaks both spark-cluster setup and teardown, and also breaks
// repeatability of the random data generation
parallelExecution in Test := false

initialCommands in console := """
|import org.apache.spark.SparkConf
Expand All @@ -70,10 +80,10 @@ initialCommands in console := """
|import org.apache.spark.SparkContext._
|import org.apache.spark.rdd.RDD
|import org.apache.spark.ml.linalg.Vectors
|import org.isarnproject.sketches.TDigest
|import org.isarnproject.sketches.udaf._
|import org.apache.spark.isarnproject.sketches.udt._
|val initialConf = new SparkConf().setAppName("repl").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").set("spark.kryoserializer.buffer", "16mb")
|import org.apache.spark.sql.functions._
|import org.isarnproject.sketches.java.TDigest
|import org.isarnproject.sketches.spark._
|val initialConf = new SparkConf().setAppName("repl")
|val spark = SparkSession.builder.config(initialConf).master("local[2]").getOrCreate()
|import spark._, spark.implicits._
|val sc = spark.sparkContext
Expand All @@ -90,12 +100,11 @@ scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature")
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.py") -> "isarnproject/__init__.py",
(baseDirectory.value / "python" / "isarnproject" / "pipelines" / "__init__.py") -> "isarnproject/pipelines/__init__.py",
(baseDirectory.value / "python" / "isarnproject" / "pipelines" / "fi.py") -> "isarnproject/pipelines/fi.py",
(baseDirectory.value / "python" / "isarnproject" / "pipelines" / "spark" / "__init__.py") -> "isarnproject/pipelines/spark/__init__.py",
(baseDirectory.value / "python" / "isarnproject" / "pipelines" / "spark" / "fi.py") -> "isarnproject/pipelines/spark/fi.py",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.py") -> "isarnproject/sketches/__init__.py",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.py") -> "isarnproject/sketches/udaf/__init__.py",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.py") -> "isarnproject/sketches/udaf/tdigest.py",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.py") -> "isarnproject/sketches/udt/__init__.py",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.py") -> "isarnproject/sketches/udt/tdigest.py"
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "spark" / "__init__.py") -> "isarnproject/sketches/spark/__init__.py",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "spark" / "tdigest.py") -> "isarnproject/sketches/spark/tdigest.py",
)

test in assembly := {}
Expand Down
1 change: 0 additions & 1 deletion project/assembly.sbt

This file was deleted.

6 changes: 4 additions & 2 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
resolvers += Resolver.url(
"bintray-sbt-plugin-releases",
url("http://dl.bintray.com/content/sbt/sbt-plugin-releases"))(
url("https://dl.bintray.com/content/sbt/sbt-plugin-releases"))(
Resolver.ivyStylePatterns)

resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"

resolvers += "jgit-repo" at "http://download.eclipse.org/jgit/maven"
resolvers += "jgit-repo" at "https://download.eclipse.org/jgit/maven"

addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.3")

Expand All @@ -15,6 +15,8 @@ addSbtPlugin("io.crashbox" % "sbt-gpg" % "0.2.1")

addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.2")

addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")

// scoverage and coveralls deps are at old versions to avoid a bug in the current versions
// update these when this fix is released: https://github.com/scoverage/sbt-coveralls/issues/73
//addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.0.4")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pyspark import since, keyword_only
from pyspark.ml.param.shared import *
from pyspark.ml.util import *
from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaWrapper
from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaWrapper, JavaPredictionModel
from pyspark.ml.common import inherit_doc
from pyspark.sql import DataFrame

Expand All @@ -14,19 +14,19 @@ def toPredictionModel(value):
raise TypeError("object %s was not a JavaPredictionModel" % (value))

class TDigestParams(Params):
delta = Param(Params._dummy(), "delta", "tdigest compression parameter",
compression = Param(Params._dummy(), "compression", "tdigest compression parameter",
typeConverter=TypeConverters.toFloat)
maxDiscrete = Param(Params._dummy(), "maxDiscrete", "maximum discrete values",
typeConverter=TypeConverters.toInt)

def __init__(self):
super(TDigestParams, self).__init__()

def setDelta(self, value):
return self._set(delta=value)
def setCompression(self, value):
return self._set(compression=value)

def getDelta(self):
return self.getOrDefault(self.delta)
def getCompression(self):
return self.getOrDefault(self.compression)

def setMaxDiscrete(self, value):
return self._set(maxDiscrete=value)
Expand Down Expand Up @@ -90,15 +90,15 @@ class TDigestFI(JavaEstimator, TDigestFIParams, JavaMLWritable, JavaMLReadable):
"""

@keyword_only
def __init__(self, delta = 0.5, maxDiscrete = 0, featuresCol = "features"):
def __init__(self, compression = 0.5, maxDiscrete = 0, featuresCol = "features"):
super(TDigestFI, self).__init__()
self._java_obj = self._new_java_obj("org.isarnproject.pipelines.TDigestFI", self.uid)
self._setDefault(delta = 0.5, maxDiscrete = 0, featuresCol = "features")
self._java_obj = self._new_java_obj("org.isarnproject.pipelines.spark.fi.TDigestFI", self.uid)
self._setDefault(compression = 0.5, maxDiscrete = 0, featuresCol = "features")
kwargs = self._input_kwargs
self.setParams(**kwargs)

@keyword_only
def setParams(self, delta = 0.5, maxDiscrete = 0, featuresCol = "features"):
def setParams(self, compression = 0.5, maxDiscrete = 0, featuresCol = "features"):
kwargs = self._input_kwargs
return self._set(**kwargs)

Expand Down
Loading