-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-22915][MLlib] Streaming tests for spark.ml.feature, from N to Z #20686
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
4099c85
bc7946c
836a173
4944c62
7a14154
80b9c8b
a5375bc
bf713b5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,94 +17,72 @@ | |
|
|
||
| package org.apache.spark.ml.feature | ||
|
|
||
| import org.apache.spark.SparkFunSuite | ||
| import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} | ||
| import org.apache.spark.ml.util.DefaultReadWriteTest | ||
| import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest} | ||
| import org.apache.spark.ml.util.TestingUtils._ | ||
| import org.apache.spark.mllib.util.MLlibTestSparkContext | ||
| import org.apache.spark.sql.{DataFrame, Row} | ||
|
|
||
|
|
||
| class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { | ||
| class NormalizerSuite extends MLTest with DefaultReadWriteTest { | ||
|
|
||
| import testImplicits._ | ||
|
|
||
| @transient var data: Array[Vector] = _ | ||
| @transient var dataFrame: DataFrame = _ | ||
| @transient var normalizer: Normalizer = _ | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will say, though, that I'm happy with moving Normalizer into individual tests. It's weird how it is shared here since it's mutated within tests.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| @transient var l1Normalized: Array[Vector] = _ | ||
| @transient var l2Normalized: Array[Vector] = _ | ||
| @transient val data: Seq[Vector] = Seq( | ||
| Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), | ||
| Vectors.dense(0.0, 0.0, 0.0), | ||
| Vectors.dense(0.6, -1.1, -3.0), | ||
| Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))), | ||
| Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))), | ||
| Vectors.sparse(3, Seq())) | ||
|
||
|
|
||
| override def beforeAll(): Unit = { | ||
| super.beforeAll() | ||
|
|
||
| data = Array( | ||
| Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), | ||
| Vectors.dense(0.0, 0.0, 0.0), | ||
| Vectors.dense(0.6, -1.1, -3.0), | ||
| Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))), | ||
| Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))), | ||
| Vectors.sparse(3, Seq()) | ||
| ) | ||
| l1Normalized = Array( | ||
| Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))), | ||
| Vectors.dense(0.0, 0.0, 0.0), | ||
| Vectors.dense(0.12765957, -0.23404255, -0.63829787), | ||
| Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))), | ||
| Vectors.dense(0.625, 0.07894737, 0.29605263), | ||
| Vectors.sparse(3, Seq()) | ||
| ) | ||
| l2Normalized = Array( | ||
| Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))), | ||
| Vectors.dense(0.0, 0.0, 0.0), | ||
| Vectors.dense(0.184549876, -0.3383414, -0.922749378), | ||
| Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))), | ||
| Vectors.dense(0.897906166, 0.113419726, 0.42532397), | ||
| Vectors.sparse(3, Seq()) | ||
| ) | ||
|
|
||
| dataFrame = data.map(NormalizerSuite.FeatureData).toSeq.toDF() | ||
| normalizer = new Normalizer() | ||
| .setInputCol("features") | ||
| .setOutputCol("normalized_features") | ||
| } | ||
|
|
||
| def collectResult(result: DataFrame): Array[Vector] = { | ||
| result.select("normalized_features").collect().map { | ||
| case Row(features: Vector) => features | ||
| } | ||
| } | ||
|
|
||
| def assertTypeOfVector(lhs: Array[Vector], rhs: Array[Vector]): Unit = { | ||
| assert((lhs, rhs).zipped.forall { | ||
| def assertTypeOfVector(lhs: Vector, rhs: Vector): Unit = { | ||
| assert((lhs, rhs) match { | ||
| case (v1: DenseVector, v2: DenseVector) => true | ||
| case (v1: SparseVector, v2: SparseVector) => true | ||
| case _ => false | ||
| }, "The vector type should be preserved after normalization.") | ||
| } | ||
|
|
||
| def assertValues(lhs: Array[Vector], rhs: Array[Vector]): Unit = { | ||
| assert((lhs, rhs).zipped.forall { (vector1, vector2) => | ||
| vector1 ~== vector2 absTol 1E-5 | ||
| }, "The vector value is not correct after normalization.") | ||
| def assertValues(lhs: Vector, rhs: Vector): Unit = { | ||
| assert(lhs ~== rhs absTol 1E-5, "The vector value is not correct after normalization.") | ||
| } | ||
|
|
||
| test("Normalization with default parameter") { | ||
| val result = collectResult(normalizer.transform(dataFrame)) | ||
|
|
||
| assertTypeOfVector(data, result) | ||
| val expected = Seq( | ||
| Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))), | ||
| Vectors.dense(0.0, 0.0, 0.0), | ||
| Vectors.dense(0.184549876, -0.3383414, -0.922749378), | ||
| Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))), | ||
| Vectors.dense(0.897906166, 0.113419726, 0.42532397), | ||
| Vectors.sparse(3, Seq()) | ||
| ) | ||
| val dataFrame: DataFrame = data.zip(expected).seq.toDF("features", "expected") | ||
| val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized") | ||
|
|
||
| assertValues(result, l2Normalized) | ||
| testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") { | ||
| case Row(features: Vector, normalized: Vector, expected: Vector) => | ||
| assertTypeOfVector(normalized, features) | ||
| assertValues(normalized, expected) | ||
| } | ||
| } | ||
|
|
||
| test("Normalization with setter") { | ||
| normalizer.setP(1) | ||
|
|
||
| val result = collectResult(normalizer.transform(dataFrame)) | ||
|
|
||
| assertTypeOfVector(data, result) | ||
| val expected = Seq( | ||
| Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))), | ||
| Vectors.dense(0.0, 0.0, 0.0), | ||
| Vectors.dense(0.12765957, -0.23404255, -0.63829787), | ||
| Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))), | ||
| Vectors.dense(0.625, 0.07894737, 0.29605263), | ||
| Vectors.sparse(3, Seq()) | ||
| ) | ||
| val dataFrame: DataFrame = data.zip(expected).seq.toDF("features", "expected") | ||
| val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized").setP(1) | ||
|
|
||
| assertValues(result, l1Normalized) | ||
| testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") { | ||
| case Row(features: Vector, normalized: Vector, expected: Vector) => | ||
| assertTypeOfVector(normalized, features) | ||
| assertValues(normalized, expected) | ||
| } | ||
| } | ||
|
|
||
| test("read/write") { | ||
|
|
@@ -115,7 +93,3 @@ class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa | |
| testDefaultReadWrite(t) | ||
| } | ||
| } | ||
|
|
||
| private object NormalizerSuite { | ||
| case class FeatureData(features: Vector) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These kinds of changes are not necessary and make the PR a lot longer. Would you mind reverting them?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok