Skip to content
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 16 additions & 23 deletions mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,61 +19,59 @@ package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
import org.apache.spark.sql.{DataFrame, Row}


@BeanInfo
case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String])

class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
class NGramSuite extends MLTest with DefaultReadWriteTest {

import org.apache.spark.ml.feature.NGramSuite._
import testImplicits._

test("default behavior yields bigram features") {
val nGram = new NGram()
.setInputCol("inputTokens")
.setOutputCol("nGrams")
val dataset = Seq(NGramTestData(
val dataFrame = Seq(NGramTestData(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These kinds of changes are not necessary and make the PR a lot longer. Would you mind reverting them?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

Array("Test", "for", "ngram", "."),
Array("Test for", "for ngram", "ngram .")
)).toDF()
testNGram(nGram, dataset)
testNGram(nGram, dataFrame)
}

test("NGramLength=4 yields length 4 n-grams") {
val nGram = new NGram()
.setInputCol("inputTokens")
.setOutputCol("nGrams")
.setN(4)
val dataset = Seq(NGramTestData(
val dataFrame = Seq(NGramTestData(
Array("a", "b", "c", "d", "e"),
Array("a b c d", "b c d e")
)).toDF()
testNGram(nGram, dataset)
testNGram(nGram, dataFrame)
}

test("empty input yields empty output") {
val nGram = new NGram()
.setInputCol("inputTokens")
.setOutputCol("nGrams")
.setN(4)
val dataset = Seq(NGramTestData(Array(), Array())).toDF()
testNGram(nGram, dataset)
val dataFrame = Seq(NGramTestData(Array(), Array())).toDF()
testNGram(nGram, dataFrame)
}

test("input array < n yields empty output") {
val nGram = new NGram()
.setInputCol("inputTokens")
.setOutputCol("nGrams")
.setN(6)
val dataset = Seq(NGramTestData(
val dataFrame = Seq(NGramTestData(
Array("a", "b", "c", "d", "e"),
Array()
)).toDF()
testNGram(nGram, dataset)
testNGram(nGram, dataFrame)
}

test("read/write") {
Expand All @@ -83,16 +81,11 @@ class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRe
.setN(3)
testDefaultReadWrite(t)
}
}

object NGramSuite extends SparkFunSuite {

def testNGram(t: NGram, dataset: Dataset[_]): Unit = {
t.transform(dataset)
.select("nGrams", "wantedNGrams")
.collect()
.foreach { case Row(actualNGrams, wantedNGrams) =>
def testNGram(t: NGram, dataFrame: DataFrame): Unit = {
testTransformer[(Seq[String], Seq[String])](dataFrame, t, "nGrams", "wantedNGrams") {
case Row(actualNGrams : Seq[String], wantedNGrams: Seq[String]) =>
assert(actualNGrams === wantedNGrams)
}
}
}
}
112 changes: 43 additions & 69 deletions mllib/src/test/scala/org/apache/spark/ml/feature/NormalizerSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,94 +17,72 @@

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}


class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
class NormalizerSuite extends MLTest with DefaultReadWriteTest {

import testImplicits._

@transient var data: Array[Vector] = _
@transient var dataFrame: DataFrame = _
@transient var normalizer: Normalizer = _
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will say, though, that I'm happy with moving Normalizer into individual tests. It's weird how it is shared here since it's mutated within tests.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

@transient var l1Normalized: Array[Vector] = _
@transient var l2Normalized: Array[Vector] = _
@transient val data: Seq[Vector] = Seq(
Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
Vectors.dense(0.0, 0.0, 0.0),
Vectors.dense(0.6, -1.1, -3.0),
Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))),
Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))),
Vectors.sparse(3, Seq()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer to put initializing data var into beforeAll.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First of all thanks for the review.

Regarding this specific comment: this way it can be a 'val' and variable name and value is close to each other. What is the advantage of separating them?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I only doubt that when the testsuite object being serialized and then deserialized, the data will lost. But I am not sure which case serialization will occur.

Copy link
Contributor Author

@attilapiros attilapiros Mar 2, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But '@transient' is about to skipping serialization for this field

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok its a minor issue lets ignore it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer to revert these changes. As far as I know, nothing is broken, and this is a common pattern used in many parts of MLlib tests.

I think the main reason to move data around would be to have actual + expected values side-by-side for easier reading.


override def beforeAll(): Unit = {
super.beforeAll()

data = Array(
Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
Vectors.dense(0.0, 0.0, 0.0),
Vectors.dense(0.6, -1.1, -3.0),
Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))),
Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))),
Vectors.sparse(3, Seq())
)
l1Normalized = Array(
Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))),
Vectors.dense(0.0, 0.0, 0.0),
Vectors.dense(0.12765957, -0.23404255, -0.63829787),
Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))),
Vectors.dense(0.625, 0.07894737, 0.29605263),
Vectors.sparse(3, Seq())
)
l2Normalized = Array(
Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))),
Vectors.dense(0.0, 0.0, 0.0),
Vectors.dense(0.184549876, -0.3383414, -0.922749378),
Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))),
Vectors.dense(0.897906166, 0.113419726, 0.42532397),
Vectors.sparse(3, Seq())
)

dataFrame = data.map(NormalizerSuite.FeatureData).toSeq.toDF()
normalizer = new Normalizer()
.setInputCol("features")
.setOutputCol("normalized_features")
}

def collectResult(result: DataFrame): Array[Vector] = {
result.select("normalized_features").collect().map {
case Row(features: Vector) => features
}
}

def assertTypeOfVector(lhs: Array[Vector], rhs: Array[Vector]): Unit = {
assert((lhs, rhs).zipped.forall {
def assertTypeOfVector(lhs: Vector, rhs: Vector): Unit = {
assert((lhs, rhs) match {
case (v1: DenseVector, v2: DenseVector) => true
case (v1: SparseVector, v2: SparseVector) => true
case _ => false
}, "The vector type should be preserved after normalization.")
}

def assertValues(lhs: Array[Vector], rhs: Array[Vector]): Unit = {
assert((lhs, rhs).zipped.forall { (vector1, vector2) =>
vector1 ~== vector2 absTol 1E-5
}, "The vector value is not correct after normalization.")
def assertValues(lhs: Vector, rhs: Vector): Unit = {
assert(lhs ~== rhs absTol 1E-5, "The vector value is not correct after normalization.")
}

test("Normalization with default parameter") {
val result = collectResult(normalizer.transform(dataFrame))

assertTypeOfVector(data, result)
val expected = Seq(
Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))),
Vectors.dense(0.0, 0.0, 0.0),
Vectors.dense(0.184549876, -0.3383414, -0.922749378),
Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))),
Vectors.dense(0.897906166, 0.113419726, 0.42532397),
Vectors.sparse(3, Seq())
)
val dataFrame: DataFrame = data.zip(expected).seq.toDF("features", "expected")
val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized")

assertValues(result, l2Normalized)
testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") {
case Row(features: Vector, normalized: Vector, expected: Vector) =>
assertTypeOfVector(normalized, features)
assertValues(normalized, expected)
}
}

test("Normalization with setter") {
normalizer.setP(1)

val result = collectResult(normalizer.transform(dataFrame))

assertTypeOfVector(data, result)
val expected = Seq(
Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))),
Vectors.dense(0.0, 0.0, 0.0),
Vectors.dense(0.12765957, -0.23404255, -0.63829787),
Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))),
Vectors.dense(0.625, 0.07894737, 0.29605263),
Vectors.sparse(3, Seq())
)
val dataFrame: DataFrame = data.zip(expected).seq.toDF("features", "expected")
val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized").setP(1)

assertValues(result, l1Normalized)
testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") {
case Row(features: Vector, normalized: Vector, expected: Vector) =>
assertTypeOfVector(normalized, features)
assertValues(normalized, expected)
}
}

test("read/write") {
Expand All @@ -115,7 +93,3 @@ class NormalizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
testDefaultReadWrite(t)
}
}

private object NormalizerSuite {
case class FeatureData(features: Vector)
}
Loading