[SPARK-5726] [MLLIB] Incorporated feedback.

ogeagla · ogeagla · commit 37d4705f10ad · 2015-05-07T12:14:40.000-06:00
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
@@ -479,7 +479,7 @@ sc.stop();
 
 ## ElementwiseProduct
 
-ElementwiseProduct multiplies individual vector samples by a provided weighting vector component-wise.  This represents the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29) between the input vector, `v` and transforming vector, `w`, to yield a result vector.
+ElementwiseProduct multiplies each input vector by a provided "weight" vector, using element-wise multiplication. In other words, it scales each column of the dataset by a scalar multiplier.  This represents the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29) between the input vector, `v` and transforming vector, `w`, to yield a result vector.
 
 `\[ \begin{pmatrix}
 v_1 \\
@@ -499,7 +499,7 @@ v_N
 
 [`ElementwiseProduct`](api/scala/index.html#org.apache.spark.mllib.feature.ElementwiseProduct) has the following parameter in the constructor:
 
-* `w` Vector, the transforming vector.
+* `w`: the transforming vector.
 
 `ElementwiseProduct` implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer) which can apply the weighting on a `Vector` to produce a transformed `Vector` or on an `RDD[Vector]` to produce a transformed `RDD[Vector]`.
 
@@ -515,36 +515,17 @@ import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.feature.ElementwiseProduct
 import org.apache.spark.mllib.linalg.Vectors
 
-//load and parse the data
+// Load and parse the data:
 val data = sc.textFile("data/mllib/kmeans_data.txt")
 val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
 
 val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
 val transformer = new ElementwiseProduct(transformingVector)
 
-//same results:
+// Batch transform and per-row transform give the same results:
 val transformedData = transformer.transform(parsedData)
 val transformedData2 = parsedData.map(x => transformer.transform(x))
 
-{% endhighlight %}
-</div>
-
-<div data-lang="python">
-{% highlight python %}
-from pyspark.mllib.linalg import Vectors
-from pyspark.mllib.feature import ElementwiseProduct
-
-# Load and parse the data
-data = sc.textFile("data/mllib/kmeans_data.txt")
-parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
-
-transformingVector = Vectors.dense(0.0, 1.0, 2.0)
-transformer = ElementwiseProduct(transformingVector)
-
-# Same results:
-transformedData = transformer.transform(parsedData)
-transformedData2 = parsedData.map(lambda x: transformer.transform(x))
-
 {% endhighlight %}
 </div>
 </div>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala
@@ -25,18 +25,26 @@ import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
 import org.apache.spark.sql.types.DataType
 
 /**
- * :: AlphaComponent
- * Maps a vector to the hadamard product of it and a reference vector.
+ * :: AlphaComponent ::
+ * Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a provided "weight" vector.  In other words, it scales each column of the dataset by a scalar multiplier.
  */
 @AlphaComponent
 class ElementwiseProduct extends UnaryTransformer[Vector, Vector, ElementwiseProduct] {
 
-  /** the vector to multiply with input vectors */
-  val scalingVec : Param[Vector] = new Param(this, "scalingVector", "vector for hadamard product")
+  /**
+    * the vector to multiply with input vectors
+    * @group param
+    */
+  val scalingVec: Param[Vector] = new Param(this, "scalingVector", "vector for hadamard product")
+
+  /** @group setParam */
   def setScalingVec(value: Vector): this.type = set(scalingVec, value)
+
+  /** @group getParam */
   def getScalingVec: Vector = getOrDefault(scalingVec)
 
   override protected def createTransformFunc(paramMap: ParamMap): Vector => Vector = {
+    require(paramMap.contains(scalingVec), s"transformation requires a weight vector: $scalingVec")
     val elemScaler = new feature.ElementwiseProduct(paramMap(scalingVec))
     elemScaler.transform
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
@@ -17,16 +17,16 @@
 
 package org.apache.spark.mllib.feature
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.mllib.linalg._
 
 /**
- * :: Experimental ::
- * Element-wise product of dense vectors by a provided vector's components.
+ * :: AlphaComponent ::
+ * Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a provided "weight" vector.  In other words, it scales each column of the dataset by a scalar multiplier.
  *
  * @param scalingVector The values used to scale the reference vector's individual components.
  */
-@Experimental
+@AlphaComponent
 class ElementwiseProduct(val scalingVector: Vector) extends VectorTransformer {
 
   /**
@@ -36,23 +36,24 @@ class ElementwiseProduct(val scalingVector: Vector) extends VectorTransformer {
    * @return transformed vector.
    */
   override def transform(vector: Vector): Vector = {
-    require(vector.size == scalingVector.size)
+    require(vector.size == scalingVector.size,
+      s"vector sizes do not match: ${scalingVector.size} ${vector.size}")
     vector match {
       case dv: DenseVector =>
         val values: Array[Double] = dv.values.clone()
         val dim = scalingVector.size
         var i = 0
-        while(i < dim) {
+        while (i < dim) {
           values(i) *= scalingVector(i)
-          i+=1
+          i += 1
         }
         Vectors.dense(values)
       case SparseVector(size, indices, vs) =>
         val values = vs.clone()
         val dim = values.size
         var i = 0
         while (i < dim) {
-          values(i) *= scalingVector.apply(indices(i))
+          values(i) *= scalingVector(indices(i))
           i += 1
         }
         Vectors.sparse(size, indices, values)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala
@@ -17,55 +17,40 @@
 
 package org.apache.spark.mllib.feature
 
-import org.apache.spark.mllib.linalg.{SparseVector, DenseVector, Vector, Vectors}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 
-class ElementwiseProductSuite extends FunSuite with MLlibTestSparkContext{
+class ElementwiseProductSuite extends FunSuite with MLlibTestSparkContext {
 
-  val denseData =  Array(
-    Vectors.dense(1.0, 1.0, 0.0, 0.0),
-    Vectors.dense(1.0, 2.0, -3.0, 0.0),
-    Vectors.dense(1.0, 3.0, 0.0, 0.0),
-    Vectors.dense(1.0, 4.0, 1.9, -9.0),
-    Vectors.dense(1.0, 5.0, 0.0, 0.0)
+  val denseData = Array(
+    Vectors.dense(1.0, 4.0, 1.9, -9.0)
   )
 
   val sparseData = Array(
-    Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
-    Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))),
-    Vectors.sparse(3, Seq((1, -5.1))),
-    Vectors.sparse(3, Seq((0, 3.8), (2, 1.9))),
-    Vectors.sparse(3, Seq((0, 1.7), (1, -0.6))),
-    Vectors.sparse(3, Seq((1, 1.9)))
+    Vectors.sparse(3, Seq((1, -1.0), (2, -3.0)))
   )
 
   val scalingVector = Vectors.dense(2.0, 0.5, 0.0, 0.25)
 
   test("elementwise (hadamard) product should properly apply vector to dense data set") {
-
     val transformer = new ElementwiseProduct(scalingVector)
     val transformedData = transformer.transform(sc.makeRDD(denseData))
-
     val transformedVecs = transformedData.collect()
+    val transformedVec = transformedVecs(0).toArray
 
-    val fourthVec = transformedVecs.apply(3).toArray
-
-    assert(fourthVec.apply(0) === 2.0, "product by 2.0 should have been applied")
-    assert(fourthVec.apply(1) === 2.0, "product by 0.5 should have been applied")
-    assert(fourthVec.apply(2) === 0.0, "product by 0.0 should have been applied")
-    assert(fourthVec.apply(3) === -2.25, "product by 0.25 should have been applied")
+    assert(transformedVec(0) === 2.0, "product by 2.0 should have been applied")
+    assert(transformedVec(1) === 2.0, "product by 0.5 should have been applied")
+    assert(transformedVec(2) === 0.0, "product by 0.0 should have been applied")
+    assert(transformedVec(3) === -2.25, "product by 0.25 should have been applied")
   }
 
   test("elementwise (hadamard) product should properly apply vector to sparse data set") {
-
     val dataRDD = sc.parallelize(sparseData, 3)
-
     val scalingVec = Vectors.dense(1.0, 0.0, 0.5)
-
     val transformer = new ElementwiseProduct(scalingVec)
-
     val data2 = sparseData.map(transformer.transform)
     val data2RDD = transformer.transform(dataRDD)
 
@@ -76,7 +61,6 @@ class ElementwiseProductSuite extends FunSuite with MLlibTestSparkContext{
     }, "The vector type should be preserved after hadamard product")
 
     assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5))
-
     assert(data2(0) ~== Vectors.sparse(3, Seq((0, -2.0), (1, 0.0))) absTol 1E-5)
     assert(data2(1) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5)
   }