Skip to content

Commit 37d4705

Browse files
committed
[SPARK-5726] [MLLIB] Incorporated feedback.
1 parent 1dffeee commit 37d4705

File tree

4 files changed

+37
-63
lines changed

4 files changed

+37
-63
lines changed

docs/mllib-feature-extraction.md

Lines changed: 4 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ sc.stop();
479479

480480
## ElementwiseProduct
481481

482-
ElementwiseProduct multiplies individual vector samples by a provided weighting vector component-wise. This represents the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29) between the input vector, `v` and transforming vector, `w`, to yield a result vector.
482+
ElementwiseProduct multiplies each input vector by a provided "weight" vector, using element-wise multiplication. In other words, it scales each column of the dataset by a scalar multiplier. This represents the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29) between the input vector, `v` and transforming vector, `w`, to yield a result vector.
483483

484484
`\[ \begin{pmatrix}
485485
v_1 \\
@@ -499,7 +499,7 @@ v_N
499499

500500
[`ElementwiseProduct`](api/scala/index.html#org.apache.spark.mllib.feature.ElementwiseProduct) has the following parameter in the constructor:
501501

502-
* `w` Vector, the transforming vector.
502+
* `w`: the transforming vector.
503503

504504
`ElementwiseProduct` implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer) which can apply the weighting on a `Vector` to produce a transformed `Vector` or on an `RDD[Vector]` to produce a transformed `RDD[Vector]`.
505505

@@ -515,36 +515,17 @@ import org.apache.spark.SparkContext._
515515
import org.apache.spark.mllib.feature.ElementwiseProduct
516516
import org.apache.spark.mllib.linalg.Vectors
517517

518-
//load and parse the data
518+
// Load and parse the data:
519519
val data = sc.textFile("data/mllib/kmeans_data.txt")
520520
val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
521521

522522
val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
523523
val transformer = new ElementwiseProduct(transformingVector)
524524

525-
//same results:
525+
// Batch transform and per-row transform give the same results:
526526
val transformedData = transformer.transform(parsedData)
527527
val transformedData2 = parsedData.map(x => transformer.transform(x))
528528

529-
{% endhighlight %}
530-
</div>
531-
532-
<div data-lang="python">
533-
{% highlight python %}
534-
from pyspark.mllib.linalg import Vectors
535-
from pyspark.mllib.feature import ElementwiseProduct
536-
537-
# Load and parse the data
538-
data = sc.textFile("data/mllib/kmeans_data.txt")
539-
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
540-
541-
transformingVector = Vectors.dense(0.0, 1.0, 2.0)
542-
transformer = ElementwiseProduct(transformingVector)
543-
544-
# Same results:
545-
transformedData = transformer.transform(parsedData)
546-
transformedData2 = parsedData.map(lambda x: transformer.transform(x))
547-
548529
{% endhighlight %}
549530
</div>
550531
</div>

mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,18 +25,26 @@ import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
2525
import org.apache.spark.sql.types.DataType
2626

2727
/**
28-
* :: AlphaComponent
29-
* Maps a vector to the hadamard product of it and a reference vector.
28+
* :: AlphaComponent ::
29+
* Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a provided "weight" vector. In other words, it scales each column of the dataset by a scalar multiplier.
3030
*/
3131
@AlphaComponent
3232
class ElementwiseProduct extends UnaryTransformer[Vector, Vector, ElementwiseProduct] {
3333

34-
/** the vector to multiply with input vectors */
35-
val scalingVec : Param[Vector] = new Param(this, "scalingVector", "vector for hadamard product")
34+
/**
35+
* the vector to multiply with input vectors
36+
* @group param
37+
*/
38+
val scalingVec: Param[Vector] = new Param(this, "scalingVector", "vector for hadamard product")
39+
40+
/** @group setParam */
3641
def setScalingVec(value: Vector): this.type = set(scalingVec, value)
42+
43+
/** @group getParam */
3744
def getScalingVec: Vector = getOrDefault(scalingVec)
3845

3946
override protected def createTransformFunc(paramMap: ParamMap): Vector => Vector = {
47+
require(paramMap.contains(scalingVec), s"transformation requires a weight vector: $scalingVec")
4048
val elemScaler = new feature.ElementwiseProduct(paramMap(scalingVec))
4149
elemScaler.transform
4250
}

mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,16 @@
1717

1818
package org.apache.spark.mllib.feature
1919

20-
import org.apache.spark.annotation.Experimental
20+
import org.apache.spark.annotation.AlphaComponent
2121
import org.apache.spark.mllib.linalg._
2222

2323
/**
24-
* :: Experimental ::
25-
* Element-wise product of dense vectors by a provided vector's components.
24+
* :: AlphaComponent ::
25+
* Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a provided "weight" vector. In other words, it scales each column of the dataset by a scalar multiplier.
2626
*
2727
* @param scalingVector The values used to scale the reference vector's individual components.
2828
*/
29-
@Experimental
29+
@AlphaComponent
3030
class ElementwiseProduct(val scalingVector: Vector) extends VectorTransformer {
3131

3232
/**
@@ -36,23 +36,24 @@ class ElementwiseProduct(val scalingVector: Vector) extends VectorTransformer {
3636
* @return transformed vector.
3737
*/
3838
override def transform(vector: Vector): Vector = {
39-
require(vector.size == scalingVector.size)
39+
require(vector.size == scalingVector.size,
40+
s"vector sizes do not match: ${scalingVector.size} ${vector.size}")
4041
vector match {
4142
case dv: DenseVector =>
4243
val values: Array[Double] = dv.values.clone()
4344
val dim = scalingVector.size
4445
var i = 0
45-
while(i < dim) {
46+
while (i < dim) {
4647
values(i) *= scalingVector(i)
47-
i+=1
48+
i += 1
4849
}
4950
Vectors.dense(values)
5051
case SparseVector(size, indices, vs) =>
5152
val values = vs.clone()
5253
val dim = values.size
5354
var i = 0
5455
while (i < dim) {
55-
values(i) *= scalingVector.apply(indices(i))
56+
values(i) *= scalingVector(indices(i))
5657
i += 1
5758
}
5859
Vectors.sparse(size, indices, values)

mllib/src/test/scala/org/apache/spark/mllib/feature/ElementwiseProductSuite.scala

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -17,55 +17,40 @@
1717

1818
package org.apache.spark.mllib.feature
1919

20-
import org.apache.spark.mllib.linalg.{SparseVector, DenseVector, Vector, Vectors}
21-
import org.apache.spark.mllib.util.MLlibTestSparkContext
2220
import org.scalatest.FunSuite
21+
22+
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
23+
import org.apache.spark.mllib.util.MLlibTestSparkContext
2324
import org.apache.spark.mllib.util.TestingUtils._
2425

25-
class ElementwiseProductSuite extends FunSuite with MLlibTestSparkContext{
26+
class ElementwiseProductSuite extends FunSuite with MLlibTestSparkContext {
2627

27-
val denseData = Array(
28-
Vectors.dense(1.0, 1.0, 0.0, 0.0),
29-
Vectors.dense(1.0, 2.0, -3.0, 0.0),
30-
Vectors.dense(1.0, 3.0, 0.0, 0.0),
31-
Vectors.dense(1.0, 4.0, 1.9, -9.0),
32-
Vectors.dense(1.0, 5.0, 0.0, 0.0)
28+
val denseData = Array(
29+
Vectors.dense(1.0, 4.0, 1.9, -9.0)
3330
)
3431

3532
val sparseData = Array(
36-
Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
37-
Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))),
38-
Vectors.sparse(3, Seq((1, -5.1))),
39-
Vectors.sparse(3, Seq((0, 3.8), (2, 1.9))),
40-
Vectors.sparse(3, Seq((0, 1.7), (1, -0.6))),
41-
Vectors.sparse(3, Seq((1, 1.9)))
33+
Vectors.sparse(3, Seq((1, -1.0), (2, -3.0)))
4234
)
4335

4436
val scalingVector = Vectors.dense(2.0, 0.5, 0.0, 0.25)
4537

4638
test("elementwise (hadamard) product should properly apply vector to dense data set") {
47-
4839
val transformer = new ElementwiseProduct(scalingVector)
4940
val transformedData = transformer.transform(sc.makeRDD(denseData))
50-
5141
val transformedVecs = transformedData.collect()
42+
val transformedVec = transformedVecs(0).toArray
5243

53-
val fourthVec = transformedVecs.apply(3).toArray
54-
55-
assert(fourthVec.apply(0) === 2.0, "product by 2.0 should have been applied")
56-
assert(fourthVec.apply(1) === 2.0, "product by 0.5 should have been applied")
57-
assert(fourthVec.apply(2) === 0.0, "product by 0.0 should have been applied")
58-
assert(fourthVec.apply(3) === -2.25, "product by 0.25 should have been applied")
44+
assert(transformedVec(0) === 2.0, "product by 2.0 should have been applied")
45+
assert(transformedVec(1) === 2.0, "product by 0.5 should have been applied")
46+
assert(transformedVec(2) === 0.0, "product by 0.0 should have been applied")
47+
assert(transformedVec(3) === -2.25, "product by 0.25 should have been applied")
5948
}
6049

6150
test("elementwise (hadamard) product should properly apply vector to sparse data set") {
62-
6351
val dataRDD = sc.parallelize(sparseData, 3)
64-
6552
val scalingVec = Vectors.dense(1.0, 0.0, 0.5)
66-
6753
val transformer = new ElementwiseProduct(scalingVec)
68-
6954
val data2 = sparseData.map(transformer.transform)
7055
val data2RDD = transformer.transform(dataRDD)
7156

@@ -76,7 +61,6 @@ class ElementwiseProductSuite extends FunSuite with MLlibTestSparkContext{
7661
}, "The vector type should be preserved after hadamard product")
7762

7863
assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5))
79-
8064
assert(data2(0) ~== Vectors.sparse(3, Seq((0, -2.0), (1, 0.0))) absTol 1E-5)
8165
assert(data2(1) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5)
8266
}

0 commit comments

Comments
 (0)