update StandardScaler to use SimpleTransformer

mengxr · mengxr · commit a0e005436acb · 2014-11-09T12:18:00.000-08:00
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
@@ -19,11 +19,12 @@ package org.apache.spark.examples.ml
 
 import scala.beans.BeanInfo
 
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.ml.{Pipeline, SimpleTransformer}
 import org.apache.spark.ml.classification.LogisticRegression
 import org.apache.spark.ml.feature.HashingTF
-import org.apache.spark.ml.{Pipeline, SimpleTransformer}
+import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.sql.SQLContext
-import org.apache.spark.{SparkConf, SparkContext}
 
 @BeanInfo
 case class LabeledDocument(id: Long, text: String, label: Double)
@@ -36,7 +37,8 @@ case class Document(id: Long, text: String)
  */
 class SimpleTokenizer extends SimpleTransformer[String, Seq[String], SimpleTokenizer]
     with Serializable {
-  override def createTransformFunc: String => Seq[String] = _.toLowerCase.split("\\s")
+  override def createTransformFunc(paramMap: ParamMap): String => Seq[String] =
+    _.toLowerCase.split("\\s")
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala b/mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
@@ -74,12 +74,17 @@ abstract class SimpleTransformer[IN, OUT: TypeTag, SELF <: SimpleTransformer[IN,
   def setInputCol(value: String): SELF = { set(inputCol, value); this.asInstanceOf[SELF] }
   def setOutputCol(value: String): SELF = { set(outputCol, value); this.asInstanceOf[SELF] }
 
-  def createTransformFunc: IN => OUT
+  /**
+   * Creates the transform function using the given param map. The input param map already takes
+   * account of the embedded param map. So the param values should be determined solely by the input
+   * param map.
+   */
+  protected def createTransformFunc(paramMap: ParamMap): IN => OUT
 
   override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = {
     import dataset.sqlContext._
     val map = this.paramMap ++ paramMap
-    val udf: IN => OUT = this.createTransformFunc
+    val udf: IN => OUT = this.createTransformFunc(map)
     dataset.select(Star(None), udf.call(map(inputCol).attr) as map(outputCol))
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -1,29 +1,18 @@
 package org.apache.spark.ml.feature
 
-import org.apache.spark.ml.Transformer
-import org.apache.spark.ml.param.{HasInputCol, HasOutputCol, IntParam, ParamMap}
+import org.apache.spark.ml.SimpleTransformer
+import org.apache.spark.ml.param.{IntParam, ParamMap}
 import org.apache.spark.mllib.feature
 import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.sql.SchemaRDD
-import org.apache.spark.sql.catalyst.analysis.Star
-import org.apache.spark.sql.catalyst.dsl._
 
-class HashingTF extends Transformer with HasInputCol with HasOutputCol {
-
-  def setInputCol(value: String) = { set(inputCol, value); this }
-  def setOutputCol(value: String) = { set(outputCol, value); this }
+class HashingTF extends SimpleTransformer[Iterable[_], Vector, HashingTF] {
 
   val numFeatures = new IntParam(this, "numFeatures", "number of features", Some(1 << 18))
   def setNumFeatures(value: Int) = { set(numFeatures, value); this }
   def getNumFeatures: Int = get(numFeatures)
 
-  override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = {
-    import dataset.sqlContext._
-    val map = this.paramMap ++ paramMap
-    val hashingTF = new feature.HashingTF(map(numFeatures))
-    val t: Iterable[_] => Vector = (doc) => {
-      hashingTF.transform(doc)
-    }
-    dataset.select(Star(None), t.call(map(inputCol).attr) as map(outputCol))
+  override protected def createTransformFunc(paramMap: ParamMap): Iterable[_] => Vector = {
+    val hashingTF = new feature.HashingTF(paramMap(numFeatures))
+    hashingTF.transform
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/package.scala b/mllib/src/main/scala/org/apache/spark/ml/package.scala
@@ -1,4 +1,19 @@
-// to fail jenkins
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 package org.apache.spark