apache · narahari92 · Sep 15, 2017 · Sep 15, 2017 · Sep 15, 2017 · srowen
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/GenFuncTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/GenFuncTransformer.scala
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import scala.util.Random
+
+import org.apache.spark.annotation.Since
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.param.Param
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.param.shared.HasInputCols
+import org.apache.spark.ml.param.shared.HasOutputCol
+import org.apache.spark.ml.util.DefaultParamsReadable
+import org.apache.spark.ml.util.DefaultParamsWritable
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.encoders.RowEncoder
+import org.apache.spark.sql.types.DoubleType
+import org.apache.spark.sql.types.NumericType
+import org.apache.spark.sql.types.StructField
+import org.apache.spark.sql.types.StructType
+
+import javax.script.ScriptEngineManager
+
+
+/**
+ * A feature transformer that executes a given javascript function on dataframe columns.
+ */
+@Since("2.3.0")
+class GenFuncTransformer(override val uid: String)
+  extends Transformer with HasInputCols with HasOutputCol with DefaultParamsWritable {
+
+  def this() = this(Identifiable.randomUID("mathTransformer"))
+
+  final val function: Param[String] = new Param[String](this, "function", "Evaulation function written in javascript which return numerical value")
+
+  /** @group setParam */
+  def setInputCols(value: Array[String]): this.type = set(inputCols, value)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  /** @group setParam */
+  def setFunction(value: String): this.type = set(function, value)
+
+  @Since("2.3.0")
+  def transform(dataset: Dataset[_]): DataFrame = {
+    val schema = dataset.schema
+    implicit val rowEncoder = RowEncoder(transformSchema(schema))
+    dataset.toDF.mapPartitions {
+      rows =>
+        val engine = new ScriptEngineManager().getEngineByName("JavaScript")
+        val func = {
+          val alphabets = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+          (for(i <- 0 until 10) yield {
+            val position = Random.nextInt(52)
+            alphabets.charAt(position)
+          }).mkString
+        }
+        engine.eval("var " + func + " = " + $(function).stripMargin)
+        rows.map {
+          row =>
+            $(inputCols).foreach {
+              col =>
+                val datatype = schema(col).dataType
+                datatype match {
+                  case _: NumericType => engine.eval(col + "=" + row.get(row.fieldIndex(col)))
+                  case _ => engine.eval(col + "=\"" + row.get(row.fieldIndex(col)).toString + "\"")
+                }
+            }
+            val args = $(inputCols).mkString("(", ",", ")")
+            val resObj = engine.eval(func + args)
+            val res = if(resObj == null) Double.NaN else resObj.toString.toDouble
+            Row.fromSeq(row.toSeq :+ res)
+        }
+    }
+  }
+
+  @Since("2.3.0")
+  def transformSchema(schema: StructType): StructType = {
+    schema.add(StructField($(outputCol), DoubleType, true))
+  }
+
+  @Since("2.3.0")
+  def copy(extra: ParamMap): GenFuncTransformer = defaultCopy(extra)
+}
+
+@Since("2.3.0")
+object GenFuncTransformer extends DefaultParamsReadable[GenFuncTransformer] {
+
+  @Since("2.3.0")
+  override def load(path: String): GenFuncTransformer = super.load(path)
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/GenFuncTransformerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/GenFuncTransformerSuite.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.types.DoubleType
+import org.apache.spark.sql.types.StructField
+import org.apache.spark.sql.types.StructType
+
+
+class GenFuncTransformerSuite 
+  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+
+  import testImplicits._
+
+  test("params") {
+    ParamsSuite.checkParams(new GenFuncTransformer)
+  }
+
+  test("execute simple add function") {
+    val function = "function(a, b) { return a + b;}"
+    val original = Seq((1.0, 2.0), (3.0, 4.0)).toDF("v1", "v2")
+    val transformer = new GenFuncTransformer().setInputCols(Array("v1", "v2")).setOutputCol("result").setFunction(function)
+    val result = transformer.transform(original)
+    val resultSchema = transformer.transformSchema(original.schema)
+    val expected = Seq((1.0, 2.0, 3.0), (3.0, 4.0, 7.0)).toDF("v1", "v2", "result")
+    val expectedSchema = StructType(original.schema.fields :+ StructField("result", DoubleType, true))
+    assert(result.schema.toString == resultSchema.toString)
+    assert(resultSchema == expectedSchema)
+    assert(result.collect().toSeq == expected.collect().toSeq)
+    assert(original.sparkSession.catalog.listTables().count() == 0)
+  }
+
+  test("execute function when input column is non numeric") {
+    val function = "function(a) { return a.length; }"
+    val original = Seq((1, "hello"), (2, "sparkml")).toDF("id", "message")
+    val transformer = new GenFuncTransformer().setInputCols(Array("message")).setOutputCol("length").setFunction(function)
+    val result = transformer.transform(original)
+    val expected = Seq((1, "hello", 5.0), (2, "sparkml", 7.0)).toDF("id", "message", "length")
+    assert(result.collect().toSeq == expected.collect().toSeq)
+  }
+
+  test("read/write") {
+    val t = new GenFuncTransformer()
+              .setInputCols(Array("v1", "v2"))
+              .setOutputCol("result")
+              .setFunction("function(a, b) { return a + b;}")
+    testDefaultReadWrite(t)
+  }
+}