apache · feynmanliang · Jun 18, 2015 · Jun 19, 2015 · Jun 19, 2015 · jkbradley
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.UnaryTransformer
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
+
+/**
+ * :: Experimental ::
+ * A feature transformer that converts the input array of strings into an array of n-grams. Null
+ * values in the input array are ignored.
+ * It returns an array of n-grams where each n-gram is represented by a space-separated string of
+ * words.
+ */
+@Experimental
+class NGram(override val uid: String)
+  extends UnaryTransformer[Seq[String], Seq[String], NGram] {
+
+  def this() = this(Identifiable.randomUID("ngram"))
+
+  /**
+   * Minimum n-gram length, >= 1.
+   * Defauult: 2, bigram features
+   * @group param
+   */
+  val NGramLength: IntParam = new IntParam(this, "NGramLength", "number elements per n-gram (>=1)",
+    ParamValidators.gtEq(1))
+
+  /** @group setParam */
+  def setNGramLength(value: Int): this.type = set(NGramLength, value)
+
+  /** @group getParam */
+  def getNGramLength: Int = $(NGramLength)
+
+  setDefault(NGramLength -> 2)
+
+  override protected def createTransformFunc: Seq[String] => Seq[String] = {
+    val minLength = $(NGramLength)
+    _.sliding(minLength).map(_.mkString(" ")).toSeq
+  }
+
+  override protected def validateInputType(inputType: DataType): Unit = {
+    require(
+      inputType.sameType(ArrayType(StringType)),
+      s"Input type must be ArrayType(StringType) but got $inputType.")
+  }
+
+  override protected def outputDataType: DataType = new ArrayType(StringType, false)
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import scala.beans.BeanInfo
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.{DataFrame, Row}
+
+@BeanInfo
+case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String])
+
+class NGramSuite extends SparkFunSuite with MLlibTestSparkContext {
+  import org.apache.spark.ml.feature.NGramSuite._
+
+  test("default behavior yields bigram features") {
+    val tokenizer = new NGram()
+      .setInputCol("inputTokens")
+      .setOutputCol("NGrams")
+    val dataset = sqlContext.createDataFrame(Seq(
+      NGramTestData(
+        Array("Test", "for", "ngram", "."),
+        Array("Test for", "for ngram", "ngram .")
+    )))
+    testNGram(tokenizer, dataset)
+  }
+
+  test("NGramLength=4 yields length 4 n-grams") {
+    val tokenizer = new NGram()
+      .setInputCol("inputTokens")
+      .setOutputCol("NGrams")
+      .setNGramLength(4)
+    val dataset = sqlContext.createDataFrame(Seq(
+      NGramTestData(
+        Array("a", "b", "c", "d", "e"),
+        Array("a b c d", "b c d e")
+      )))
+    testNGram(tokenizer, dataset)
+  }
+}
+
+object NGramSuite extends SparkFunSuite {
+
+  def testNGram(t: NGram, dataset: DataFrame): Unit = {
+    t.transform(dataset)
+      .select("NGrams", "wantedNGrams")
+      .collect()
+      .foreach { case Row(actualNGrams, wantedNGrams) =>
+      assert(actualNGrams === wantedNGrams)
+    }
+  }
+}