apache · holdenk · Apr 14, 2016 · Apr 14, 2016 · Apr 14, 2016 · Apr 15, 2016
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -22,7 +22,7 @@ import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
+import org.apache.spark.ml.param.shared.{HasBinary, HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.linalg.{Vectors, VectorUDT}
 import org.apache.spark.rdd.RDD
@@ -34,7 +34,8 @@ import org.apache.spark.util.collection.OpenHashMap
 /**
  * Params for [[CountVectorizer]] and [[CountVectorizerModel]].
  */
-private[feature] trait CountVectorizerParams extends Params with HasInputCol with HasOutputCol {
+private[feature] trait CountVectorizerParams extends Params with HasBinary with HasInputCol
+    with HasOutputCol {
 
   /**
    * Max size of the vocabulary.
@@ -101,19 +102,6 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
   /** @group getParam */
   def getMinTF: Double = $(minTF)
 
-  /**
-   * Binary toggle to control the output vector values.
-   * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for
-   * discrete probabilistic models that model binary events rather than integer counts.
-   * Default: false
-   * @group param
-   */
-  val binary: BooleanParam =
-    new BooleanParam(this, "binary", "If True, all non zero counts are set to 1.")
-
-  /** @group getParam */
-  def getBinary: Boolean = $(binary)
-
   setDefault(binary -> false)
 }
 

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.param.{BooleanParam, IntParam, ParamMap, ParamValidators}
-import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
+import org.apache.spark.ml.param.shared.{HasBinary, HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.feature
 import org.apache.spark.sql.{DataFrame, Dataset}
@@ -34,7 +34,7 @@ import org.apache.spark.sql.types.{ArrayType, StructType}
  */
 @Experimental
 class HashingTF(override val uid: String)
-  extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable {
+  extends Transformer with HasBinary with HasInputCol with HasOutputCol with DefaultParamsWritable {
 
   def this() = this(Identifiable.randomUID("hashingTF"))
 
@@ -52,17 +52,6 @@ class HashingTF(override val uid: String)
   val numFeatures = new IntParam(this, "numFeatures", "number of features (> 0)",
     ParamValidators.gt(0))
 
-  /**
-   * Binary toggle to control term frequency counts.
-   * If true, all non-zero counts are set to 1.  This is useful for discrete probabilistic
-   * models that model binary events rather than integer counts.
-   * (default = false)
-   * @group param
-   */
-  val binary = new BooleanParam(this, "binary", "If true, all non zero counts are set to 1. " +
-    "This is useful for discrete probabilistic models that model binary events rather " +
-    "than integer counts")
-
   setDefault(numFeatures -> (1 << 18), binary -> false)
 
   /** @group getParam */
@@ -71,9 +60,6 @@ class HashingTF(override val uid: String)
   /** @group setParam */
   def setNumFeatures(value: Int): this.type = set(numFeatures, value)
 
-  /** @group getParam */
-  def getBinary: Boolean = $(binary)
-
   /** @group setParam */
   def setBinary(value: Boolean): this.type = set(binary, value)
 

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -76,7 +76,10 @@ private[shared] object SharedParamsCodeGen {
       ParamDesc[String]("weightCol", "weight column name. If this is not set or empty, we treat " +
         "all instance weights as 1.0"),
       ParamDesc[String]("solver", "the solver algorithm for optimization. If this is not set or " +
-        "empty, default value is 'auto'", Some("\"auto\"")))
+        "empty, default value is 'auto'", Some("\"auto\"")),
+      ParamDesc[Boolean]("binary", "If true, all non-zero counts (after any filters are applied) " +
+        "are set to 1. This is useful for discrete probabilistic models that model binary events " +
+        "rather than integer counts. Default False.", Some("false")))
 
     val code = genSharedParams(params)
     val file = "src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala"

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -389,4 +389,21 @@ private[ml] trait HasSolver extends Params {
   /** @group getParam */
   final def getSolver: String = $(solver)
 }
+
+/**
+ * Trait for shared param binary (default: false).
+ */
+private[ml] trait HasBinary extends Params {
+
+  /**
+   * Param for If true, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False..
+   * @group param
+   */
+  final val binary: BooleanParam = new BooleanParam(this, "binary", "If true, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.")
+
+  setDefault(binary, false)
+
+  /** @group getParam */
+  final def getBinary: Boolean = $(binary)
+}
 // scalastyle:on
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
@@ -200,7 +200,8 @@ def getSplits(self):
 
 
 @inherit_doc
-class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
+class CountVectorizer(JavaEstimator, HasBinary, HasInputCol, HasOutputCol, JavaMLReadable,
+                      JavaMLWritable):
     """
     .. note:: Experimental
 
@@ -256,11 +257,6 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable,
     vocabSize = Param(
         Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.",
         typeConverter=TypeConverters.toInt)
-    binary = Param(
-        Params._dummy(), "binary", "Binary toggle to control the output vector values." +
-        " If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful" +
-        " for discrete probabilistic models that model binary events rather than integer counts." +
-        " Default False", typeConverter=TypeConverters.toBoolean)
 
     @keyword_only
     def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None,
@@ -510,8 +506,8 @@ def getScalingVec(self):
 
 
 @inherit_doc
-class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable,
-                JavaMLWritable):
+class HashingTF(JavaTransformer, HasBinary, HasInputCol, HasOutputCol, HasNumFeatures,
+                JavaMLReadable, JavaMLWritable):
     """
     .. note:: Experimental
 
@@ -536,11 +532,6 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java
     .. versionadded:: 1.3.0
     """
 
-    binary = Param(Params._dummy(), "binary", "If True, all non zero counts are set to 1. " +
-                   "This is useful for discrete probabilistic models that model binary events " +
-                   "rather than integer counts. Default False.",
-                   typeConverter=TypeConverters.toBoolean)
-
     @keyword_only
     def __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None):
         """

diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -148,7 +148,11 @@ def get$Name(self):
         ("solver", "the solver algorithm for optimization. If this is not set or empty, " +
          "default value is 'auto'.", "'auto'", "TypeConverters.toString"),
         ("varianceCol", "column name for the biased sample variance of prediction.",
-         None, "TypeConverters.toString")]
+         None, "TypeConverters.toString"),
+        ("binary", "If True, all non-zero counts (after any filters are applied) are set to 1. " +
+         "This is useful for discrete probabilistic models that model binary events rather than " +
+         "integer counts. Default False.", "False", "TypeConverters.toBoolean")
+    ]
 
     code = []
     for name, doc, defaultValueStr, typeConverter in shared:

diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
@@ -583,6 +583,31 @@ def getVarianceCol(self):
         return self.getOrDefault(self.varianceCol)
 
 
+class HasBinary(Params):
+    """
+    Mixin for param binary: If True, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.
+    """
+
+    binary = Param(Params._dummy(), "binary", "If True, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.", typeConverter=TypeConverters.toBoolean)
+
+    def __init__(self):
+        super(HasBinary, self).__init__()
+        self._setDefault(binary=False)
+
+    def setBinary(self, value):
+        """
+        Sets the value of :py:attr:`binary`.
+        """
+        self._set(binary=value)
+        return self
+
+    def getBinary(self):
+        """
+        Gets the value of binary or its default value.
+        """
+        return self.getOrDefault(self.binary)
+
+
 class DecisionTreeParams(Params):
     """
     Mixin for Decision Tree parameters.