diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index 922670a41b6b..681b31e8bf44 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -22,7 +22,7 @@ import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ -import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.param.shared.{HasBinary, HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg.{Vectors, VectorUDT} import org.apache.spark.rdd.RDD @@ -34,7 +34,8 @@ import org.apache.spark.util.collection.OpenHashMap /** * Params for [[CountVectorizer]] and [[CountVectorizerModel]]. */ -private[feature] trait CountVectorizerParams extends Params with HasInputCol with HasOutputCol { +private[feature] trait CountVectorizerParams extends Params with HasBinary with HasInputCol + with HasOutputCol { /** * Max size of the vocabulary. @@ -101,19 +102,6 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit /** @group getParam */ def getMinTF: Double = $(minTF) - /** - * Binary toggle to control the output vector values. - * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for - * discrete probabilistic models that model binary events rather than integer counts. - * Default: false - * @group param - */ - val binary: BooleanParam = - new BooleanParam(this, "binary", "If True, all non zero counts are set to 1.") - - /** @group getParam */ - def getBinary: Boolean = $(binary) - setDefault(binary -> false) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala index 467ad7307462..dd574ca494cd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala @@ -21,7 +21,7 @@ import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{BooleanParam, IntParam, ParamMap, ParamValidators} -import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.param.shared.{HasBinary, HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} @@ -34,7 +34,7 @@ import org.apache.spark.sql.types.{ArrayType, StructType} */ @Experimental class HashingTF(override val uid: String) - extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { + extends Transformer with HasBinary with HasInputCol with HasOutputCol with DefaultParamsWritable { def this() = this(Identifiable.randomUID("hashingTF")) @@ -52,17 +52,6 @@ class HashingTF(override val uid: String) val numFeatures = new IntParam(this, "numFeatures", "number of features (> 0)", ParamValidators.gt(0)) - /** - * Binary toggle to control term frequency counts. - * If true, all non-zero counts are set to 1. This is useful for discrete probabilistic - * models that model binary events rather than integer counts. - * (default = false) - * @group param - */ - val binary = new BooleanParam(this, "binary", "If true, all non zero counts are set to 1. " + - "This is useful for discrete probabilistic models that model binary events rather " + - "than integer counts") - setDefault(numFeatures -> (1 << 18), binary -> false) /** @group getParam */ @@ -71,9 +60,6 @@ class HashingTF(override val uid: String) /** @group setParam */ def setNumFeatures(value: Int): this.type = set(numFeatures, value) - /** @group getParam */ - def getBinary: Boolean = $(binary) - /** @group setParam */ def setBinary(value: Boolean): this.type = set(binary, value) diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala index 1d03a5b4f404..26630401118b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala @@ -76,7 +76,10 @@ private[shared] object SharedParamsCodeGen { ParamDesc[String]("weightCol", "weight column name. If this is not set or empty, we treat " + "all instance weights as 1.0"), ParamDesc[String]("solver", "the solver algorithm for optimization. If this is not set or " + - "empty, default value is 'auto'", Some("\"auto\""))) + "empty, default value is 'auto'", Some("\"auto\"")), + ParamDesc[Boolean]("binary", "If true, all non-zero counts (after any filters are applied) " + + "are set to 1. This is useful for discrete probabilistic models that model binary events " + + "rather than integer counts. Default False.", Some("false"))) val code = genSharedParams(params) val file = "src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala" diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala index 64d6af2766ca..9370a412ea0a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala @@ -389,4 +389,21 @@ private[ml] trait HasSolver extends Params { /** @group getParam */ final def getSolver: String = $(solver) } + +/** + * Trait for shared param binary (default: false). + */ +private[ml] trait HasBinary extends Params { + + /** + * Param for If true, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.. + * @group param + */ + final val binary: BooleanParam = new BooleanParam(this, "binary", "If true, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.") + + setDefault(binary, false) + + /** @group getParam */ + final def getBinary: Boolean = $(binary) +} // scalastyle:on diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 809a513316f9..9e85f9d24650 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -200,7 +200,8 @@ def getSplits(self): @inherit_doc -class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): +class CountVectorizer(JavaEstimator, HasBinary, HasInputCol, HasOutputCol, JavaMLReadable, + JavaMLWritable): """ .. note:: Experimental @@ -256,11 +257,6 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, vocabSize = Param( Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.", typeConverter=TypeConverters.toInt) - binary = Param( - Params._dummy(), "binary", "Binary toggle to control the output vector values." + - " If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful" + - " for discrete probabilistic models that model binary events rather than integer counts." + - " Default False", typeConverter=TypeConverters.toBoolean) @keyword_only def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None, @@ -510,8 +506,8 @@ def getScalingVec(self): @inherit_doc -class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable, - JavaMLWritable): +class HashingTF(JavaTransformer, HasBinary, HasInputCol, HasOutputCol, HasNumFeatures, + JavaMLReadable, JavaMLWritable): """ .. note:: Experimental @@ -536,11 +532,6 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java .. versionadded:: 1.3.0 """ - binary = Param(Params._dummy(), "binary", "If True, all non zero counts are set to 1. " + - "This is useful for discrete probabilistic models that model binary events " + - "rather than integer counts. Default False.", - typeConverter=TypeConverters.toBoolean) - @keyword_only def __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None): """ diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index a7615c43bee2..1a3d9aec5088 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -148,7 +148,11 @@ def get$Name(self): ("solver", "the solver algorithm for optimization. If this is not set or empty, " + "default value is 'auto'.", "'auto'", "TypeConverters.toString"), ("varianceCol", "column name for the biased sample variance of prediction.", - None, "TypeConverters.toString")] + None, "TypeConverters.toString"), + ("binary", "If True, all non-zero counts (after any filters are applied) are set to 1. " + + "This is useful for discrete probabilistic models that model binary events rather than " + + "integer counts. Default False.", "False", "TypeConverters.toBoolean") + ] code = [] for name, doc, defaultValueStr, typeConverter in shared: diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index c9e975525ce1..35983ca7bdaf 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -583,6 +583,31 @@ def getVarianceCol(self): return self.getOrDefault(self.varianceCol) +class HasBinary(Params): + """ + Mixin for param binary: If True, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False. + """ + + binary = Param(Params._dummy(), "binary", "If True, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.", typeConverter=TypeConverters.toBoolean) + + def __init__(self): + super(HasBinary, self).__init__() + self._setDefault(binary=False) + + def setBinary(self, value): + """ + Sets the value of :py:attr:`binary`. + """ + self._set(binary=value) + return self + + def getBinary(self): + """ + Gets the value of binary or its default value. + """ + return self.getOrDefault(self.binary) + + class DecisionTreeParams(Params): """ Mixin for Decision Tree parameters.