Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.shared.{HasBinary, HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.linalg.{Vectors, VectorUDT}
import org.apache.spark.rdd.RDD
Expand All @@ -34,7 +34,8 @@ import org.apache.spark.util.collection.OpenHashMap
/**
* Params for [[CountVectorizer]] and [[CountVectorizerModel]].
*/
private[feature] trait CountVectorizerParams extends Params with HasInputCol with HasOutputCol {
private[feature] trait CountVectorizerParams extends Params with HasBinary with HasInputCol
with HasOutputCol {

/**
* Max size of the vocabulary.
Expand Down Expand Up @@ -101,19 +102,6 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
/** @group getParam */
def getMinTF: Double = $(minTF)

/**
* Binary toggle to control the output vector values.
* If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for
* discrete probabilistic models that model binary events rather than integer counts.
* Default: false
* @group param
*/
val binary: BooleanParam =
new BooleanParam(this, "binary", "If True, all non zero counts are set to 1.")

/** @group getParam */
def getBinary: Boolean = $(binary)

setDefault(binary -> false)
}

Expand Down
18 changes: 2 additions & 16 deletions mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{BooleanParam, IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.shared.{HasBinary, HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.{DataFrame, Dataset}
Expand All @@ -34,7 +34,7 @@ import org.apache.spark.sql.types.{ArrayType, StructType}
*/
@Experimental
class HashingTF(override val uid: String)
extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable {
extends Transformer with HasBinary with HasInputCol with HasOutputCol with DefaultParamsWritable {

def this() = this(Identifiable.randomUID("hashingTF"))

Expand All @@ -52,17 +52,6 @@ class HashingTF(override val uid: String)
val numFeatures = new IntParam(this, "numFeatures", "number of features (> 0)",
ParamValidators.gt(0))

/**
* Binary toggle to control term frequency counts.
* If true, all non-zero counts are set to 1. This is useful for discrete probabilistic
* models that model binary events rather than integer counts.
* (default = false)
* @group param
*/
val binary = new BooleanParam(this, "binary", "If true, all non zero counts are set to 1. " +
"This is useful for discrete probabilistic models that model binary events rather " +
"than integer counts")

setDefault(numFeatures -> (1 << 18), binary -> false)

/** @group getParam */
Expand All @@ -71,9 +60,6 @@ class HashingTF(override val uid: String)
/** @group setParam */
def setNumFeatures(value: Int): this.type = set(numFeatures, value)

/** @group getParam */
def getBinary: Boolean = $(binary)

/** @group setParam */
def setBinary(value: Boolean): this.type = set(binary, value)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,10 @@ private[shared] object SharedParamsCodeGen {
ParamDesc[String]("weightCol", "weight column name. If this is not set or empty, we treat " +
"all instance weights as 1.0"),
ParamDesc[String]("solver", "the solver algorithm for optimization. If this is not set or " +
"empty, default value is 'auto'", Some("\"auto\"")))
"empty, default value is 'auto'", Some("\"auto\"")),
ParamDesc[Boolean]("binary", "If true, all non-zero counts (after any filters are applied) " +
"are set to 1. This is useful for discrete probabilistic models that model binary events " +
"rather than integer counts. Default False.", Some("false")))

val code = genSharedParams(params)
val file = "src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -389,4 +389,21 @@ private[ml] trait HasSolver extends Params {
/** @group getParam */
final def getSolver: String = $(solver)
}

/**
* Trait for shared param binary (default: false).
*/
private[ml] trait HasBinary extends Params {

/**
* Param for If true, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False..
* @group param
*/
final val binary: BooleanParam = new BooleanParam(this, "binary", "If true, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.")

setDefault(binary, false)

/** @group getParam */
final def getBinary: Boolean = $(binary)
}
// scalastyle:on
17 changes: 4 additions & 13 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,8 @@ def getSplits(self):


@inherit_doc
class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
class CountVectorizer(JavaEstimator, HasBinary, HasInputCol, HasOutputCol, JavaMLReadable,
JavaMLWritable):
"""
.. note:: Experimental

Expand Down Expand Up @@ -256,11 +257,6 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable,
vocabSize = Param(
Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.",
typeConverter=TypeConverters.toInt)
binary = Param(
Params._dummy(), "binary", "Binary toggle to control the output vector values." +
" If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful" +
" for discrete probabilistic models that model binary events rather than integer counts." +
" Default False", typeConverter=TypeConverters.toBoolean)

@keyword_only
def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None,
Expand Down Expand Up @@ -510,8 +506,8 @@ def getScalingVec(self):


@inherit_doc
class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable,
JavaMLWritable):
class HashingTF(JavaTransformer, HasBinary, HasInputCol, HasOutputCol, HasNumFeatures,
JavaMLReadable, JavaMLWritable):
"""
.. note:: Experimental

Expand All @@ -536,11 +532,6 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java
.. versionadded:: 1.3.0
"""

binary = Param(Params._dummy(), "binary", "If True, all non zero counts are set to 1. " +
"This is useful for discrete probabilistic models that model binary events " +
"rather than integer counts. Default False.",
typeConverter=TypeConverters.toBoolean)

@keyword_only
def __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None):
"""
Expand Down
6 changes: 5 additions & 1 deletion python/pyspark/ml/param/_shared_params_code_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,11 @@ def get$Name(self):
("solver", "the solver algorithm for optimization. If this is not set or empty, " +
"default value is 'auto'.", "'auto'", "TypeConverters.toString"),
("varianceCol", "column name for the biased sample variance of prediction.",
None, "TypeConverters.toString")]
None, "TypeConverters.toString"),
("binary", "If True, all non-zero counts (after any filters are applied) are set to 1. " +
"This is useful for discrete probabilistic models that model binary events rather than " +
"integer counts. Default False.", "False", "TypeConverters.toBoolean")
]

code = []
for name, doc, defaultValueStr, typeConverter in shared:
Expand Down
25 changes: 25 additions & 0 deletions python/pyspark/ml/param/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,31 @@ def getVarianceCol(self):
return self.getOrDefault(self.varianceCol)


class HasBinary(Params):
"""
Mixin for param binary: If True, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.
"""

binary = Param(Params._dummy(), "binary", "If True, all non-zero counts (after any filters are applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.", typeConverter=TypeConverters.toBoolean)

def __init__(self):
super(HasBinary, self).__init__()
self._setDefault(binary=False)

def setBinary(self, value):
"""
Sets the value of :py:attr:`binary`.
"""
self._set(binary=value)
return self

def getBinary(self):
"""
Gets the value of binary or its default value.
"""
return self.getOrDefault(self.binary)


class DecisionTreeParams(Params):
"""
Mixin for Decision Tree parameters.
Expand Down