-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-14238][ML][MLLIB][PYSPARK] Add binary toggle Param to PySpark HashingTF in ML & MLlib #12079
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
a65b464
dabaf89
829c87e
9c2b4ab
551cc6e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -831,6 +831,25 @@ def test_logistic_regression_summary(self): | |
| self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC) | ||
|
|
||
|
|
||
| class HashingTFTest(PySparkTestCase): | ||
|
|
||
| def test_apply_binary_term_freqs(self): | ||
| sqlContext = SQLContext(self.sc) | ||
|
|
||
| df = sqlContext.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"]) | ||
| n = 100 | ||
| hashingTF = HashingTF() | ||
| hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True) | ||
| output = hashingTF.transform(df) | ||
| features = output.select("features").first().features.toArray() | ||
| expected = Vectors.sparse(100, {(ord("a") % n): 1.0, | ||
|
||
| (ord("b") % n): 1.0, | ||
| (ord("c") % n): 1.0}).toArray() | ||
| for i in range(0, n): | ||
| self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) + | ||
| ": expected " + str(expected[i]) + ", got " + str(features[i])) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| from pyspark.ml.tests import * | ||
| if xmlrunner: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -379,6 +379,17 @@ class HashingTF(object): | |
| """ | ||
| def __init__(self, numFeatures=1 << 20): | ||
| self.numFeatures = numFeatures | ||
| self.binary = False | ||
|
|
||
| @since("2.0.0") | ||
| def setBinary(self, value): | ||
| """ | ||
| If true, term frequency vector will be binary such that non-zero | ||
|
||
| term counts will be set to 1 | ||
| (default: false) | ||
|
||
| """ | ||
| self.binary = value | ||
| return self | ||
|
|
||
| @since('1.2.0') | ||
| def indexOf(self, term): | ||
|
|
@@ -398,7 +409,7 @@ def transform(self, document): | |
| freq = {} | ||
| for term in document: | ||
| i = self.indexOf(term) | ||
| freq[i] = freq.get(i, 0) + 1.0 | ||
| freq[i] = 1.0 if self.binary else freq.get(i, 0) + 1.0 | ||
| return Vectors.sparse(self.numFeatures, freq.items()) | ||
|
|
||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We probably want to mention the default value here (namely false).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @holdenk this issue has been addressed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great! Looking at the incoming PRs it seems there is a second PR also adding a binary feature to another model - it might make sense to move this to a shared param instead of having it be per-model (although it will require coordination with the other PR timing wise).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See #12308 (comment)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if true->if True