-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-21680][ML][MLLIB]optimize Vector compress #18899
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -151,6 +151,12 @@ sealed trait Vector extends Serializable { | |
| @Since("1.4.0") | ||
| def toSparse: SparseVector | ||
|
|
||
| /** | ||
| * Converts this vector to a sparse vector with all explicit zeros removed when the size is known. | ||
| */ | ||
| @Since("2.3.0") | ||
| private[linalg] def toSparse(nnz: Int): SparseVector | ||
|
|
||
| /** | ||
| * Converts this vector to a dense vector. | ||
| */ | ||
|
|
@@ -165,7 +171,7 @@ sealed trait Vector extends Serializable { | |
| val nnz = numNonzeros | ||
| // A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes. | ||
| if (1.5 * (nnz + 1.0) < size) { | ||
| toSparse | ||
| toSparse(nnz) | ||
| } else { | ||
| toDense | ||
| } | ||
|
|
@@ -670,8 +676,10 @@ class DenseVector @Since("1.0.0") ( | |
| } | ||
|
|
||
| @Since("1.4.0") | ||
| override def toSparse: SparseVector = { | ||
| val nnz = numNonzeros | ||
| override def toSparse: SparseVector = toSparse(numNonzeros) | ||
|
|
||
| @Since("2.3.0") | ||
| private[linalg] override def toSparse(nnz: Int): SparseVector = { | ||
| val ii = new Array[Int](nnz) | ||
| val vv = new Array[Double](nnz) | ||
| var k = 0 | ||
|
|
@@ -823,8 +831,10 @@ class SparseVector @Since("1.0.0") ( | |
| } | ||
|
|
||
| @Since("1.4.0") | ||
| override def toSparse: SparseVector = { | ||
| val nnz = numNonzeros | ||
| override def toSparse: SparseVector = toSparse(numNonzeros) | ||
|
|
||
| @Since("2.3.0") | ||
|
||
| private[linalg] override def toSparse(nnz: Int): SparseVector = { | ||
| if (nnz == numActives) { | ||
| this | ||
| } else { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1012,6 +1012,10 @@ object MimaExcludes { | |
| ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.setFeatureSubsetStrategy"), | ||
| ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"), | ||
| ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setFeatureSubsetStrategy") | ||
| ) ++ Seq( | ||
| // [SPARK-21680][ML][MLLIB]optimzie Vector coompress | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, does this really cause a MiMa failure? what's the message, is it about adding the new method to the interface? I think it could be OK because it's a sealed trait that user code can't implement. CC maybe @MLnick or @sethah or @jkbradley for a thought on that
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The error message is"method toSparse(nnz: Int) in trait is present only in current version" |
||
| ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.toSparse"), | ||
| ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Vector.toSparse") | ||
| ) | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't need to be overridden. Just define it in the superclass
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If define
def toSparse: SparseVector = toSparse(numNonzeros)in the superclass, when call dv.toSparse (there are this kinds of call in the code), there will be error message:
Both toSparse in the DenseVector of type (nnz:Int) org.apache.spark.ml.linalg.SparseVector and toSparse in trait Vector of type =>org.apache.spark.ml.linalg.SparseVector match .
So we should change the name of toSparse(nnz: Int), maybe toSparseWithSize(nnz: Int).