Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,21 @@ sealed trait Vector extends Serializable {
* Converts this vector to a sparse vector with all explicit zeros removed.
*/
@Since("2.0.0")
def toSparse: SparseVector
def toSparse: SparseVector = toSparseWithSize(numNonzeros)

/**
* Converts this vector to a sparse vector with all explicit zeros removed when the size is known.
* This method is used to avoid re-computing the number of non-zero elements when it is
* already known. This method should only be called after computing the number of non-zero
* elements via [[numNonzeros]]. e.g.
* {{{
* val nnz = numNonzeros
* val sv = toSparse(nnz)
* }}}
*
* If `nnz` is under-specified, a [[java.lang.ArrayIndexOutOfBoundsException]] is thrown.
*/
private[linalg] def toSparseWithSize(nnz: Int): SparseVector

/**
* Converts this vector to a dense vector.
Expand All @@ -152,7 +166,7 @@ sealed trait Vector extends Serializable {
val nnz = numNonzeros
// A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
if (1.5 * (nnz + 1.0) < size) {
toSparse
toSparseWithSize(nnz)
} else {
toDense
}
Expand Down Expand Up @@ -495,8 +509,7 @@ class DenseVector @Since("2.0.0") ( @Since("2.0.0") val values: Array[Double]) e
nnz
}

override def toSparse: SparseVector = {
val nnz = numNonzeros
private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = {
val ii = new Array[Int](nnz)
val vv = new Array[Double](nnz)
var k = 0
Expand Down Expand Up @@ -635,8 +648,7 @@ class SparseVector @Since("2.0.0") (
nnz
}

override def toSparse: SparseVector = {
val nnz = numNonzeros
private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = {
if (nnz == numActives) {
this
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -318,11 +318,21 @@ class VectorsSuite extends SparkMLFunSuite {
assert(dv0s.numActives === 2)
assert(dv0s === dv0)

assert(dv0.toSparseWithSize(dv0.numNonzeros) === dv0)
val dv0s2 = dv0.toSparseWithSize(dv0.numNonzeros)
assert(dv0s2.numActives === 2)
assert(dv0s2 === dv0s)

val sv0 = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
assert(sv0.toDense === sv0)
val sv0s = sv0.toSparse
assert(sv0s.numActives === 2)
assert(sv0s === sv0)

assert(sv0.toSparseWithSize(sv0.numNonzeros) === sv0)
val sv0s2 = sv0.toSparseWithSize(sv0.numNonzeros)
assert(sv0s2.numActives === 2)
assert(sv0s2 === sv0s)
}

test("Vector.compressed") {
Expand Down
26 changes: 18 additions & 8 deletions mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,21 @@ sealed trait Vector extends Serializable {
* Converts this vector to a sparse vector with all explicit zeros removed.
*/
@Since("1.4.0")
def toSparse: SparseVector
def toSparse: SparseVector = toSparseWithSize(numNonzeros)

/**
* Converts this vector to a sparse vector with all explicit zeros removed when the size is known.
* This method is used to avoid re-computing the number of non-zero elements when it is
* already known. This method should only be called after computing the number of non-zero
* elements via [[numNonzeros]]. e.g.
* {{{
* val nnz = numNonzeros
* val sv = toSparse(nnz)
* }}}
*
* If `nnz` is under-specified, a [[java.lang.ArrayIndexOutOfBoundsException]] is thrown.
*/
private[linalg] def toSparseWithSize(nnz: Int): SparseVector

/**
* Converts this vector to a dense vector.
Expand All @@ -165,7 +179,7 @@ sealed trait Vector extends Serializable {
val nnz = numNonzeros
// A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
if (1.5 * (nnz + 1.0) < size) {
toSparse
toSparseWithSize(nnz)
} else {
toDense
}
Expand Down Expand Up @@ -669,9 +683,7 @@ class DenseVector @Since("1.0.0") (
nnz
}

@Since("1.4.0")
override def toSparse: SparseVector = {
val nnz = numNonzeros
private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = {
val ii = new Array[Int](nnz)
val vv = new Array[Double](nnz)
var k = 0
Expand Down Expand Up @@ -822,9 +834,7 @@ class SparseVector @Since("1.0.0") (
nnz
}

@Since("1.4.0")
override def toSparse: SparseVector = {
val nnz = numNonzeros
private[linalg] override def toSparseWithSize(nnz: Int): SparseVector = {
if (nnz == numActives) {
this
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -351,11 +351,21 @@ class VectorsSuite extends SparkFunSuite with Logging {
assert(dv0s.numActives === 2)
assert(dv0s === dv0)

assert(dv0.toSparseWithSize(dv0.numNonzeros) === dv0)
val dv0s2 = dv0.toSparseWithSize(dv0.numNonzeros)
assert(dv0s2.numActives === 2)
assert(dv0s2 === dv0s)

val sv0 = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
assert(sv0.toDense === sv0)
val sv0s = sv0.toSparse
assert(sv0s.numActives === 2)
assert(sv0s === sv0)

assert(sv0.toSparseWithSize(sv0.numNonzeros) === sv0)
val sv0s2 = sv0.toSparseWithSize(sv0.numNonzeros)
assert(sv0s2.numActives === 2)
assert(sv0s2 === sv0s)
}

test("Vector.compressed") {
Expand Down
4 changes: 4 additions & 0 deletions project/MimaExcludes.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1012,6 +1012,10 @@ object MimaExcludes {
ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.setFeatureSubsetStrategy"),
ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"),
ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setFeatureSubsetStrategy")
) ++ Seq(
// [SPARK-21680][ML][MLLIB]optimzie Vector coompress
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, does this really cause a MiMa failure? what's the message, is it about adding the new method to the interface? I think it could be OK because it's a sealed trait that user code can't implement. CC maybe @MLnick or @sethah or @jkbradley for a thought on that

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error message is"method toSparse(nnz: Int) in trait is present only in current version"

ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.toSparseWithSize"),
ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Vector.toSparseWithSize")
)
}

Expand Down