Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -440,19 +440,14 @@ private class LinearSVCAggregator(

private val numFeatures: Int = bcFeaturesStd.value.length
private val numFeaturesPlusIntercept: Int = if (fitIntercept) numFeatures + 1 else numFeatures
private val coefficients: Vector = bcCoefficients.value
private var weightSum: Double = 0.0
private var lossSum: Double = 0.0
require(numFeaturesPlusIntercept == coefficients.size, s"Dimension mismatch. Coefficients " +
s"length ${coefficients.size}, FeaturesStd length ${numFeatures}, fitIntercept: $fitIntercept")

private val coefficientsArray = coefficients match {
case dv: DenseVector => dv.values
case _ =>
throw new IllegalArgumentException(
s"coefficients only supports dense vector but got type ${coefficients.getClass}.")
@transient private lazy val coefficientsArray = bcCoefficients.value match {
case DenseVector(values) => values
case _ => throw new IllegalArgumentException(s"coefficients only supports dense vector" +
s" but got type ${bcCoefficients.value.getClass}.")
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note this eliminates the check for sparse coefficients, but I'm not sure it was ever necessary. We don't do it in other aggregators, this is a private class so we don't need to worry about people misusing it. Appreciate other thoughts.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We used to check it I think? At some point there was a BLAS operation used that only worked for dense vectors. I think during all the linear model refactor for 2.0/2.1 that was eliminated

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed this up a bit. I think using the check but marking the array as @transient lazy val is slightly more explicit about what we are trying to avoid here.

Copy link

@AnthonyTruchet AnthonyTruchet Feb 27, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Race condition with my review below...
Thanks for anticipating it, please ignore it then :-)

private val gradientSumArray = Array.fill[Double](coefficientsArray.length)(0)
private lazy val gradientSumArray = new Array[Double](numFeaturesPlusIntercept)

/**
* Add a new training instance to this LinearSVCAggregator, and update the loss and gradient
Expand All @@ -463,6 +458,9 @@ private class LinearSVCAggregator(
*/
def add(instance: Instance): this.type = {
instance match { case Instance(label, weight, features) =>
require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0")
require(numFeatures == features.size, s"Dimensions mismatch when adding new instance." +
s" Expecting $numFeatures but got ${features.size}.")
if (weight == 0.0) return this
val localFeaturesStd = bcFeaturesStd.value
val localCoefficients = coefficientsArray
Expand Down Expand Up @@ -530,18 +528,15 @@ private class LinearSVCAggregator(
this
}

def loss: Double = {
if (weightSum != 0) {
lossSum / weightSum
} else 0.0
}
def loss: Double = if (weightSum != 0) lossSum / weightSum else 0.0

def gradient: Vector = {
if (weightSum != 0) {
val result = Vectors.dense(gradientSumArray.clone())
scal(1.0 / weightSum, result)
result
} else Vectors.dense(Array.fill[Double](coefficientsArray.length)(0))
} else {
Vectors.dense(new Array[Double](numFeaturesPlusIntercept))
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -1436,7 +1436,7 @@ private class LogisticAggregator(
case _ => throw new IllegalArgumentException(s"coefficients only supports dense vector but " +
s"got type ${bcCoefficients.value.getClass}.)")
}
private val gradientSumArray = new Array[Double](coefficientSize)
private lazy val gradientSumArray = new Array[Double](coefficientSize)

if (multinomial && numClasses <= 2) {
logInfo(s"Multinomial logistic regression for binary classification yields separate " +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -580,10 +580,10 @@ private class ExpectationAggregator(
private val k: Int = bcWeights.value.length
private var totalCnt: Long = 0L
private var newLogLikelihood: Double = 0.0
private val newWeights: Array[Double] = new Array[Double](k)
private val newMeans: Array[DenseVector] = Array.fill(k)(
private lazy val newWeights: Array[Double] = new Array[Double](k)
private lazy val newMeans: Array[DenseVector] = Array.fill(k)(
new DenseVector(Array.fill[Double](numFeatures)(0.0)))
private val newCovs: Array[DenseVector] = Array.fill(k)(
private lazy val newCovs: Array[DenseVector] = Array.fill(k)(
new DenseVector(Array.fill[Double](numFeatures * (numFeatures + 1) / 2)(0.0)))

@transient private lazy val oldGaussians = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ private class AFTAggregator(
private var totalCnt: Long = 0L
private var lossSum = 0.0
// Here we optimize loss function over log(sigma), intercept and coefficients
private val gradientSumArray = Array.ofDim[Double](length)
private lazy val gradientSumArray = Array.ofDim[Double](length)

def count: Long = totalCnt
def loss: Double = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -959,7 +959,7 @@ private class LeastSquaresAggregator(
@transient private lazy val effectiveCoefficientsVector = effectiveCoefAndOffset._1
@transient private lazy val offset = effectiveCoefAndOffset._2

private val gradientSumArray = Array.ofDim[Double](dim)
private lazy val gradientSumArray = Array.ofDim[Double](dim)

/**
* Add a new training instance to this LeastSquaresAggregator, and update the loss and gradient
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import breeze.linalg.{DenseVector => BDV}

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.classification.LinearSVCSuite._
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
Expand Down Expand Up @@ -123,6 +123,21 @@ class LinearSVCSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
assert(model2.intercept !== 0.0)
}

test("sparse coefficients in SVCAggregator") {
val bcCoefficients = spark.sparkContext.broadcast(Vectors.sparse(2, Array(0), Array(1.0)))
val bcFeaturesStd = spark.sparkContext.broadcast(Array(1.0))
val agg = new LinearSVCAggregator(bcCoefficients, bcFeaturesStd, true)
val thrown = withClue("LinearSVCAggregator cannot handle sparse coefficients") {
intercept[IllegalArgumentException] {
agg.add(Instance(1.0, 1.0, Vectors.dense(1.0)))
}
}
assert(thrown.getMessage.contains("coefficients only supports dense"))

bcCoefficients.destroy(blocking = false)
bcFeaturesStd.destroy(blocking = false)
}

test("linearSVC with sample weights") {
def modelEquals(m1: LinearSVCModel, m2: LinearSVCModel): Unit = {
assert(m1.coefficients ~== m2.coefficients absTol 0.05)
Expand Down