Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -195,17 +195,24 @@ object GradientBoostedTrees extends Logging {
baseLearners(0) = firstTreeModel
baseLearnerWeights(0) = 1.0
val startingModel = new GradientBoostedTreesModel(Regression, Array(firstTreeModel), Array(1.0))
logDebug("error of gbt = " + loss.computeError(startingModel, input))

var predError: RDD[(Double, Double)] = GradientBoostedTreesModel.
computeInitialPredictionAndError(input, 1.0, firstTreeModel, loss)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use "baseLearnerWeights(0)" instead of "1.0"

logDebug("error of gbt = " + predError.values.mean())

// Note: A model of type regression is used since we require raw prediction
timer.stop("building tree 0")

var bestValidateError = if (validate) loss.computeError(startingModel, validationInput) else 0.0
var validatePredError: RDD[(Double, Double)] = GradientBoostedTreesModel.
computeInitialPredictionAndError(validationInput, 1.0, firstTreeModel, loss)
var bestValidateError = if (validate) validatePredError.values.mean() else 0.0
var bestM = 1

// psuedo-residual for second iteration
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"psuedo" --> "pseudo" (I didn't notice before)

data = input.map(point => LabeledPoint(loss.gradient(startingModel, point),
point.features))
data = predError.zip(input).map {
case ((pred, _), point) => LabeledPoint(loss.gradient(pred, point.label), point.features)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like a bug---not from you, but from before. Shouldn't there be a "-" in front of "loss.gradient" (as in the gradient computation in the loop below)?

Also, style: Put "case ... =>" on the line above:

data = predError.zip(input).map { case (...) =>
  LabeledPoint(...)
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you tell me why should there be a negative sign in any case?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can check out Algorithm 1 in this paper: Friedman. "Stochastic Gradient Boosting." 1999.
Intuitively, we want to minimize the loss, and using the gradient of the loss gives useful weights indicating how well/poorly we are doing at predicting each instance. Since the gradient points in the direction of increasing loss, we want to negate it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. Thanks a lot for the intuitive explanation. Any other comments?

}

var m = 1
while (m < numIterations) {
timer.start(s"building tree $m")
Expand All @@ -223,14 +230,20 @@ object GradientBoostedTrees extends Logging {
// Note: A model of type regression is used since we require raw prediction
val partialModel = new GradientBoostedTreesModel(
Regression, baseLearners.slice(0, m + 1), baseLearnerWeights.slice(0, m + 1))
logDebug("error of gbt = " + loss.computeError(partialModel, input))

predError = GradientBoostedTreesModel.updatePredictionError(
input, predError, learningRate, model, loss)
logDebug("error of gbt = " + predError.values.mean())

if (validate) {
// Stop training early if
// 1. Reduction in error is less than the validationTol or
// 2. If the error increases, that is if the model is overfit.
// We want the model returned corresponding to the best validation error.
val currentValidateError = loss.computeError(partialModel, validationInput)

validatePredError = GradientBoostedTreesModel.updatePredictionError(
validationInput, validatePredError, learningRate, model, loss)
val currentValidateError = validatePredError.values.mean()
if (bestValidateError - currentValidateError < validationTol) {
return new GradientBoostedTreesModel(
boostingStrategy.treeStrategy.algo,
Expand All @@ -242,8 +255,9 @@ object GradientBoostedTrees extends Logging {
}
}
// Update data with pseudo-residuals
data = input.map(point => LabeledPoint(-loss.gradient(partialModel, point),
point.features))
data = predError.zip(input).map {
case ((pred, _), point) => LabeledPoint(-loss.gradient(pred, point.label), point.features)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove extra space after "=>"

}
m += 1
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,12 @@ object AbsoluteError extends Loss {
* Method to calculate the gradients for the gradient boosting calculation for least
* absolute error calculation.
* The gradient with respect to F(x) is: sign(F(x) - y)
* @param model Ensemble model
* @param point Instance of the training dataset
* @param prediction Predicted point
* @param label True label.
* @return Loss gradient
*/
override def gradient(
model: TreeEnsembleModel,
point: LabeledPoint): Double = {
if ((point.label - model.predict(point.features)) < 0) 1.0 else -1.0
override def gradient(prediction: Double, label: Double): Double = {
if (label - prediction < 0) 1.0 else -1.0
}

override def computeError(prediction: Double, label: Double): Double = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,15 @@ object LogLoss extends Loss {
* Method to calculate the loss gradients for the gradient boosting calculation for binary
* classification
* The gradient with respect to F(x) is: - 4 y / (1 + exp(2 y F(x)))
* @param model Ensemble model
* @param point Instance of the training dataset
* @param prediction Predicted point
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Predicted label" (Please correct elsewhere too)

* @param label True label.
* @return Loss gradient
*/
override def gradient(
model: TreeEnsembleModel,
point: LabeledPoint): Double = {
val prediction = model.predict(point.features)
- 4.0 * point.label / (1.0 + math.exp(2.0 * point.label * prediction))
override def gradient(prediction: Double, label: Double): Double = {
- 4.0 * label / (1.0 + math.exp(2.0 * label * prediction))
}


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove extra newline

override def computeError(prediction: Double, label: Double): Double = {
val margin = 2.0 * label * prediction
// The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,11 @@ trait Loss extends Serializable {

/**
* Method to calculate the gradients for the gradient boosting calculation.
* @param model Model of the weak learner.
* @param point Instance of the training dataset.
* @param prediction Predicted feature
* @param label true label.
* @return Loss gradient.
*/
def gradient(
model: TreeEnsembleModel,
point: LabeledPoint): Double
def gradient(prediction: Double, label: Double): Double

/**
* Method to calculate error of the base learner for the gradient boosting calculation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,12 @@ object SquaredError extends Loss {
* Method to calculate the gradients for the gradient boosting calculation for least
* squares error calculation.
* The gradient with respect to F(x) is: - 2 (y - F(x))
* @param model Ensemble model
* @param point Instance of the training dataset
* @param prediction Predicted point
* @param label True label.
* @return Loss gradient
*/
override def gradient(
model: TreeEnsembleModel,
point: LabeledPoint): Double = {
2.0 * (model.predict(point.features) - point.label)
override def gradient(prediction: Double, label: Double): Double = {
2.0 * (prediction - label)
}

override def computeError(prediction: Double, label: Double): Double = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,29 +131,19 @@ class GradientBoostedTreesModel(
val numIterations = trees.length
val evaluationArray = Array.fill(numIterations)(0.0)

var predictionAndError: RDD[(Double, Double)] = remappedData.map { i =>
val pred = treeWeights(0) * trees(0).predict(i.features)
val error = loss.computeError(pred, i.label)
(pred, error)
}
var predictionAndError = GradientBoostedTreesModel.computeInitialPredictionAndError(
remappedData, treeWeights(0), trees(0), loss)

evaluationArray(0) = predictionAndError.values.mean()

// Avoid the model being copied across numIterations.
val broadcastTrees = sc.broadcast(trees)
val broadcastWeights = sc.broadcast(treeWeights)

(1 until numIterations).map { nTree =>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"map" --> "foreach"

predictionAndError = remappedData.zip(predictionAndError).mapPartitions { iter =>
val currentTree = broadcastTrees.value(nTree)
val currentTreeWeight = broadcastWeights.value(nTree)
iter.map {
case (point, (pred, error)) => {
val newPred = pred + currentTree.predict(point.features) * currentTreeWeight
val newError = loss.computeError(newPred, point.label)
(newPred, newError)
}
}
}
predictionAndError = GradientBoostedTreesModel.updatePredictionError(
remappedData, predictionAndError, broadcastWeights.value(nTree),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should pass the broadcast variables themselves; don't extract the value here since this happens before the mapPartitions().

broadcastTrees.value(nTree), loss)
evaluationArray(nTree) = predictionAndError.values.mean()
}

Expand All @@ -166,6 +156,56 @@ class GradientBoostedTreesModel(

object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {

/**
* Method to compute initial error and prediction as a RDD for the first
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doc: "Compute the initial predictions and errors for a dataset for the first iteration of gradient boosting." No need to say it's a "method." This is a leftover problem from before, so don't follow previous examples. : )

* iteration of gradient boosting.
* @param data RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doc: "Training data" The generated doc will already include the argument type. Also a leftover problem from before...

* @param initTreeWeight: learning rate assigned to the first tree.
* @param initTree: first DecisionTreeModel
* @param loss: evaluation metric
* @return a RDD with each element being a zip of the prediction and error
* corresponding to every sample.
*/
def computeInitialPredictionAndError(
data: RDD[LabeledPoint],
initTreeWeight: Double,
initTree: DecisionTreeModel, loss: Loss): RDD[(Double, Double)] = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style: 1 parameter per line

data.map { i =>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rename i -> "lp" or "labeledPoint"

val pred = initTreeWeight * initTree.predict(i.features)
val error = loss.computeError(pred, i.label)
(pred, error)
}
}

/**
* Method to update a zipped predictionError RDD
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

* (as obtained with computeInitialPredictionAndError)
* @param data RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

* @param predictionAndError: predictionError RDD
* @param currentTreeWeight: learning rate.
* @param currentTree: first DecisionTree
* @param loss: evaluation metric
* @return a RDD with each element being a zip of the prediction and error
* corresponing to each sample.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo: "corresponding"

*/
def updatePredictionError(
data: RDD[LabeledPoint],
predictionAndError: RDD[(Double, Double)],
currentTreeWeight: Double,
currentTree: DecisionTreeModel,
loss: Loss): RDD[(Double, Double)] = {

data.zip(predictionAndError).mapPartitions { iter =>
iter.map {
case (point, (pred, error)) => {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"point" --> (Use the same variable name for a labeled point everywhere for consistency. I tend to use "lp")

val newPred = pred + currentTree.predict(point.features) * currentTreeWeight
val newError = loss.computeError(newPred, point.label)
(newPred, newError)
}
}
}
}

override def load(sc: SparkContext, path: String): GradientBoostedTreesModel = {
val (loadedClassName, version, jsonMetadata) = Loader.loadMetadata(sc, path)
val classNameV1_0 = SaveLoadV1_0.thisClassName
Expand Down