-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-5972] [MLlib] Cache residuals and gradient in GBT during training and validation #5330
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
923dbf6
5869533
70d3b4c
58f4932
d542bb0
32d409d
0b5d659
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -195,17 +195,24 @@ object GradientBoostedTrees extends Logging { | |
| baseLearners(0) = firstTreeModel | ||
| baseLearnerWeights(0) = 1.0 | ||
| val startingModel = new GradientBoostedTreesModel(Regression, Array(firstTreeModel), Array(1.0)) | ||
| logDebug("error of gbt = " + loss.computeError(startingModel, input)) | ||
|
|
||
| var predError: RDD[(Double, Double)] = GradientBoostedTreesModel. | ||
| computeInitialPredictionAndError(input, 1.0, firstTreeModel, loss) | ||
| logDebug("error of gbt = " + predError.values.mean()) | ||
|
|
||
| // Note: A model of type regression is used since we require raw prediction | ||
| timer.stop("building tree 0") | ||
|
|
||
| var bestValidateError = if (validate) loss.computeError(startingModel, validationInput) else 0.0 | ||
| var validatePredError: RDD[(Double, Double)] = GradientBoostedTreesModel. | ||
| computeInitialPredictionAndError(validationInput, 1.0, firstTreeModel, loss) | ||
| var bestValidateError = if (validate) validatePredError.values.mean() else 0.0 | ||
| var bestM = 1 | ||
|
|
||
| // psuedo-residual for second iteration | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "psuedo" --> "pseudo" (I didn't notice before) |
||
| data = input.map(point => LabeledPoint(loss.gradient(startingModel, point), | ||
| point.features)) | ||
| data = predError.zip(input).map { | ||
| case ((pred, _), point) => LabeledPoint(loss.gradient(pred, point.label), point.features) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks like a bug---not from you, but from before. Shouldn't there be a "-" in front of "loss.gradient" (as in the gradient computation in the loop below)? Also, style: Put "case ... =>" on the line above:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you tell me why should there be a negative sign in any case?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can check out Algorithm 1 in this paper: Friedman. "Stochastic Gradient Boosting." 1999.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it. Thanks a lot for the intuitive explanation. Any other comments? |
||
| } | ||
|
|
||
| var m = 1 | ||
| while (m < numIterations) { | ||
| timer.start(s"building tree $m") | ||
|
|
@@ -223,14 +230,20 @@ object GradientBoostedTrees extends Logging { | |
| // Note: A model of type regression is used since we require raw prediction | ||
| val partialModel = new GradientBoostedTreesModel( | ||
| Regression, baseLearners.slice(0, m + 1), baseLearnerWeights.slice(0, m + 1)) | ||
| logDebug("error of gbt = " + loss.computeError(partialModel, input)) | ||
|
|
||
| predError = GradientBoostedTreesModel.updatePredictionError( | ||
| input, predError, learningRate, model, loss) | ||
| logDebug("error of gbt = " + predError.values.mean()) | ||
|
|
||
| if (validate) { | ||
| // Stop training early if | ||
| // 1. Reduction in error is less than the validationTol or | ||
| // 2. If the error increases, that is if the model is overfit. | ||
| // We want the model returned corresponding to the best validation error. | ||
| val currentValidateError = loss.computeError(partialModel, validationInput) | ||
|
|
||
| validatePredError = GradientBoostedTreesModel.updatePredictionError( | ||
| validationInput, validatePredError, learningRate, model, loss) | ||
| val currentValidateError = validatePredError.values.mean() | ||
| if (bestValidateError - currentValidateError < validationTol) { | ||
| return new GradientBoostedTreesModel( | ||
| boostingStrategy.treeStrategy.algo, | ||
|
|
@@ -242,8 +255,9 @@ object GradientBoostedTrees extends Logging { | |
| } | ||
| } | ||
| // Update data with pseudo-residuals | ||
| data = input.map(point => LabeledPoint(-loss.gradient(partialModel, point), | ||
| point.features)) | ||
| data = predError.zip(input).map { | ||
| case ((pred, _), point) => LabeledPoint(-loss.gradient(pred, point.label), point.features) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove extra space after "=>" |
||
| } | ||
| m += 1 | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,17 +39,15 @@ object LogLoss extends Loss { | |
| * Method to calculate the loss gradients for the gradient boosting calculation for binary | ||
| * classification | ||
| * The gradient with respect to F(x) is: - 4 y / (1 + exp(2 y F(x))) | ||
| * @param model Ensemble model | ||
| * @param point Instance of the training dataset | ||
| * @param prediction Predicted point | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "Predicted label" (Please correct elsewhere too) |
||
| * @param label True label. | ||
| * @return Loss gradient | ||
| */ | ||
| override def gradient( | ||
| model: TreeEnsembleModel, | ||
| point: LabeledPoint): Double = { | ||
| val prediction = model.predict(point.features) | ||
| - 4.0 * point.label / (1.0 + math.exp(2.0 * point.label * prediction)) | ||
| override def gradient(prediction: Double, label: Double): Double = { | ||
| - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction)) | ||
| } | ||
|
|
||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove extra newline |
||
| override def computeError(prediction: Double, label: Double): Double = { | ||
| val margin = 2.0 * label * prediction | ||
| // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -131,29 +131,19 @@ class GradientBoostedTreesModel( | |
| val numIterations = trees.length | ||
| val evaluationArray = Array.fill(numIterations)(0.0) | ||
|
|
||
| var predictionAndError: RDD[(Double, Double)] = remappedData.map { i => | ||
| val pred = treeWeights(0) * trees(0).predict(i.features) | ||
| val error = loss.computeError(pred, i.label) | ||
| (pred, error) | ||
| } | ||
| var predictionAndError = GradientBoostedTreesModel.computeInitialPredictionAndError( | ||
| remappedData, treeWeights(0), trees(0), loss) | ||
|
|
||
| evaluationArray(0) = predictionAndError.values.mean() | ||
|
|
||
| // Avoid the model being copied across numIterations. | ||
| val broadcastTrees = sc.broadcast(trees) | ||
| val broadcastWeights = sc.broadcast(treeWeights) | ||
|
|
||
| (1 until numIterations).map { nTree => | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "map" --> "foreach" |
||
| predictionAndError = remappedData.zip(predictionAndError).mapPartitions { iter => | ||
| val currentTree = broadcastTrees.value(nTree) | ||
| val currentTreeWeight = broadcastWeights.value(nTree) | ||
| iter.map { | ||
| case (point, (pred, error)) => { | ||
| val newPred = pred + currentTree.predict(point.features) * currentTreeWeight | ||
| val newError = loss.computeError(newPred, point.label) | ||
| (newPred, newError) | ||
| } | ||
| } | ||
| } | ||
| predictionAndError = GradientBoostedTreesModel.updatePredictionError( | ||
| remappedData, predictionAndError, broadcastWeights.value(nTree), | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should pass the broadcast variables themselves; don't extract the value here since this happens before the mapPartitions(). |
||
| broadcastTrees.value(nTree), loss) | ||
| evaluationArray(nTree) = predictionAndError.values.mean() | ||
| } | ||
|
|
||
|
|
@@ -166,6 +156,56 @@ class GradientBoostedTreesModel( | |
|
|
||
| object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] { | ||
|
|
||
| /** | ||
| * Method to compute initial error and prediction as a RDD for the first | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doc: "Compute the initial predictions and errors for a dataset for the first iteration of gradient boosting." No need to say it's a "method." This is a leftover problem from before, so don't follow previous examples. : ) |
||
| * iteration of gradient boosting. | ||
| * @param data RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doc: "Training data" The generated doc will already include the argument type. Also a leftover problem from before... |
||
| * @param initTreeWeight: learning rate assigned to the first tree. | ||
| * @param initTree: first DecisionTreeModel | ||
| * @param loss: evaluation metric | ||
| * @return a RDD with each element being a zip of the prediction and error | ||
| * corresponding to every sample. | ||
| */ | ||
| def computeInitialPredictionAndError( | ||
| data: RDD[LabeledPoint], | ||
| initTreeWeight: Double, | ||
| initTree: DecisionTreeModel, loss: Loss): RDD[(Double, Double)] = { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. style: 1 parameter per line |
||
| data.map { i => | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. rename i -> "lp" or "labeledPoint" |
||
| val pred = initTreeWeight * initTree.predict(i.features) | ||
| val error = loss.computeError(pred, i.label) | ||
| (pred, error) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Method to update a zipped predictionError RDD | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
| * (as obtained with computeInitialPredictionAndError) | ||
| * @param data RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
| * @param predictionAndError: predictionError RDD | ||
| * @param currentTreeWeight: learning rate. | ||
| * @param currentTree: first DecisionTree | ||
| * @param loss: evaluation metric | ||
| * @return a RDD with each element being a zip of the prediction and error | ||
| * corresponing to each sample. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo: "corresponding" |
||
| */ | ||
| def updatePredictionError( | ||
| data: RDD[LabeledPoint], | ||
| predictionAndError: RDD[(Double, Double)], | ||
| currentTreeWeight: Double, | ||
| currentTree: DecisionTreeModel, | ||
| loss: Loss): RDD[(Double, Double)] = { | ||
|
|
||
| data.zip(predictionAndError).mapPartitions { iter => | ||
| iter.map { | ||
| case (point, (pred, error)) => { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "point" --> (Use the same variable name for a labeled point everywhere for consistency. I tend to use "lp") |
||
| val newPred = pred + currentTree.predict(point.features) * currentTreeWeight | ||
| val newError = loss.computeError(newPred, point.label) | ||
| (newPred, newError) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| override def load(sc: SparkContext, path: String): GradientBoostedTreesModel = { | ||
| val (loadedClassName, version, jsonMetadata) = Loader.loadMetadata(sc, path) | ||
| val classNameV1_0 = SaveLoadV1_0.thisClassName | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use "baseLearnerWeights(0)" instead of "1.0"