[SPARK-5972] Cache residuals and gradient in GBT during training and validation

MechCoder · MechCoder · commit 923dbf6fee7e · 2015-04-02T20:03:04.000+05:30
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -195,17 +195,24 @@ object GradientBoostedTrees extends Logging {
     baseLearners(0) = firstTreeModel
     baseLearnerWeights(0) = 1.0
     val startingModel = new GradientBoostedTreesModel(Regression, Array(firstTreeModel), Array(1.0))
-    logDebug("error of gbt = " + loss.computeError(startingModel, input))
+
+    var predError: RDD[(Double, Double)] = GradientBoostedTreesModel.
+      computeInitialPredictionAndError(input, 1.0, firstTreeModel, loss)
+    logDebug("error of gbt = " + predError.values.mean())
 
     // Note: A model of type regression is used since we require raw prediction
     timer.stop("building tree 0")
 
-    var bestValidateError = if (validate) loss.computeError(startingModel, validationInput) else 0.0
+    var validatePredError: RDD[(Double, Double)] = GradientBoostedTreesModel.
+      computeInitialPredictionAndError(validationInput, 1.0, firstTreeModel, loss)
+    var bestValidateError = if (validate) validatePredError.values.mean() else 0.0
     var bestM = 1
 
     // psuedo-residual for second iteration
-    data = input.map(point => LabeledPoint(loss.gradient(startingModel, point),
-      point.features))
+    data = predError.zip(input).map {
+      case ((pred, _), point) =>  LabeledPoint(loss.gradient(pred, point.label), point.features)
+    }
+
     var m = 1
     while (m < numIterations) {
       timer.start(s"building tree $m")
@@ -223,14 +230,20 @@ object GradientBoostedTrees extends Logging {
       // Note: A model of type regression is used since we require raw prediction
       val partialModel = new GradientBoostedTreesModel(
         Regression, baseLearners.slice(0, m + 1), baseLearnerWeights.slice(0, m + 1))
-      logDebug("error of gbt = " + loss.computeError(partialModel, input))
+
+      predError = GradientBoostedTreesModel.updatePredictionError(
+        input, predError, learningRate, model, loss)
+      logDebug("error of gbt = " + predError.values.mean())
 
       if (validate) {
         // Stop training early if
         // 1. Reduction in error is less than the validationTol or
         // 2. If the error increases, that is if the model is overfit.
         // We want the model returned corresponding to the best validation error.
-        val currentValidateError = loss.computeError(partialModel, validationInput)
+
+        validatePredError = GradientBoostedTreesModel.updatePredictionError(
+          validationInput, validatePredError, learningRate, model, loss)
+        val currentValidateError = validatePredError.values.mean()
         if (bestValidateError - currentValidateError < validationTol) {
           return new GradientBoostedTreesModel(
             boostingStrategy.treeStrategy.algo,
@@ -242,8 +255,9 @@ object GradientBoostedTrees extends Logging {
         }
       }
       // Update data with pseudo-residuals
-      data = input.map(point => LabeledPoint(-loss.gradient(partialModel, point),
-        point.features))
+      data = predError.zip(input).map {
+        case ((pred, _), point) =>  LabeledPoint(-loss.gradient(pred, point.label), point.features)
+      }
       m += 1
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala
@@ -37,14 +37,12 @@ object AbsoluteError extends Loss {
    * Method to calculate the gradients for the gradient boosting calculation for least
    * absolute error calculation.
    * The gradient with respect to F(x) is: sign(F(x) - y)
-   * @param model Ensemble model
-   * @param point Instance of the training dataset
+   * @param prediction Predicted point
+   * @param label True label.
    * @return Loss gradient
    */
-  override def gradient(
-      model: TreeEnsembleModel,
-      point: LabeledPoint): Double = {
-    if ((point.label - model.predict(point.features)) < 0) 1.0 else -1.0
+  override def gradient(prediction: Double, label: Double): Double = {
+    if (label - prediction < 0) 1.0 else -1.0
   }
 
   override def computeError(prediction: Double, label: Double): Double = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala
@@ -39,17 +39,15 @@ object LogLoss extends Loss {
    * Method to calculate the loss gradients for the gradient boosting calculation for binary
    * classification
    * The gradient with respect to F(x) is: - 4 y / (1 + exp(2 y F(x)))
-   * @param model Ensemble model
-   * @param point Instance of the training dataset
+   * @param prediction Predicted point
+   * @param label True label.
    * @return Loss gradient
    */
-  override def gradient(
-      model: TreeEnsembleModel,
-      point: LabeledPoint): Double = {
-    val prediction = model.predict(point.features)
-    - 4.0 * point.label / (1.0 + math.exp(2.0 * point.label * prediction))
+  override def gradient(prediction: Double, label: Double): Double = {
+    - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction))
   }
 
+
   override def computeError(prediction: Double, label: Double): Double = {
     val margin = 2.0 * label * prediction
     // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
@@ -31,13 +31,11 @@ trait Loss extends Serializable {
 
   /**
    * Method to calculate the gradients for the gradient boosting calculation.
-   * @param model Model of the weak learner.
-   * @param point Instance of the training dataset.
+   * @param prediction Predicted feature
+   * @param label true label.
    * @return Loss gradient.
    */
-  def gradient(
-      model: TreeEnsembleModel,
-      point: LabeledPoint): Double
+  def gradient(prediction: Double, label: Double): Double
 
   /**
    * Method to calculate error of the base learner for the gradient boosting calculation.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
@@ -37,14 +37,12 @@ object SquaredError extends Loss {
    * Method to calculate the gradients for the gradient boosting calculation for least
    * squares error calculation.
    * The gradient with respect to F(x) is: - 2 (y - F(x))
-   * @param model Ensemble model
-   * @param point Instance of the training dataset
+   * @param prediction Predicted point
+   * @param label True label.
    * @return Loss gradient
    */
-  override def gradient(
-    model: TreeEnsembleModel,
-    point: LabeledPoint): Double = {
-    2.0 * (model.predict(point.features) - point.label)
+  override def gradient(prediction: Double, label: Double): Double = {
+    2.0 * (prediction - label)
   }
 
   override def computeError(prediction: Double, label: Double): Double = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -131,29 +131,19 @@ class GradientBoostedTreesModel(
     val numIterations = trees.length
     val evaluationArray = Array.fill(numIterations)(0.0)
 
-    var predictionAndError: RDD[(Double, Double)] = remappedData.map { i =>
-      val pred = treeWeights(0) * trees(0).predict(i.features)
-      val error = loss.computeError(pred, i.label)
-      (pred, error)
-    }
+    var predictionAndError = GradientBoostedTreesModel.computeInitialPredictionAndError(
+      remappedData, treeWeights(0), trees(0), loss)
+
     evaluationArray(0) = predictionAndError.values.mean()
 
     // Avoid the model being copied across numIterations.
     val broadcastTrees = sc.broadcast(trees)
     val broadcastWeights = sc.broadcast(treeWeights)
 
     (1 until numIterations).map { nTree =>
-      predictionAndError = remappedData.zip(predictionAndError).mapPartitions { iter =>
-        val currentTree = broadcastTrees.value(nTree)
-        val currentTreeWeight = broadcastWeights.value(nTree)
-        iter.map {
-          case (point, (pred, error)) => {
-            val newPred = pred + currentTree.predict(point.features) * currentTreeWeight
-            val newError = loss.computeError(newPred, point.label)
-            (newPred, newError)
-          }
-        }
-      }
+      predictionAndError = GradientBoostedTreesModel.updatePredictionError(
+        remappedData, predictionAndError, broadcastWeights.value(nTree),
+        broadcastTrees.value(nTree), loss)
       evaluationArray(nTree) = predictionAndError.values.mean()
     }
 
@@ -166,6 +156,56 @@ class GradientBoostedTreesModel(
 
 object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
 
+  /**
+   * Method to compute initial error and prediction as a RDD for the first
+   * iteration of gradient boosting.
+   * @param data RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
+   * @param initTreeWeight: learning rate assigned to the first tree.
+   * @param initTree: first DecisionTreeModel
+   * @param loss: evaluation metric
+   * @return a RDD with each element being a zip of the prediction and error
+   *         corresponding to every sample.
+   */
+  def computeInitialPredictionAndError(
+      data: RDD[LabeledPoint],
+      initTreeWeight: Double,
+      initTree: DecisionTreeModel, loss: Loss): RDD[(Double, Double)] = {
+    data.map { i =>
+      val pred = initTreeWeight * initTree.predict(i.features)
+      val error = loss.computeError(pred, i.label)
+      (pred, error)
+    }
+  }
+
+  /**
+   * Method to update a zipped predictionError RDD
+   * (as obtained with computeInitialPredictionAndError)
+   * @param data RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
+   * @param predictionAndError: predictionError RDD
+   * @param currentTreeWeight: learning rate.
+   * @param currentTree: first DecisionTree
+   * @param loss: evaluation metric
+   * @return a RDD with each element being a zip of the prediction and error
+   *         corresponing to each sample.
+   */
+  def updatePredictionError(
+    data: RDD[LabeledPoint],
+    predictionAndError: RDD[(Double, Double)],
+    currentTreeWeight: Double,
+    currentTree: DecisionTreeModel,
+    loss: Loss): RDD[(Double, Double)] = {
+
+    data.zip(predictionAndError).mapPartitions { iter =>
+      iter.map {
+        case (point, (pred, error)) => {
+          val newPred = pred + currentTree.predict(point.features) * currentTreeWeight
+          val newError = loss.computeError(newPred, point.label)
+          (newPred, newError)
+        }
+      }
+    }
+  }
+
   override def load(sc: SparkContext, path: String): GradientBoostedTreesModel = {
     val (loadedClassName, version, jsonMetadata) = Loader.loadMetadata(sc, path)
     val classNameV1_0 = SaveLoadV1_0.thisClassName