apache · MechCoder · Mar 5, 2015 · Mar 10, 2015 · Mar 11, 2015 · Mar 14, 2015
diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md
@@ -464,8 +464,8 @@ first one being the training dataset and the second being the validation dataset
 The training is stopped when the improvement in the validation error is not more than a certain tolerance
 (supplied by the `validationTol` argument in `BoostingStrategy`). In practice, the validation error
 decreases initially and later increases. There might be cases in which the validation error does not change monotonically,
-and the user is advised to set a large enough negative tolerance and examine the validation curve to to tune the number of
-iterations.
+and the user is advised to set a large enough negative tolerance and examine the validation curve using `evaluateEachIteration`
+(which gives the error or loss per iteration) to tune the number of iterations.
 
 ### Examples
 

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala
@@ -61,4 +61,18 @@ object AbsoluteError extends Loss {
       math.abs(err)
     }.mean()
   }
+
+  /**
+   * Method to calculate loss when the predictions are already known.
+   * Note: This method is used in the method evaluateEachIteration to avoid recomputing the
+   * predicted values from previously fit trees.
+   * @param datum: LabeledPoint
+   * @param prediction: Predicted label.
+   * @return  Absolute error of model on the given datapoint.
+   */
+  override def computeError(datum: LabeledPoint, prediction: Double): Double = {
+    val err = datum.label - prediction
+    math.abs(err)
+  }
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala
@@ -66,4 +66,19 @@ object LogLoss extends Loss {
       2.0 * MLUtils.log1pExp(-margin)
     }.mean()
   }
+
+  /**
+   * Method to calculate loss when the predictions are already known.
+   * Note: This method is used in the method evaluateEachIteration to avoid recomputing the
+   * predicted values from previously fit trees.
+   * @param datum: LabeledPoint
+   * @param prediction: Predicted label.
+   * @return log loss of model on the datapoint.
+   */
+  override def computeError(datum: LabeledPoint, prediction: Double): Double = {
+    val margin = 2.0 * datum.label * prediction
+    // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable.
+    2.0 * MLUtils.log1pExp(-margin)
+  }
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
@@ -49,4 +49,14 @@ trait Loss extends Serializable {
    */
   def computeError(model: TreeEnsembleModel, data: RDD[LabeledPoint]): Double
 
+  /**
+   * Method to calculate loss when the predictions are already known.
+   * Note: This method is used in the method evaluateEachIteration to avoid recomputing the
+   * predicted values from previously fit trees.
+   * @param datum: LabeledPoint
+   * @param prediction: Predicted label.
+   * @return Measure of model error on datapoint.
+   */
+  def computeError(datum: LabeledPoint, prediction: Double) : Double
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
@@ -61,4 +61,18 @@ object SquaredError extends Loss {
       err * err
     }.mean()
   }
+
+  /**
+   * Method to calculate loss when the predictions are already known.
+   * Note: This method is used in the method evaluateEachIteration to avoid recomputing the
+   * predicted values from previously fit trees.
+   * @param datum: LabeledPoint
+   * @param prediction: Predicted label.
+   * @return  Mean squared error of model on datapoint.
+   */
+  override def computeError(datum: LabeledPoint, prediction: Double): Double = {
+    val err = prediction - datum.label
+    err * err
+  }
+
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -28,9 +28,11 @@ import org.apache.spark.{Logging, SparkContext}
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.Algo
 import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.EnsembleCombiningStrategy._
+import org.apache.spark.mllib.tree.loss.Loss
 import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SQLContext
@@ -108,6 +110,53 @@ class GradientBoostedTreesModel(
   }
 
   override protected def formatVersion: String = TreeEnsembleModel.SaveLoadV1_0.thisFormatVersion
+
+  /**
+   * Method to compute error or loss for every iteration of gradient boosting.
+   * @param data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
+   * @param loss: evaluation metric.
+   * @return an array with index i having the losses or errors for the ensemble
+   *         containing trees 1 to i + 1
+   */
+  def evaluateEachIteration(
+      data: RDD[LabeledPoint],
+      loss: Loss) : Array[Double] = {
+
+    val sc = data.sparkContext
+    val remappedData = algo match {
+      case Classification => data.map(x => new LabeledPoint((x.label * 2) - 1, x.features))
+      case _ => data
+    }
+    val initialTree = trees(0)
+    val numIterations = trees.length
+    val evaluationArray = Array.fill(numIterations)(0.0)
+
+    // Initial weight is 1.0
+    var predictionErrorModel = remappedData.map {i =>
+      val pred = initialTree.predict(i.features)
+      val error = loss.computeError(i, pred)
+      (pred, error)
+    }
+    evaluationArray(0) = predictionErrorModel.values.mean()
+
+    // Avoid the model being copied across numIterations.
+    val broadcastTrees = sc.broadcast(trees)
+    val broadcastWeights = sc.broadcast(treeWeights)
+
+    (1 until numIterations).map {nTree =>
+      predictionErrorModel = (remappedData zip predictionErrorModel) map {
+        case (point, (pred, error)) => {
+          val newPred = pred + (
+            broadcastTrees.value(nTree).predict(point.features) * broadcastWeights.value(nTree))
+          val newError = loss.computeError(point, newPred)
+          (newPred, newError)
+        }
+      }
+      evaluationArray(nTree) = predictionErrorModel.values.mean()
+    }
+    evaluationArray
+  }
+
 }
 
 object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
@@ -175,10 +175,11 @@ class GradientBoostedTreesSuite extends FunSuite with MLlibTestSparkContext {
           new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0)
         val gbtValidate = new GradientBoostedTrees(boostingStrategy)
           .runWithValidation(trainRdd, validateRdd)
-        assert(gbtValidate.numTrees !== numIterations)
+        val numTrees = gbtValidate.numTrees
+        assert(numTrees !== numIterations)
 
         // Test that it performs better on the validation dataset.
-        val gbt = GradientBoostedTrees.train(trainRdd, boostingStrategy)
+        val gbt = new GradientBoostedTrees(boostingStrategy).run(trainRdd)
         val (errorWithoutValidation, errorWithValidation) = {
           if (algo == Classification) {
             val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features))
@@ -188,6 +189,17 @@ class GradientBoostedTreesSuite extends FunSuite with MLlibTestSparkContext {
           }
         }
         assert(errorWithValidation <= errorWithoutValidation)
+
+        // Test that results from evaluateEachIteration comply with runWithValidation.
+        // Note that convergenceTol is set to 0.0
+        val evaluationArray = gbt.evaluateEachIteration(validateRdd, loss)
+        assert(evaluationArray.length === numIterations)
+        assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1))
+        var i = 1
+        while (i < numTrees) {
+          assert(evaluationArray(i) < evaluationArray(i - 1))
+          i += 1
+        }
       }
     }
   }