[SPARK-7127] Simplified calls by overriding transformImpl and using broadcasted model in callUDF to make prediction

BryanCutler · BryanCutler · commit 9afad56fa0a7 · 2015-07-14T14:32:30.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
@@ -174,22 +174,20 @@ abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType,
    * @return transformed dataset with [[predictionCol]] of type [[Double]]
    */
   override def transform(dataset: DataFrame): DataFrame = {
-    transformImpl(dataset, predict)
-  }
-
-  protected def transformImpl(
-      dataset: DataFrame,
-      predictFunc: (FeaturesType) => Double): DataFrame = {
     transformSchema(dataset.schema, logging = true)
     if ($(predictionCol).nonEmpty) {
-      dataset.withColumn($(predictionCol), callUDF(predictFunc, DoubleType, col($(featuresCol))))
+      transformImpl(dataset)
     } else {
       this.logWarning(s"$uid: Predictor.transform() was called as NOOP" +
         " since no output columns were set.")
       dataset
     }
   }
 
+  protected def transformImpl(dataset: DataFrame): DataFrame = {
+    dataset.withColumn($(predictionCol), callUDF(predict _, DoubleType, col($(featuresCol))))
+  }
+
   /**
    * Predict label for the given features.
    * This internal method is used to implement [[transform()]] and output [[predictionCol]].
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -34,6 +34,8 @@ import org.apache.spark.mllib.tree.loss.{LogLoss => OldLogLoss, Loss => OldLoss}
 import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel => OldGBTModel}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.DoubleType
 
 /**
  * :: AlphaComponent ::
@@ -176,23 +178,17 @@ final class GBTClassificationModel(
 
   override def treeWeights: Array[Double] = _treeWeights
 
-  override def transform(dataset: DataFrame): DataFrame = {
+  override protected def transformImpl(dataset: DataFrame): DataFrame = {
     val bcastModel = dataset.sqlContext.sparkContext.broadcast(this)
-    val predictFunc = (features: Vector) => predictImpl(features, () => bcastModel.value)
-    transformImpl(dataset, predictFunc)
+    dataset.withColumn($(predictionCol), callUDF(bcastModel.value.predict _, DoubleType,
+      col($(featuresCol))))
   }
 
   override protected def predict(features: Vector): Double = {
-    // TODO: When we add a generic Bagging class, handle transform there: SPARK-7128
-    // Predict without using a broadcasted model
-    predictImpl(features, () => this)
-  }
-
-  protected def predictImpl(features: Vector, modelAccesor: () => TreeEnsembleModel): Double = {
+    // TODO: When we add a generic Boosting class, handle transform there?  SPARK-7129
     // Classifies by thresholding sum of weighted tree predictions
-    val treePredictions = modelAccesor().trees.map(_.rootNode.predict(features))
-    val prediction = blas.ddot(modelAccesor().numTrees, treePredictions, 1,
-      modelAccesor().treeWeights, 1)
+    val treePredictions = _trees.map(_.rootNode.predict(features))
+    val prediction = blas.ddot(numTrees, treePredictions, 1, _treeWeights, 1)
     if (prediction > 0.0) 1.0 else 0.0
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -31,6 +31,8 @@ import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
 import org.apache.spark.mllib.tree.model.{RandomForestModel => OldRandomForestModel}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.DoubleType
 
 /**
  * :: AlphaComponent ::
@@ -134,23 +136,18 @@ final class RandomForestClassificationModel private[ml] (
 
   override def treeWeights: Array[Double] = _treeWeights
 
-  override def transform(dataset: DataFrame): DataFrame = {
+  override protected def transformImpl(dataset: DataFrame): DataFrame = {
     val bcastModel = dataset.sqlContext.sparkContext.broadcast(this)
-    val predictFunc = (features: Vector) => predictImpl(features, () => bcastModel.value)
-    transformImpl(dataset, predictFunc)
+    dataset.withColumn($(predictionCol), callUDF(bcastModel.value.predict _, DoubleType,
+      col($(featuresCol))))
   }
 
   override protected def predict(features: Vector): Double = {
     // TODO: When we add a generic Bagging class, handle transform there: SPARK-7128
-    // Predict without using a broadcasted model
-    predictImpl(features, () => this)
-  }
-
-  protected def predictImpl(features: Vector, modelAccesor: () => TreeEnsembleModel): Double = {
     // Classifies using majority votes.
     // Ignore the weights since all are 1.0 for now.
     val votes = mutable.Map.empty[Int, Double]
-    modelAccesor().trees.view.foreach { tree =>
+    _trees.view.foreach { tree =>
       val prediction = tree.rootNode.predict(features).toInt
       votes(prediction) = votes.getOrElse(prediction, 0.0) + 1.0 // 1.0 = weight
     }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -33,6 +33,8 @@ import org.apache.spark.mllib.tree.loss.{AbsoluteError => OldAbsoluteError, Loss
 import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel => OldGBTModel}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.DoubleType
 
 /**
  * :: AlphaComponent ::
@@ -165,23 +167,17 @@ final class GBTRegressionModel(
 
   override def treeWeights: Array[Double] = _treeWeights
 
-  override def transform(dataset: DataFrame): DataFrame = {
+  override protected def transformImpl(dataset: DataFrame): DataFrame = {
     val bcastModel = dataset.sqlContext.sparkContext.broadcast(this)
-    val predictFunc = (features: Vector) => predictImpl(features, () => bcastModel.value)
-    transformImpl(dataset, predictFunc)
+    dataset.withColumn($(predictionCol), callUDF(bcastModel.value.predict _, DoubleType,
+      col($(featuresCol))))
   }
 
   override protected def predict(features: Vector): Double = {
-    // TODO: When we add a generic Bagging class, handle transform there: SPARK-7128
-    // Predict without using a broadcasted model
-    predictImpl(features, () => this)
-  }
-
-  protected def predictImpl(features: Vector, modelAccesor: () => TreeEnsembleModel): Double = {
+    // TODO: When we add a generic Boosting class, handle transform there?  SPARK-7129
     // Classifies by thresholding sum of weighted tree predictions
-    val treePredictions = modelAccesor().trees.map(_.rootNode.predict(features))
-    val prediction = blas.ddot(modelAccesor().numTrees, treePredictions, 1,
-      modelAccesor().treeWeights, 1)
+    val treePredictions = _trees.map(_.rootNode.predict(features))
+    val prediction = blas.ddot(numTrees, treePredictions, 1, _treeWeights, 1)
     if (prediction > 0.0) 1.0 else 0.0
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -29,6 +29,8 @@ import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
 import org.apache.spark.mllib.tree.model.{RandomForestModel => OldRandomForestModel}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.DoubleType
 
 /**
  * :: AlphaComponent ::
@@ -121,22 +123,17 @@ final class RandomForestRegressionModel private[ml] (
 
   override def treeWeights: Array[Double] = _treeWeights
 
-  override def transform(dataset: DataFrame): DataFrame = {
+  override protected def transformImpl(dataset: DataFrame): DataFrame = {
     val bcastModel = dataset.sqlContext.sparkContext.broadcast(this)
-    val predictFunc = (features: Vector) => predictImpl(features, () => bcastModel.value)
-    transformImpl(dataset, predictFunc)
+    dataset.withColumn($(predictionCol), callUDF(bcastModel.value.predict _, DoubleType,
+      col($(featuresCol))))
   }
 
   override protected def predict(features: Vector): Double = {
-    // TODO: When we add a generic Bagging class, handle transform there: SPARK-7128
-    // Predict without using a broadcasted model
-    predictImpl(features, () => this)
-  }
-
-  protected def predictImpl(features: Vector, modelAccesor: () => TreeEnsembleModel): Double = {
+    // TODO: When we add a generic Bagging class, handle transform there.  SPARK-7128
     // Predict average of tree predictions.
     // Ignore the weights since all are 1.0 for now.
-    modelAccesor().trees.map(_.rootNode.predict(features)).sum / modelAccesor().numTrees
+    _trees.map(_.rootNode.predict(features)).sum / numTrees
   }
 
   override def copy(extra: ParamMap): RandomForestRegressionModel = {