[SPARK-7127] Removed abstract class for broadcasting model, instead passing a prediction function as param to transform

BryanCutler · BryanCutler · commit aaad77b8a973 · 2015-05-24T20:41:47.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.ml
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.SchemaUtils
@@ -175,55 +174,25 @@ abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType,
    * @return transformed dataset with [[predictionCol]] of type [[Double]]
    */
   override def transform(dataset: DataFrame): DataFrame = {
+    transformImpl(dataset, predict)
+  }
+
+  protected def transformImpl(
+      dataset: DataFrame,
+      predictFunc: (FeaturesType) => Double): DataFrame = {
     transformSchema(dataset.schema, logging = true)
     if ($(predictionCol).nonEmpty) {
-      transformImpl(dataset)
+      dataset.withColumn($(predictionCol), callUDF(predictFunc, DoubleType, col($(featuresCol))))
     } else {
       this.logWarning(s"$uid: Predictor.transform() was called as NOOP" +
         " since no output columns were set.")
       dataset
     }
   }
 
-  protected def transformImpl(dataset: DataFrame): DataFrame = {
-    dataset.withColumn($(predictionCol), callUDF(predict _, DoubleType, col($(featuresCol))))
-  }
-
   /**
    * Predict label for the given features.
    * This internal method is used to implement [[transform()]] and output [[predictionCol]].
    */
   protected def predict(features: FeaturesType): Double
 }
-
-
-/**
- * :: DeveloperApi ::
- *
- * Abstraction for a model for prediction tasks that will broadcast the model used to predict.
- *
- * @tparam FeaturesType  Type of features.
- *                       E.g., [[org.apache.spark.mllib.linalg.VectorUDT]] for vector features.
- * @tparam M  Specialization of [[PredictionModel]].  If you subclass this type, use this type
- *            parameter to specify the concrete type for the corresponding model.
- */
-@DeveloperApi
-abstract class PredictionModelBroadcasting[
-    FeaturesType, M <: PredictionModelBroadcasting[FeaturesType, M]
-  ]
-  extends PredictionModel[FeaturesType, M] {
-
-  protected def transformImpl(dataset: DataFrame, bcastModel: Broadcast[M]): DataFrame = {
-
-    dataset.withColumn($(predictionCol),
-      callUDF((features: FeaturesType) => predictWithBroadcastModel(features, bcastModel),
-        DoubleType, col($(featuresCol)))
-    )
-  }
-
-  /**
-   * Predict label for the given features using a broadcasted model.
-   * This internal method is used to implement [[transform()]] and output [[predictionCol]].
-   */
-  protected def predictWithBroadcastModel(features: FeaturesType, bcastModel: Broadcast[M]): Double
-}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -20,8 +20,7 @@ package org.apache.spark.ml.classification
 import scala.collection.mutable
 
 import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.ml.{PredictionModelBroadcasting, Predictor}
+import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.tree.{RandomForestParams, TreeClassifierParams, DecisionTreeModel, TreeEnsembleModel}
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
@@ -124,7 +123,7 @@ object RandomForestClassifier {
 final class RandomForestClassificationModel private[ml] (
     override val uid: String,
     private val _trees: Array[DecisionTreeClassificationModel])
-  extends PredictionModelBroadcasting[Vector, RandomForestClassificationModel]
+  extends PredictionModel[Vector, RandomForestClassificationModel]
   with TreeEnsembleModel with Serializable {
 
   require(numTrees > 0, "RandomForestClassificationModel requires at least 1 tree.")
@@ -138,21 +137,16 @@ final class RandomForestClassificationModel private[ml] (
 
   override def transform(dataset: DataFrame): DataFrame = {
     val bcastModel = dataset.sqlContext.sparkContext.broadcast(this)
-    transformImpl(dataset, bcastModel)
+    val predictFunc = (features: Vector) => predictImpl(features, () => bcastModel.value)
+    transformImpl(dataset, predictFunc)
   }
 
   override protected def predict(features: Vector): Double = {
     // TODO: When we add a generic Bagging class, handle transform there: SPARK-7128
-    // Predict without using a broadcasted mode
+    // Predict without using a broadcasted model
     predictImpl(features, () => this)
   }
 
-  override protected def predictWithBroadcastModel(features: Vector,
-      bcastModel: Broadcast[RandomForestClassificationModel]): Double = {
-    // Predict using the given broadcasted model
-    predictImpl(features, () => bcastModel.value)
-  }
-
   protected def predictImpl(features: Vector, modelAccesor: () => TreeEnsembleModel): Double = {
     // Classifies using majority votes.
     // Ignore the weights since all are 1.0 for now.