apache
diff --git a/‎core/src/main/scala/org/apache/spark/NewAccumulator.scala‎
Lines changed: 8 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/NewAccumulator.scala‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎core/src/test/scala/org/apache/spark/AccumulatorSuite.scala‎
Lines changed: 1 addition & 1 deletion b/‎core/src/test/scala/org/apache/spark/AccumulatorSuite.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/src/test/scala/org/apache/spark/InternalAccumulatorSuite.scala‎
Lines changed: 3 additions & 3 deletions b/‎core/src/test/scala/org/apache/spark/InternalAccumulatorSuite.scala‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala‎
Lines changed: 67 additions & 3 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala‎
Lines changed: 67 additions & 3 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala‎
Lines changed: 2 additions & 8 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala‎
Lines changed: 2 additions & 8 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala‎
Lines changed: 14 additions & 11 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala‎
Lines changed: 14 additions & 11 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala‎
Lines changed: 2 additions & 8 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala‎
Lines changed: 2 additions & 8 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala‎
Lines changed: 4 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎mllib/src/test/scala/org/apache/spark/ml/classification/ClassifierSuite.scala‎
Lines changed: 108 additions & 0 deletions b/‎mllib/src/test/scala/org/apache/spark/ml/classification/ClassifierSuite.scala‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala‎
Lines changed: 6 additions & 0 deletions b/‎mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala‎
Lines changed: 6 additions & 0 deletions
@@ -22,6 +22,8 @@ import java.io.ObjectInputStream
 import java.util.concurrent.atomic.AtomicLong
 import javax.annotation.concurrent.GuardedBy
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.scheduler.AccumulableInfo
 import org.apache.spark.util.Utils
 
@@ -57,7 +59,7 @@ abstract class NewAccumulator[IN, OUT] extends Serializable {
    * registered before ues, or it will throw exception.
    */
   final def isRegistered: Boolean =
-    metadata != null && AccumulatorContext.originals.containsKey(metadata.id)
+    metadata != null && AccumulatorContext.get(metadata.id).isDefined
 
   private def assertMetadataNotNull(): Unit = {
     if (metadata == null) {
@@ -197,7 +199,7 @@ private[spark] object AccumulatorContext {
    * TODO: Don't use a global map; these should be tied to a SparkContext (SPARK-13051).
    */
   @GuardedBy("AccumulatorContext")
-  val originals = new java.util.HashMap[Long, jl.ref.WeakReference[NewAccumulator[_, _]]]
+  private val originals = new java.util.HashMap[Long, jl.ref.WeakReference[NewAccumulator[_, _]]]
 
   private[this] val nextId = new AtomicLong(0L)
 
@@ -207,6 +209,10 @@ private[spark] object AccumulatorContext {
    */
   def newId(): Long = nextId.getAndIncrement
 
+  def numAccums: Int = synchronized(originals.size)
+
+  def accumIds: Set[Long] = synchronized(originals.keySet().asScala.toSet)
+
   /**
    * Register an [[Accumulator]] created on the driver such that it can be used on the executors.
    *
 
@@ -191,7 +191,7 @@ class AccumulatorSuite extends SparkFunSuite with Matchers with LocalSparkContex
     assert(ref.get.isEmpty)
 
     AccumulatorContext.remove(accId)
-    assert(!AccumulatorContext.originals.containsKey(accId))
+    assert(!AccumulatorContext.get(accId).isDefined)
   }
 
   test("get accum") {
 
@@ -183,18 +183,18 @@ class InternalAccumulatorSuite extends SparkFunSuite with LocalSparkContext {
       private val myCleaner = new SaveAccumContextCleaner(this)
       override def cleaner: Option[ContextCleaner] = Some(myCleaner)
     }
-    assert(AccumulatorContext.originals.isEmpty)
+    assert(AccumulatorContext.numAccums == 0)
     sc.parallelize(1 to 100).map { i => (i, i) }.reduceByKey { _ + _ }.count()
     val numInternalAccums = TaskMetrics.empty.internalAccums.length
     // We ran 2 stages, so we should have 2 sets of internal accumulators, 1 for each stage
-    assert(AccumulatorContext.originals.size === numInternalAccums * 2)
+    assert(AccumulatorContext.numAccums === numInternalAccums * 2)
     val accumsRegistered = sc.cleaner match {
       case Some(cleaner: SaveAccumContextCleaner) => cleaner.accumsRegisteredForCleanup
       case _ => Seq.empty[Long]
     }
     // Make sure the same set of accumulators is registered for cleanup
     assert(accumsRegistered.size === numInternalAccums * 2)
-    assert(accumsRegistered.toSet === AccumulatorContext.originals.keySet().asScala)
+    assert(accumsRegistered.toSet === AccumulatorContext.accumIds)
   }
 
   /**
 
@@ -17,14 +17,17 @@
 
 package org.apache.spark.ml.classification
 
+import org.apache.spark.SparkException
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.ml.{PredictionModel, Predictor, PredictorParams}
 import org.apache.spark.ml.param.shared.HasRawPredictionCol
-import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.ml.util.{MetadataUtils, SchemaUtils}
 import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
-import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
 
 /**
  * (private[spark]) Params for classification.
@@ -62,6 +65,67 @@ abstract class Classifier[
   def setRawPredictionCol(value: String): E = set(rawPredictionCol, value).asInstanceOf[E]
 
   // TODO: defaultEvaluator (follow-up PR)
+
+  /**
+   * Extract [[labelCol]] and [[featuresCol]] from the given dataset,
+   * and put it in an RDD with strong types.
+   *
+   * @param dataset  DataFrame with columns for labels ([[org.apache.spark.sql.types.NumericType]])
+   *                 and features ([[Vector]]). Labels are cast to [[DoubleType]].
+   * @param numClasses  Number of classes label can take.  Labels must be integers in the range
+   *                    [0, numClasses).
+   * @throws SparkException  if any label is not an integer >= 0
+   */
+  protected def extractLabeledPoints(dataset: Dataset[_], numClasses: Int): RDD[LabeledPoint] = {
+    require(numClasses > 0, s"Classifier (in extractLabeledPoints) found numClasses =" +
+      s" $numClasses, but requires numClasses > 0.")
+    dataset.select(col($(labelCol)).cast(DoubleType), col($(featuresCol))).rdd.map {
+      case Row(label: Double, features: Vector) =>
+        require(label % 1 == 0 && label >= 0 && label < numClasses, s"Classifier was given" +
+          s" dataset with invalid label $label.  Labels must be integers in range" +
+          s" [0, 1, ..., $numClasses), where numClasses=$numClasses.")
+        LabeledPoint(label, features)
+    }
+  }
+
+  /**
+   * Get the number of classes.  This looks in column metadata first, and if that is missing,
+   * then this assumes classes are indexed 0,1,...,numClasses-1 and computes numClasses
+   * by finding the maximum label value.
+   *
+   * Label validation (ensuring all labels are integers >= 0) needs to be handled elsewhere,
+   * such as in [[extractLabeledPoints()]].
+   *
+   * @param dataset  Dataset which contains a column [[labelCol]]
+   * @param maxNumClasses  Maximum number of classes allowed when inferred from data.  If numClasses
+   *                       is specified in the metadata, then maxNumClasses is ignored.
+   * @return  number of classes
+   * @throws IllegalArgumentException  if metadata does not specify numClasses, and the
+   *                                   actual numClasses exceeds maxNumClasses
+   */
+  protected def getNumClasses(dataset: Dataset[_], maxNumClasses: Int = 100): Int = {
+    MetadataUtils.getNumClasses(dataset.schema($(labelCol))) match {
+      case Some(n: Int) => n
+      case None =>
+        // Get number of classes from dataset itself.
+        val maxLabelRow: Array[Row] = dataset.select(max($(labelCol))).take(1)
+        if (maxLabelRow.isEmpty) {
+          throw new SparkException("ML algorithm was given empty dataset.")
+        }
+        val maxDoubleLabel: Double = maxLabelRow.head.getDouble(0)
+        require((maxDoubleLabel + 1).isValidInt, s"Classifier found max label value =" +
+          s" $maxDoubleLabel but requires integers in range [0, ... ${Int.MaxValue})")
+        val numClasses = maxDoubleLabel.toInt + 1
+        require(numClasses <= maxNumClasses, s"Classifier inferred $numClasses from label values" +
+          s" in column $labelCol, but this exceeded the max numClasses ($maxNumClasses) allowed" +
+          s" to be inferred from values.  To avoid this error for labels with > $maxNumClasses" +
+          s" classes, specify numClasses explicitly in the metadata; this can be done by applying" +
+          s" StringIndexer to the label column.")
+        logInfo(this.getClass.getCanonicalName + s" inferred $numClasses classes for" +
+          s" labelCol=$labelCol since numClasses was not specified in the column metadata.")
+        numClasses
+    }
+  }
 }
 
 /**
 
@@ -85,14 +85,8 @@ class DecisionTreeClassifier @Since("1.4.0") (
   override protected def train(dataset: Dataset[_]): DecisionTreeClassificationModel = {
     val categoricalFeatures: Map[Int, Int] =
       MetadataUtils.getCategoricalFeatures(dataset.schema($(featuresCol)))
-    val numClasses: Int = MetadataUtils.getNumClasses(dataset.schema($(labelCol))) match {
-      case Some(n: Int) => n
-      case None => throw new IllegalArgumentException("DecisionTreeClassifier was given input" +
-        s" with invalid label column ${$(labelCol)}, without the number of classes" +
-        " specified. See StringIndexer.")
-        // TODO: Automatically index labels: SPARK-7126
-    }
-    val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
+    val numClasses: Int = getNumClasses(dataset)
+    val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset, numClasses)
     val strategy = getOldStrategy(categoricalFeatures, numClasses)
     val trees = RandomForest.run(oldDataset, strategy, numTrees = 1, featureSubsetStrategy = "all",
       seed = $(seed), parentUID = Some(uid))
 
@@ -35,8 +35,9 @@ import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
 import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel => OldGBTModel}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.DoubleType
 
 /**
  * :: Experimental ::
@@ -126,16 +127,16 @@ class GBTClassifier @Since("1.4.0") (
   override protected def train(dataset: Dataset[_]): GBTClassificationModel = {
     val categoricalFeatures: Map[Int, Int] =
       MetadataUtils.getCategoricalFeatures(dataset.schema($(featuresCol)))
-    val numClasses: Int = MetadataUtils.getNumClasses(dataset.schema($(labelCol))) match {
-      case Some(n: Int) => n
-      case None => throw new IllegalArgumentException("GBTClassifier was given input" +
-        s" with invalid label column ${$(labelCol)}, without the number of classes" +
-        " specified. See StringIndexer.")
-      // TODO: Automatically index labels: SPARK-7126
-    }
-    require(numClasses == 2,
-      s"GBTClassifier only supports binary classification but was given numClasses = $numClasses")
-    val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
+    // We copy and modify this from Classifier.extractLabeledPoints since GBT only supports
+    // 2 classes now.  This lets us provide a more precise error message.
+    val oldDataset: RDD[LabeledPoint] =
+      dataset.select(col($(labelCol)).cast(DoubleType), col($(featuresCol))).rdd.map {
+        case Row(label: Double, features: Vector) =>
+          require(label == 0 || label == 1, s"GBTClassifier was given" +
+            s" dataset with invalid label $label.  Labels must be in {0,1}; note that" +
+            s" GBTClassifier currently only supports binary classification.")
+          LabeledPoint(label, features)
+      }
     val numFeatures = oldDataset.first().features.size
     val boostingStrategy = super.getOldBoostingStrategy(categoricalFeatures, OldAlgo.Classification)
     val (baseLearners, learnerWeights) = GradientBoostedTrees.run(oldDataset, boostingStrategy,
@@ -165,6 +166,7 @@ object GBTClassifier extends DefaultParamsReadable[GBTClassifier] {
  * model for classification.
  * It supports binary labels, as well as both continuous and categorical features.
  * Note: Multiclass labels are not currently supported.
+ *
  * @param _trees  Decision trees in the ensemble.
  * @param _treeWeights  Weights for the decision trees in the ensemble.
  */
@@ -185,6 +187,7 @@ class GBTClassificationModel private[ml](
 
   /**
    * Construct a GBTClassificationModel
+   *
    * @param _trees  Decision trees in the ensemble.
    * @param _treeWeights  Weights for the decision trees in the ensemble.
    */
 
@@ -101,14 +101,8 @@ class RandomForestClassifier @Since("1.4.0") (
   override protected def train(dataset: Dataset[_]): RandomForestClassificationModel = {
     val categoricalFeatures: Map[Int, Int] =
       MetadataUtils.getCategoricalFeatures(dataset.schema($(featuresCol)))
-    val numClasses: Int = MetadataUtils.getNumClasses(dataset.schema($(labelCol))) match {
-      case Some(n: Int) => n
-      case None => throw new IllegalArgumentException("RandomForestClassifier was given input" +
-        s" with invalid label column ${$(labelCol)}, without the number of classes" +
-        " specified. See StringIndexer.")
-      // TODO: Automatically index labels: SPARK-7126
-    }
-    val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
+    val numClasses: Int = getNumClasses(dataset)
+    val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset, numClasses)
     val strategy =
       super.getOldStrategy(categoricalFeatures, numClasses, OldAlgo.Classification, getOldImpurity)
     val trees =
 
@@ -327,5 +327,9 @@ object FPGrowth {
     def javaItems: java.util.List[Item] = {
       items.toList.asJava
     }
+
+    override def toString: String = {
+      s"${items.mkString("{", ",", "}")}: $freq"
+    }
   }
 }
@@ -17,6 +17,86 @@
 
 package org.apache.spark.ml.classification
 
+import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.ml.classification.ClassifierSuite.MockClassifier
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{DataFrame, Dataset}
+
+class ClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  test("extractLabeledPoints") {
+    def getTestData(labels: Seq[Double]): DataFrame = {
+      val data = labels.map { label: Double => LabeledPoint(label, Vectors.dense(0.0)) }
+      sqlContext.createDataFrame(data)
+    }
+
+    val c = new MockClassifier
+    // Valid dataset
+    val df0 = getTestData(Seq(0.0, 2.0, 1.0, 5.0))
+    c.extractLabeledPoints(df0, 6).count()
+    // Invalid datasets
+    val df1 = getTestData(Seq(0.0, -2.0, 1.0, 5.0))
+    withClue("Classifier should fail if label is negative") {
+      val e: SparkException = intercept[SparkException] {
+        c.extractLabeledPoints(df1, 6).count()
+      }
+      assert(e.getMessage.contains("given dataset with invalid label"))
+    }
+    val df2 = getTestData(Seq(0.0, 2.1, 1.0, 5.0))
+    withClue("Classifier should fail if label is not an integer") {
+      val e: SparkException = intercept[SparkException] {
+        c.extractLabeledPoints(df2, 6).count()
+      }
+      assert(e.getMessage.contains("given dataset with invalid label"))
+    }
+    // extractLabeledPoints with numClasses specified
+    withClue("Classifier should fail if label is >= numClasses") {
+      val e: SparkException = intercept[SparkException] {
+        c.extractLabeledPoints(df0, numClasses = 5).count()
+      }
+      assert(e.getMessage.contains("given dataset with invalid label"))
+    }
+    withClue("Classifier.extractLabeledPoints should fail if numClasses <= 0") {
+      val e: IllegalArgumentException = intercept[IllegalArgumentException] {
+        c.extractLabeledPoints(df0, numClasses = 0).count()
+      }
+      assert(e.getMessage.contains("but requires numClasses > 0"))
+    }
+  }
+
+  test("getNumClasses") {
+    def getTestData(labels: Seq[Double]): DataFrame = {
+      val data = labels.map { label: Double => LabeledPoint(label, Vectors.dense(0.0)) }
+      sqlContext.createDataFrame(data)
+    }
+
+    val c = new MockClassifier
+    // Valid dataset
+    val df0 = getTestData(Seq(0.0, 2.0, 1.0, 5.0))
+    assert(c.getNumClasses(df0) === 6)
+    // Invalid datasets
+    val df1 = getTestData(Seq(0.0, 2.0, 1.0, 5.1))
+    withClue("getNumClasses should fail if label is max label not an integer") {
+      val e: IllegalArgumentException = intercept[IllegalArgumentException] {
+        c.getNumClasses(df1)
+      }
+      assert(e.getMessage.contains("requires integers in range"))
+    }
+    val df2 = getTestData(Seq(0.0, 2.0, 1.0, Int.MaxValue.toDouble))
+    withClue("getNumClasses should fail if label is max label is >= Int.MaxValue") {
+      val e: IllegalArgumentException = intercept[IllegalArgumentException] {
+        c.getNumClasses(df2)
+      }
+      assert(e.getMessage.contains("requires integers in range"))
+    }
+  }
+}
+
 object ClassifierSuite {
 
   /**
@@ -29,4 +109,32 @@ object ClassifierSuite {
     "rawPredictionCol" -> "myRawPrediction"
   )
 
+  class MockClassifier(override val uid: String)
+    extends Classifier[Vector, MockClassifier, MockClassificationModel] {
+
+    def this() = this(Identifiable.randomUID("mockclassifier"))
+
+    override def copy(extra: ParamMap): MockClassifier = throw new NotImplementedError()
+
+    override def train(dataset: Dataset[_]): MockClassificationModel =
+      throw new NotImplementedError()
+
+    // Make methods public
+    override def extractLabeledPoints(dataset: Dataset[_], numClasses: Int): RDD[LabeledPoint] =
+      super.extractLabeledPoints(dataset, numClasses)
+    def getNumClasses(dataset: Dataset[_]): Int = super.getNumClasses(dataset)
+  }
+
+  class MockClassificationModel(override val uid: String)
+    extends ClassificationModel[Vector, MockClassificationModel] {
+
+    def this() = this(Identifiable.randomUID("mockclassificationmodel"))
+
+    protected def predictRaw(features: Vector): Vector = throw new NotImplementedError()
+
+    override def copy(extra: ParamMap): MockClassificationModel = throw new NotImplementedError()
+
+    override def numClasses: Int = throw new NotImplementedError()
+  }
+
 }
@@ -342,6 +342,12 @@ class DecisionTreeClassifierSuite
       }
   }
 
+  test("Fitting without numClasses in metadata") {
+    val df: DataFrame = sqlContext.createDataFrame(TreeTests.featureImportanceData(sc))
+    val dt = new DecisionTreeClassifier().setMaxDepth(1)
+    dt.fit(df)
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // Tests of model save/load
   /////////////////////////////////////////////////////////////////////////////
Original file line number	Diff line number	Diff line change
`@@ -191,7 +191,7 @@ class AccumulatorSuite extends SparkFunSuite with Matchers with LocalSparkContex`
`191`	`191`	`assert(ref.get.isEmpty)`
`192`	`192`
`193`	`193`	`AccumulatorContext.remove(accId)`
`194`		`- assert(!AccumulatorContext.originals.containsKey(accId))`
	`194`	`+ assert(!AccumulatorContext.get(accId).isDefined)`
`195`	`195`	`}`
`196`	`196`
`197`	`197`	`test("get accum") {`
Original file line number	Diff line number	Diff line change
`@@ -327,5 +327,9 @@ object FPGrowth {`
`327`	`327`	`def javaItems: java.util.List[Item] = {`
`328`	`328`	`items.toList.asJava`
`329`	`329`	`}`
	`330`	`+`
	`331`	`+ override def toString: String = {`
	`332`	`+ s"${items.mkString("{", ",", "}")}: $freq"`
	`333`	`+ }`
`330`	`334`	`}`
`331`	`335`	`}`
Original file line number	Diff line number	Diff line change
`@@ -342,6 +342,12 @@ class DecisionTreeClassifierSuite`
`342`	`342`	`}`
`343`	`343`	`}`
`344`	`344`
	`345`	`+ test("Fitting without numClasses in metadata") {`
	`346`	`+ val df: DataFrame = sqlContext.createDataFrame(TreeTests.featureImportanceData(sc))`
	`347`	`+ val dt = new DecisionTreeClassifier().setMaxDepth(1)`
	`348`	`+ dt.fit(df)`
	`349`	`+ }`
	`350`	`+`
`345`	`351`	`/////////////////////////////////////////////////////////////////////////////`
`346`	`352`	`// Tests of model save/load`
`347`	`353`	`/////////////////////////////////////////////////////////////////////////////`