From ecddf1597bde986cf216e632d8cf3075875a6918 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 16 Nov 2016 20:35:20 -0800
Subject: [PATCH 01/12] Remove deprecated methods for ML.

---
 .../classification/LogisticRegression.scala   |  8 +++--
 .../RandomForestClassifier.scala              |  9 ------
 .../spark/ml/feature/ChiSqSelector.scala      |  7 -----
 .../org/apache/spark/ml/param/params.scala    | 15 ----------
 .../ml/regression/LinearRegression.scala      |  3 --
 .../ml/regression/RandomForestRegressor.scala |  8 -----
 .../org/apache/spark/ml/tree/treeParams.scala | 13 ++++----
 python/pyspark/ml/util.py                     | 30 +++++++++++++++++--
 8 files changed, 40 insertions(+), 53 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 18b9b3043db8..fca7f6a26284 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -40,7 +40,7 @@ import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions.{col, lit}
-import org.apache.spark.sql.types.DoubleType
+import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.VersionUtils
 
@@ -176,8 +176,12 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
     }
   }
 
-  override def validateParams(): Unit = {
+  override protected def validateAndTransformSchema(
+      schema: StructType,
+      fitting: Boolean,
+      featuresDataType: DataType): StructType = {
     checkThresholdConsistency()
+    super.validateAndTransformSchema(schema, fitting, featuresDataType)
   }
 }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 52345b0626c4..3c784d5555a4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -221,15 +221,6 @@ class RandomForestClassificationModel private[ml] (
     }
   }
 
-  /**
-   * Number of trees in ensemble
-   *
-   * @deprecated  Use [[getNumTrees]] instead.  This method will be removed in 2.1.0
-   */
-  // TODO: Once this is removed, then this class can inherit from RandomForestClassifierParams
-  @deprecated("Use getNumTrees instead.  This method will be removed in 2.1.0.", "2.0.0")
-  val numTrees: Int = trees.length
-
   @Since("1.4.0")
   override def copy(extra: ParamMap): RandomForestClassificationModel = {
     copyValues(new RandomForestClassificationModel(uid, _trees, numFeatures, numClasses), extra)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 653fa41124f8..7cd0f159c6be 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -216,13 +216,6 @@ final class ChiSqSelectorModel private[ml] (
   @Since("1.6.0")
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  /**
-   * @group setParam
-   */
-  @Since("1.6.0")
-  @deprecated("labelCol is not used by ChiSqSelectorModel.", "2.0.0")
-  def setLabelCol(value: String): this.type = set(labelCol, value)
-
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
     val transformedSchema = transformSchema(dataset.schema, logging = true)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 9245931b27ca..993ac8cb2126 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -546,21 +546,6 @@ trait Params extends Identifiable with Serializable {
       .map(m => m.invoke(this).asInstanceOf[Param[_]])
   }
 
-  /**
-   * Validates parameter values stored internally.
-   * Raise an exception if any parameter value is invalid.
-   *
-   * This only needs to check for interactions between parameters.
-   * Parameter value checks which do not depend on other parameters are handled by
-   * `Param.validate()`. This method does not handle input/output column parameters;
-   * those are checked during schema validation.
-   * @deprecated Will be removed in 2.1.0. All the checks should be merged into transformSchema
-   */
-  @deprecated("Will be removed in 2.1.0. Checks should be merged into transformSchema.", "2.0.0")
-  def validateParams(): Unit = {
-    // Do nothing by default.  Override to handle Param interactions.
-  }
-
   /**
    * Explains a param.
    * @param param input param, must belong to this instance.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 9639b07496c1..f8d754ddb3e3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -605,9 +605,6 @@ class LinearRegressionSummary private[regression] (
     private val privateModel: LinearRegressionModel,
     private val diagInvAtWA: Array[Double]) extends Serializable {
 
-  @deprecated("The model field is deprecated and will be removed in 2.1.0.", "2.0.0")
-  val model: LinearRegressionModel = privateModel
-
   @transient private val metrics = new RegressionMetrics(
     predictions
       .select(col(predictionCol), col(labelCol).cast(DoubleType))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index 0ad00aa6f928..b123bc9360b9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -181,14 +181,6 @@ class RandomForestRegressionModel private[ml] (
     _trees.map(_.rootNode.predictImpl(features).prediction).sum / getNumTrees
   }
 
-  /**
-   * Number of trees in ensemble
-   * @deprecated  Use [[getNumTrees]] instead.  This method will be removed in 2.1.0
-   */
-  // TODO: Once this is removed, then this class can inherit from RandomForestRegressorParams
-  @deprecated("Use getNumTrees instead.  This method will be removed in 2.1.0.", "2.0.0")
-  val numTrees: Int = trees.length
-
   @Since("1.4.0")
   override def copy(extra: ParamMap): RandomForestRegressionModel = {
     copyValues(new RandomForestRegressionModel(uid, _trees, numFeatures), extra).setParent(parent)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index 57c7e44e9760..30500109ca66 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -435,7 +435,12 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS
   setDefault(maxIter -> 20, stepSize -> 0.1)
 
   /** @group setParam */
-  def setMaxIter(value: Int): this.type = set(maxIter, value)
+  def setMaxIter(value: Int): this.type = {
+    require(ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)(
+      value), "GBT parameter stepSize should be in interval (0, 1], " +
+      s"but it given invalid value $value.")
+    set(maxIter, value)
+  }
 
   /**
    * Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each
@@ -445,12 +450,6 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS
    */
   def setStepSize(value: Double): this.type = set(stepSize, value)
 
-  override def validateParams(): Unit = {
-    require(ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)(
-      getStepSize), "GBT parameter stepSize should be in interval (0, 1], " +
-      s"but it given invalid value $getStepSize.")
-  }
-
   /** (private[ml]) Create a BoostingStrategy instance to use with the old API. */
   private[ml] def getOldBoostingStrategy(
       categoricalFeatures: Map[Int, Int],
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index 7d39c3012235..42f6df9ca280 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -81,6 +81,10 @@ def context(self, sqlContext):
         """Sets the SQL context to use for saving."""
         raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))
 
+    def session(self, sparkSession):
+        """Sets the Spark Session to use for saving."""
+        raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))
+
 
 @inherit_doc
 class JavaMLWriter(MLWriter):
@@ -105,10 +109,19 @@ def overwrite(self):
         return self
 
     def context(self, sqlContext):
-        """Sets the SQL context to use for saving."""
+        """
+        Sets the SQL context to use for saving.
+        .. note:: Deprecated in 2.1, use session instead.
+        """
+        warnings.warn("Deprecated in 2.1, use session instead.")
         self._jwrite.context(sqlContext._ssql_ctx)
         return self
 
+    def session(self, sparkSession):
+        """Sets the Spark Session to use for saving."""
+        self._jwrite.session(sparkSession._jsparkSession)
+        return self
+
 
 @inherit_doc
 class MLWritable(object):
@@ -158,6 +171,10 @@ def context(self, sqlContext):
         """Sets the SQL context to use for loading."""
         raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self))
 
+    def session(self, sparkSession):
+        """Sets the Spark Session to use for loading."""
+        raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self))
+
 
 @inherit_doc
 class JavaMLReader(MLReader):
@@ -180,10 +197,19 @@ def load(self, path):
         return self._clazz._from_java(java_obj)
 
     def context(self, sqlContext):
-        """Sets the SQL context to use for loading."""
+        """
+        Sets the SQL context to use for loading.
+        .. note:: Deprecated in 2.1, use session instead.
+        """
+        warnings.warn("Deprecated in 2.1, use session instead.")
         self._jread.context(sqlContext._ssql_ctx)
         return self
 
+    def session(self, sparkSession):
+        """Sets the Spark Session to use for loading."""
+        self._jread.session(sparkSession._jsparkSession)
+        return self
+
     @classmethod
     def _java_loader_class(cls, clazz):
         """

From 9a8c92691e7ec0b9d37eed0cf6f9dbcc4d4d622f Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 16 Nov 2016 23:47:30 -0800
Subject: [PATCH 02/12] Add mima excludes.

---
 project/MimaExcludes.scala | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 12f7ed202b9d..6a58d2006171 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -867,6 +867,22 @@ object MimaExcludes {
       // [SPARK-12221] Add CPU time to metrics
       ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.TaskMetrics.this"),
       ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.TaskMetricDistributions.this")
+    ) ++ Seq(
+      // [SPARK-18481] ML 2.1 QA: Remove deprecated methods for ML
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.PipelineStage.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.param.JavaParams.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.param.Params.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.GBTClassificationModel.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegression.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.GBTClassifier.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionModel.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.numTrees"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.feature.ChiSqSelectorModel.setLabelCol"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.evaluation.Evaluator.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.GBTRegressor.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.GBTRegressionModel.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.LinearRegressionSummary.model"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees")
     )
   }
 

From 13988b453569fc8e814952f0ef7ce20e634a724a Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 18 Nov 2016 03:01:27 -0800
Subject: [PATCH 03/12] Fix wrong location of GBT step size check.

---
 .../org/apache/spark/ml/tree/treeParams.scala     | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index 30500109ca66..c1dfa50ed3ce 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -435,12 +435,8 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS
   setDefault(maxIter -> 20, stepSize -> 0.1)
 
   /** @group setParam */
-  def setMaxIter(value: Int): this.type = {
-    require(ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)(
-      value), "GBT parameter stepSize should be in interval (0, 1], " +
-      s"but it given invalid value $value.")
-    set(maxIter, value)
-  }
+  def setMaxIter(value: Int): this.type = set(maxIter, value)
+
 
   /**
    * Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each
@@ -448,7 +444,12 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS
    * (default = 0.1)
    * @group setParam
    */
-  def setStepSize(value: Double): this.type = set(stepSize, value)
+  def setStepSize(value: Double): this.type = {
+    require(ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)(
+      value), "GBT parameter stepSize should be in interval (0, 1], " +
+      s"but it given invalid value $value.")
+    set(stepSize, value)
+  }
 
   /** (private[ml]) Create a BoostingStrategy instance to use with the old API. */
   private[ml] def getOldBoostingStrategy(

From 7beb5688e83a49c9e0f2270fe413c2f2fff39ecd Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sat, 19 Nov 2016 07:03:20 -0800
Subject: [PATCH 04/12] Remove RandomForestClassificationModelParams and
 RandomForestRegressionModelParams.

---
 .../ml/classification/GBTClassifier.scala     |  5 ++
 .../RandomForestClassifier.scala              |  2 +-
 .../spark/ml/regression/GBTRegressor.scala    |  5 ++
 .../ml/regression/RandomForestRegressor.scala |  2 +-
 .../org/apache/spark/ml/tree/treeModels.scala |  5 --
 .../org/apache/spark/ml/tree/treeParams.scala | 63 +++++++------------
 6 files changed, 34 insertions(+), 48 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index f8f164e8c14b..d7d4612e6d17 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -201,6 +201,11 @@ class GBTClassificationModel private[ml](
   @Since("1.4.0")
   override def trees: Array[DecisionTreeRegressionModel] = _trees
 
+  /**
+   * Number of trees in ensemble
+   */
+  val getNumTrees: Int = trees.length
+
   @Since("1.4.0")
   override def treeWeights: Array[Double] = _treeWeights
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 3c784d5555a4..f0035acb1d65 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -158,7 +158,7 @@ class RandomForestClassificationModel private[ml] (
     @Since("1.6.0") override val numFeatures: Int,
     @Since("1.5.0") override val numClasses: Int)
   extends ProbabilisticClassificationModel[Vector, RandomForestClassificationModel]
-  with RandomForestClassificationModelParams with TreeEnsembleModel[DecisionTreeClassificationModel]
+  with RandomForestClassifierParams with TreeEnsembleModel[DecisionTreeClassificationModel]
   with MLWritable with Serializable {
 
   require(_trees.nonEmpty, "RandomForestClassificationModel requires at least 1 tree.")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index fa69d60836e6..0f6e178bf318 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -183,6 +183,11 @@ class GBTRegressionModel private[ml](
   @Since("1.4.0")
   override def trees: Array[DecisionTreeRegressionModel] = _trees
 
+  /**
+   * Number of trees in ensemble
+   */
+  val getNumTrees: Int = trees.length
+
   @Since("1.4.0")
   override def treeWeights: Array[Double] = _treeWeights
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index b123bc9360b9..e9230b88e8cf 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -144,7 +144,7 @@ class RandomForestRegressionModel private[ml] (
     private val _trees: Array[DecisionTreeRegressionModel],
     override val numFeatures: Int)
   extends PredictionModel[Vector, RandomForestRegressionModel]
-  with RandomForestRegressionModelParams with TreeEnsembleModel[DecisionTreeRegressionModel]
+  with RandomForestRegressorParams with TreeEnsembleModel[DecisionTreeRegressionModel]
   with MLWritable with Serializable {
 
   require(_trees.nonEmpty, "RandomForestRegressionModel requires at least 1 tree.")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
index d3cbc363799a..0d6e9034e5ce 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
@@ -95,11 +95,6 @@ private[ml] trait TreeEnsembleModel[M <: DecisionTreeModel] {
   /** Trees in this ensemble. Warning: These have null parent Estimators. */
   def trees: Array[M]
 
-  /**
-   * Number of trees in ensemble
-   */
-  val getNumTrees: Int = trees.length
-
   /** Weights for each tree, zippable with [[trees]] */
   def treeWeights: Array[Double]
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index c1dfa50ed3ce..311580040d1d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -317,8 +317,28 @@ private[ml] trait TreeEnsembleParams extends DecisionTreeParams {
   }
 }
 
-/** Used for [[RandomForestParams]] */
-private[ml] trait HasFeatureSubsetStrategy extends Params {
+/**
+ * Parameters for Random Forest algorithms.
+ */
+private[ml] trait RandomForestParams extends TreeEnsembleParams {
+
+  /**
+   * Number of trees to train (>= 1).
+   * If 1, then no bootstrapping is used.  If > 1, then bootstrapping is done.
+   * TODO: Change to always do bootstrapping (simpler).  SPARK-7130
+   * (default = 20)
+   * @group param
+   */
+  final val numTrees: IntParam = new IntParam(this, "numTrees", "Number of trees to train (>= 1)",
+    ParamValidators.gtEq(1))
+
+  setDefault(numTrees -> 20)
+
+  /** @group setParam */
+  def setNumTrees(value: Int): this.type = set(numTrees, value)
+
+  /** @group getParam */
+  final def getNumTrees: Int = $(numTrees)
 
   /**
    * The number of features to consider for splits at each tree node.
@@ -364,38 +384,6 @@ private[ml] trait HasFeatureSubsetStrategy extends Params {
   final def getFeatureSubsetStrategy: String = $(featureSubsetStrategy).toLowerCase
 }
 
-/**
- * Used for [[RandomForestParams]].
- * This is separated out from [[RandomForestParams]] because of an issue with the
- * `numTrees` method conflicting with this Param in the Estimator.
- */
-private[ml] trait HasNumTrees extends Params {
-
-  /**
-   * Number of trees to train (>= 1).
-   * If 1, then no bootstrapping is used.  If > 1, then bootstrapping is done.
-   * TODO: Change to always do bootstrapping (simpler).  SPARK-7130
-   * (default = 20)
-   * @group param
-   */
-  final val numTrees: IntParam = new IntParam(this, "numTrees", "Number of trees to train (>= 1)",
-    ParamValidators.gtEq(1))
-
-  setDefault(numTrees -> 20)
-
-  /** @group setParam */
-  def setNumTrees(value: Int): this.type = set(numTrees, value)
-
-  /** @group getParam */
-  final def getNumTrees: Int = $(numTrees)
-}
-
-/**
- * Parameters for Random Forest algorithms.
- */
-private[ml] trait RandomForestParams extends TreeEnsembleParams
-  with HasFeatureSubsetStrategy with HasNumTrees
-
 private[spark] object RandomForestParams {
   // These options should be lowercase.
   final val supportedFeatureSubsetStrategies: Array[String] =
@@ -405,15 +393,9 @@ private[spark] object RandomForestParams {
 private[ml] trait RandomForestClassifierParams
   extends RandomForestParams with TreeClassifierParams
 
-private[ml] trait RandomForestClassificationModelParams extends TreeEnsembleParams
-  with HasFeatureSubsetStrategy with TreeClassifierParams
-
 private[ml] trait RandomForestRegressorParams
   extends RandomForestParams with TreeRegressorParams
 
-private[ml] trait RandomForestRegressionModelParams extends TreeEnsembleParams
-  with HasFeatureSubsetStrategy with TreeRegressorParams
-
 /**
  * Parameters for Gradient-Boosted Tree algorithms.
  *
@@ -437,7 +419,6 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS
   /** @group setParam */
   def setMaxIter(value: Int): this.type = set(maxIter, value)
 
-
   /**
    * Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each
    * estimator.

From 36c2e248c8c1436b126352ee50ded5f01c03c4c8 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sat, 19 Nov 2016 07:21:47 -0800
Subject: [PATCH 05/12] Better deprecated docs for estimator/transfomer
 read/write context function.

---
 .../scala/org/apache/spark/ml/util/ReadWrite.scala     |  2 +-
 python/pyspark/ml/util.py                              | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index bc4f9e6716ee..22abf4c976e9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -46,7 +46,7 @@ private[util] sealed trait BaseReadWrite {
    * Sets the Spark SQLContext to use for saving/loading.
    */
   @Since("1.6.0")
-  @deprecated("Use session instead", "2.0.0")
+  @deprecated("Use session instead, This method will be removed in 2.2.0.", "2.0.0")
   def context(sqlContext: SQLContext): this.type = {
     optionSparkSession = Option(sqlContext.sparkSession)
     this
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index 42f6df9ca280..9c5cd1568148 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -78,7 +78,10 @@ def overwrite(self):
         raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))
 
     def context(self, sqlContext):
-        """Sets the SQL context to use for saving."""
+        """
+        Sets the SQL context to use for saving.
+        .. note:: Deprecated in 2.1, use session instead.
+        """
         raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))
 
     def session(self, sparkSession):
@@ -168,7 +171,10 @@ def load(self, path):
         raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self))
 
     def context(self, sqlContext):
-        """Sets the SQL context to use for loading."""
+        """
+        Sets the SQL context to use for loading.
+        .. note:: Deprecated in 2.1, use session instead.
+        """
         raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self))
 
     def session(self, sparkSession):

From 8d3c47a2cb3f2ffcf72b6798c224ac0f29c9e6b2 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sat, 19 Nov 2016 08:09:15 -0800
Subject: [PATCH 06/12] Add functions to mima excludes.

---
 project/MimaExcludes.scala | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 6a58d2006171..e1b03fe2bb49 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -882,7 +882,17 @@ object MimaExcludes {
       ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.GBTRegressor.validateParams"),
       ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.GBTRegressionModel.validateParams"),
       ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.LinearRegressionSummary.model"),
-      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees")
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.RandomForestClassifier"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.RandomForestRegressor"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel"),
+      ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.getNumTrees"),
+      ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.getNumTrees"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.numTrees"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.setFeatureSubsetStrategy"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setFeatureSubsetStrategy")
     )
   }
 

From 0c85fd4143c666c06a3db9c89ac50eb4981b2849 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 23 Nov 2016 21:52:52 -0800
Subject: [PATCH 07/12] Update docs.

---
 .../spark/ml/classification/GBTClassifier.scala      |  1 +
 .../apache/spark/ml/regression/GBTRegressor.scala    |  1 +
 .../scala/org/apache/spark/ml/tree/treeParams.scala  |  4 ++++
 python/pyspark/ml/util.py                            | 12 ++++++------
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index d7d4612e6d17..a857691c8059 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -204,6 +204,7 @@ class GBTClassificationModel private[ml](
   /**
    * Number of trees in ensemble
    */
+  @Since("2.0.0")
   val getNumTrees: Int = trees.length
 
   @Since("1.4.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index 0f6e178bf318..8ab09226297d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -186,6 +186,7 @@ class GBTRegressionModel private[ml](
   /**
    * Number of trees in ensemble
    */
+  @Since("2.0.0")
   val getNumTrees: Int = trees.length
 
   @Since("1.4.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index 311580040d1d..0fce196e63d3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -327,6 +327,10 @@ private[ml] trait RandomForestParams extends TreeEnsembleParams {
    * If 1, then no bootstrapping is used.  If > 1, then bootstrapping is done.
    * TODO: Change to always do bootstrapping (simpler).  SPARK-7130
    * (default = 20)
+   *
+   * Note: The reason that we cannot add this to both GBT and RF (i.e. in TreeEnsembleParams)
+   * is the param `maxIter` controls how many trees a GBT has. The semantics in the algorithms
+   * are a bit different.
    * @group param
    */
   final val numTrees: IntParam = new IntParam(this, "numTrees", "Number of trees to train (>= 1)",
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index 9c5cd1568148..bec4b2895210 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -80,7 +80,7 @@ def overwrite(self):
     def context(self, sqlContext):
         """
         Sets the SQL context to use for saving.
-        .. note:: Deprecated in 2.1, use session instead.
+        .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead.
         """
         raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))
 
@@ -114,9 +114,9 @@ def overwrite(self):
     def context(self, sqlContext):
         """
         Sets the SQL context to use for saving.
-        .. note:: Deprecated in 2.1, use session instead.
+        .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead.
         """
-        warnings.warn("Deprecated in 2.1, use session instead.")
+        warnings.warn("Deprecated in 2.1 and will be removed in 2.2, use session instead.")
         self._jwrite.context(sqlContext._ssql_ctx)
         return self
 
@@ -173,7 +173,7 @@ def load(self, path):
     def context(self, sqlContext):
         """
         Sets the SQL context to use for loading.
-        .. note:: Deprecated in 2.1, use session instead.
+        .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead.
         """
         raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self))
 
@@ -205,9 +205,9 @@ def load(self, path):
     def context(self, sqlContext):
         """
         Sets the SQL context to use for loading.
-        .. note:: Deprecated in 2.1, use session instead.
+        .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead.
         """
-        warnings.warn("Deprecated in 2.1, use session instead.")
+        warnings.warn("Deprecated in 2.1 and will be removed in 2.2, use session instead.")
         self._jread.context(sqlContext._ssql_ctx)
         return self
 

From 1d8cef5f251614f0aef37fdd19426f5bf298beb4 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 23 Nov 2016 22:32:48 -0800
Subject: [PATCH 08/12] Add test cases for param validation.

---
 .../spark/ml/classification/GBTClassifierSuite.scala      | 8 ++++++++
 .../spark/ml/classification/LogisticRegressionSuite.scala | 6 ++++++
 2 files changed, 14 insertions(+)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index 3492709677d4..7c36745ab213 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -70,6 +70,14 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext
     ParamsSuite.checkParams(model)
   }
 
+  test("GBT parameter stepSize should be in interval (0, 1]") {
+    withClue("GBT parameter stepSize should be in interval (0, 1]") {
+      intercept[IllegalArgumentException] {
+        new GBTClassifier().setStepSize(10)
+      }
+    }
+  }
+
   test("Binary classification with continuous features: Log Loss") {
     val categoricalFeatures = Map.empty[Int, Int]
     testCombinations.foreach {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 2877285eb4d5..b7407b7edb53 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -190,6 +190,12 @@ class LogisticRegressionSuite
       }
     }
     // thresholds and threshold must be consistent: values
+    withClue("fit with ParamMap should throw error if threshold, thresholds do not match.") {
+      intercept[IllegalArgumentException] {
+        lr2.fit(smallBinaryDataset,
+          lr2.thresholds -> Array(0.3, 0.7), lr2.threshold -> (expectedThreshold / 2.0))
+      }
+    }
     withClue("fit with ParamMap should throw error if threshold, thresholds do not match.") {
       intercept[IllegalArgumentException] {
         val lr2model = lr2.fit(smallBinaryDataset,

From 3c8ee2d445bcb22ecf16e1f67408fbe81136cc13 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 25 Nov 2016 01:41:13 -0800
Subject: [PATCH 09/12] Use separate stepSize and add doc for transformSchema

---
 .../scala/org/apache/spark/ml/Pipeline.scala  |  3 +++
 .../org/apache/spark/ml/tree/treeParams.scala | 23 +++++++++++--------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index f406f8c426d0..acb3b0fc18b3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -45,6 +45,9 @@ abstract class PipelineStage extends Params with Logging {
    * :: DeveloperApi ::
    *
    * Check transform validity and derive the output schema from the input schema.
+   * This only needs to check for interactions between parameters. Raise an exception if
+   * any parameter value is invalid. Parameter value checks which do not depend on other
+   * parameters are handled by `Param.validate()`.
    *
    * Typical implementation should first conduct verification on schema change and parameter
    * validity, including complex parameter interaction checks.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index 0fce196e63d3..49bc9e6809af 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -405,7 +405,7 @@ private[ml] trait RandomForestRegressorParams
  *
  * Note: Marked as private and DeveloperApi since this may be made public in the future.
  */
-private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasStepSize {
+private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
 
   /* TODO: Add this doc when we add this param.  SPARK-7132
    * Threshold for stopping early when runWithValidation is used.
@@ -424,17 +424,20 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS
   def setMaxIter(value: Int): this.type = set(maxIter, value)
 
   /**
-   * Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each
-   * estimator.
+   * Param for Step size (a.k.a. learning rate) in interval (0, 1] for shrinking
+   * the contribution of each estimator.
    * (default = 0.1)
-   * @group setParam
+   * @group param
    */
-  def setStepSize(value: Double): this.type = {
-    require(ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)(
-      value), "GBT parameter stepSize should be in interval (0, 1], " +
-      s"but it given invalid value $value.")
-    set(stepSize, value)
-  }
+  final val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size " +
+    "(a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.",
+    ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true))
+
+  /** @group getParam */
+  final def getStepSize: Double = $(stepSize)
+
+  /** @group setParam */
+  def setStepSize(value: Double): this.type = set(stepSize, value)
 
   /** (private[ml]) Create a BoostingStrategy instance to use with the old API. */
   private[ml] def getOldBoostingStrategy(

From c874d0e41a6c892fce66396a7ca5a51b897202e1 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 25 Nov 2016 01:48:41 -0800
Subject: [PATCH 10/12] Update docs

---
 mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index acb3b0fc18b3..38176b96ba2e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -45,9 +45,10 @@ abstract class PipelineStage extends Params with Logging {
    * :: DeveloperApi ::
    *
    * Check transform validity and derive the output schema from the input schema.
-   * This only needs to check for interactions between parameters. Raise an exception if
-   * any parameter value is invalid. Parameter value checks which do not depend on other
-   * parameters are handled by `Param.validate()`.
+   *
+   * We check validity for interactions between parameters during `transformSchema` and
+   * raise an exception if any parameter value is invalid. Parameter value checks which
+   * do not depend on other parameters are handled by `Param.validate()`.
    *
    * Typical implementation should first conduct verification on schema change and parameter
    * validity, including complex parameter interaction checks.

From 157acf92784d58e159095c59242bd8e1304ad7a4 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 25 Nov 2016 02:21:25 -0800
Subject: [PATCH 11/12] Add excludes to MimaExcludes

---
 project/MimaExcludes.scala | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index e1b03fe2bb49..84014014f2f5 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -885,8 +885,12 @@ object MimaExcludes {
       ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"),
       ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.RandomForestClassifier"),
       ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.GBTClassifier"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.GBTClassificationModel"),
       ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.RandomForestRegressor"),
       ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.GBTRegressor"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.GBTRegressionModel"),
       ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.getNumTrees"),
       ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.getNumTrees"),
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.numTrees"),

From 864be6e1fd09080af0234800bf26d1e248e245d4 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 25 Nov 2016 08:47:11 -0800
Subject: [PATCH 12/12] Set default value after variable declarion

---
 .../src/main/scala/org/apache/spark/ml/tree/treeParams.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index 49bc9e6809af..83a4f9d6d129 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -418,8 +418,6 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
   // final val validationTol: DoubleParam = new DoubleParam(this, "validationTol", "")
   // validationTol -> 1e-5
 
-  setDefault(maxIter -> 20, stepSize -> 0.1)
-
   /** @group setParam */
   def setMaxIter(value: Int): this.type = set(maxIter, value)
 
@@ -439,6 +437,8 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
   /** @group setParam */
   def setStepSize(value: Double): this.type = set(stepSize, value)
 
+  setDefault(maxIter -> 20, stepSize -> 0.1)
+
   /** (private[ml]) Create a BoostingStrategy instance to use with the old API. */
   private[ml] def getOldBoostingStrategy(
       categoricalFeatures: Map[Int, Int],