From ecddf1597bde986cf216e632d8cf3075875a6918 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Wed, 16 Nov 2016 20:35:20 -0800 Subject: [PATCH 01/12] Remove deprecated methods for ML. --- .../classification/LogisticRegression.scala | 8 +++-- .../RandomForestClassifier.scala | 9 ------ .../spark/ml/feature/ChiSqSelector.scala | 7 ----- .../org/apache/spark/ml/param/params.scala | 15 ---------- .../ml/regression/LinearRegression.scala | 3 -- .../ml/regression/RandomForestRegressor.scala | 8 ----- .../org/apache/spark/ml/tree/treeParams.scala | 13 ++++---- python/pyspark/ml/util.py | 30 +++++++++++++++++-- 8 files changed, 40 insertions(+), 53 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 18b9b3043db8..fca7f6a26284 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -40,7 +40,7 @@ import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.{col, lit} -import org.apache.spark.sql.types.DoubleType +import org.apache.spark.sql.types.{DataType, DoubleType, StructType} import org.apache.spark.storage.StorageLevel import org.apache.spark.util.VersionUtils @@ -176,8 +176,12 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas } } - override def validateParams(): Unit = { + override protected def validateAndTransformSchema( + schema: StructType, + fitting: Boolean, + featuresDataType: DataType): StructType = { checkThresholdConsistency() + super.validateAndTransformSchema(schema, fitting, featuresDataType) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index 52345b0626c4..3c784d5555a4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -221,15 +221,6 @@ class RandomForestClassificationModel private[ml] ( } } - /** - * Number of trees in ensemble - * - * @deprecated Use [[getNumTrees]] instead. This method will be removed in 2.1.0 - */ - // TODO: Once this is removed, then this class can inherit from RandomForestClassifierParams - @deprecated("Use getNumTrees instead. This method will be removed in 2.1.0.", "2.0.0") - val numTrees: Int = trees.length - @Since("1.4.0") override def copy(extra: ParamMap): RandomForestClassificationModel = { copyValues(new RandomForestClassificationModel(uid, _trees, numFeatures, numClasses), extra) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 653fa41124f8..7cd0f159c6be 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -216,13 +216,6 @@ final class ChiSqSelectorModel private[ml] ( @Since("1.6.0") def setOutputCol(value: String): this.type = set(outputCol, value) - /** - * @group setParam - */ - @Since("1.6.0") - @deprecated("labelCol is not used by ChiSqSelectorModel.", "2.0.0") - def setLabelCol(value: String): this.type = set(labelCol, value) - @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val transformedSchema = transformSchema(dataset.schema, logging = true) diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index 9245931b27ca..993ac8cb2126 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -546,21 +546,6 @@ trait Params extends Identifiable with Serializable { .map(m => m.invoke(this).asInstanceOf[Param[_]]) } - /** - * Validates parameter values stored internally. - * Raise an exception if any parameter value is invalid. - * - * This only needs to check for interactions between parameters. - * Parameter value checks which do not depend on other parameters are handled by - * `Param.validate()`. This method does not handle input/output column parameters; - * those are checked during schema validation. - * @deprecated Will be removed in 2.1.0. All the checks should be merged into transformSchema - */ - @deprecated("Will be removed in 2.1.0. Checks should be merged into transformSchema.", "2.0.0") - def validateParams(): Unit = { - // Do nothing by default. Override to handle Param interactions. - } - /** * Explains a param. * @param param input param, must belong to this instance. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 9639b07496c1..f8d754ddb3e3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -605,9 +605,6 @@ class LinearRegressionSummary private[regression] ( private val privateModel: LinearRegressionModel, private val diagInvAtWA: Array[Double]) extends Serializable { - @deprecated("The model field is deprecated and will be removed in 2.1.0.", "2.0.0") - val model: LinearRegressionModel = privateModel - @transient private val metrics = new RegressionMetrics( predictions .select(col(predictionCol), col(labelCol).cast(DoubleType)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index 0ad00aa6f928..b123bc9360b9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -181,14 +181,6 @@ class RandomForestRegressionModel private[ml] ( _trees.map(_.rootNode.predictImpl(features).prediction).sum / getNumTrees } - /** - * Number of trees in ensemble - * @deprecated Use [[getNumTrees]] instead. This method will be removed in 2.1.0 - */ - // TODO: Once this is removed, then this class can inherit from RandomForestRegressorParams - @deprecated("Use getNumTrees instead. This method will be removed in 2.1.0.", "2.0.0") - val numTrees: Int = trees.length - @Since("1.4.0") override def copy(extra: ParamMap): RandomForestRegressionModel = { copyValues(new RandomForestRegressionModel(uid, _trees, numFeatures), extra).setParent(parent) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index 57c7e44e9760..30500109ca66 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -435,7 +435,12 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS setDefault(maxIter -> 20, stepSize -> 0.1) /** @group setParam */ - def setMaxIter(value: Int): this.type = set(maxIter, value) + def setMaxIter(value: Int): this.type = { + require(ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)( + value), "GBT parameter stepSize should be in interval (0, 1], " + + s"but it given invalid value $value.") + set(maxIter, value) + } /** * Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each @@ -445,12 +450,6 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS */ def setStepSize(value: Double): this.type = set(stepSize, value) - override def validateParams(): Unit = { - require(ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)( - getStepSize), "GBT parameter stepSize should be in interval (0, 1], " + - s"but it given invalid value $getStepSize.") - } - /** (private[ml]) Create a BoostingStrategy instance to use with the old API. */ private[ml] def getOldBoostingStrategy( categoricalFeatures: Map[Int, Int], diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py index 7d39c3012235..42f6df9ca280 100644 --- a/python/pyspark/ml/util.py +++ b/python/pyspark/ml/util.py @@ -81,6 +81,10 @@ def context(self, sqlContext): """Sets the SQL context to use for saving.""" raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self)) + def session(self, sparkSession): + """Sets the Spark Session to use for saving.""" + raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self)) + @inherit_doc class JavaMLWriter(MLWriter): @@ -105,10 +109,19 @@ def overwrite(self): return self def context(self, sqlContext): - """Sets the SQL context to use for saving.""" + """ + Sets the SQL context to use for saving. + .. note:: Deprecated in 2.1, use session instead. + """ + warnings.warn("Deprecated in 2.1, use session instead.") self._jwrite.context(sqlContext._ssql_ctx) return self + def session(self, sparkSession): + """Sets the Spark Session to use for saving.""" + self._jwrite.session(sparkSession._jsparkSession) + return self + @inherit_doc class MLWritable(object): @@ -158,6 +171,10 @@ def context(self, sqlContext): """Sets the SQL context to use for loading.""" raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self)) + def session(self, sparkSession): + """Sets the Spark Session to use for loading.""" + raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self)) + @inherit_doc class JavaMLReader(MLReader): @@ -180,10 +197,19 @@ def load(self, path): return self._clazz._from_java(java_obj) def context(self, sqlContext): - """Sets the SQL context to use for loading.""" + """ + Sets the SQL context to use for loading. + .. note:: Deprecated in 2.1, use session instead. + """ + warnings.warn("Deprecated in 2.1, use session instead.") self._jread.context(sqlContext._ssql_ctx) return self + def session(self, sparkSession): + """Sets the Spark Session to use for loading.""" + self._jread.session(sparkSession._jsparkSession) + return self + @classmethod def _java_loader_class(cls, clazz): """ From 9a8c92691e7ec0b9d37eed0cf6f9dbcc4d4d622f Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Wed, 16 Nov 2016 23:47:30 -0800 Subject: [PATCH 02/12] Add mima excludes. --- project/MimaExcludes.scala | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 12f7ed202b9d..6a58d2006171 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -867,6 +867,22 @@ object MimaExcludes { // [SPARK-12221] Add CPU time to metrics ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.TaskMetrics.this"), ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.TaskMetricDistributions.this") + ) ++ Seq( + // [SPARK-18481] ML 2.1 QA: Remove deprecated methods for ML + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.PipelineStage.validateParams"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.param.JavaParams.validateParams"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.param.Params.validateParams"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.GBTClassificationModel.validateParams"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegression.validateParams"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.GBTClassifier.validateParams"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionModel.validateParams"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.numTrees"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.feature.ChiSqSelectorModel.setLabelCol"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.evaluation.Evaluator.validateParams"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.GBTRegressor.validateParams"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.GBTRegressionModel.validateParams"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.LinearRegressionSummary.model"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees") ) } From 13988b453569fc8e814952f0ef7ce20e634a724a Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 18 Nov 2016 03:01:27 -0800 Subject: [PATCH 03/12] Fix wrong location of GBT step size check. --- .../org/apache/spark/ml/tree/treeParams.scala | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index 30500109ca66..c1dfa50ed3ce 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -435,12 +435,8 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS setDefault(maxIter -> 20, stepSize -> 0.1) /** @group setParam */ - def setMaxIter(value: Int): this.type = { - require(ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)( - value), "GBT parameter stepSize should be in interval (0, 1], " + - s"but it given invalid value $value.") - set(maxIter, value) - } + def setMaxIter(value: Int): this.type = set(maxIter, value) + /** * Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each @@ -448,7 +444,12 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS * (default = 0.1) * @group setParam */ - def setStepSize(value: Double): this.type = set(stepSize, value) + def setStepSize(value: Double): this.type = { + require(ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)( + value), "GBT parameter stepSize should be in interval (0, 1], " + + s"but it given invalid value $value.") + set(stepSize, value) + } /** (private[ml]) Create a BoostingStrategy instance to use with the old API. */ private[ml] def getOldBoostingStrategy( From 7beb5688e83a49c9e0f2270fe413c2f2fff39ecd Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Sat, 19 Nov 2016 07:03:20 -0800 Subject: [PATCH 04/12] Remove RandomForestClassificationModelParams and RandomForestRegressionModelParams. --- .../ml/classification/GBTClassifier.scala | 5 ++ .../RandomForestClassifier.scala | 2 +- .../spark/ml/regression/GBTRegressor.scala | 5 ++ .../ml/regression/RandomForestRegressor.scala | 2 +- .../org/apache/spark/ml/tree/treeModels.scala | 5 -- .../org/apache/spark/ml/tree/treeParams.scala | 63 +++++++------------ 6 files changed, 34 insertions(+), 48 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index f8f164e8c14b..d7d4612e6d17 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala @@ -201,6 +201,11 @@ class GBTClassificationModel private[ml]( @Since("1.4.0") override def trees: Array[DecisionTreeRegressionModel] = _trees + /** + * Number of trees in ensemble + */ + val getNumTrees: Int = trees.length + @Since("1.4.0") override def treeWeights: Array[Double] = _treeWeights diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index 3c784d5555a4..f0035acb1d65 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -158,7 +158,7 @@ class RandomForestClassificationModel private[ml] ( @Since("1.6.0") override val numFeatures: Int, @Since("1.5.0") override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, RandomForestClassificationModel] - with RandomForestClassificationModelParams with TreeEnsembleModel[DecisionTreeClassificationModel] + with RandomForestClassifierParams with TreeEnsembleModel[DecisionTreeClassificationModel] with MLWritable with Serializable { require(_trees.nonEmpty, "RandomForestClassificationModel requires at least 1 tree.") diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index fa69d60836e6..0f6e178bf318 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -183,6 +183,11 @@ class GBTRegressionModel private[ml]( @Since("1.4.0") override def trees: Array[DecisionTreeRegressionModel] = _trees + /** + * Number of trees in ensemble + */ + val getNumTrees: Int = trees.length + @Since("1.4.0") override def treeWeights: Array[Double] = _treeWeights diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index b123bc9360b9..e9230b88e8cf 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -144,7 +144,7 @@ class RandomForestRegressionModel private[ml] ( private val _trees: Array[DecisionTreeRegressionModel], override val numFeatures: Int) extends PredictionModel[Vector, RandomForestRegressionModel] - with RandomForestRegressionModelParams with TreeEnsembleModel[DecisionTreeRegressionModel] + with RandomForestRegressorParams with TreeEnsembleModel[DecisionTreeRegressionModel] with MLWritable with Serializable { require(_trees.nonEmpty, "RandomForestRegressionModel requires at least 1 tree.") diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala index d3cbc363799a..0d6e9034e5ce 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala @@ -95,11 +95,6 @@ private[ml] trait TreeEnsembleModel[M <: DecisionTreeModel] { /** Trees in this ensemble. Warning: These have null parent Estimators. */ def trees: Array[M] - /** - * Number of trees in ensemble - */ - val getNumTrees: Int = trees.length - /** Weights for each tree, zippable with [[trees]] */ def treeWeights: Array[Double] diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index c1dfa50ed3ce..311580040d1d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -317,8 +317,28 @@ private[ml] trait TreeEnsembleParams extends DecisionTreeParams { } } -/** Used for [[RandomForestParams]] */ -private[ml] trait HasFeatureSubsetStrategy extends Params { +/** + * Parameters for Random Forest algorithms. + */ +private[ml] trait RandomForestParams extends TreeEnsembleParams { + + /** + * Number of trees to train (>= 1). + * If 1, then no bootstrapping is used. If > 1, then bootstrapping is done. + * TODO: Change to always do bootstrapping (simpler). SPARK-7130 + * (default = 20) + * @group param + */ + final val numTrees: IntParam = new IntParam(this, "numTrees", "Number of trees to train (>= 1)", + ParamValidators.gtEq(1)) + + setDefault(numTrees -> 20) + + /** @group setParam */ + def setNumTrees(value: Int): this.type = set(numTrees, value) + + /** @group getParam */ + final def getNumTrees: Int = $(numTrees) /** * The number of features to consider for splits at each tree node. @@ -364,38 +384,6 @@ private[ml] trait HasFeatureSubsetStrategy extends Params { final def getFeatureSubsetStrategy: String = $(featureSubsetStrategy).toLowerCase } -/** - * Used for [[RandomForestParams]]. - * This is separated out from [[RandomForestParams]] because of an issue with the - * `numTrees` method conflicting with this Param in the Estimator. - */ -private[ml] trait HasNumTrees extends Params { - - /** - * Number of trees to train (>= 1). - * If 1, then no bootstrapping is used. If > 1, then bootstrapping is done. - * TODO: Change to always do bootstrapping (simpler). SPARK-7130 - * (default = 20) - * @group param - */ - final val numTrees: IntParam = new IntParam(this, "numTrees", "Number of trees to train (>= 1)", - ParamValidators.gtEq(1)) - - setDefault(numTrees -> 20) - - /** @group setParam */ - def setNumTrees(value: Int): this.type = set(numTrees, value) - - /** @group getParam */ - final def getNumTrees: Int = $(numTrees) -} - -/** - * Parameters for Random Forest algorithms. - */ -private[ml] trait RandomForestParams extends TreeEnsembleParams - with HasFeatureSubsetStrategy with HasNumTrees - private[spark] object RandomForestParams { // These options should be lowercase. final val supportedFeatureSubsetStrategies: Array[String] = @@ -405,15 +393,9 @@ private[spark] object RandomForestParams { private[ml] trait RandomForestClassifierParams extends RandomForestParams with TreeClassifierParams -private[ml] trait RandomForestClassificationModelParams extends TreeEnsembleParams - with HasFeatureSubsetStrategy with TreeClassifierParams - private[ml] trait RandomForestRegressorParams extends RandomForestParams with TreeRegressorParams -private[ml] trait RandomForestRegressionModelParams extends TreeEnsembleParams - with HasFeatureSubsetStrategy with TreeRegressorParams - /** * Parameters for Gradient-Boosted Tree algorithms. * @@ -437,7 +419,6 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS /** @group setParam */ def setMaxIter(value: Int): this.type = set(maxIter, value) - /** * Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each * estimator. From 36c2e248c8c1436b126352ee50ded5f01c03c4c8 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Sat, 19 Nov 2016 07:21:47 -0800 Subject: [PATCH 05/12] Better deprecated docs for estimator/transfomer read/write context function. --- .../scala/org/apache/spark/ml/util/ReadWrite.scala | 2 +- python/pyspark/ml/util.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala index bc4f9e6716ee..22abf4c976e9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala @@ -46,7 +46,7 @@ private[util] sealed trait BaseReadWrite { * Sets the Spark SQLContext to use for saving/loading. */ @Since("1.6.0") - @deprecated("Use session instead", "2.0.0") + @deprecated("Use session instead, This method will be removed in 2.2.0.", "2.0.0") def context(sqlContext: SQLContext): this.type = { optionSparkSession = Option(sqlContext.sparkSession) this diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py index 42f6df9ca280..9c5cd1568148 100644 --- a/python/pyspark/ml/util.py +++ b/python/pyspark/ml/util.py @@ -78,7 +78,10 @@ def overwrite(self): raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self)) def context(self, sqlContext): - """Sets the SQL context to use for saving.""" + """ + Sets the SQL context to use for saving. + .. note:: Deprecated in 2.1, use session instead. + """ raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self)) def session(self, sparkSession): @@ -168,7 +171,10 @@ def load(self, path): raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self)) def context(self, sqlContext): - """Sets the SQL context to use for loading.""" + """ + Sets the SQL context to use for loading. + .. note:: Deprecated in 2.1, use session instead. + """ raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self)) def session(self, sparkSession): From 8d3c47a2cb3f2ffcf72b6798c224ac0f29c9e6b2 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Sat, 19 Nov 2016 08:09:15 -0800 Subject: [PATCH 06/12] Add functions to mima excludes. --- project/MimaExcludes.scala | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 6a58d2006171..e1b03fe2bb49 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -882,7 +882,17 @@ object MimaExcludes { ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.GBTRegressor.validateParams"), ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.GBTRegressionModel.validateParams"), ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.LinearRegressionSummary.model"), - ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees") + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"), + ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.RandomForestClassifier"), + ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel"), + ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.RandomForestRegressor"), + ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel"), + ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.getNumTrees"), + ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.getNumTrees"), + ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.numTrees"), + ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.setFeatureSubsetStrategy"), + ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"), + ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setFeatureSubsetStrategy") ) } From 0c85fd4143c666c06a3db9c89ac50eb4981b2849 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Wed, 23 Nov 2016 21:52:52 -0800 Subject: [PATCH 07/12] Update docs. --- .../spark/ml/classification/GBTClassifier.scala | 1 + .../apache/spark/ml/regression/GBTRegressor.scala | 1 + .../scala/org/apache/spark/ml/tree/treeParams.scala | 4 ++++ python/pyspark/ml/util.py | 12 ++++++------ 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index d7d4612e6d17..a857691c8059 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala @@ -204,6 +204,7 @@ class GBTClassificationModel private[ml]( /** * Number of trees in ensemble */ + @Since("2.0.0") val getNumTrees: Int = trees.length @Since("1.4.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index 0f6e178bf318..8ab09226297d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -186,6 +186,7 @@ class GBTRegressionModel private[ml]( /** * Number of trees in ensemble */ + @Since("2.0.0") val getNumTrees: Int = trees.length @Since("1.4.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index 311580040d1d..0fce196e63d3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -327,6 +327,10 @@ private[ml] trait RandomForestParams extends TreeEnsembleParams { * If 1, then no bootstrapping is used. If > 1, then bootstrapping is done. * TODO: Change to always do bootstrapping (simpler). SPARK-7130 * (default = 20) + * + * Note: The reason that we cannot add this to both GBT and RF (i.e. in TreeEnsembleParams) + * is the param `maxIter` controls how many trees a GBT has. The semantics in the algorithms + * are a bit different. * @group param */ final val numTrees: IntParam = new IntParam(this, "numTrees", "Number of trees to train (>= 1)", diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py index 9c5cd1568148..bec4b2895210 100644 --- a/python/pyspark/ml/util.py +++ b/python/pyspark/ml/util.py @@ -80,7 +80,7 @@ def overwrite(self): def context(self, sqlContext): """ Sets the SQL context to use for saving. - .. note:: Deprecated in 2.1, use session instead. + .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead. """ raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self)) @@ -114,9 +114,9 @@ def overwrite(self): def context(self, sqlContext): """ Sets the SQL context to use for saving. - .. note:: Deprecated in 2.1, use session instead. + .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead. """ - warnings.warn("Deprecated in 2.1, use session instead.") + warnings.warn("Deprecated in 2.1 and will be removed in 2.2, use session instead.") self._jwrite.context(sqlContext._ssql_ctx) return self @@ -173,7 +173,7 @@ def load(self, path): def context(self, sqlContext): """ Sets the SQL context to use for loading. - .. note:: Deprecated in 2.1, use session instead. + .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead. """ raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self)) @@ -205,9 +205,9 @@ def load(self, path): def context(self, sqlContext): """ Sets the SQL context to use for loading. - .. note:: Deprecated in 2.1, use session instead. + .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead. """ - warnings.warn("Deprecated in 2.1, use session instead.") + warnings.warn("Deprecated in 2.1 and will be removed in 2.2, use session instead.") self._jread.context(sqlContext._ssql_ctx) return self From 1d8cef5f251614f0aef37fdd19426f5bf298beb4 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Wed, 23 Nov 2016 22:32:48 -0800 Subject: [PATCH 08/12] Add test cases for param validation. --- .../spark/ml/classification/GBTClassifierSuite.scala | 8 ++++++++ .../spark/ml/classification/LogisticRegressionSuite.scala | 6 ++++++ 2 files changed, 14 insertions(+) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala index 3492709677d4..7c36745ab213 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala @@ -70,6 +70,14 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext ParamsSuite.checkParams(model) } + test("GBT parameter stepSize should be in interval (0, 1]") { + withClue("GBT parameter stepSize should be in interval (0, 1]") { + intercept[IllegalArgumentException] { + new GBTClassifier().setStepSize(10) + } + } + } + test("Binary classification with continuous features: Log Loss") { val categoricalFeatures = Map.empty[Int, Int] testCombinations.foreach { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 2877285eb4d5..b7407b7edb53 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -190,6 +190,12 @@ class LogisticRegressionSuite } } // thresholds and threshold must be consistent: values + withClue("fit with ParamMap should throw error if threshold, thresholds do not match.") { + intercept[IllegalArgumentException] { + lr2.fit(smallBinaryDataset, + lr2.thresholds -> Array(0.3, 0.7), lr2.threshold -> (expectedThreshold / 2.0)) + } + } withClue("fit with ParamMap should throw error if threshold, thresholds do not match.") { intercept[IllegalArgumentException] { val lr2model = lr2.fit(smallBinaryDataset, From 3c8ee2d445bcb22ecf16e1f67408fbe81136cc13 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 25 Nov 2016 01:41:13 -0800 Subject: [PATCH 09/12] Use separate stepSize and add doc for transformSchema --- .../scala/org/apache/spark/ml/Pipeline.scala | 3 +++ .../org/apache/spark/ml/tree/treeParams.scala | 23 +++++++++++-------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala index f406f8c426d0..acb3b0fc18b3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala @@ -45,6 +45,9 @@ abstract class PipelineStage extends Params with Logging { * :: DeveloperApi :: * * Check transform validity and derive the output schema from the input schema. + * This only needs to check for interactions between parameters. Raise an exception if + * any parameter value is invalid. Parameter value checks which do not depend on other + * parameters are handled by `Param.validate()`. * * Typical implementation should first conduct verification on schema change and parameter * validity, including complex parameter interaction checks. diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index 0fce196e63d3..49bc9e6809af 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -405,7 +405,7 @@ private[ml] trait RandomForestRegressorParams * * Note: Marked as private and DeveloperApi since this may be made public in the future. */ -private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasStepSize { +private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter { /* TODO: Add this doc when we add this param. SPARK-7132 * Threshold for stopping early when runWithValidation is used. @@ -424,17 +424,20 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS def setMaxIter(value: Int): this.type = set(maxIter, value) /** - * Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each - * estimator. + * Param for Step size (a.k.a. learning rate) in interval (0, 1] for shrinking + * the contribution of each estimator. * (default = 0.1) - * @group setParam + * @group param */ - def setStepSize(value: Double): this.type = { - require(ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)( - value), "GBT parameter stepSize should be in interval (0, 1], " + - s"but it given invalid value $value.") - set(stepSize, value) - } + final val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size " + + "(a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.", + ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)) + + /** @group getParam */ + final def getStepSize: Double = $(stepSize) + + /** @group setParam */ + def setStepSize(value: Double): this.type = set(stepSize, value) /** (private[ml]) Create a BoostingStrategy instance to use with the old API. */ private[ml] def getOldBoostingStrategy( From c874d0e41a6c892fce66396a7ca5a51b897202e1 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 25 Nov 2016 01:48:41 -0800 Subject: [PATCH 10/12] Update docs --- mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala index acb3b0fc18b3..38176b96ba2e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala @@ -45,9 +45,10 @@ abstract class PipelineStage extends Params with Logging { * :: DeveloperApi :: * * Check transform validity and derive the output schema from the input schema. - * This only needs to check for interactions between parameters. Raise an exception if - * any parameter value is invalid. Parameter value checks which do not depend on other - * parameters are handled by `Param.validate()`. + * + * We check validity for interactions between parameters during `transformSchema` and + * raise an exception if any parameter value is invalid. Parameter value checks which + * do not depend on other parameters are handled by `Param.validate()`. * * Typical implementation should first conduct verification on schema change and parameter * validity, including complex parameter interaction checks. From 157acf92784d58e159095c59242bd8e1304ad7a4 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 25 Nov 2016 02:21:25 -0800 Subject: [PATCH 11/12] Add excludes to MimaExcludes --- project/MimaExcludes.scala | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index e1b03fe2bb49..84014014f2f5 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -885,8 +885,12 @@ object MimaExcludes { ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"), ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.RandomForestClassifier"), ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel"), + ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.GBTClassifier"), + ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.GBTClassificationModel"), ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.RandomForestRegressor"), ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel"), + ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.GBTRegressor"), + ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.GBTRegressionModel"), ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.getNumTrees"), ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.getNumTrees"), ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.numTrees"), From 864be6e1fd09080af0234800bf26d1e248e245d4 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 25 Nov 2016 08:47:11 -0800 Subject: [PATCH 12/12] Set default value after variable declarion --- .../src/main/scala/org/apache/spark/ml/tree/treeParams.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index 49bc9e6809af..83a4f9d6d129 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -418,8 +418,6 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter { // final val validationTol: DoubleParam = new DoubleParam(this, "validationTol", "") // validationTol -> 1e-5 - setDefault(maxIter -> 20, stepSize -> 0.1) - /** @group setParam */ def setMaxIter(value: Int): this.type = set(maxIter, value) @@ -439,6 +437,8 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter { /** @group setParam */ def setStepSize(value: Double): this.type = set(stepSize, value) + setDefault(maxIter -> 20, stepSize -> 0.1) + /** (private[ml]) Create a BoostingStrategy instance to use with the old API. */ private[ml] def getOldBoostingStrategy( categoricalFeatures: Map[Int, Int],