diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/FMClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/FMClassifier.scala index a4d242753a789..1511cb61b8b30 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/FMClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/FMClassifier.scala @@ -86,7 +86,6 @@ class FMClassifier @Since("3.0.0") ( */ @Since("3.0.0") def setFactorSize(value: Int): this.type = set(factorSize, value) - setDefault(factorSize -> 8) /** * Set whether to fit intercept term. @@ -96,7 +95,6 @@ class FMClassifier @Since("3.0.0") ( */ @Since("3.0.0") def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) - setDefault(fitIntercept -> true) /** * Set whether to fit linear term. @@ -106,7 +104,6 @@ class FMClassifier @Since("3.0.0") ( */ @Since("3.0.0") def setFitLinear(value: Boolean): this.type = set(fitLinear, value) - setDefault(fitLinear -> true) /** * Set the L2 regularization parameter. @@ -116,7 +113,6 @@ class FMClassifier @Since("3.0.0") ( */ @Since("3.0.0") def setRegParam(value: Double): this.type = set(regParam, value) - setDefault(regParam -> 0.0) /** * Set the mini-batch fraction parameter. @@ -126,7 +122,6 @@ class FMClassifier @Since("3.0.0") ( */ @Since("3.0.0") def setMiniBatchFraction(value: Double): this.type = set(miniBatchFraction, value) - setDefault(miniBatchFraction -> 1.0) /** * Set the standard deviation of initial coefficients. @@ -136,7 +131,6 @@ class FMClassifier @Since("3.0.0") ( */ @Since("3.0.0") def setInitStd(value: Double): this.type = set(initStd, value) - setDefault(initStd -> 0.01) /** * Set the maximum number of iterations. @@ -146,7 +140,6 @@ class FMClassifier @Since("3.0.0") ( */ @Since("3.0.0") def setMaxIter(value: Int): this.type = set(maxIter, value) - setDefault(maxIter -> 100) /** * Set the initial step size for the first step (like learning rate). @@ -156,7 +149,6 @@ class FMClassifier @Since("3.0.0") ( */ @Since("3.0.0") def setStepSize(value: Double): this.type = set(stepSize, value) - setDefault(stepSize -> 1.0) /** * Set the convergence tolerance of iterations. @@ -166,7 +158,6 @@ class FMClassifier @Since("3.0.0") ( */ @Since("3.0.0") def setTol(value: Double): this.type = set(tol, value) - setDefault(tol -> 1E-6) /** * Set the solver algorithm used for optimization. @@ -177,7 +168,6 @@ class FMClassifier @Since("3.0.0") ( */ @Since("3.0.0") def setSolver(value: String): this.type = set(solver, value) - setDefault(solver -> AdamW) /** * Set the random seed for weight initialization. diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala index 905789090d625..88621b34beb13 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala @@ -54,6 +54,9 @@ private[classification] trait LinearSVCParams extends ClassifierParams with HasR */ final override val threshold: DoubleParam = new DoubleParam(this, "threshold", "threshold in binary classification prediction applied to rawPrediction") + + setDefault(regParam -> 0.0, maxIter -> 100, fitIntercept -> true, tol -> 1E-6, + standardization -> true, threshold -> 0.0, aggregationDepth -> 2) } /** @@ -81,7 +84,6 @@ class LinearSVC @Since("2.2.0") ( */ @Since("2.2.0") def setRegParam(value: Double): this.type = set(regParam, value) - setDefault(regParam -> 0.0) /** * Set the maximum number of iterations. @@ -91,7 +93,6 @@ class LinearSVC @Since("2.2.0") ( */ @Since("2.2.0") def setMaxIter(value: Int): this.type = set(maxIter, value) - setDefault(maxIter -> 100) /** * Whether to fit an intercept term. @@ -101,7 +102,6 @@ class LinearSVC @Since("2.2.0") ( */ @Since("2.2.0") def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) - setDefault(fitIntercept -> true) /** * Set the convergence tolerance of iterations. @@ -112,7 +112,6 @@ class LinearSVC @Since("2.2.0") ( */ @Since("2.2.0") def setTol(value: Double): this.type = set(tol, value) - setDefault(tol -> 1E-6) /** * Whether to standardize the training features before fitting the model. @@ -122,7 +121,6 @@ class LinearSVC @Since("2.2.0") ( */ @Since("2.2.0") def setStandardization(value: Boolean): this.type = set(standardization, value) - setDefault(standardization -> true) /** * Set the value of param [[weightCol]]. @@ -141,7 +139,6 @@ class LinearSVC @Since("2.2.0") ( */ @Since("2.2.0") def setThreshold(value: Double): this.type = set(threshold, value) - setDefault(threshold -> 0.0) /** * Suggested depth for treeAggregate (greater than or equal to 2). @@ -153,7 +150,6 @@ class LinearSVC @Since("2.2.0") ( */ @Since("2.2.0") def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value) - setDefault(aggregationDepth -> 2) @Since("2.2.0") override def copy(extra: ParamMap): LinearSVC = defaultCopy(extra) @@ -300,7 +296,6 @@ class LinearSVCModel private[classification] ( @Since("2.2.0") def setThreshold(value: Double): this.type = set(threshold, value) - setDefault(threshold, 0.0) private val margin: Vector => Double = (features) => { BLAS.dot(features, coefficients) + intercept diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 50c14d086957f..a61eb329977cd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -248,6 +248,10 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas isSet(lowerBoundsOnIntercepts) || isSet(upperBoundsOnIntercepts) } + setDefault(regParam -> 0.0, elasticNetParam -> 0.0, maxIter -> 100, tol -> 1E-6, + fitIntercept -> true, family -> "auto", standardization -> true, threshold -> 0.5, + aggregationDepth -> 2) + override protected def validateAndTransformSchema( schema: StructType, fitting: Boolean, @@ -290,7 +294,6 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("1.2.0") def setRegParam(value: Double): this.type = set(regParam, value) - setDefault(regParam -> 0.0) /** * Set the ElasticNet mixing parameter. @@ -306,7 +309,6 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("1.4.0") def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value) - setDefault(elasticNetParam -> 0.0) /** * Set the maximum number of iterations. @@ -316,7 +318,6 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("1.2.0") def setMaxIter(value: Int): this.type = set(maxIter, value) - setDefault(maxIter -> 100) /** * Set the convergence tolerance of iterations. @@ -327,7 +328,6 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("1.4.0") def setTol(value: Double): this.type = set(tol, value) - setDefault(tol -> 1E-6) /** * Whether to fit an intercept term. @@ -337,7 +337,6 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("1.4.0") def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) - setDefault(fitIntercept -> true) /** * Sets the value of param [[family]]. @@ -347,7 +346,6 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("2.1.0") def setFamily(value: String): this.type = set(family, value) - setDefault(family -> "auto") /** * Whether to standardize the training features before fitting the model. @@ -361,11 +359,9 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("1.5.0") def setStandardization(value: Boolean): this.type = set(standardization, value) - setDefault(standardization -> true) @Since("1.5.0") override def setThreshold(value: Double): this.type = super.setThreshold(value) - setDefault(threshold -> 0.5) @Since("1.5.0") override def getThreshold: Double = super.getThreshold @@ -396,7 +392,6 @@ class LogisticRegression @Since("1.2.0") ( */ @Since("2.1.0") def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value) - setDefault(aggregationDepth -> 2) /** * Set the lower bounds on coefficients if fitting under bound constrained optimization. diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index 94681ae9ef796..409a39c5b8ca9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -63,6 +63,8 @@ private[classification] trait NaiveBayesParams extends PredictorParams with HasW /** @group getParam */ final def getModelType: String = $(modelType) + + setDefault(smoothing -> 1.0, modelType -> NaiveBayes.Multinomial) } // scalastyle:off line.size.limit @@ -106,7 +108,6 @@ class NaiveBayes @Since("1.5.0") ( */ @Since("1.5.0") def setSmoothing(value: Double): this.type = set(smoothing, value) - setDefault(smoothing -> 1.0) /** * Set the model type using a string (case-sensitive). @@ -116,7 +117,6 @@ class NaiveBayes @Since("1.5.0") ( */ @Since("1.5.0") def setModelType(value: String): this.type = set(modelType, value) - setDefault(modelType -> Multinomial) /** * Sets the value of param [[weightCol]]. diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala index 6c7112b80569f..b649b1dd95231 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala @@ -71,6 +71,8 @@ private[clustering] trait BisectingKMeansParams extends Params with HasMaxIter @Since("2.0.0") def getMinDivisibleClusterSize: Double = $(minDivisibleClusterSize) + setDefault(k -> 4, maxIter -> 20, minDivisibleClusterSize -> 1.0) + /** * Validates and transforms the input schema. * @param schema input schema @@ -225,11 +227,6 @@ class BisectingKMeans @Since("2.0.0") ( @Since("2.0.0") override val uid: String) extends Estimator[BisectingKMeansModel] with BisectingKMeansParams with DefaultParamsWritable { - setDefault( - k -> 4, - maxIter -> 20, - minDivisibleClusterSize -> 1.0) - @Since("2.0.0") override def copy(extra: ParamMap): BisectingKMeans = defaultCopy(extra) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index d779e602545cf..26a866bbecaeb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -59,6 +59,8 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w @Since("2.0.0") def getK: Int = $(k) + setDefault(k -> 2, maxIter -> 100, tol -> 0.01) + /** * Validates and transforms the input schema. * @@ -323,11 +325,6 @@ class GaussianMixture @Since("2.0.0") ( @Since("2.0.0") override val uid: String) extends Estimator[GaussianMixtureModel] with GaussianMixtureParams with DefaultParamsWritable { - setDefault( - k -> 2, - maxIter -> 100, - tol -> 0.01) - @Since("2.0.0") override def copy(extra: ParamMap): GaussianMixture = defaultCopy(extra) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index a42c920e24987..5370318bc6adb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -86,6 +86,9 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe @Since("1.5.0") def getInitSteps: Int = $(initSteps) + setDefault(k -> 2, maxIter -> 20, initMode -> MLlibKMeans.K_MEANS_PARALLEL, initSteps -> 2, + tol -> 1e-4, distanceMeasure -> DistanceMeasure.EUCLIDEAN) + /** * Validates and transforms the input schema. * @param schema input schema @@ -270,14 +273,6 @@ class KMeans @Since("1.5.0") ( @Since("1.5.0") override val uid: String) extends Estimator[KMeansModel] with KMeansParams with DefaultParamsWritable { - setDefault( - k -> 2, - maxIter -> 20, - initMode -> MLlibKMeans.K_MEANS_PARALLEL, - initSteps -> 2, - tol -> 1e-4, - distanceMeasure -> DistanceMeasure.EUCLIDEAN) - @Since("1.5.0") override def copy(extra: ParamMap): KMeans = defaultCopy(extra) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 4e86b3b247ace..c1b76fb40b2f6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -199,8 +199,6 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM " with estimates of the topic mixture distribution for each document (often called \"theta\"" + " in the literature). Returns a vector of zeros for an empty document.") - setDefault(topicDistributionCol -> "topicDistribution") - /** @group getParam */ @Since("1.6.0") def getTopicDistributionCol: String = $(topicDistributionCol) @@ -315,6 +313,11 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM @Since("2.0.0") def getKeepLastCheckpoint: Boolean = $(keepLastCheckpoint) + setDefault(maxIter -> 20, k -> 10, optimizer -> "online", checkpointInterval -> 10, + learningOffset -> 1024, learningDecay -> 0.51, subsamplingRate -> 0.05, + optimizeDocConcentration -> true, keepLastCheckpoint -> true, + topicDistributionCol -> "topicDistribution") + /** * Validates and transforms the input schema. * @@ -863,10 +866,6 @@ class LDA @Since("1.6.0") ( @Since("1.6.0") def this() = this(Identifiable.randomUID("lda")) - setDefault(maxIter -> 20, k -> 10, optimizer -> "online", checkpointInterval -> 10, - learningOffset -> 1024, learningDecay -> 0.51, subsamplingRate -> 0.05, - optimizeDocConcentration -> true, keepLastCheckpoint -> true) - /** * The features for LDA should be a `Vector` representing the word counts in a document. * The vector should be of length vocabSize, with counts for each term (word). diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/PowerIterationClustering.scala index 812a426a062c1..1466b32bef530 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/PowerIterationClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/PowerIterationClustering.scala @@ -91,7 +91,7 @@ private[clustering] trait PowerIterationClusteringParams extends Params with Has @Since("2.4.0") def getDstCol: String = $(dstCol) - setDefault(srcCol -> "src", dstCol -> "dst") + setDefault(srcCol -> "src", dstCol -> "dst", k -> 2, maxIter -> 20, initMode -> "random") } /** @@ -111,11 +111,6 @@ class PowerIterationClustering private[clustering] ( @Since("2.4.0") override val uid: String) extends PowerIterationClusteringParams with DefaultParamsWritable { - setDefault( - k -> 2, - maxIter -> 20, - initMode -> "random") - @Since("2.4.0") def this() = this(Identifiable.randomUID("PowerIterationClustering")) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala index 7733225c0c4d5..cbec87d7dd715 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala @@ -80,8 +80,6 @@ class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override va @Since("3.0.0") def setNumBins(value: Int): this.type = set(numBins, value) - setDefault(numBins -> 1000) - /** @group setParam */ @Since("1.5.0") def setRawPredictionCol(value: String): this.type = set(rawPredictionCol, value) @@ -94,7 +92,7 @@ class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override va @Since("3.0.0") def setWeightCol(value: String): this.type = set(weightCol, value) - setDefault(metricName -> "areaUnderROC") + setDefault(metricName -> "areaUnderROC", numBins -> 1000) @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala index 1d6540e970383..3bf115e72b431 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala @@ -63,8 +63,6 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid @Since("1.5.0") def setMetricName(value: String): this.type = set(metricName, value) - setDefault(metricName -> "f1") - /** @group setParam */ @Since("1.5.0") def setPredictionCol(value: String): this.type = set(predictionCol, value) @@ -104,8 +102,6 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid @Since("3.0.0") def setMetricLabel(value: Double): this.type = set(metricLabel, value) - setDefault(metricLabel -> 0.0) - /** * The beta value, which controls precision vs recall weighting, * used in `"weightedFMeasure"`, `"fMeasureByLabel"`. @@ -127,8 +123,6 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid @Since("3.0.0") def setBeta(value: Double): this.type = set(beta, value) - setDefault(beta -> 1.0) - /** * param for eps. log-loss is undefined for p=0 or p=1, so probabilities are clipped to * max(eps, min(1 - eps, p)). Must be in range (0, 0.5). The default value is 1e-15. @@ -149,7 +143,7 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid @Since("3.0.0") def setEps(value: Double): this.type = set(eps, value) - setDefault(eps -> 1e-15) + setDefault(metricName -> "f1", eps -> 1e-15, metricLabel -> 0.0, beta -> 1.0) @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MultilabelClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MultilabelClassificationEvaluator.scala index a8db5452bd56c..632e67c59696c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MultilabelClassificationEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MultilabelClassificationEvaluator.scala @@ -65,8 +65,6 @@ class MultilabelClassificationEvaluator @Since("3.0.0") (@Since("3.0.0") overrid @Since("3.0.0") def setMetricName(value: String): this.type = set(metricName, value) - setDefault(metricName -> "f1Measure") - /** * param for the class whose metric will be computed in `"precisionByLabel"`, `"recallByLabel"`, * `"f1MeasureByLabel"`. @@ -86,8 +84,6 @@ class MultilabelClassificationEvaluator @Since("3.0.0") (@Since("3.0.0") overrid /** @group setParam */ def setMetricLabel(value: Double): this.type = set(metricLabel, value) - setDefault(metricLabel -> 0.0) - /** @group setParam */ @Since("3.0.0") def setPredictionCol(value: String): this.type = set(predictionCol, value) @@ -96,6 +92,8 @@ class MultilabelClassificationEvaluator @Since("3.0.0") (@Since("3.0.0") overrid @Since("3.0.0") def setLabelCol(value: String): this.type = set(labelCol, value) + setDefault(metricLabel -> 0.0, metricName -> "f1Measure") + @Since("3.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala index c5dea6c177e21..3a2ddf4d97ee3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RankingEvaluator.scala @@ -61,8 +61,6 @@ class RankingEvaluator @Since("3.0.0") (@Since("3.0.0") override val uid: String @Since("3.0.0") def setMetricName(value: String): this.type = set(metricName, value) - setDefault(metricName -> "meanAveragePrecision") - /** * param for ranking position value used in `"meanAveragePrecisionAtK"`, `"precisionAtK"`, * `"ndcgAtK"`, `"recallAtK"`. Must be > 0. The default value is 10. @@ -83,8 +81,6 @@ class RankingEvaluator @Since("3.0.0") (@Since("3.0.0") override val uid: String @Since("3.0.0") def setK(value: Int): this.type = set(k, value) - setDefault(k -> 10) - /** @group setParam */ @Since("3.0.0") def setPredictionCol(value: String): this.type = set(predictionCol, value) @@ -93,6 +89,8 @@ class RankingEvaluator @Since("3.0.0") (@Since("3.0.0") override val uid: String @Since("3.0.0") def setLabelCol(value: String): this.type = set(labelCol, value) + setDefault(k -> 10, metricName -> "meanAveragePrecision") + @Since("3.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala index 18a8dda0c76ef..de256600982e5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala @@ -79,8 +79,6 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui @Since("3.0.0") def setThroughOrigin(value: Boolean): this.type = set(throughOrigin, value) - setDefault(throughOrigin -> false) - /** @group setParam */ @Since("1.4.0") def setPredictionCol(value: String): this.type = set(predictionCol, value) @@ -93,7 +91,7 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui @Since("3.0.0") def setWeightCol(value: String): this.type = set(weightCol, value) - setDefault(metricName -> "rmse") + setDefault(metricName -> "rmse", throughOrigin -> false) @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 76f4f944f11d5..c582d92ec538b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -54,7 +54,6 @@ private[feature] trait ChiSqSelectorParams extends Params "Number of features that selector will select, ordered by ascending p-value. If the" + " number of features is < numTopFeatures, then this will select all features.", ParamValidators.gtEq(1)) - setDefault(numTopFeatures -> 50) /** @group getParam */ @Since("1.6.0") @@ -70,7 +69,6 @@ private[feature] trait ChiSqSelectorParams extends Params final val percentile = new DoubleParam(this, "percentile", "Percentile of features that selector will select, ordered by ascending p-value.", ParamValidators.inRange(0, 1)) - setDefault(percentile -> 0.1) /** @group getParam */ @Since("2.1.0") @@ -85,7 +83,6 @@ private[feature] trait ChiSqSelectorParams extends Params @Since("2.1.0") final val fpr = new DoubleParam(this, "fpr", "The highest p-value for features to be kept.", ParamValidators.inRange(0, 1)) - setDefault(fpr -> 0.05) /** @group getParam */ @Since("2.1.0") @@ -100,7 +97,6 @@ private[feature] trait ChiSqSelectorParams extends Params @Since("2.2.0") final val fdr = new DoubleParam(this, "fdr", "The upper bound of the expected false discovery rate.", ParamValidators.inRange(0, 1)) - setDefault(fdr -> 0.05) /** @group getParam */ def getFdr: Double = $(fdr) @@ -114,7 +110,6 @@ private[feature] trait ChiSqSelectorParams extends Params @Since("2.2.0") final val fwe = new DoubleParam(this, "fwe", "The upper bound of the expected family-wise error rate.", ParamValidators.inRange(0, 1)) - setDefault(fwe -> 0.05) /** @group getParam */ def getFwe: Double = $(fwe) @@ -129,11 +124,13 @@ private[feature] trait ChiSqSelectorParams extends Params "The selector type of the ChisqSelector. " + "Supported options: " + OldChiSqSelector.supportedSelectorTypes.mkString(", "), ParamValidators.inArray[String](OldChiSqSelector.supportedSelectorTypes)) - setDefault(selectorType -> OldChiSqSelector.NumTopFeatures) /** @group getParam */ @Since("2.1.0") def getSelectorType: String = $(selectorType) + + setDefault(numTopFeatures -> 50, percentile -> 0.1, fpr -> 0.05, fdr -> 0.05, fwe -> 0.05, + selectorType -> OldChiSqSelector.NumTopFeatures) } /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 64f1722f5fcb8..ad1010da5c104 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -64,6 +64,8 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasInp /** @group getParam */ def getMissingValue: Double = $(missingValue) + setDefault(strategy -> Imputer.mean, missingValue -> Double.NaN) + /** Returns the input and output column names corresponding in pair. */ private[feature] def getInOutCols(): (Array[String], Array[String]) = { if (isSet(inputCol)) { @@ -144,8 +146,6 @@ class Imputer @Since("2.2.0") (@Since("2.2.0") override val uid: String) @Since("3.0.0") def setRelativeError(value: Double): this.type = set(relativeError, value) - setDefault(strategy -> Imputer.mean, missingValue -> Double.NaN) - override def fit(dataset: Dataset[_]): ImputerModel = { transformSchema(dataset.schema, logging = true) val spark = dataset.sparkSession diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala index c84892c974b90..56798a8e61a61 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala @@ -58,6 +58,8 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H /** @group getParam */ def getMax: Double = $(max) + setDefault(min -> 0.0, max -> 1.0) + /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { require($(min) < $(max), s"The specified min(${$(min)}) is larger or equal to max(${$(max)})") @@ -93,8 +95,6 @@ class MinMaxScaler @Since("1.5.0") (@Since("1.5.0") override val uid: String) @Since("1.5.0") def this() = this(Identifiable.randomUID("minMaxScal")) - setDefault(min -> 0.0, max -> 1.0) - /** @group setParam */ @Since("1.5.0") def setInputCol(value: String): this.type = set(inputCol, value) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala index 0ef092f6be463..5a500fefb57ec 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala @@ -53,8 +53,6 @@ private[ml] trait OneHotEncoderBase extends Params with HasHandleInvalid "during fitting, invalid data will result in an error.", ParamValidators.inArray(OneHotEncoder.supportedHandleInvalids)) - setDefault(handleInvalid, OneHotEncoder.ERROR_INVALID) - /** * Whether to drop the last category in the encoded vector (default: true) * @group param @@ -62,12 +60,13 @@ private[ml] trait OneHotEncoderBase extends Params with HasHandleInvalid @Since("2.3.0") final val dropLast: BooleanParam = new BooleanParam(this, "dropLast", "whether to drop the last category") - setDefault(dropLast -> true) /** @group getParam */ @Since("2.3.0") def getDropLast: Boolean = $(dropLast) + setDefault(handleInvalid -> OneHotEncoder.ERROR_INVALID, dropLast -> true) + /** Returns the input and output column names corresponding in pair. */ private[feature] def getInOutCols(): (Array[String], Array[String]) = { if (isSet(inputCol)) { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala index 4eedfc4dc0efa..b93c9b1fcd204 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala @@ -46,7 +46,6 @@ private[feature] trait QuantileDiscretizerBase extends Params val numBuckets = new IntParam(this, "numBuckets", "Number of buckets (quantiles, or " + "categories) into which data points are grouped. Must be >= 2.", ParamValidators.gtEq(2)) - setDefault(numBuckets -> 2) /** @group getParam */ def getNumBuckets: Int = getOrDefault(numBuckets) @@ -82,7 +81,8 @@ private[feature] trait QuantileDiscretizerBase extends Params "how to handle invalid entries. Options are skip (filter out rows with invalid values), " + "error (throw an error), or keep (keep invalid values in a special additional bucket).", ParamValidators.inArray(Bucketizer.supportedHandleInvalids)) - setDefault(handleInvalid, Bucketizer.ERROR_INVALID) + + setDefault(handleInvalid -> Bucketizer.ERROR_INVALID, numBuckets -> 2) } /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index 7ccfafa4ac813..b8da020017f12 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -60,7 +60,6 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol with @Since("2.1.0") val forceIndexLabel: BooleanParam = new BooleanParam(this, "forceIndexLabel", "Force to index label whether it is numeric or string") - setDefault(forceIndexLabel -> false) /** @group getParam */ @Since("2.1.0") @@ -80,7 +79,6 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol with "type. Options are 'skip' (filter out rows with invalid data), error (throw an error), " + "or 'keep' (put invalid data in a special additional bucket, at index numLabels).", ParamValidators.inArray(StringIndexer.supportedHandleInvalids)) - setDefault(handleInvalid, StringIndexer.ERROR_INVALID) /** * Param for how to order categories of a string FEATURE column used by `StringIndexer`. @@ -113,12 +111,14 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol with "The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', " + "RFormula drops the same category as R when encoding strings.", ParamValidators.inArray(StringIndexer.supportedStringOrderType)) - setDefault(stringIndexerOrderType, StringIndexer.frequencyDesc) /** @group getParam */ @Since("2.3.0") def getStringIndexerOrderType: String = $(stringIndexerOrderType) + setDefault(forceIndexLabel -> false, handleInvalid -> StringIndexer.ERROR_INVALID, + stringIndexerOrderType -> StringIndexer.frequencyDesc) + protected def hasLabelCol(schema: StructType): Boolean = { schema.map(_.name).contains($(labelCol)) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RobustScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RobustScaler.scala index bd9be779fedbd..12aba69528891 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RobustScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RobustScaler.scala @@ -50,8 +50,6 @@ private[feature] trait RobustScalerParams extends Params with HasInputCol with H /** @group getParam */ def getLower: Double = $(lower) - setDefault(lower -> 0.25) - /** * Upper quantile to calculate quantile range, shared by all features * Default: 0.75 @@ -64,8 +62,6 @@ private[feature] trait RobustScalerParams extends Params with HasInputCol with H /** @group getParam */ def getUpper: Double = $(upper) - setDefault(upper -> 0.75) - /** * Whether to center the data with median before scaling. * It will build a dense output, so take care when applying to sparse input. @@ -78,8 +74,6 @@ private[feature] trait RobustScalerParams extends Params with HasInputCol with H /** @group getParam */ def getWithCentering: Boolean = $(withCentering) - setDefault(withCentering -> false) - /** * Whether to scale the data to quantile range. * Default: true @@ -91,7 +85,7 @@ private[feature] trait RobustScalerParams extends Params with HasInputCol with H /** @group getParam */ def getWithScaling: Boolean = $(withScaling) - setDefault(withScaling -> true) + setDefault(withScaling -> true, lower -> 0.25, upper -> 0.75, withCentering -> false) /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index be32f44287b6a..77e3d902e0456 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -56,8 +56,6 @@ private[feature] trait StringIndexerBase extends Params with HasHandleInvalid wi "or 'keep' (put invalid data in a special additional bucket, at index numLabels).", ParamValidators.inArray(StringIndexer.supportedHandleInvalids)) - setDefault(handleInvalid, StringIndexer.ERROR_INVALID) - /** * Param for how to order labels of string column. The first label after ordering is assigned * an index of 0. @@ -84,6 +82,9 @@ private[feature] trait StringIndexerBase extends Params with HasHandleInvalid wi @Since("2.3.0") def getStringOrderType: String = $(stringOrderType) + setDefault(handleInvalid -> StringIndexer.ERROR_INVALID, + stringOrderType -> StringIndexer.frequencyDesc) + /** Returns the input and output column names corresponding in pair. */ private[feature] def getInOutCols(): (Array[String], Array[String]) = { ParamValidators.checkSingleVsMultiColumnParams(this, Seq(outputCol), Seq(outputCols)) @@ -155,7 +156,6 @@ class StringIndexer @Since("1.4.0") ( /** @group setParam */ @Since("2.3.0") def setStringOrderType(value: String): this.type = set(stringOrderType, value) - setDefault(stringOrderType, StringIndexer.frequencyDesc) /** @group setParam */ @Since("1.4.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala index 866074fb1453e..b7cf4392cd177 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala @@ -60,8 +60,6 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu "number of categories of the feature).", ParamValidators.inArray(VectorIndexer.supportedHandleInvalids)) - setDefault(handleInvalid, VectorIndexer.ERROR_INVALID) - /** * Threshold for the number of values a categorical feature can take. * If a feature is found to have {@literal >} maxCategories values, then it is declared @@ -75,10 +73,10 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu " If a feature is found to have > maxCategories values, then it is declared continuous.", ParamValidators.gtEq(2)) - setDefault(maxCategories -> 20) - /** @group getParam */ def getMaxCategories: Int = $(maxCategories) + + setDefault(maxCategories -> 20, handleInvalid -> VectorIndexer.ERROR_INVALID) } /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala index be91844ba39e6..3590b9118f3b8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala @@ -57,8 +57,6 @@ final class VectorSlicer @Since("1.5.0") (@Since("1.5.0") override val uid: Stri "An array of indices to select features from a vector column." + " There can be no overlap with names.", VectorSlicer.validIndices) - setDefault(indices -> Array.emptyIntArray) - /** @group getParam */ @Since("1.5.0") def getIndices: Array[Int] = $(indices) @@ -79,8 +77,6 @@ final class VectorSlicer @Since("1.5.0") (@Since("1.5.0") override val uid: Stri "An array of feature names to select features from a vector column." + " There can be no overlap with indices.", VectorSlicer.validNames) - setDefault(names -> Array.empty[String]) - /** @group getParam */ @Since("1.5.0") def getNames: Array[String] = $(names) @@ -97,6 +93,8 @@ final class VectorSlicer @Since("1.5.0") (@Since("1.5.0") override val uid: Stri @Since("1.5.0") def setOutputCol(value: String): this.type = set(outputCol, value) + setDefault(indices -> Array.emptyIntArray, names -> Array.empty[String]) + @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { // Validity checks diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index bbfcbfbe038ef..287fce4243743 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -47,7 +47,6 @@ private[feature] trait Word2VecBase extends Params final val vectorSize = new IntParam( this, "vectorSize", "the dimension of codes after transforming from words (> 0)", ParamValidators.gt(0)) - setDefault(vectorSize -> 100) /** @group getParam */ def getVectorSize: Int = $(vectorSize) @@ -60,7 +59,6 @@ private[feature] trait Word2VecBase extends Params final val windowSize = new IntParam( this, "windowSize", "the window size (context words from [-window, window]) (> 0)", ParamValidators.gt(0)) - setDefault(windowSize -> 5) /** @group expertGetParam */ def getWindowSize: Int = $(windowSize) @@ -73,7 +71,6 @@ private[feature] trait Word2VecBase extends Params final val numPartitions = new IntParam( this, "numPartitions", "number of partitions for sentences of words (> 0)", ParamValidators.gt(0)) - setDefault(numPartitions -> 1) /** @group getParam */ def getNumPartitions: Int = $(numPartitions) @@ -86,7 +83,6 @@ private[feature] trait Word2VecBase extends Params */ final val minCount = new IntParam(this, "minCount", "the minimum number of times a token must " + "appear to be included in the word2vec model's vocabulary (>= 0)", ParamValidators.gtEq(0)) - setDefault(minCount -> 5) /** @group getParam */ def getMinCount: Int = $(minCount) @@ -101,13 +97,12 @@ private[feature] trait Word2VecBase extends Params final val maxSentenceLength = new IntParam(this, "maxSentenceLength", "Maximum length " + "(in words) of each sentence in the input data. Any sentence longer than this threshold will " + "be divided into chunks up to the size (> 0)", ParamValidators.gt(0)) - setDefault(maxSentenceLength -> 1000) /** @group getParam */ def getMaxSentenceLength: Int = $(maxSentenceLength) - setDefault(stepSize -> 0.025) - setDefault(maxIter -> 1) + setDefault(vectorSize -> 100, windowSize -> 5, numPartitions -> 1, minCount -> 5, + maxSentenceLength -> 1000, stepSize -> 0.025, maxIter -> 1) /** * Validate and transform the input schema. diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala index e50d4255b1f37..de9d57e400551 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala @@ -50,7 +50,6 @@ private[fpm] trait FPGrowthParams extends Params with HasPredictionCol { */ @Since("2.2.0") val itemsCol: Param[String] = new Param[String](this, "itemsCol", "items column name") - setDefault(itemsCol -> "items") /** @group getParam */ @Since("2.2.0") @@ -66,7 +65,6 @@ private[fpm] trait FPGrowthParams extends Params with HasPredictionCol { val minSupport: DoubleParam = new DoubleParam(this, "minSupport", "the minimal support level of a frequent pattern", ParamValidators.inRange(0.0, 1.0)) - setDefault(minSupport -> 0.3) /** @group getParam */ @Since("2.2.0") @@ -95,12 +93,13 @@ private[fpm] trait FPGrowthParams extends Params with HasPredictionCol { val minConfidence: DoubleParam = new DoubleParam(this, "minConfidence", "minimal confidence for generating Association Rule", ParamValidators.inRange(0.0, 1.0)) - setDefault(minConfidence -> 0.8) /** @group getParam */ @Since("2.2.0") def getMinConfidence: Double = $(minConfidence) + setDefault(minSupport -> 0.3, itemsCol -> "items", minConfidence -> 0.8) + /** * Validates and transforms the input schema. * @param schema input schema diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index 2da65a0bcf790..86989c54bde3c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -59,7 +59,6 @@ private[regression] trait AFTSurvivalRegressionParams extends PredictorParams /** @group getParam */ @Since("1.6.0") def getCensorCol: String = $(censorCol) - setDefault(censorCol -> "censor") /** * Param for quantile probabilities array. @@ -75,7 +74,6 @@ private[regression] trait AFTSurvivalRegressionParams extends PredictorParams /** @group getParam */ @Since("1.6.0") def getQuantileProbabilities: Array[Double] = $(quantileProbabilities) - setDefault(quantileProbabilities -> Array(0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99)) /** * Param for quantiles column name. @@ -94,6 +92,10 @@ private[regression] trait AFTSurvivalRegressionParams extends PredictorParams isDefined(quantilesCol) && $(quantilesCol).nonEmpty } + setDefault(censorCol -> "censor", + quantileProbabilities -> Array(0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99), + fitIntercept -> true, maxIter -> 100, tol -> 1E-6, aggregationDepth -> 2) + /** * Validates and transforms the input schema with the provided param map. * @param schema input schema @@ -150,7 +152,6 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S */ @Since("1.6.0") def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) - setDefault(fitIntercept -> true) /** * Set the maximum number of iterations. @@ -159,7 +160,6 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S */ @Since("1.6.0") def setMaxIter(value: Int): this.type = set(maxIter, value) - setDefault(maxIter -> 100) /** * Set the convergence tolerance of iterations. @@ -169,7 +169,6 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S */ @Since("1.6.0") def setTol(value: Double): this.type = set(tol, value) - setDefault(tol -> 1E-6) /** * Suggested depth for treeAggregate (greater than or equal to 2). @@ -180,7 +179,6 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S */ @Since("2.1.0") def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value) - setDefault(aggregationDepth -> 2) /** * Extract [[featuresCol]], [[labelCol]] and [[censorCol]] from input dataset, diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 64e5e191ffd17..354c583ecc9ae 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -104,6 +104,10 @@ private[regression] trait LinearRegressionParams extends PredictorParams @Since("2.3.0") def getEpsilon: Double = $(epsilon) + setDefault(regParam -> 0.0, fitIntercept -> true, standardization -> true, + elasticNetParam -> 0.0, maxIter -> 100, tol -> 1E-6, solver -> Auto, + aggregationDepth -> 2, loss -> SquaredError, epsilon -> 1.35) + override protected def validateAndTransformSchema( schema: StructType, fitting: Boolean, @@ -190,7 +194,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("1.3.0") def setRegParam(value: Double): this.type = set(regParam, value) - setDefault(regParam -> 0.0) /** * Set if we should fit the intercept. @@ -200,7 +203,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("1.5.0") def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) - setDefault(fitIntercept -> true) /** * Whether to standardize the training features before fitting the model. @@ -216,7 +218,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("1.5.0") def setStandardization(value: Boolean): this.type = set(standardization, value) - setDefault(standardization -> true) /** * Set the ElasticNet mixing parameter. @@ -232,7 +233,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("1.4.0") def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value) - setDefault(elasticNetParam -> 0.0) /** * Set the maximum number of iterations. @@ -242,7 +242,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("1.3.0") def setMaxIter(value: Int): this.type = set(maxIter, value) - setDefault(maxIter -> 100) /** * Set the convergence tolerance of iterations. @@ -253,7 +252,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("1.4.0") def setTol(value: Double): this.type = set(tol, value) - setDefault(tol -> 1E-6) /** * Whether to over-/under-sample training instances according to the given weights in weightCol. @@ -282,7 +280,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("1.6.0") def setSolver(value: String): this.type = set(solver, value) - setDefault(solver -> Auto) /** * Suggested depth for treeAggregate (greater than or equal to 2). @@ -294,7 +291,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("2.1.0") def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value) - setDefault(aggregationDepth -> 2) /** * Sets the value of param [[loss]]. @@ -304,7 +300,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("2.3.0") def setLoss(value: String): this.type = set(loss, value) - setDefault(loss -> SquaredError) /** * Sets the value of param [[epsilon]]. @@ -314,7 +309,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String */ @Since("2.3.0") def setEpsilon(value: Double): this.type = set(epsilon, value) - setDefault(epsilon -> 1.35) override protected def train(dataset: Dataset[_]): LinearRegressionModel = instrumented { instr => // Extract the number of features before deciding optimization solver. diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index 7e2c287f146fb..19ea8ae4775d8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -333,8 +333,6 @@ private[ml] trait TreeEnsembleParams extends DecisionTreeParams { "Fraction of the training data used for learning each decision tree, in range (0, 1].", ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)) - setDefault(subsamplingRate -> 1.0) - /** @group getParam */ final def getSubsamplingRate: Double = $(subsamplingRate) @@ -386,10 +384,10 @@ private[ml] trait TreeEnsembleParams extends DecisionTreeParams { || Try(value.toInt).filter(_ > 0).isSuccess || Try(value.toDouble).filter(_ > 0).filter(_ <= 1.0).isSuccess) - setDefault(featureSubsetStrategy -> "auto") - /** @group getParam */ final def getFeatureSubsetStrategy: String = $(featureSubsetStrategy).toLowerCase(Locale.ROOT) + + setDefault(subsamplingRate -> 1.0, featureSubsetStrategy -> "auto") } /** @@ -448,8 +446,6 @@ private[ml] trait RandomForestParams extends TreeEnsembleParams { new IntParam(this, "numTrees", "Number of trees to train (at least 1)", ParamValidators.gtEq(1)) - setDefault(numTrees -> 20) - /** @group getParam */ final def getNumTrees: Int = $(numTrees) @@ -461,11 +457,11 @@ private[ml] trait RandomForestParams extends TreeEnsembleParams { final val bootstrap: BooleanParam = new BooleanParam(this, "bootstrap", "Whether bootstrap samples are used when building trees.") - setDefault(bootstrap -> true) - /** @group getParam */ @Since("3.0.0") final def getBootstrap: Boolean = $(bootstrap) + + setDefault(numTrees -> 20, bootstrap -> true) } private[ml] trait RandomForestClassifierParams @@ -518,9 +514,7 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS "(a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.", ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)) - setDefault(maxIter -> 20, stepSize -> 0.1, validationTol -> 0.01) - - setDefault(featureSubsetStrategy -> "all") + setDefault(maxIter -> 20, stepSize -> 0.1, validationTol -> 0.01, featureSubsetStrategy -> "all") /** (private[ml]) Create a BoostingStrategy instance to use with the old API. */ private[ml] def getOldBoostingStrategy( diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala b/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala index 4d9e664850c12..dd0139b94f098 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala @@ -63,6 +63,9 @@ trait DefaultReadWriteTest extends TempDirectory { self: Suite => (instance.getOrDefault(p), newInstance.getOrDefault(p)) match { case (Array(values), Array(newValues)) => assert(values === newValues, s"Values do not match on param ${p.name}.") + case (value: Double, newValue: Double) => + assert(value.isNaN && newValue.isNaN || value == newValue, + s"Values do not match on param ${p.name}.") case (value, newValue) => assert(value === newValue, s"Values do not match on param ${p.name}.") } diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 369761e8f6fa7..c8e15ca1568cc 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -178,6 +178,11 @@ class _LinearSVCParams(_JavaClassifierParams, HasRegParam, HasMaxIter, HasFitInt " all predictions 0.0 and -Inf will make all predictions 1.0.", typeConverter=TypeConverters.toFloat) + def __init__(self, *args): + super(_LinearSVCParams, self).__init__(*args) + self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, fitIntercept=True, + standardization=True, threshold=0.0, aggregationDepth=2) + @inherit_doc class LinearSVC(JavaClassifier, _LinearSVCParams, JavaMLWritable, JavaMLReadable): @@ -244,6 +249,8 @@ class LinearSVC(JavaClassifier, _LinearSVCParams, JavaMLWritable, JavaMLReadable True >>> model.intercept == model2.intercept True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True .. versionadded:: 2.2.0 """ @@ -262,8 +269,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(LinearSVC, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.LinearSVC", self.uid) - self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, fitIntercept=True, - standardization=True, threshold=0.0, aggregationDepth=2) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -427,6 +432,10 @@ class _LogisticRegressionParams(_JavaProbabilisticClassifierParams, HasRegParam, "classes for multinomial regression.", typeConverter=TypeConverters.toVector) + def __init__(self, *args): + super(_LogisticRegressionParams, self).__init__(*args) + self._setDefault(maxIter=100, regParam=0.0, tol=1E-6, threshold=0.5, family="auto") + @since("1.4.0") def setThreshold(self, value): """ @@ -616,6 +625,8 @@ class LogisticRegression(JavaProbabilisticClassifier, _LogisticRegressionParams, True >>> model2 LogisticRegressionModel: uid=..., numClasses=2, numFeatures=2 + >>> blorModel.transform(test0).take(1) == model2.transform(test0).take(1) + True .. versionadded:: 1.3.0 """ @@ -642,7 +653,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(LogisticRegression, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.LogisticRegression", self.uid) - self._setDefault(maxIter=100, regParam=0.0, tol=1E-6, threshold=0.5, family="auto") kwargs = self._input_kwargs self.setParams(**kwargs) self._checkThresholdConsistency() @@ -1130,7 +1140,12 @@ class _DecisionTreeClassifierParams(_DecisionTreeParams, _TreeClassifierParams): """ Params for :py:class:`DecisionTreeClassifier` and :py:class:`DecisionTreeClassificationModel`. """ - pass + + def __init__(self, *args): + super(_DecisionTreeClassifierParams, self).__init__(*args) + self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, + maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, + impurity="gini", leafCol="", minWeightFractionPerNode=0.0) @inherit_doc @@ -1197,7 +1212,8 @@ class DecisionTreeClassifier(JavaProbabilisticClassifier, _DecisionTreeClassifie >>> model2 = DecisionTreeClassificationModel.load(model_path) >>> model.featureImportances == model2.featureImportances True - + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True >>> df3 = spark.createDataFrame([ ... (1.0, 0.2, Vectors.dense(1.0)), ... (1.0, 0.8, Vectors.dense(1.0)), @@ -1229,9 +1245,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(DecisionTreeClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.DecisionTreeClassifier", self.uid) - self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="gini", leafCol="", minWeightFractionPerNode=0.0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1365,7 +1378,14 @@ class _RandomForestClassifierParams(_RandomForestParams, _TreeClassifierParams): """ Params for :py:class:`RandomForestClassifier` and :py:class:`RandomForestClassificationModel`. """ - pass + + def __init__(self, *args): + super(_RandomForestClassifierParams, self).__init__(*args) + self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, + maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, + impurity="gini", numTrees=20, featureSubsetStrategy="auto", + subsamplingRate=1.0, leafCol="", minWeightFractionPerNode=0.0, + bootstrap=True) @inherit_doc @@ -1437,6 +1457,8 @@ class RandomForestClassifier(JavaProbabilisticClassifier, _RandomForestClassifie >>> model2 = RandomForestClassificationModel.load(model_path) >>> model.featureImportances == model2.featureImportances True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True .. versionadded:: 1.4.0 """ @@ -1459,11 +1481,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(RandomForestClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.RandomForestClassifier", self.uid) - self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="gini", numTrees=20, featureSubsetStrategy="auto", - subsamplingRate=1.0, leafCol="", minWeightFractionPerNode=0.0, - bootstrap=True) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1633,6 +1650,14 @@ class _GBTClassifierParams(_GBTParams, _HasVarianceImpurity): "Supported options: " + ", ".join(supportedLossTypes), typeConverter=TypeConverters.toString) + def __init__(self, *args): + super(_GBTClassifierParams, self).__init__(*args) + self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, + maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, + lossType="logistic", maxIter=20, stepSize=0.1, subsamplingRate=1.0, + impurity="variance", featureSubsetStrategy="all", validationTol=0.01, + leafCol="", minWeightFractionPerNode=0.0) + @since("1.4.0") def getLossType(self): """ @@ -1724,6 +1749,8 @@ class GBTClassifier(JavaProbabilisticClassifier, _GBTClassifierParams, True >>> model.treeWeights == model2.treeWeights True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True >>> model.trees [DecisionTreeRegressionModel...depth=..., DecisionTreeRegressionModel...] >>> validation = spark.createDataFrame([(0.0, Vectors.dense(-1.0),)], @@ -1760,11 +1787,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(GBTClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.GBTClassifier", self.uid) - self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - lossType="logistic", maxIter=20, stepSize=0.1, subsamplingRate=1.0, - impurity="variance", featureSubsetStrategy="all", validationTol=0.01, - leafCol="", minWeightFractionPerNode=0.0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1962,6 +1984,10 @@ class _NaiveBayesParams(_JavaPredictorParams, HasWeightCol): "and gaussian.", typeConverter=TypeConverters.toString) + def __init__(self, *args): + super(_NaiveBayesParams, self).__init__(*args) + self._setDefault(smoothing=1.0, modelType="multinomial") + @since("1.5.0") def getSmoothing(self): """ @@ -2045,6 +2071,8 @@ class NaiveBayes(JavaProbabilisticClassifier, _NaiveBayesParams, HasThresholds, True >>> model.theta == model2.theta True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True >>> nb = nb.setThresholds([0.01, 10.00]) >>> model3 = nb.fit(df) >>> result = model3.transform(test0).head() @@ -2080,7 +2108,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(NaiveBayes, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.NaiveBayes", self.uid) - self._setDefault(smoothing=1.0, modelType="multinomial") kwargs = self._input_kwargs self.setParams(**kwargs) @@ -2172,8 +2199,8 @@ class _MultilayerPerceptronParams(_JavaProbabilisticClassifierParams, HasSeed, H initialWeights = Param(Params._dummy(), "initialWeights", "The initial weights of the model.", typeConverter=TypeConverters.toVector) - def __init__(self): - super(_MultilayerPerceptronParams, self).__init__() + def __init__(self, *args): + super(_MultilayerPerceptronParams, self).__init__(*args) self._setDefault(maxIter=100, tol=1E-6, blockSize=128, stepSize=0.03, solver="l-bfgs") @since("1.6.0") @@ -2255,6 +2282,8 @@ class MultilayerPerceptronClassifier(JavaProbabilisticClassifier, _MultilayerPer True >>> model.weights == model2.weights True + >>> model.transform(testDF).take(1) == model2.transform(testDF).take(1) + True >>> mlp2 = mlp2.setInitialWeights(list(range(0, 12))) >>> model3 = mlp2.fit(df) >>> model3.weights != model2.weights @@ -2429,6 +2458,8 @@ class OneVsRest(Estimator, _OneVsRestParams, HasParallelism, JavaMLReadable, Jav >>> model2 = OneVsRestModel.load(model_path) >>> model2.transform(test0).head().newPrediction 0.0 + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True >>> model.transform(test2).columns ['features', 'rawPrediction', 'newPrediction'] @@ -2854,6 +2885,17 @@ class FMClassifier(JavaProbabilisticClassifier, _FactorizationMachinesParams, Ja DenseVector([14.8232]) >>> model.factors DenseMatrix(1, 2, [0.0163, -0.0051], 1) + >>> model_path = temp_path + "/fm_model" + >>> model.save(model_path) + >>> model2 = FMClassificationModel.load(model_path) + >>> model2.intercept + -7.316665276826291 + >>> model2.linear + DenseVector([14.8232]) + >>> model2.factors + DenseMatrix(1, 2, [0.0163, -0.0051], 1) + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True .. versionadded:: 3.0.0 """ diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index e1c2732af31b9..2060bbc625a02 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -108,6 +108,10 @@ class _GaussianMixtureParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionC k = Param(Params._dummy(), "k", "Number of independent Gaussians in the mixture model. " + "Must be > 1.", typeConverter=TypeConverters.toInt) + def __init__(self, *args): + super(_GaussianMixtureParams, self).__init__(*args) + self._setDefault(k=2, tol=0.01, maxIter=100, aggregationDepth=2) + @since("2.0.0") def getK(self): """ @@ -321,6 +325,8 @@ class GaussianMixture(JavaEstimator, _GaussianMixtureParams, JavaMLWritable, Jav Row(mean=DenseVector([0.825, 0.8675])) >>> model2.gaussiansDF.select("cov").head() Row(cov=DenseMatrix(2, 2, [0.0056, -0.0051, -0.0051, 0.0046], False)) + >>> model.transform(df).take(1) == model2.transform(df).take(1) + True >>> gm2.setWeightCol("weight") GaussianMixture... @@ -339,7 +345,6 @@ def __init__(self, featuresCol="features", predictionCol="prediction", k=2, super(GaussianMixture, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.GaussianMixture", self.uid) - self._setDefault(k=2, tol=0.01, maxIter=100, aggregationDepth=2) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -493,6 +498,11 @@ class _KMeansParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, HasTo initSteps = Param(Params._dummy(), "initSteps", "The number of steps for k-means|| " + "initialization mode. Must be > 0.", typeConverter=TypeConverters.toInt) + def __init__(self, *args): + super(_KMeansParams, self).__init__(*args) + self._setDefault(k=2, initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, + distanceMeasure="euclidean") + @since("1.5.0") def getK(self): """ @@ -622,6 +632,8 @@ class KMeans(JavaEstimator, _KMeansParams, JavaMLWritable, JavaMLReadable): array([ True, True], dtype=bool) >>> model.clusterCenters()[1] == model2.clusterCenters()[1] array([ True, True], dtype=bool) + >>> model.transform(df).take(1) == model2.transform(df).take(1) + True .. versionadded:: 1.5.0 """ @@ -637,8 +649,6 @@ def __init__(self, featuresCol="features", predictionCol="prediction", k=2, """ super(KMeans, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid) - self._setDefault(k=2, initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, - distanceMeasure="euclidean") kwargs = self._input_kwargs self.setParams(**kwargs) @@ -747,6 +757,10 @@ class _BisectingKMeansParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionC "proportion of points (if < 1.0) of a divisible cluster.", typeConverter=TypeConverters.toFloat) + def __init__(self, *args): + super(_BisectingKMeansParams, self).__init__(*args) + self._setDefault(maxIter=20, k=4, minDivisibleClusterSize=1.0) + @since("2.0.0") def getK(self): """ @@ -897,6 +911,8 @@ class BisectingKMeans(JavaEstimator, _BisectingKMeansParams, JavaMLWritable, Jav array([ True, True], dtype=bool) >>> model.clusterCenters()[1] == model2.clusterCenters()[1] array([ True, True], dtype=bool) + >>> model.transform(df).take(1) == model2.transform(df).take(1) + True .. versionadded:: 2.0.0 """ @@ -913,7 +929,6 @@ def __init__(self, featuresCol="features", predictionCol="prediction", maxIter=2 super(BisectingKMeans, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.BisectingKMeans", self.uid) - self._setDefault(maxIter=20, k=4, minDivisibleClusterSize=1.0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1056,6 +1071,13 @@ class _LDAParams(HasMaxIter, HasFeaturesCol, HasSeed, HasCheckpointInterval): " partition is lost, so set this bit with care.", TypeConverters.toBoolean) + def __init__(self, *args): + super(_LDAParams, self).__init__(*args) + self._setDefault(maxIter=20, checkpointInterval=10, + k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51, + subsamplingRate=0.05, optimizeDocConcentration=True, + topicDistributionCol="topicDistribution", keepLastCheckpoint=True) + @since("2.0.0") def getK(self): """ @@ -1366,6 +1388,8 @@ class LDA(JavaEstimator, _LDAParams, JavaMLReadable, JavaMLWritable): >>> local_model_path = temp_path + "/lda_local_model" >>> localModel.save(local_model_path) >>> sameLocalModel = LocalLDAModel.load(local_model_path) + >>> model.transform(df).take(1) == sameLocalModel.transform(df).take(1) + True .. versionadded:: 2.0.0 """ @@ -1385,10 +1409,6 @@ def __init__(self, featuresCol="features", maxIter=20, seed=None, checkpointInte """ super(LDA, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.LDA", self.uid) - self._setDefault(maxIter=20, checkpointInterval=10, - k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51, - subsamplingRate=0.05, optimizeDocConcentration=True, - topicDistributionCol="topicDistribution", keepLastCheckpoint=True) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1581,6 +1601,10 @@ class _PowerIterationClusteringParams(HasMaxIter, HasWeightCol): "Name of the input column for destination vertex IDs.", typeConverter=TypeConverters.toString) + def __init__(self, *args): + super(_PowerIterationClusteringParams, self).__init__(*args) + self._setDefault(k=2, maxIter=20, initMode="random", srcCol="src", dstCol="dst") + @since("2.4.0") def getK(self): """ @@ -1654,6 +1678,8 @@ class PowerIterationClustering(_PowerIterationClusteringParams, JavaParams, Java 2 >>> pic2.getMaxIter() 40 + >>> pic2.assignClusters(df).take(6) == assignments.take(6) + True .. versionadded:: 2.4.0 """ @@ -1668,7 +1694,6 @@ def __init__(self, k=2, maxIter=20, initMode="random", srcCol="src", dstCol="dst super(PowerIterationClustering, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.clustering.PowerIterationClustering", self.uid) - self._setDefault(k=2, maxIter=20, initMode="random", srcCol="src", dstCol="dst") kwargs = self._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 6df2f74bcfc9d..4f42bb337fe48 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -94,6 +94,8 @@ class Binarizer(JavaTransformer, HasThreshold, HasThresholds, HasInputCol, HasOu >>> loadedBinarizer = Binarizer.load(binarizerPath) >>> loadedBinarizer.getThreshold() == binarizer.getThreshold() True + >>> loadedBinarizer.transform(df).take(1) == binarizer.transform(df).take(1) + True >>> df2 = spark.createDataFrame([(0.5, 0.3)], ["values1", "values2"]) >>> binarizer2 = Binarizer(thresholds=[0.0, 1.0]) >>> binarizer2.setInputCols(["values1", "values2"]).setOutputCols(["output1", "output2"]) @@ -197,6 +199,10 @@ class _LSHParams(HasInputCol, HasOutputCol): "and decreasing it improves the running performance.", typeConverter=TypeConverters.toInt) + def __init__(self, *args): + super(_LSHParams, self).__init__(*args) + self._setDefault(numHashTables=1) + def getNumHashTables(self): """ Gets the value of numHashTables or its default value. @@ -392,7 +398,6 @@ def __init__(self, inputCol=None, outputCol=None, seed=None, numHashTables=1, super(BucketedRandomProjectionLSH, self).__init__() self._java_obj = \ self._new_java_obj("org.apache.spark.ml.feature.BucketedRandomProjectionLSH", self.uid) - self._setDefault(numHashTables=1) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -479,6 +484,8 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOu >>> loadedBucketizer = Bucketizer.load(bucketizerPath) >>> loadedBucketizer.getSplits() == bucketizer.getSplits() True + >>> loadedBucketizer.transform(df).take(1) == bucketizer.transform(df).take(1) + True >>> bucketed = bucketizer.setHandleInvalid("skip").transform(df).collect() >>> len(bucketed) 4 @@ -735,6 +742,8 @@ class CountVectorizer(JavaEstimator, _CountVectorizerParams, JavaMLReadable, Jav >>> loadedModel = CountVectorizerModel.load(modelPath) >>> loadedModel.vocabulary == model.vocabulary True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True >>> fromVocabModel = CountVectorizerModel.from_vocabulary(["a", "b", "c"], ... inputCol="raw", outputCol="vectors") >>> fromVocabModel.transform(df).show(truncate=False) @@ -922,6 +931,8 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit >>> dctPath = temp_path + "/dct" >>> dct.save(dctPath) >>> loadedDtc = DCT.load(dctPath) + >>> loadedDtc.transform(df1).take(1) == dct.transform(df1).take(1) + True >>> loadedDtc.getInverse() False @@ -1005,6 +1016,8 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReada >>> loadedEp = ElementwiseProduct.load(elementwiseProductPath) >>> loadedEp.getScalingVec() == ep.getScalingVec() True + >>> loadedEp.transform(df).take(1) == ep.transform(df).take(1) + True .. versionadded:: 1.5.0 """ @@ -1203,6 +1216,8 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java >>> loadedHashingTF = HashingTF.load(hashingTFPath) >>> loadedHashingTF.getNumFeatures() == hashingTF.getNumFeatures() True + >>> loadedHashingTF.transform(df).take(1) == hashingTF.transform(df).take(1) + True >>> hashingTF.indexOf("b") 5 @@ -1287,6 +1302,10 @@ class _IDFParams(HasInputCol, HasOutputCol): "minimum number of documents in which a term should appear for filtering", typeConverter=TypeConverters.toInt) + def __init__(self, *args): + super(_IDFParams, self).__init__(*args) + self._setDefault(minDocFreq=0) + @since("1.4.0") def getMinDocFreq(self): """ @@ -1347,7 +1366,6 @@ def __init__(self, minDocFreq=0, inputCol=None, outputCol=None): """ super(IDF, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid) - self._setDefault(minDocFreq=0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1447,6 +1465,10 @@ class _ImputerParams(HasInputCol, HasInputCols, HasOutputCol, HasOutputCols, Has "The placeholder for the missing values. All occurrences of missingValue " "will be imputed.", typeConverter=TypeConverters.toFloat) + def __init__(self, *args): + super(_ImputerParams, self).__init__(*args) + self._setDefault(strategy="mean", missingValue=float("nan"), relativeError=0.001) + @since("2.2.0") def getStrategy(self): """ @@ -1584,7 +1606,6 @@ def __init__(self, strategy="mean", missingValue=float("nan"), inputCols=None, """ super(Imputer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Imputer", self.uid) - self._setDefault(strategy="mean", missingValue=float("nan"), relativeError=0.001) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1813,6 +1834,8 @@ class MaxAbsScaler(JavaEstimator, _MaxAbsScalerParams, JavaMLReadable, JavaMLWri >>> loadedModel = MaxAbsScalerModel.load(modelPath) >>> loadedModel.maxAbs == model.maxAbs True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 2.0.0 """ @@ -1953,7 +1976,6 @@ def __init__(self, inputCol=None, outputCol=None, seed=None, numHashTables=1): """ super(MinHashLSH, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinHashLSH", self.uid) - self._setDefault(numHashTables=1) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -2004,6 +2026,10 @@ class _MinMaxScalerParams(HasInputCol, HasOutputCol): max = Param(Params._dummy(), "max", "Upper bound of the output feature range", typeConverter=TypeConverters.toFloat) + def __init__(self, *args): + super(_MinMaxScalerParams, self).__init__(*args) + self._setDefault(min=0.0, max=1.0) + @since("1.6.0") def getMin(self): """ @@ -2067,6 +2093,8 @@ class MinMaxScaler(JavaEstimator, _MinMaxScalerParams, JavaMLReadable, JavaMLWri True >>> loadedModel.originalMax == model.originalMax True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 1.6.0 """ @@ -2078,7 +2106,6 @@ def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None): """ super(MinMaxScaler, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid) - self._setDefault(min=0.0, max=1.0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -2212,6 +2239,8 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWr >>> loadedNGram = NGram.load(ngramPath) >>> loadedNGram.getN() == ngram.getN() True + >>> loadedNGram.transform(df).take(1) == ngram.transform(df).take(1) + True .. versionadded:: 1.5.0 """ @@ -2292,6 +2321,8 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav >>> loadedNormalizer = Normalizer.load(normalizerPath) >>> loadedNormalizer.getP() == normalizer.getP() True + >>> loadedNormalizer.transform(df).take(1) == normalizer.transform(df).take(1) + True .. versionadded:: 1.4.0 """ @@ -2365,6 +2396,10 @@ class _OneHotEncoderParams(HasInputCol, HasInputCols, HasOutputCol, HasOutputCol dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category", typeConverter=TypeConverters.toBoolean) + def __init__(self, *args): + super(_OneHotEncoderParams, self).__init__(*args) + self._setDefault(handleInvalid="error", dropLast=True) + @since("2.3.0") def getDropLast(self): """ @@ -2425,6 +2460,8 @@ class OneHotEncoder(JavaEstimator, _OneHotEncoderParams, JavaMLReadable, JavaMLW >>> loadedModel = OneHotEncoderModel.load(modelPath) >>> loadedModel.categorySizes == model.categorySizes True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 2.3.0 """ @@ -2439,7 +2476,6 @@ def __init__(self, inputCols=None, outputCols=None, handleInvalid="error", dropL super(OneHotEncoder, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.feature.OneHotEncoder", self.uid) - self._setDefault(handleInvalid="error", dropLast=True) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -2586,6 +2622,8 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLRead >>> loadedPx = PolynomialExpansion.load(polyExpansionPath) >>> loadedPx.getDegree() == px.getDegree() True + >>> loadedPx.transform(df).take(1) == px.transform(df).take(1) + True .. versionadded:: 1.4.0 """ @@ -2882,6 +2920,11 @@ class _RobustScalerParams(HasInputCol, HasOutputCol, HasRelativeError): withScaling = Param(Params._dummy(), "withScaling", "Whether to scale the data to " "quantile range", typeConverter=TypeConverters.toBoolean) + def __init__(self, *args): + super(_RobustScalerParams, self).__init__(*args) + self._setDefault(lower=0.25, upper=0.75, withCentering=False, withScaling=True, + relativeError=0.001) + @since("3.0.0") def getLower(self): """ @@ -2957,6 +3000,8 @@ class RobustScaler(JavaEstimator, _RobustScalerParams, JavaMLReadable, JavaMLWri True >>> loadedModel.range == model.range True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 3.0.0 """ @@ -2970,8 +3015,6 @@ def __init__(self, lower=0.25, upper=0.75, withCentering=False, withScaling=True """ super(RobustScaler, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RobustScaler", self.uid) - self._setDefault(lower=0.25, upper=0.75, withCentering=False, withScaling=True, - relativeError=0.001) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -3117,6 +3160,8 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, True >>> loadedReTokenizer.getGaps() == reTokenizer.getGaps() True + >>> loadedReTokenizer.transform(df).take(1) == reTokenizer.transform(df).take(1) + True .. versionadded:: 1.4.0 """ @@ -3241,6 +3286,8 @@ class SQLTransformer(JavaTransformer, JavaMLReadable, JavaMLWritable): >>> loadedSqlTrans = SQLTransformer.load(sqlTransformerPath) >>> loadedSqlTrans.getStatement() == sqlTrans.getStatement() True + >>> loadedSqlTrans.transform(df).take(1) == sqlTrans.transform(df).take(1) + True .. versionadded:: 1.6.0 """ @@ -3295,6 +3342,10 @@ class _StandardScalerParams(HasInputCol, HasOutputCol): withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation", typeConverter=TypeConverters.toBoolean) + def __init__(self, *args): + super(_StandardScalerParams, self).__init__(*args) + self._setDefault(withMean=False, withStd=True) + @since("1.4.0") def getWithMean(self): """ @@ -3352,6 +3403,8 @@ class StandardScaler(JavaEstimator, _StandardScalerParams, JavaMLReadable, JavaM True >>> loadedModel.mean == model.mean True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 1.4.0 """ @@ -3363,7 +3416,6 @@ def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None): """ super(StandardScaler, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StandardScaler", self.uid) - self._setDefault(withMean=False, withStd=True) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -3512,6 +3564,8 @@ class StringIndexer(JavaEstimator, _StringIndexerParams, JavaMLReadable, JavaMLW >>> loadedModel = StringIndexerModel.load(modelPath) >>> loadedModel.labels == model.labels True + >>> loadedModel.transform(stringIndDf).take(1) == model.transform(stringIndDf).take(1) + True >>> indexToStringPath = temp_path + "/index-to-string" >>> inverter.save(indexToStringPath) >>> loadedInverter = IndexToString.load(indexToStringPath) @@ -3802,6 +3856,8 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, True >>> loadedRemover.getCaseSensitive() == remover.getCaseSensitive() True + >>> loadedRemover.transform(df).take(1) == remover.transform(df).take(1) + True >>> df2 = spark.createDataFrame([(["a", "b", "c"], ["a", "b"])], ["text1", "text2"]) >>> remover2 = StopWordsRemover(stopWords=["b"]) >>> remover2.setInputCols(["text1", "text2"]).setOutputCols(["words1", "words2"]) @@ -4114,6 +4170,10 @@ class _VectorIndexerParams(HasInputCol, HasOutputCol, HasHandleInvalid): "of categories of the feature).", typeConverter=TypeConverters.toString) + def __init__(self, *args): + super(_VectorIndexerParams, self).__init__(*args) + self._setDefault(maxCategories=20, handleInvalid="error") + @since("1.4.0") def getMaxCategories(self): """ @@ -4194,6 +4254,8 @@ class VectorIndexer(JavaEstimator, _VectorIndexerParams, JavaMLReadable, JavaMLW True >>> loadedModel.categoryMaps == model.categoryMaps True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True >>> dfWithInvalid = spark.createDataFrame([(Vectors.dense([3.0, 1.0]),)], ["a"]) >>> indexer.getHandleInvalid() 'error' @@ -4214,7 +4276,6 @@ def __init__(self, maxCategories=20, inputCol=None, outputCol=None, handleInvali """ super(VectorIndexer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid) - self._setDefault(maxCategories=20, handleInvalid="error") kwargs = self._input_kwargs self.setParams(**kwargs) @@ -4337,6 +4398,8 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, J True >>> loadedVs.getNames() == vs.getNames() True + >>> loadedVs.transform(df).take(1) == vs.transform(df).take(1) + True .. versionadded:: 1.6.0 """ @@ -4436,6 +4499,11 @@ class _Word2VecParams(HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCo "be divided into chunks up to the size.", typeConverter=TypeConverters.toInt) + def __init__(self, *args): + super(_Word2VecParams, self).__init__(*args) + self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, + windowSize=5, maxSentenceLength=1000) + @since("1.4.0") def getVectorSize(self): """ @@ -4530,6 +4598,8 @@ class Word2Vec(JavaEstimator, _Word2VecParams, JavaMLReadable, JavaMLWritable): True >>> loadedModel.getVectors().first().vector == model.getVectors().first().vector True + >>> loadedModel.transform(doc).take(1) == model.transform(doc).take(1) + True .. versionadded:: 1.4.0 """ @@ -4543,8 +4613,6 @@ def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, """ super(Word2Vec, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid) - self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - windowSize=5, maxSentenceLength=1000) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -4736,6 +4804,8 @@ class PCA(JavaEstimator, _PCAParams, JavaMLReadable, JavaMLWritable): True >>> loadedModel.explainedVariance == model.explainedVariance True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 1.5.0 """ @@ -4852,6 +4922,11 @@ class _RFormulaParams(HasFeaturesCol, HasLabelCol, HasHandleInvalid): "additional bucket, at index numLabels).", typeConverter=TypeConverters.toString) + def __init__(self, *args): + super(_RFormulaParams, self).__init__(*args) + self._setDefault(forceIndexLabel=False, stringIndexerOrderType="frequencyDesc", + handleInvalid="error") + @since("1.5.0") def getFormula(self): """ @@ -4954,8 +5029,6 @@ def __init__(self, formula=None, featuresCol="features", labelCol="label", """ super(RFormula, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid) - self._setDefault(forceIndexLabel=False, stringIndexerOrderType="frequencyDesc", - handleInvalid="error") kwargs = self._input_kwargs self.setParams(**kwargs) @@ -5064,6 +5137,11 @@ class _ChiSqSelectorParams(HasFeaturesCol, HasOutputCol, HasLabelCol): fwe = Param(Params._dummy(), "fwe", "The upper bound of the expected family-wise error rate.", typeConverter=TypeConverters.toFloat) + def __init__(self, *args): + super(_ChiSqSelectorParams, self).__init__(*args) + self._setDefault(numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1, + fpr=0.05, fdr=0.05, fwe=0.05) + @since("2.1.0") def getSelectorType(self): """ @@ -5160,6 +5238,8 @@ class ChiSqSelector(JavaEstimator, _ChiSqSelectorParams, JavaMLReadable, JavaMLW >>> loadedModel = ChiSqSelectorModel.load(modelPath) >>> loadedModel.selectedFeatures == model.selectedFeatures True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 2.0.0 """ @@ -5175,8 +5255,6 @@ def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, """ super(ChiSqSelector, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid) - self._setDefault(numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1, - fpr=0.05, fdr=0.05, fwe=0.05) kwargs = self._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/fpm.py b/python/pyspark/ml/fpm.py index 7d933daf9e032..0e302d53d97e7 100644 --- a/python/pyspark/ml/fpm.py +++ b/python/pyspark/ml/fpm.py @@ -56,6 +56,11 @@ class _FPGrowthParams(HasPredictionCol): "but will affect the association rules generation.", typeConverter=TypeConverters.toFloat) + def __init__(self, *args): + super(_FPGrowthParams, self).__init__(*args) + self._setDefault(minSupport=0.3, minConfidence=0.8, + itemsCol="items", predictionCol="prediction") + def getItemsCol(self): """ Gets the value of itemsCol or its default value. @@ -194,6 +199,11 @@ class FPGrowth(JavaEstimator, _FPGrowthParams, JavaMLWritable, JavaMLReadable): >>> new_data = spark.createDataFrame([(["t", "s"], )], ["items"]) >>> sorted(fpm.transform(new_data).first().newPrediction) [u'x', u'y', u'z'] + >>> model_path = temp_path + "/fpm_model" + >>> fpm.save(model_path) + >>> model2 = FPGrowthModel.load(model_path) + >>> fpm.transform(data).take(1) == model2.transform(data).take(1) + True .. versionadded:: 2.2.0 """ @@ -206,8 +216,6 @@ def __init__(self, minSupport=0.3, minConfidence=0.8, itemsCol="items", """ super(FPGrowth, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.fpm.FPGrowth", self.uid) - self._setDefault(minSupport=0.3, minConfidence=0.8, - itemsCol="items", predictionCol="prediction") kwargs = self._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index 99d80aa867bda..62b856046234a 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -46,6 +46,10 @@ class _ALSModelParams(HasPredictionCol, HasBlockSize): "Supported values: 'nan', 'drop'.", typeConverter=TypeConverters.toString) + def __init__(self, *args): + super(_ALSModelParams, self).__init__(*args) + self._setDefault(blockSize=4096) + @since("1.4.0") def getUserCol(self): """ @@ -99,6 +103,14 @@ class _ALSParams(_ALSModelParams, HasMaxIter, HasRegParam, HasCheckpointInterval "StorageLevel for ALS model factors.", typeConverter=TypeConverters.toString) + def __init__(self, *args): + super(_ALSParams, self).__init__(*args) + self._setDefault(rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, + implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", + ratingCol="rating", nonnegative=False, checkpointInterval=10, + intermediateStorageLevel="MEMORY_AND_DISK", + finalStorageLevel="MEMORY_AND_DISK", coldStartStrategy="nan") + @since("1.4.0") def getRank(self): """ @@ -275,6 +287,8 @@ class ALS(JavaEstimator, _ALSParams, JavaMLWritable, JavaMLReadable): True >>> sorted(model.itemFactors.collect()) == sorted(model2.itemFactors.collect()) True + >>> model.transform(test).take(1) == model2.transform(test).take(1) + True .. versionadded:: 1.4.0 """ @@ -294,12 +308,6 @@ def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemB """ super(ALS, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.recommendation.ALS", self.uid) - self._setDefault(rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, - implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", - ratingCol="rating", nonnegative=False, checkpointInterval=10, - intermediateStorageLevel="MEMORY_AND_DISK", - finalStorageLevel="MEMORY_AND_DISK", coldStartStrategy="nan", - blockSize=4096) kwargs = self._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 9c3c1e6ca01ae..550efb27c1e97 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -79,6 +79,10 @@ class _LinearRegressionParams(_JavaPredictorParams, HasRegParam, HasElasticNetPa "robustness. Must be > 1.0. Only valid when loss is huber", typeConverter=TypeConverters.toFloat) + def __init__(self, *args): + super(_LinearRegressionParams, self).__init__(*args) + self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, loss="squaredError", epsilon=1.35) + @since("2.3.0") def getEpsilon(self): """ @@ -158,6 +162,8 @@ class LinearRegression(JavaRegressor, _LinearRegressionParams, JavaMLWritable, J True >>> model.intercept == model2.intercept True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True >>> model.numFeatures 1 >>> model.write().format("pmml").save(model_path + "_2") @@ -179,7 +185,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(LinearRegression, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.regression.LinearRegression", self.uid) - self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, loss="squaredError", epsilon=1.35) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -584,6 +589,10 @@ class _IsotonicRegressionParams(HasFeaturesCol, HasLabelCol, HasPredictionCol, H "The index of the feature if featuresCol is a vector column, no effect otherwise.", typeConverter=TypeConverters.toInt) + def __init__(self, *args): + super(_IsotonicRegressionParams, self).__init__(*args) + self._setDefault(isotonic=True, featureIndex=0) + def getIsotonic(self): """ Gets the value of isotonic or its default value. @@ -633,6 +642,8 @@ class IsotonicRegression(JavaEstimator, _IsotonicRegressionParams, HasWeightCol, True >>> model.predictions == model2.predictions True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True .. versionadded:: 1.6.0 """ @@ -646,7 +657,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(IsotonicRegression, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.regression.IsotonicRegression", self.uid) - self._setDefault(isotonic=True, featureIndex=0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -773,7 +783,11 @@ class _DecisionTreeRegressorParams(_DecisionTreeParams, _TreeRegressorParams, Ha .. versionadded:: 3.0.0 """ - pass + def __init__(self, *args): + super(_DecisionTreeRegressorParams, self).__init__(*args) + self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, + maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, + impurity="variance", leafCol="", minWeightFractionPerNode=0.0) @inherit_doc @@ -829,6 +843,8 @@ class DecisionTreeRegressor(JavaRegressor, _DecisionTreeRegressorParams, JavaMLW True >>> model.depth == model2.depth True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True >>> model.transform(test1).head().variance 0.0 @@ -860,9 +876,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(DecisionTreeRegressor, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.regression.DecisionTreeRegressor", self.uid) - self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="variance", leafCol="", minWeightFractionPerNode=0.0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1017,7 +1030,14 @@ class _RandomForestRegressorParams(_RandomForestParams, _TreeRegressorParams): .. versionadded:: 3.0.0 """ - pass + + def __init__(self, *args): + super(_RandomForestRegressorParams, self).__init__(*args) + self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, + maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, + impurity="variance", subsamplingRate=1.0, numTrees=20, + featureSubsetStrategy="auto", leafCol="", minWeightFractionPerNode=0.0, + bootstrap=True) @inherit_doc @@ -1078,6 +1098,8 @@ class RandomForestRegressor(JavaRegressor, _RandomForestRegressorParams, JavaMLW >>> model2 = RandomForestRegressionModel.load(model_path) >>> model.featureImportances == model2.featureImportances True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True .. versionadded:: 1.4.0 """ @@ -1100,11 +1122,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(RandomForestRegressor, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.regression.RandomForestRegressor", self.uid) - self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="variance", subsamplingRate=1.0, numTrees=20, - featureSubsetStrategy="auto", leafCol="", minWeightFractionPerNode=0.0, - bootstrap=True) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1275,6 +1292,14 @@ class _GBTRegressorParams(_GBTParams, _TreeRegressorParams): "Supported options: " + ", ".join(supportedLossTypes), typeConverter=TypeConverters.toString) + def __init__(self, *args): + super(_GBTRegressorParams, self).__init__(*args) + self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, + maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, + impurity="variance", featureSubsetStrategy="all", validationTol=0.01, + leafCol="", minWeightFractionPerNode=0.0) + @since("1.4.0") def getLossType(self): """ @@ -1338,6 +1363,8 @@ class GBTRegressor(JavaRegressor, _GBTRegressorParams, JavaMLWritable, JavaMLRea True >>> model.treeWeights == model2.treeWeights True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True >>> model.trees [DecisionTreeRegressionModel...depth=..., DecisionTreeRegressionModel...] >>> validation = spark.createDataFrame([(0.0, Vectors.dense(-1.0))], @@ -1372,11 +1399,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred """ super(GBTRegressor, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid) - self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, - impurity="variance", featureSubsetStrategy="all", validationTol=0.01, - leafCol="", minWeightFractionPerNode=0.0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1595,6 +1617,12 @@ class _AFTSurvivalRegressionParams(_JavaPredictorParams, HasMaxIter, HasTol, Has "corresponding quantileProbabilities if it is set.", typeConverter=TypeConverters.toString) + def __init__(self, *args): + super(_AFTSurvivalRegressionParams, self).__init__(*args) + self._setDefault(censorCol="censor", + quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], + maxIter=100, tol=1E-6) + @since("1.6.0") def getCensorCol(self): """ @@ -1667,6 +1695,8 @@ class AFTSurvivalRegression(JavaRegressor, _AFTSurvivalRegressionParams, True >>> model.scale == model2.scale True + >>> model.transform(df).take(1) == model2.transform(df).take(1) + True .. versionadded:: 1.6.0 """ @@ -1685,9 +1715,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(AFTSurvivalRegression, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.regression.AFTSurvivalRegression", self.uid) - self._setDefault(censorCol="censor", - quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], - maxIter=100, tol=1E-6) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1847,8 +1874,8 @@ class _GeneralizedLinearRegressionParams(_JavaPredictorParams, HasFitIntercept, "or empty, we treat all instance offsets as 0.0", typeConverter=TypeConverters.toString) - def __init__(self): - super(_GeneralizedLinearRegressionParams, self).__init__() + def __init__(self, *args): + super(_GeneralizedLinearRegressionParams, self).__init__(*args) self._setDefault(family="gaussian", maxIter=25, tol=1e-6, regParam=0.0, solver="irls", variancePower=0.0, aggregationDepth=2) @@ -1966,6 +1993,8 @@ class GeneralizedLinearRegression(JavaRegressor, _GeneralizedLinearRegressionPar True >>> model.coefficients[0] == model2.coefficients[0] True + >>> model.transform(df).take(1) == model2.transform(df).take(1) + True .. versionadded:: 2.0.0 """ @@ -2357,8 +2386,8 @@ class _FactorizationMachinesParams(_JavaPredictorParams, HasMaxIter, HasStepSize solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " + "options: gd, adamW. (Default adamW)", typeConverter=TypeConverters.toString) - def __init__(self): - super(_FactorizationMachinesParams, self).__init__() + def __init__(self, *args): + super(_FactorizationMachinesParams, self).__init__(*args) self._setDefault(factorSize=8, fitIntercept=True, fitLinear=True, regParam=0.0, miniBatchFraction=1.0, initStd=0.01, maxIter=100, stepSize=1.0, tol=1e-6, solver="adamW") @@ -2436,6 +2465,17 @@ class FMRegressor(JavaRegressor, _FactorizationMachinesParams, JavaMLWritable, J DenseVector([0.9978]) >>> model.factors DenseMatrix(1, 2, [0.0173, 0.0021], 1) + >>> model_path = temp_path + "/fm_model" + >>> model.save(model_path) + >>> model2 = FMRegressionModel.load(model_path) + >>> model2.intercept + -0.0032501766849261557 + >>> model2.linear + DenseVector([0.9978]) + >>> model2.factors + DenseMatrix(1, 2, [0.0173, 0.0021], 1) + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True .. versionadded:: 3.0.0 """ diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py index 777b4930ce8c9..2b119ea866236 100644 --- a/python/pyspark/ml/tests/test_param.py +++ b/python/pyspark/ml/tests/test_param.py @@ -366,15 +366,14 @@ def test_java_params(self): if not name.endswith('Model') and not name.endswith('Params') \ and issubclass(cls, JavaParams) and not inspect.isabstract(cls) \ and not name.startswith('Java') and name != '_LSH': - # NOTE: disable check_params_exist until there is parity with Scala API - check_params(self, cls(), check_params_exist=False) + check_params(self, cls(), check_params_exist=True) # Additional classes that need explicit construction from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'), - check_params_exist=False) + check_params_exist=True) check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'), - check_params_exist=False) + check_params_exist=True) if __name__ == "__main__": diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index cb4542c5d025f..e564ff7a10448 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -200,6 +200,10 @@ class _CrossValidatorParams(_ValidatorParams): numFolds = Param(Params._dummy(), "numFolds", "number of folds for cross validation", typeConverter=TypeConverters.toInt) + def __init__(self, *args): + super(_CrossValidatorParams, self).__init__(*args) + self._setDefault(numFolds=3) + @since("1.4.0") def getNumFolds(self): """ @@ -249,6 +253,8 @@ class CrossValidator(Estimator, _CrossValidatorParams, HasParallelism, HasCollec [0.5, ... >>> evaluator.evaluate(cvModel.transform(dataset)) 0.8333... + >>> evaluator.evaluate(cvModelRead.transform(dataset)) + 0.8333... .. versionadded:: 1.4.0 """ @@ -261,7 +267,7 @@ def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numF seed=None, parallelism=1, collectSubModels=False) """ super(CrossValidator, self).__init__() - self._setDefault(numFolds=3, parallelism=1) + self._setDefault(parallelism=1) kwargs = self._input_kwargs self._set(**kwargs) @@ -545,6 +551,10 @@ class _TrainValidationSplitParams(_ValidatorParams): trainRatio = Param(Params._dummy(), "trainRatio", "Param for ratio between train and\ validation data. Must be between 0 and 1.", typeConverter=TypeConverters.toFloat) + def __init__(self, *args): + super(_TrainValidationSplitParams, self).__init__(*args) + self._setDefault(trainRatio=0.75) + @since("2.0.0") def getTrainRatio(self): """ @@ -590,6 +600,8 @@ class TrainValidationSplit(Estimator, _TrainValidationSplitParams, HasParallelis [0.5, ... >>> evaluator.evaluate(tvsModel.transform(dataset)) 0.833... + >>> evaluator.evaluate(tvsModelRead.transform(dataset)) + 0.833... .. versionadded:: 2.0.0 """ @@ -602,7 +614,7 @@ def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, trai parallelism=1, collectSubModels=False, seed=None) """ super(TrainValidationSplit, self).__init__() - self._setDefault(trainRatio=0.75, parallelism=1) + self._setDefault(parallelism=1) kwargs = self._input_kwargs self._set(**kwargs)