diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 64f1722f5fcb8..ad1010da5c104 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -64,6 +64,8 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasInp /** @group getParam */ def getMissingValue: Double = $(missingValue) + setDefault(strategy -> Imputer.mean, missingValue -> Double.NaN) + /** Returns the input and output column names corresponding in pair. */ private[feature] def getInOutCols(): (Array[String], Array[String]) = { if (isSet(inputCol)) { @@ -144,8 +146,6 @@ class Imputer @Since("2.2.0") (@Since("2.2.0") override val uid: String) @Since("3.0.0") def setRelativeError(value: Double): this.type = set(relativeError, value) - setDefault(strategy -> Imputer.mean, missingValue -> Double.NaN) - override def fit(dataset: Dataset[_]): ImputerModel = { transformSchema(dataset.schema, logging = true) val spark = dataset.sparkSession diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala index 90187c331e835..22c4ca9cddf4b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala @@ -58,6 +58,8 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H /** @group getParam */ def getMax: Double = $(max) + setDefault(min -> 0.0, max -> 1.0) + /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { require($(min) < $(max), s"The specified min(${$(min)}) is larger or equal to max(${$(max)})") @@ -93,8 +95,6 @@ class MinMaxScaler @Since("1.5.0") (@Since("1.5.0") override val uid: String) @Since("1.5.0") def this() = this(Identifiable.randomUID("minMaxScal")) - setDefault(min -> 0.0, max -> 1.0) - /** @group setParam */ @Since("1.5.0") def setInputCol(value: String): this.type = set(inputCol, value) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala index 0ef092f6be463..5a500fefb57ec 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala @@ -53,8 +53,6 @@ private[ml] trait OneHotEncoderBase extends Params with HasHandleInvalid "during fitting, invalid data will result in an error.", ParamValidators.inArray(OneHotEncoder.supportedHandleInvalids)) - setDefault(handleInvalid, OneHotEncoder.ERROR_INVALID) - /** * Whether to drop the last category in the encoded vector (default: true) * @group param @@ -62,12 +60,13 @@ private[ml] trait OneHotEncoderBase extends Params with HasHandleInvalid @Since("2.3.0") final val dropLast: BooleanParam = new BooleanParam(this, "dropLast", "whether to drop the last category") - setDefault(dropLast -> true) /** @group getParam */ @Since("2.3.0") def getDropLast: Boolean = $(dropLast) + setDefault(handleInvalid -> OneHotEncoder.ERROR_INVALID, dropLast -> true) + /** Returns the input and output column names corresponding in pair. */ private[feature] def getInOutCols(): (Array[String], Array[String]) = { if (isSet(inputCol)) { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala index 4eedfc4dc0efa..b93c9b1fcd204 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala @@ -46,7 +46,6 @@ private[feature] trait QuantileDiscretizerBase extends Params val numBuckets = new IntParam(this, "numBuckets", "Number of buckets (quantiles, or " + "categories) into which data points are grouped. Must be >= 2.", ParamValidators.gtEq(2)) - setDefault(numBuckets -> 2) /** @group getParam */ def getNumBuckets: Int = getOrDefault(numBuckets) @@ -82,7 +81,8 @@ private[feature] trait QuantileDiscretizerBase extends Params "how to handle invalid entries. Options are skip (filter out rows with invalid values), " + "error (throw an error), or keep (keep invalid values in a special additional bucket).", ParamValidators.inArray(Bucketizer.supportedHandleInvalids)) - setDefault(handleInvalid, Bucketizer.ERROR_INVALID) + + setDefault(handleInvalid -> Bucketizer.ERROR_INVALID, numBuckets -> 2) } /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index 7ccfafa4ac813..b8da020017f12 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -60,7 +60,6 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol with @Since("2.1.0") val forceIndexLabel: BooleanParam = new BooleanParam(this, "forceIndexLabel", "Force to index label whether it is numeric or string") - setDefault(forceIndexLabel -> false) /** @group getParam */ @Since("2.1.0") @@ -80,7 +79,6 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol with "type. Options are 'skip' (filter out rows with invalid data), error (throw an error), " + "or 'keep' (put invalid data in a special additional bucket, at index numLabels).", ParamValidators.inArray(StringIndexer.supportedHandleInvalids)) - setDefault(handleInvalid, StringIndexer.ERROR_INVALID) /** * Param for how to order categories of a string FEATURE column used by `StringIndexer`. @@ -113,12 +111,14 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol with "The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', " + "RFormula drops the same category as R when encoding strings.", ParamValidators.inArray(StringIndexer.supportedStringOrderType)) - setDefault(stringIndexerOrderType, StringIndexer.frequencyDesc) /** @group getParam */ @Since("2.3.0") def getStringIndexerOrderType: String = $(stringIndexerOrderType) + setDefault(forceIndexLabel -> false, handleInvalid -> StringIndexer.ERROR_INVALID, + stringIndexerOrderType -> StringIndexer.frequencyDesc) + protected def hasLabelCol(schema: StructType): Boolean = { schema.map(_.name).contains($(labelCol)) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RobustScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RobustScaler.scala index 72ab3dbc31016..e8f325ec58432 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RobustScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RobustScaler.scala @@ -50,8 +50,6 @@ private[feature] trait RobustScalerParams extends Params with HasInputCol with H /** @group getParam */ def getLower: Double = $(lower) - setDefault(lower -> 0.25) - /** * Upper quantile to calculate quantile range, shared by all features * Default: 0.75 @@ -64,8 +62,6 @@ private[feature] trait RobustScalerParams extends Params with HasInputCol with H /** @group getParam */ def getUpper: Double = $(upper) - setDefault(upper -> 0.75) - /** * Whether to center the data with median before scaling. * It will build a dense output, so take care when applying to sparse input. @@ -78,8 +74,6 @@ private[feature] trait RobustScalerParams extends Params with HasInputCol with H /** @group getParam */ def getWithCentering: Boolean = $(withCentering) - setDefault(withCentering -> false) - /** * Whether to scale the data to quantile range. * Default: true @@ -91,7 +85,7 @@ private[feature] trait RobustScalerParams extends Params with HasInputCol with H /** @group getParam */ def getWithScaling: Boolean = $(withScaling) - setDefault(withScaling -> true) + setDefault(withScaling -> true, lower -> 0.25, upper -> 0.75, withCentering -> false) /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Selector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Selector.scala index 627133968d149..46052a89fdf1a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Selector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Selector.scala @@ -50,7 +50,6 @@ private[feature] trait SelectorParams extends Params "Number of features that selector will select, ordered by ascending p-value. If the" + " number of features is < numTopFeatures, then this will select all features.", ParamValidators.gtEq(1)) - setDefault(numTopFeatures -> 50) /** @group getParam */ @Since("3.1.0") @@ -66,7 +65,6 @@ private[feature] trait SelectorParams extends Params final val percentile = new DoubleParam(this, "percentile", "Percentile of features that selector will select, ordered by ascending p-value.", ParamValidators.inRange(0, 1)) - setDefault(percentile -> 0.1) /** @group getParam */ @Since("3.1.0") @@ -81,7 +79,6 @@ private[feature] trait SelectorParams extends Params @Since("3.1.0") final val fpr = new DoubleParam(this, "fpr", "The higest p-value for features to be kept.", ParamValidators.inRange(0, 1)) - setDefault(fpr -> 0.05) /** @group getParam */ @Since("3.1.0") @@ -96,7 +93,6 @@ private[feature] trait SelectorParams extends Params @Since("3.1.0") final val fdr = new DoubleParam(this, "fdr", "The upper bound of the expected false discovery rate.", ParamValidators.inRange(0, 1)) - setDefault(fdr -> 0.05) /** @group getParam */ def getFdr: Double = $(fdr) @@ -110,7 +106,6 @@ private[feature] trait SelectorParams extends Params @Since("3.1.0") final val fwe = new DoubleParam(this, "fwe", "The upper bound of the expected family-wise error rate.", ParamValidators.inRange(0, 1)) - setDefault(fwe -> 0.05) /** @group getParam */ def getFwe: Double = $(fwe) @@ -125,12 +120,13 @@ private[feature] trait SelectorParams extends Params "The selector type. Supported options: numTopFeatures, percentile, fpr, fdr, fwe", ParamValidators.inArray(Array("numTopFeatures", "percentile", "fpr", "fdr", "fwe"))) - setDefault(selectorType -> "numTopFeatures") /** @group getParam */ @Since("3.1.0") def getSelectorType: String = $(selectorType) + setDefault(numTopFeatures -> 50, percentile -> 0.1, fpr -> 0.05, fdr -> 0.05, fwe -> 0.05, + selectorType -> "numTopFeatures") } /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index be32f44287b6a..ab51fe6e78bd7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -56,8 +56,6 @@ private[feature] trait StringIndexerBase extends Params with HasHandleInvalid wi "or 'keep' (put invalid data in a special additional bucket, at index numLabels).", ParamValidators.inArray(StringIndexer.supportedHandleInvalids)) - setDefault(handleInvalid, StringIndexer.ERROR_INVALID) - /** * Param for how to order labels of string column. The first label after ordering is assigned * an index of 0. @@ -80,6 +78,9 @@ private[feature] trait StringIndexerBase extends Params with HasHandleInvalid wi s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.", ParamValidators.inArray(StringIndexer.supportedStringOrderType)) + setDefault(handleInvalid -> StringIndexer.ERROR_INVALID, + stringOrderType -> StringIndexer.frequencyDesc) + /** @group getParam */ @Since("2.3.0") def getStringOrderType: String = $(stringOrderType) @@ -155,7 +156,6 @@ class StringIndexer @Since("1.4.0") ( /** @group setParam */ @Since("2.3.0") def setStringOrderType(value: String): this.type = set(stringOrderType, value) - setDefault(stringOrderType, StringIndexer.frequencyDesc) /** @group setParam */ @Since("1.4.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala index 866074fb1453e..b7cf4392cd177 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala @@ -60,8 +60,6 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu "number of categories of the feature).", ParamValidators.inArray(VectorIndexer.supportedHandleInvalids)) - setDefault(handleInvalid, VectorIndexer.ERROR_INVALID) - /** * Threshold for the number of values a categorical feature can take. * If a feature is found to have {@literal >} maxCategories values, then it is declared @@ -75,10 +73,10 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu " If a feature is found to have > maxCategories values, then it is declared continuous.", ParamValidators.gtEq(2)) - setDefault(maxCategories -> 20) - /** @group getParam */ def getMaxCategories: Int = $(maxCategories) + + setDefault(maxCategories -> 20, handleInvalid -> VectorIndexer.ERROR_INVALID) } /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala index be91844ba39e6..3590b9118f3b8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala @@ -57,8 +57,6 @@ final class VectorSlicer @Since("1.5.0") (@Since("1.5.0") override val uid: Stri "An array of indices to select features from a vector column." + " There can be no overlap with names.", VectorSlicer.validIndices) - setDefault(indices -> Array.emptyIntArray) - /** @group getParam */ @Since("1.5.0") def getIndices: Array[Int] = $(indices) @@ -79,8 +77,6 @@ final class VectorSlicer @Since("1.5.0") (@Since("1.5.0") override val uid: Stri "An array of feature names to select features from a vector column." + " There can be no overlap with indices.", VectorSlicer.validNames) - setDefault(names -> Array.empty[String]) - /** @group getParam */ @Since("1.5.0") def getNames: Array[String] = $(names) @@ -97,6 +93,8 @@ final class VectorSlicer @Since("1.5.0") (@Since("1.5.0") override val uid: Stri @Since("1.5.0") def setOutputCol(value: String): this.type = set(outputCol, value) + setDefault(indices -> Array.emptyIntArray, names -> Array.empty[String]) + @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { // Validity checks diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index db2665fa2e4a3..01db39f9e3921 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -47,7 +47,6 @@ private[feature] trait Word2VecBase extends Params final val vectorSize = new IntParam( this, "vectorSize", "the dimension of codes after transforming from words (> 0)", ParamValidators.gt(0)) - setDefault(vectorSize -> 100) /** @group getParam */ def getVectorSize: Int = $(vectorSize) @@ -60,7 +59,6 @@ private[feature] trait Word2VecBase extends Params final val windowSize = new IntParam( this, "windowSize", "the window size (context words from [-window, window]) (> 0)", ParamValidators.gt(0)) - setDefault(windowSize -> 5) /** @group expertGetParam */ def getWindowSize: Int = $(windowSize) @@ -73,7 +71,6 @@ private[feature] trait Word2VecBase extends Params final val numPartitions = new IntParam( this, "numPartitions", "number of partitions for sentences of words (> 0)", ParamValidators.gt(0)) - setDefault(numPartitions -> 1) /** @group getParam */ def getNumPartitions: Int = $(numPartitions) @@ -86,7 +83,6 @@ private[feature] trait Word2VecBase extends Params */ final val minCount = new IntParam(this, "minCount", "the minimum number of times a token must " + "appear to be included in the word2vec model's vocabulary (>= 0)", ParamValidators.gtEq(0)) - setDefault(minCount -> 5) /** @group getParam */ def getMinCount: Int = $(minCount) @@ -101,13 +97,12 @@ private[feature] trait Word2VecBase extends Params final val maxSentenceLength = new IntParam(this, "maxSentenceLength", "Maximum length " + "(in words) of each sentence in the input data. Any sentence longer than this threshold will " + "be divided into chunks up to the size (> 0)", ParamValidators.gt(0)) - setDefault(maxSentenceLength -> 1000) /** @group getParam */ def getMaxSentenceLength: Int = $(maxSentenceLength) - setDefault(stepSize -> 0.025) - setDefault(maxIter -> 1) + setDefault(vectorSize -> 100, windowSize -> 5, numPartitions -> 1, minCount -> 5, + maxSentenceLength -> 1000, stepSize -> 0.025, maxIter -> 1) /** * Validate and transform the input schema. diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index 7e2c287f146fb..19ea8ae4775d8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -333,8 +333,6 @@ private[ml] trait TreeEnsembleParams extends DecisionTreeParams { "Fraction of the training data used for learning each decision tree, in range (0, 1].", ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)) - setDefault(subsamplingRate -> 1.0) - /** @group getParam */ final def getSubsamplingRate: Double = $(subsamplingRate) @@ -386,10 +384,10 @@ private[ml] trait TreeEnsembleParams extends DecisionTreeParams { || Try(value.toInt).filter(_ > 0).isSuccess || Try(value.toDouble).filter(_ > 0).filter(_ <= 1.0).isSuccess) - setDefault(featureSubsetStrategy -> "auto") - /** @group getParam */ final def getFeatureSubsetStrategy: String = $(featureSubsetStrategy).toLowerCase(Locale.ROOT) + + setDefault(subsamplingRate -> 1.0, featureSubsetStrategy -> "auto") } /** @@ -448,8 +446,6 @@ private[ml] trait RandomForestParams extends TreeEnsembleParams { new IntParam(this, "numTrees", "Number of trees to train (at least 1)", ParamValidators.gtEq(1)) - setDefault(numTrees -> 20) - /** @group getParam */ final def getNumTrees: Int = $(numTrees) @@ -461,11 +457,11 @@ private[ml] trait RandomForestParams extends TreeEnsembleParams { final val bootstrap: BooleanParam = new BooleanParam(this, "bootstrap", "Whether bootstrap samples are used when building trees.") - setDefault(bootstrap -> true) - /** @group getParam */ @Since("3.0.0") final def getBootstrap: Boolean = $(bootstrap) + + setDefault(numTrees -> 20, bootstrap -> true) } private[ml] trait RandomForestClassifierParams @@ -518,9 +514,7 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS "(a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.", ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)) - setDefault(maxIter -> 20, stepSize -> 0.1, validationTol -> 0.01) - - setDefault(featureSubsetStrategy -> "all") + setDefault(maxIter -> 20, stepSize -> 0.1, validationTol -> 0.01, featureSubsetStrategy -> "all") /** (private[ml]) Create a BoostingStrategy instance to use with the old API. */ private[ml] def getOldBoostingStrategy( diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala index e99c55b0cdd85..275d3c5510f7d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala @@ -55,8 +55,6 @@ private[ml] trait CrossValidatorParams extends ValidatorParams { /** @group getParam */ def getNumFolds: Int = $(numFolds) - setDefault(numFolds -> 3) - /** * Param for the column name of user specified fold number. Once this is specified, * `CrossValidator` won't do random k-fold split. Note that this column should be @@ -68,7 +66,7 @@ private[ml] trait CrossValidatorParams extends ValidatorParams { def getFoldCol: String = $(foldCol) - setDefault(foldCol, "") + setDefault(foldCol -> "", numFolds -> 3) } /** diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala b/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala index 4d9e664850c12..dd0139b94f098 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala @@ -63,6 +63,9 @@ trait DefaultReadWriteTest extends TempDirectory { self: Suite => (instance.getOrDefault(p), newInstance.getOrDefault(p)) match { case (Array(values), Array(newValues)) => assert(values === newValues, s"Values do not match on param ${p.name}.") + case (value: Double, newValue: Double) => + assert(value.isNaN && newValue.isNaN || value == newValue, + s"Values do not match on param ${p.name}.") case (value, newValue) => assert(value === newValue, s"Values do not match on param ${p.name}.") } diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index e192e8c252d50..61390705e9f13 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -513,8 +513,8 @@ class _LinearSVCParams(_ClassifierParams, HasRegParam, HasMaxIter, HasFitInterce " all predictions 0.0 and -Inf will make all predictions 1.0.", typeConverter=TypeConverters.toFloat) - def __init__(self): - super(_LinearSVCParams, self).__init__() + def __init__(self, *args): + super(_LinearSVCParams, self).__init__(*args) self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, fitIntercept=True, standardization=True, threshold=0.0, aggregationDepth=2, blockSize=1) @@ -587,6 +587,8 @@ class LinearSVC(_JavaClassifier, _LinearSVCParams, JavaMLWritable, JavaMLReadabl True >>> model.intercept == model2.intercept True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True .. versionadded:: 2.2.0 """ @@ -820,8 +822,8 @@ class _LogisticRegressionParams(_ProbabilisticClassifierParams, HasRegParam, "classes for multinomial regression.", typeConverter=TypeConverters.toVector) - def __init__(self): - super(_LogisticRegressionParams, self).__init__() + def __init__(self, *args): + super(_LogisticRegressionParams, self).__init__(*args) self._setDefault(maxIter=100, regParam=0.0, tol=1E-6, threshold=0.5, family="auto", blockSize=1) @@ -1018,6 +1020,8 @@ class LogisticRegression(_JavaProbabilisticClassifier, _LogisticRegressionParams True >>> model2 LogisticRegressionModel: uid=..., numClasses=2, numFeatures=2 + >>> blorModel.transform(test0).take(1) == model2.transform(test0).take(1) + True .. versionadded:: 1.3.0 """ @@ -1313,8 +1317,8 @@ class _DecisionTreeClassifierParams(_DecisionTreeParams, _TreeClassifierParams): Params for :py:class:`DecisionTreeClassifier` and :py:class:`DecisionTreeClassificationModel`. """ - def __init__(self): - super(_DecisionTreeClassifierParams, self).__init__() + def __init__(self, *args): + super(_DecisionTreeClassifierParams, self).__init__(*args) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", leafCol="", minWeightFractionPerNode=0.0) @@ -1384,7 +1388,8 @@ class DecisionTreeClassifier(_JavaProbabilisticClassifier, _DecisionTreeClassifi >>> model2 = DecisionTreeClassificationModel.load(model_path) >>> model.featureImportances == model2.featureImportances True - + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True >>> df3 = spark.createDataFrame([ ... (1.0, 0.2, Vectors.dense(1.0)), ... (1.0, 0.8, Vectors.dense(1.0)), @@ -1550,8 +1555,8 @@ class _RandomForestClassifierParams(_RandomForestParams, _TreeClassifierParams): Params for :py:class:`RandomForestClassifier` and :py:class:`RandomForestClassificationModel`. """ - def __init__(self): - super(_RandomForestClassifierParams, self).__init__() + def __init__(self, *args): + super(_RandomForestClassifierParams, self).__init__(*args) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20, featureSubsetStrategy="auto", @@ -1628,6 +1633,8 @@ class RandomForestClassifier(_JavaProbabilisticClassifier, _RandomForestClassifi >>> model2 = RandomForestClassificationModel.load(model_path) >>> model.featureImportances == model2.featureImportances True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True .. versionadded:: 1.4.0 """ @@ -1893,8 +1900,8 @@ class _GBTClassifierParams(_GBTParams, _HasVarianceImpurity): "Supported options: " + ", ".join(supportedLossTypes), typeConverter=TypeConverters.toString) - def __init__(self): - super(_GBTClassifierParams, self).__init__() + def __init__(self, *args): + super(_GBTClassifierParams, self).__init__(*args) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1, subsamplingRate=1.0, @@ -1992,6 +1999,8 @@ class GBTClassifier(_JavaProbabilisticClassifier, _GBTClassifierParams, True >>> model.treeWeights == model2.treeWeights True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True >>> model.trees [DecisionTreeRegressionModel...depth=..., DecisionTreeRegressionModel...] >>> validation = spark.createDataFrame([(0.0, Vectors.dense(-1.0),)], @@ -2225,8 +2234,8 @@ class _NaiveBayesParams(_PredictorParams, HasWeightCol): "and gaussian.", typeConverter=TypeConverters.toString) - def __init__(self): - super(_NaiveBayesParams, self).__init__() + def __init__(self, *args): + super(_NaiveBayesParams, self).__init__(*args) self._setDefault(smoothing=1.0, modelType="multinomial") @since("1.5.0") @@ -2312,6 +2321,8 @@ class NaiveBayes(_JavaProbabilisticClassifier, _NaiveBayesParams, HasThresholds, True >>> model.theta == model2.theta True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True >>> nb = nb.setThresholds([0.01, 10.00]) >>> model3 = nb.fit(df) >>> result = model3.transform(test0).head() @@ -2438,8 +2449,8 @@ class _MultilayerPerceptronParams(_ProbabilisticClassifierParams, HasSeed, HasMa initialWeights = Param(Params._dummy(), "initialWeights", "The initial weights of the model.", typeConverter=TypeConverters.toVector) - def __init__(self): - super(_MultilayerPerceptronParams, self).__init__() + def __init__(self, *args): + super(_MultilayerPerceptronParams, self).__init__(*args) self._setDefault(maxIter=100, tol=1E-6, blockSize=128, stepSize=0.03, solver="l-bfgs") @since("1.6.0") @@ -2521,6 +2532,8 @@ class MultilayerPerceptronClassifier(_JavaProbabilisticClassifier, _MultilayerPe True >>> model.weights == model2.weights True + >>> model.transform(testDF).take(1) == model2.transform(testDF).take(1) + True >>> mlp2 = mlp2.setInitialWeights(list(range(0, 12))) >>> model3 = mlp2.fit(df) >>> model3.weights != model2.weights @@ -2695,6 +2708,8 @@ class OneVsRest(Estimator, _OneVsRestParams, HasParallelism, JavaMLReadable, Jav >>> model2 = OneVsRestModel.load(model_path) >>> model2.transform(test0).head().newPrediction 0.0 + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True >>> model.transform(test2).columns ['features', 'rawPrediction', 'newPrediction'] @@ -3120,6 +3135,17 @@ class FMClassifier(_JavaProbabilisticClassifier, _FactorizationMachinesParams, J DenseVector([14.8232]) >>> model.factors DenseMatrix(1, 2, [0.0163, -0.0051], 1) + >>> model_path = temp_path + "/fm_model" + >>> model.save(model_path) + >>> model2 = FMClassificationModel.load(model_path) + >>> model2.intercept + -7.316665276826291 + >>> model2.linear + DenseVector([14.8232]) + >>> model2.factors + DenseMatrix(1, 2, [0.0163, -0.0051], 1) + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True .. versionadded:: 3.0.0 """ diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 6ca413d696368..2d70f876849f8 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -109,8 +109,8 @@ class _GaussianMixtureParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionC k = Param(Params._dummy(), "k", "Number of independent Gaussians in the mixture model. " + "Must be > 1.", typeConverter=TypeConverters.toInt) - def __init__(self): - super(_GaussianMixtureParams, self).__init__() + def __init__(self, *args): + super(_GaussianMixtureParams, self).__init__(*args) self._setDefault(k=2, tol=0.01, maxIter=100, aggregationDepth=2, blockSize=1) @since("2.0.0") @@ -325,6 +325,8 @@ class GaussianMixture(JavaEstimator, _GaussianMixtureParams, JavaMLWritable, Jav Row(mean=DenseVector([0.825, 0.8675])) >>> model2.gaussiansDF.select("cov").head() Row(cov=DenseMatrix(2, 2, [0.0056, -0.0051, -0.0051, 0.0046], False)) + >>> model.transform(df).take(1) == model2.transform(df).take(1) + True >>> gm2.setWeightCol("weight") GaussianMixture... @@ -503,8 +505,8 @@ class _KMeansParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, HasTo initSteps = Param(Params._dummy(), "initSteps", "The number of steps for k-means|| " + "initialization mode. Must be > 0.", typeConverter=TypeConverters.toInt) - def __init__(self): - super(_KMeansParams, self).__init__() + def __init__(self, *args): + super(_KMeansParams, self).__init__(*args) self._setDefault(k=2, initMode="k-means||", initSteps=2, tol=1e-4, maxIter=20, distanceMeasure="euclidean") @@ -637,6 +639,8 @@ class KMeans(JavaEstimator, _KMeansParams, JavaMLWritable, JavaMLReadable): array([ True, True], dtype=bool) >>> model.clusterCenters()[1] == model2.clusterCenters()[1] array([ True, True], dtype=bool) + >>> model.transform(df).take(1) == model2.transform(df).take(1) + True .. versionadded:: 1.5.0 """ @@ -760,8 +764,8 @@ class _BisectingKMeansParams(HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionC "proportion of points (if < 1.0) of a divisible cluster.", typeConverter=TypeConverters.toFloat) - def __init__(self): - super(_BisectingKMeansParams, self).__init__() + def __init__(self, *args): + super(_BisectingKMeansParams, self).__init__(*args) self._setDefault(maxIter=20, k=4, minDivisibleClusterSize=1.0) @since("2.0.0") @@ -914,6 +918,8 @@ class BisectingKMeans(JavaEstimator, _BisectingKMeansParams, JavaMLWritable, Jav array([ True, True], dtype=bool) >>> model.clusterCenters()[1] == model2.clusterCenters()[1] array([ True, True], dtype=bool) + >>> model.transform(df).take(1) == model2.transform(df).take(1) + True .. versionadded:: 2.0.0 """ @@ -1072,8 +1078,8 @@ class _LDAParams(HasMaxIter, HasFeaturesCol, HasSeed, HasCheckpointInterval): " partition is lost, so set this bit with care.", TypeConverters.toBoolean) - def __init__(self): - super(_LDAParams, self).__init__() + def __init__(self, *args): + super(_LDAParams, self).__init__(*args) self._setDefault(maxIter=20, checkpointInterval=10, k=10, optimizer="online", learningOffset=1024.0, learningDecay=0.51, subsamplingRate=0.05, optimizeDocConcentration=True, @@ -1389,6 +1395,8 @@ class LDA(JavaEstimator, _LDAParams, JavaMLReadable, JavaMLWritable): >>> local_model_path = temp_path + "/lda_local_model" >>> localModel.save(local_model_path) >>> sameLocalModel = LocalLDAModel.load(local_model_path) + >>> model.transform(df).take(1) == sameLocalModel.transform(df).take(1) + True .. versionadded:: 2.0.0 """ @@ -1600,8 +1608,8 @@ class _PowerIterationClusteringParams(HasMaxIter, HasWeightCol): "Name of the input column for destination vertex IDs.", typeConverter=TypeConverters.toString) - def __init__(self): - super(_PowerIterationClusteringParams, self).__init__() + def __init__(self, *args): + super(_PowerIterationClusteringParams, self).__init__(*args) self._setDefault(k=2, maxIter=20, initMode="random", srcCol="src", dstCol="dst") @since("2.4.0") @@ -1677,6 +1685,8 @@ class PowerIterationClustering(_PowerIterationClusteringParams, JavaParams, Java 2 >>> pic2.getMaxIter() 40 + >>> pic2.assignClusters(df).take(6) == assignments.take(6) + True .. versionadded:: 2.4.0 """ diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index a319dace6869a..2220293d54ba4 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -92,6 +92,8 @@ class Binarizer(JavaTransformer, HasThreshold, HasThresholds, HasInputCol, HasOu >>> loadedBinarizer = Binarizer.load(binarizerPath) >>> loadedBinarizer.getThreshold() == binarizer.getThreshold() True + >>> loadedBinarizer.transform(df).take(1) == binarizer.transform(df).take(1) + True >>> df2 = spark.createDataFrame([(0.5, 0.3)], ["values1", "values2"]) >>> binarizer2 = Binarizer(thresholds=[0.0, 1.0]) >>> binarizer2.setInputCols(["values1", "values2"]).setOutputCols(["output1", "output2"]) @@ -195,6 +197,10 @@ class _LSHParams(HasInputCol, HasOutputCol): "and decreasing it improves the running performance.", typeConverter=TypeConverters.toInt) + def __init__(self, *args): + super(_LSHParams, self).__init__(*args) + self._setDefault(numHashTables=1) + def getNumHashTables(self): """ Gets the value of numHashTables or its default value. @@ -390,7 +396,6 @@ def __init__(self, inputCol=None, outputCol=None, seed=None, numHashTables=1, super(BucketedRandomProjectionLSH, self).__init__() self._java_obj = \ self._new_java_obj("org.apache.spark.ml.feature.BucketedRandomProjectionLSH", self.uid) - self._setDefault(numHashTables=1) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -477,6 +482,8 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOu >>> loadedBucketizer = Bucketizer.load(bucketizerPath) >>> loadedBucketizer.getSplits() == bucketizer.getSplits() True + >>> loadedBucketizer.transform(df).take(1) == bucketizer.transform(df).take(1) + True >>> bucketed = bucketizer.setHandleInvalid("skip").transform(df).collect() >>> len(bucketed) 4 @@ -733,6 +740,8 @@ class CountVectorizer(JavaEstimator, _CountVectorizerParams, JavaMLReadable, Jav >>> loadedModel = CountVectorizerModel.load(modelPath) >>> loadedModel.vocabulary == model.vocabulary True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True >>> fromVocabModel = CountVectorizerModel.from_vocabulary(["a", "b", "c"], ... inputCol="raw", outputCol="vectors") >>> fromVocabModel.transform(df).show(truncate=False) @@ -920,6 +929,8 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit >>> dctPath = temp_path + "/dct" >>> dct.save(dctPath) >>> loadedDtc = DCT.load(dctPath) + >>> loadedDtc.transform(df1).take(1) == dct.transform(df1).take(1) + True >>> loadedDtc.getInverse() False @@ -1003,6 +1014,8 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReada >>> loadedEp = ElementwiseProduct.load(elementwiseProductPath) >>> loadedEp.getScalingVec() == ep.getScalingVec() True + >>> loadedEp.transform(df).take(1) == ep.transform(df).take(1) + True .. versionadded:: 1.5.0 """ @@ -1201,6 +1214,8 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java >>> loadedHashingTF = HashingTF.load(hashingTFPath) >>> loadedHashingTF.getNumFeatures() == hashingTF.getNumFeatures() True + >>> loadedHashingTF.transform(df).take(1) == hashingTF.transform(df).take(1) + True >>> hashingTF.indexOf("b") 5 @@ -1292,6 +1307,10 @@ def getMinDocFreq(self): """ return self.getOrDefault(self.minDocFreq) + def __init__(self, *args): + super(_IDFParams, self).__init__(*args) + self._setDefault(minDocFreq=0) + @inherit_doc class IDF(JavaEstimator, _IDFParams, JavaMLReadable, JavaMLWritable): @@ -1345,7 +1364,6 @@ def __init__(self, minDocFreq=0, inputCol=None, outputCol=None): """ super(IDF, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid) - self._setDefault(minDocFreq=0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1445,6 +1463,10 @@ class _ImputerParams(HasInputCol, HasInputCols, HasOutputCol, HasOutputCols, Has "The placeholder for the missing values. All occurrences of missingValue " "will be imputed.", typeConverter=TypeConverters.toFloat) + def __init__(self, *args): + super(_ImputerParams, self).__init__(*args) + self._setDefault(strategy="mean", missingValue=float("nan"), relativeError=0.001) + @since("2.2.0") def getStrategy(self): """ @@ -1582,7 +1604,6 @@ def __init__(self, strategy="mean", missingValue=float("nan"), inputCols=None, """ super(Imputer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Imputer", self.uid) - self._setDefault(strategy="mean", missingValue=float("nan"), relativeError=0.001) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1811,6 +1832,8 @@ class MaxAbsScaler(JavaEstimator, _MaxAbsScalerParams, JavaMLReadable, JavaMLWri >>> loadedModel = MaxAbsScalerModel.load(modelPath) >>> loadedModel.maxAbs == model.maxAbs True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 2.0.0 """ @@ -1951,7 +1974,6 @@ def __init__(self, inputCol=None, outputCol=None, seed=None, numHashTables=1): """ super(MinHashLSH, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinHashLSH", self.uid) - self._setDefault(numHashTables=1) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -2002,6 +2024,10 @@ class _MinMaxScalerParams(HasInputCol, HasOutputCol): max = Param(Params._dummy(), "max", "Upper bound of the output feature range", typeConverter=TypeConverters.toFloat) + def __init__(self, *args): + super(_MinMaxScalerParams, self).__init__(*args) + self._setDefault(min=0.0, max=1.0) + @since("1.6.0") def getMin(self): """ @@ -2065,6 +2091,8 @@ class MinMaxScaler(JavaEstimator, _MinMaxScalerParams, JavaMLReadable, JavaMLWri True >>> loadedModel.originalMax == model.originalMax True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 1.6.0 """ @@ -2076,7 +2104,6 @@ def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None): """ super(MinMaxScaler, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid) - self._setDefault(min=0.0, max=1.0) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -2209,6 +2236,8 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWr >>> loadedNGram = NGram.load(ngramPath) >>> loadedNGram.getN() == ngram.getN() True + >>> loadedNGram.transform(df).take(1) == ngram.transform(df).take(1) + True .. versionadded:: 1.5.0 """ @@ -2289,6 +2318,8 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav >>> loadedNormalizer = Normalizer.load(normalizerPath) >>> loadedNormalizer.getP() == normalizer.getP() True + >>> loadedNormalizer.transform(df).take(1) == normalizer.transform(df).take(1) + True .. versionadded:: 1.4.0 """ @@ -2362,6 +2393,10 @@ class _OneHotEncoderParams(HasInputCol, HasInputCols, HasOutputCol, HasOutputCol dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category", typeConverter=TypeConverters.toBoolean) + def __init__(self, *args): + super(_OneHotEncoderParams, self).__init__(*args) + self._setDefault(handleInvalid="error", dropLast=True) + @since("2.3.0") def getDropLast(self): """ @@ -2422,6 +2457,8 @@ class OneHotEncoder(JavaEstimator, _OneHotEncoderParams, JavaMLReadable, JavaMLW >>> loadedModel = OneHotEncoderModel.load(modelPath) >>> loadedModel.categorySizes == model.categorySizes True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 2.3.0 """ @@ -2436,7 +2473,6 @@ def __init__(self, inputCols=None, outputCols=None, handleInvalid="error", dropL super(OneHotEncoder, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.feature.OneHotEncoder", self.uid) - self._setDefault(handleInvalid="error", dropLast=True) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -2583,6 +2619,8 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLRead >>> loadedPx = PolynomialExpansion.load(polyExpansionPath) >>> loadedPx.getDegree() == px.getDegree() True + >>> loadedPx.transform(df).take(1) == px.transform(df).take(1) + True .. versionadded:: 1.4.0 """ @@ -2879,6 +2917,11 @@ class _RobustScalerParams(HasInputCol, HasOutputCol, HasRelativeError): withScaling = Param(Params._dummy(), "withScaling", "Whether to scale the data to " "quantile range", typeConverter=TypeConverters.toBoolean) + def __init__(self, *args): + super(_RobustScalerParams, self).__init__(*args) + self._setDefault(lower=0.25, upper=0.75, withCentering=False, withScaling=True, + relativeError=0.001) + @since("3.0.0") def getLower(self): """ @@ -2954,6 +2997,8 @@ class RobustScaler(JavaEstimator, _RobustScalerParams, JavaMLReadable, JavaMLWri True >>> loadedModel.range == model.range True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 3.0.0 """ @@ -2967,8 +3012,6 @@ def __init__(self, lower=0.25, upper=0.75, withCentering=False, withScaling=True """ super(RobustScaler, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RobustScaler", self.uid) - self._setDefault(lower=0.25, upper=0.75, withCentering=False, withScaling=True, - relativeError=0.001) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -3113,6 +3156,8 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, True >>> loadedReTokenizer.getGaps() == reTokenizer.getGaps() True + >>> loadedReTokenizer.transform(df).take(1) == reTokenizer.transform(df).take(1) + True .. versionadded:: 1.4.0 """ @@ -3237,6 +3282,8 @@ class SQLTransformer(JavaTransformer, JavaMLReadable, JavaMLWritable): >>> loadedSqlTrans = SQLTransformer.load(sqlTransformerPath) >>> loadedSqlTrans.getStatement() == sqlTrans.getStatement() True + >>> loadedSqlTrans.transform(df).take(1) == sqlTrans.transform(df).take(1) + True .. versionadded:: 1.6.0 """ @@ -3291,6 +3338,10 @@ class _StandardScalerParams(HasInputCol, HasOutputCol): withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation", typeConverter=TypeConverters.toBoolean) + def __init__(self, *args): + super(_StandardScalerParams, self).__init__(*args) + self._setDefault(withMean=False, withStd=True) + @since("1.4.0") def getWithMean(self): """ @@ -3348,6 +3399,8 @@ class StandardScaler(JavaEstimator, _StandardScalerParams, JavaMLReadable, JavaM True >>> loadedModel.mean == model.mean True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 1.4.0 """ @@ -3359,7 +3412,6 @@ def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None): """ super(StandardScaler, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StandardScaler", self.uid) - self._setDefault(withMean=False, withStd=True) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -3513,6 +3565,8 @@ class StringIndexer(JavaEstimator, _StringIndexerParams, JavaMLReadable, JavaMLW >>> loadedInverter = IndexToString.load(indexToStringPath) >>> loadedInverter.getLabels() == inverter.getLabels() True + >>> loadedModel.transform(stringIndDf).take(1) == model.transform(stringIndDf).take(1) + True >>> stringIndexer.getStringOrderType() 'frequencyDesc' >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="error", @@ -3798,6 +3852,8 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, True >>> loadedRemover.getCaseSensitive() == remover.getCaseSensitive() True + >>> loadedRemover.transform(df).take(1) == remover.transform(df).take(1) + True >>> df2 = spark.createDataFrame([(["a", "b", "c"], ["a", "b"])], ["text1", "text2"]) >>> remover2 = StopWordsRemover(stopWords=["b"]) >>> remover2.setInputCols(["text1", "text2"]).setOutputCols(["words1", "words2"]) @@ -4109,6 +4165,10 @@ class _VectorIndexerParams(HasInputCol, HasOutputCol, HasHandleInvalid): "of categories of the feature).", typeConverter=TypeConverters.toString) + def __init__(self, *args): + super(_VectorIndexerParams, self).__init__(*args) + self._setDefault(maxCategories=20, handleInvalid="error") + @since("1.4.0") def getMaxCategories(self): """ @@ -4189,6 +4249,8 @@ class VectorIndexer(JavaEstimator, _VectorIndexerParams, JavaMLReadable, JavaMLW True >>> loadedModel.categoryMaps == model.categoryMaps True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True >>> dfWithInvalid = spark.createDataFrame([(Vectors.dense([3.0, 1.0]),)], ["a"]) >>> indexer.getHandleInvalid() 'error' @@ -4209,7 +4271,6 @@ def __init__(self, maxCategories=20, inputCol=None, outputCol=None, handleInvali """ super(VectorIndexer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid) - self._setDefault(maxCategories=20, handleInvalid="error") kwargs = self._input_kwargs self.setParams(**kwargs) @@ -4332,6 +4393,8 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, J True >>> loadedVs.getNames() == vs.getNames() True + >>> loadedVs.transform(df).take(1) == vs.transform(df).take(1) + True .. versionadded:: 1.6.0 """ @@ -4431,6 +4494,11 @@ class _Word2VecParams(HasStepSize, HasMaxIter, HasSeed, HasInputCol, HasOutputCo "be divided into chunks up to the size.", typeConverter=TypeConverters.toInt) + def __init__(self, *args): + super(_Word2VecParams, self).__init__(*args) + self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, + windowSize=5, maxSentenceLength=1000) + @since("1.4.0") def getVectorSize(self): """ @@ -4524,6 +4592,8 @@ class Word2Vec(JavaEstimator, _Word2VecParams, JavaMLReadable, JavaMLWritable): True >>> loadedModel.getVectors().first().vector == model.getVectors().first().vector True + >>> loadedModel.transform(doc).take(1) == model.transform(doc).take(1) + True .. versionadded:: 1.4.0 """ @@ -4537,8 +4607,6 @@ def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, """ super(Word2Vec, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid) - self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - windowSize=5, maxSentenceLength=1000) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -4730,6 +4798,8 @@ class PCA(JavaEstimator, _PCAParams, JavaMLReadable, JavaMLWritable): True >>> loadedModel.explainedVariance == model.explainedVariance True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 1.5.0 """ @@ -4846,6 +4916,11 @@ class _RFormulaParams(HasFeaturesCol, HasLabelCol, HasHandleInvalid): "additional bucket, at index numLabels).", typeConverter=TypeConverters.toString) + def __init__(self, *args): + super(_RFormulaParams, self).__init__(*args) + self._setDefault(forceIndexLabel=False, stringIndexerOrderType="frequencyDesc", + handleInvalid="error") + @since("1.5.0") def getFormula(self): """ @@ -4948,8 +5023,6 @@ def __init__(self, formula=None, featuresCol="features", labelCol="label", """ super(RFormula, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid) - self._setDefault(forceIndexLabel=False, stringIndexerOrderType="frequencyDesc", - handleInvalid="error") kwargs = self._input_kwargs self.setParams(**kwargs) @@ -5058,6 +5131,11 @@ class _SelectorParams(HasFeaturesCol, HasOutputCol, HasLabelCol): fwe = Param(Params._dummy(), "fwe", "The upper bound of the expected family-wise error rate.", typeConverter=TypeConverters.toFloat) + def __init__(self, *args): + super(_SelectorParams, self).__init__(*args) + self._setDefault(numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1, + fpr=0.05, fdr=0.05, fwe=0.05) + @since("2.1.0") def getSelectorType(self): """ @@ -5257,6 +5335,8 @@ class ANOVASelector(_Selector, JavaMLReadable, JavaMLWritable): >>> loadedModel = ANOVASelectorModel.load(modelPath) >>> loadedModel.selectedFeatures == model.selectedFeatures True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 3.1.0 """ @@ -5272,8 +5352,6 @@ def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, """ super(ANOVASelector, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ANOVASelector", self.uid) - self._setDefault(numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1, - fpr=0.05, fdr=0.05, fwe=0.05) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -5356,6 +5434,8 @@ class ChiSqSelector(_Selector, JavaMLReadable, JavaMLWritable): >>> loadedModel = ChiSqSelectorModel.load(modelPath) >>> loadedModel.selectedFeatures == model.selectedFeatures True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 2.0.0 """ @@ -5371,8 +5451,6 @@ def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, """ super(ChiSqSelector, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid) - self._setDefault(numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1, - fpr=0.05, fdr=0.05, fwe=0.05) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -5459,6 +5537,8 @@ class FValueSelector(_Selector, JavaMLReadable, JavaMLWritable): >>> loadedModel = FValueSelectorModel.load(modelPath) >>> loadedModel.selectedFeatures == model.selectedFeatures True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 3.1.0 """ @@ -5474,8 +5554,6 @@ def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, """ super(FValueSelector, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.FValueSelector", self.uid) - self._setDefault(numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1, - fpr=0.05, fdr=0.05, fwe=0.05) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -5652,6 +5730,8 @@ class VarianceThresholdSelector(JavaEstimator, _VarianceThresholdSelectorParams, >>> loadedModel = VarianceThresholdSelectorModel.load(modelPath) >>> loadedModel.selectedFeatures == model.selectedFeatures True + >>> loadedModel.transform(df).take(1) == model.transform(df).take(1) + True .. versionadded:: 3.1.0 """ diff --git a/python/pyspark/ml/fpm.py b/python/pyspark/ml/fpm.py index a1a8a4e3e3ac4..37d3b6eec02d0 100644 --- a/python/pyspark/ml/fpm.py +++ b/python/pyspark/ml/fpm.py @@ -55,8 +55,8 @@ class _FPGrowthParams(HasPredictionCol): "but will affect the association rules generation.", typeConverter=TypeConverters.toFloat) - def __init__(self): - super(_FPGrowthParams, self).__init__() + def __init__(self, *args): + super(_FPGrowthParams, self).__init__(*args) self._setDefault(minSupport=0.3, minConfidence=0.8, itemsCol="items", predictionCol="prediction") @@ -197,6 +197,11 @@ class FPGrowth(JavaEstimator, _FPGrowthParams, JavaMLWritable, JavaMLReadable): >>> new_data = spark.createDataFrame([(["t", "s"], )], ["items"]) >>> sorted(fpm.transform(new_data).first().newPrediction) ['x', 'y', 'z'] + >>> model_path = temp_path + "/fpm_model" + >>> fpm.save(model_path) + >>> model2 = FPGrowthModel.load(model_path) + >>> fpm.transform(data).take(1) == model2.transform(data).take(1) + True .. versionadded:: 2.2.0 """ diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index 99d80aa867bda..62b856046234a 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -46,6 +46,10 @@ class _ALSModelParams(HasPredictionCol, HasBlockSize): "Supported values: 'nan', 'drop'.", typeConverter=TypeConverters.toString) + def __init__(self, *args): + super(_ALSModelParams, self).__init__(*args) + self._setDefault(blockSize=4096) + @since("1.4.0") def getUserCol(self): """ @@ -99,6 +103,14 @@ class _ALSParams(_ALSModelParams, HasMaxIter, HasRegParam, HasCheckpointInterval "StorageLevel for ALS model factors.", typeConverter=TypeConverters.toString) + def __init__(self, *args): + super(_ALSParams, self).__init__(*args) + self._setDefault(rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, + implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", + ratingCol="rating", nonnegative=False, checkpointInterval=10, + intermediateStorageLevel="MEMORY_AND_DISK", + finalStorageLevel="MEMORY_AND_DISK", coldStartStrategy="nan") + @since("1.4.0") def getRank(self): """ @@ -275,6 +287,8 @@ class ALS(JavaEstimator, _ALSParams, JavaMLWritable, JavaMLReadable): True >>> sorted(model.itemFactors.collect()) == sorted(model2.itemFactors.collect()) True + >>> model.transform(test).take(1) == model2.transform(test).take(1) + True .. versionadded:: 1.4.0 """ @@ -294,12 +308,6 @@ def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemB """ super(ALS, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.recommendation.ALS", self.uid) - self._setDefault(rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, - implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", - ratingCol="rating", nonnegative=False, checkpointInterval=10, - intermediateStorageLevel="MEMORY_AND_DISK", - finalStorageLevel="MEMORY_AND_DISK", coldStartStrategy="nan", - blockSize=4096) kwargs = self._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 6fe6486c5a04a..4a8d1530b8a6f 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -104,8 +104,8 @@ class _LinearRegressionParams(_PredictorParams, HasRegParam, HasElasticNetParam, "robustness. Must be > 1.0. Only valid when loss is huber", typeConverter=TypeConverters.toFloat) - def __init__(self): - super(_LinearRegressionParams, self).__init__() + def __init__(self, *args): + super(_LinearRegressionParams, self).__init__(*args) self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, loss="squaredError", epsilon=1.35, blockSize=1) @@ -190,6 +190,8 @@ class LinearRegression(_JavaRegressor, _LinearRegressionParams, JavaMLWritable, True >>> model.intercept == model2.intercept True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True >>> model.numFeatures 1 >>> model.write().format("pmml").save(model_path + "_2") @@ -622,8 +624,8 @@ class _IsotonicRegressionParams(HasFeaturesCol, HasLabelCol, HasPredictionCol, H "The index of the feature if featuresCol is a vector column, no effect otherwise.", typeConverter=TypeConverters.toInt) - def __init__(self): - super(_IsotonicRegressionParams, self).__init__() + def __init__(self, *args): + super(_IsotonicRegressionParams, self).__init__(*args) self._setDefault(isotonic=True, featureIndex=0) def getIsotonic(self): @@ -675,6 +677,8 @@ class IsotonicRegression(JavaEstimator, _IsotonicRegressionParams, HasWeightCol, True >>> model.predictions == model2.predictions True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True .. versionadded:: 1.6.0 """ @@ -814,8 +818,8 @@ class _DecisionTreeRegressorParams(_DecisionTreeParams, _TreeRegressorParams, Ha .. versionadded:: 3.0.0 """ - def __init__(self): - super(_DecisionTreeRegressorParams, self).__init__() + def __init__(self, *args): + super(_DecisionTreeRegressorParams, self).__init__(*args) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance", leafCol="", minWeightFractionPerNode=0.0) @@ -876,7 +880,8 @@ class DecisionTreeRegressor(_JavaRegressor, _DecisionTreeRegressorParams, JavaML True >>> model.transform(test1).head().variance 0.0 - + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True >>> df3 = spark.createDataFrame([ ... (1.0, 0.2, Vectors.dense(1.0)), ... (1.0, 0.8, Vectors.dense(1.0)), @@ -1060,8 +1065,8 @@ class _RandomForestRegressorParams(_RandomForestParams, _TreeRegressorParams): .. versionadded:: 3.0.0 """ - def __init__(self): - super(_RandomForestRegressorParams, self).__init__() + def __init__(self, *args): + super(_RandomForestRegressorParams, self).__init__(*args) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance", subsamplingRate=1.0, numTrees=20, @@ -1127,6 +1132,8 @@ class RandomForestRegressor(_JavaRegressor, _RandomForestRegressorParams, JavaML >>> model2 = RandomForestRegressionModel.load(model_path) >>> model.featureImportances == model2.featureImportances True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True .. versionadded:: 1.4.0 """ @@ -1319,8 +1326,8 @@ class _GBTRegressorParams(_GBTParams, _TreeRegressorParams): "Supported options: " + ", ".join(supportedLossTypes), typeConverter=TypeConverters.toString) - def __init__(self): - super(_GBTRegressorParams, self).__init__() + def __init__(self, *args): + super(_GBTRegressorParams, self).__init__(*args) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, @@ -1390,6 +1397,8 @@ class GBTRegressor(_JavaRegressor, _GBTRegressorParams, JavaMLWritable, JavaMLRe True >>> model.treeWeights == model2.treeWeights True + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True >>> model.trees [DecisionTreeRegressionModel...depth=..., DecisionTreeRegressionModel...] >>> validation = spark.createDataFrame([(0.0, Vectors.dense(-1.0))], @@ -1642,8 +1651,8 @@ class _AFTSurvivalRegressionParams(_PredictorParams, HasMaxIter, HasTol, HasFitI "corresponding quantileProbabilities if it is set.", typeConverter=TypeConverters.toString) - def __init__(self): - super(_AFTSurvivalRegressionParams, self).__init__() + def __init__(self, *args): + super(_AFTSurvivalRegressionParams, self).__init__(*args) self._setDefault(censorCol="censor", quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], maxIter=100, tol=1E-6, blockSize=1) @@ -1722,6 +1731,8 @@ class AFTSurvivalRegression(_JavaRegressor, _AFTSurvivalRegressionParams, True >>> model.scale == model2.scale True + >>> model.transform(df).take(1) == model2.transform(df).take(1) + True .. versionadded:: 1.6.0 """ @@ -1906,8 +1917,8 @@ class _GeneralizedLinearRegressionParams(_PredictorParams, HasFitIntercept, HasM "or empty, we treat all instance offsets as 0.0", typeConverter=TypeConverters.toString) - def __init__(self): - super(_GeneralizedLinearRegressionParams, self).__init__() + def __init__(self, *args): + super(_GeneralizedLinearRegressionParams, self).__init__(*args) self._setDefault(family="gaussian", maxIter=25, tol=1e-6, regParam=0.0, solver="irls", variancePower=0.0, aggregationDepth=2) @@ -2025,6 +2036,8 @@ class GeneralizedLinearRegression(_JavaRegressor, _GeneralizedLinearRegressionPa True >>> model.coefficients[0] == model2.coefficients[0] True + >>> model.transform(df).take(1) == model2.transform(df).take(1) + True .. versionadded:: 2.0.0 """ @@ -2391,7 +2404,7 @@ def __repr__(self): class _FactorizationMachinesParams(_PredictorParams, HasMaxIter, HasStepSize, HasTol, - HasSolver, HasSeed, HasFitIntercept, HasRegParam): + HasSolver, HasSeed, HasFitIntercept, HasRegParam, HasWeightCol): """ Params for :py:class:`FMRegressor`, :py:class:`FMRegressionModel`, :py:class:`FMClassifier` and :py:class:`FMClassifierModel`. @@ -2416,8 +2429,8 @@ class _FactorizationMachinesParams(_PredictorParams, HasMaxIter, HasStepSize, Ha solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " + "options: gd, adamW. (Default adamW)", typeConverter=TypeConverters.toString) - def __init__(self): - super(_FactorizationMachinesParams, self).__init__() + def __init__(self, *args): + super(_FactorizationMachinesParams, self).__init__(*args) self._setDefault(factorSize=8, fitIntercept=True, fitLinear=True, regParam=0.0, miniBatchFraction=1.0, initStd=0.01, maxIter=100, stepSize=1.0, tol=1e-6, solver="adamW") @@ -2495,6 +2508,17 @@ class FMRegressor(_JavaRegressor, _FactorizationMachinesParams, JavaMLWritable, DenseVector([0.9978]) >>> model.factors DenseMatrix(1, 2, [0.0173, 0.0021], 1) + >>> model_path = temp_path + "/fm_model" + >>> model.save(model_path) + >>> model2 = FMRegressionModel.load(model_path) + >>> model2.intercept + -0.0032501766849261557 + >>> model2.linear + DenseVector([0.9978]) + >>> model2.factors + DenseMatrix(1, 2, [0.0173, 0.0021], 1) + >>> model.transform(test0).take(1) == model2.transform(test0).take(1) + True .. versionadded:: 3.0.0 """ diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py index e1abd59a2d7b2..44731568b6d1f 100644 --- a/python/pyspark/ml/tests/test_param.py +++ b/python/pyspark/ml/tests/test_param.py @@ -359,16 +359,14 @@ def test_java_params(self): and issubclass(cls, JavaParams) and not inspect.isabstract(cls) \ and not re.match("_?Java", name) and name != '_LSH' \ and name != '_Selector': - # NOTE: disable check_params_exist until there is parity with Scala API - - check_params(self, cls(), check_params_exist=False) + check_params(self, cls(), check_params_exist=True) # Additional classes that need explicit construction from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'), - check_params_exist=False) + check_params_exist=True) check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'), - check_params_exist=False) + check_params_exist=True) if __name__ == "__main__": diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 7f3d942e2e456..d7800e0c9020e 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -206,6 +206,10 @@ class _CrossValidatorParams(_ValidatorParams): "with range [0, numFolds) and Spark will throw exception on out-of-range " + "fold numbers.", typeConverter=TypeConverters.toString) + def __init__(self, *args): + super(_CrossValidatorParams, self).__init__(*args) + self._setDefault(numFolds=3, foldCol="") + @since("1.4.0") def getNumFolds(self): """ @@ -262,6 +266,8 @@ class CrossValidator(Estimator, _CrossValidatorParams, HasParallelism, HasCollec [0.5, ... >>> evaluator.evaluate(cvModel.transform(dataset)) 0.8333... + >>> evaluator.evaluate(cvModelRead.transform(dataset)) + 0.8333... .. versionadded:: 1.4.0 """ @@ -274,7 +280,7 @@ def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numF seed=None, parallelism=1, collectSubModels=False, foldCol="") """ super(CrossValidator, self).__init__() - self._setDefault(numFolds=3, parallelism=1, foldCol="") + self._setDefault(parallelism=1) kwargs = self._input_kwargs self._set(**kwargs) @@ -600,6 +606,10 @@ class _TrainValidationSplitParams(_ValidatorParams): trainRatio = Param(Params._dummy(), "trainRatio", "Param for ratio between train and\ validation data. Must be between 0 and 1.", typeConverter=TypeConverters.toFloat) + def __init__(self, *args): + super(_TrainValidationSplitParams, self).__init__(*args) + self._setDefault(trainRatio=0.75) + @since("2.0.0") def getTrainRatio(self): """ @@ -645,8 +655,11 @@ class TrainValidationSplit(Estimator, _TrainValidationSplitParams, HasParallelis [0.5, ... >>> evaluator.evaluate(tvsModel.transform(dataset)) 0.833... + >>> evaluator.evaluate(tvsModelRead.transform(dataset)) + 0.833... .. versionadded:: 2.0.0 + """ @keyword_only @@ -657,7 +670,7 @@ def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, trai parallelism=1, collectSubModels=False, seed=None) """ super(TrainValidationSplit, self).__init__() - self._setDefault(trainRatio=0.75, parallelism=1) + self._setDefault(parallelism=1) kwargs = self._input_kwargs self._set(**kwargs)