diff --git a/docs/ml-features.md b/docs/ml-features.md index d2f036fb083d..53c822c335f5 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1188,7 +1188,9 @@ categorical features. The number of bins is set by the `numBuckets` parameter. I that the number of buckets used will be smaller than this value, for example, if there are too few distinct values of the input to create enough distinct quantiles. -NaN values: Note also that QuantileDiscretizer +NaN values: +NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will produce +a `Bucketizer` model for making predictions. During the transformation, `Bucketizer` will raise an error when it finds NaN values in the dataset, but the user can also choose to either keep or remove NaN values within the dataset by setting `handleInvalid`. If the user chooses to keep NaN values, they will be handled specially and placed into their own bucket, for example, if 4 buckets diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index d07b4adebb08..f62d8262e0d6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -307,7 +307,6 @@ class LogisticRegression @Since("1.2.0") ( private var optInitialModel: Option[LogisticRegressionModel] = None - /** @group setParam */ private[spark] def setInitialModel(model: LogisticRegressionModel): this.type = { this.optInitialModel = Some(model) this @@ -318,8 +317,9 @@ class LogisticRegression @Since("1.2.0") ( train(dataset, handlePersistence) } - protected[spark] def train(dataset: Dataset[_], handlePersistence: Boolean): - LogisticRegressionModel = { + protected[spark] def train( + dataset: Dataset[_], + handlePersistence: Boolean): LogisticRegressionModel = { val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol)) val instances: RDD[Instance] = dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map { diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index f1a7676c74b0..2ee8b6723ff4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.types.DoubleType /** * Params for Naive Bayes Classifiers. */ -private[ml] trait NaiveBayesParams extends PredictorParams with HasWeightCol { +private[classification] trait NaiveBayesParams extends PredictorParams with HasWeightCol { /** * The smoothing parameter. diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index 1143f0f565eb..dbeb9cfe5e21 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -84,11 +84,12 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String * Default: "error" * @group param */ + // TODO: SPARK-18619 Make Bucketizer inherit from HasHandleInvalid. @Since("2.1.0") - val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" + + val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " + "invalid entries. Options are skip (filter out rows with invalid values), " + "error (throw an error), or keep (keep invalid values in a special additional bucket).", - ParamValidators.inArray(Bucketizer.supportedHandleInvalid)) + ParamValidators.inArray(Bucketizer.supportedHandleInvalids)) /** @group getParam */ @Since("2.1.0") @@ -145,7 +146,7 @@ object Bucketizer extends DefaultParamsReadable[Bucketizer] { private[feature] val SKIP_INVALID: String = "skip" private[feature] val ERROR_INVALID: String = "error" private[feature] val KEEP_INVALID: String = "keep" - private[feature] val supportedHandleInvalid: Array[String] = + private[feature] val supportedHandleInvalids: Array[String] = Array(SKIP_INVALID, ERROR_INVALID, KEEP_INVALID) /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 653fa41124f8..9725125b6cb0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -82,11 +82,13 @@ private[feature] trait ChiSqSelectorParams extends Params * Default value is 0.05. * @group param */ + @Since("2.1.0") final val fpr = new DoubleParam(this, "fpr", "The highest p-value for features to be kept.", ParamValidators.inRange(0, 1)) setDefault(fpr -> 0.05) /** @group getParam */ + @Since("2.1.0") def getFpr: Double = $(fpr) /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala index b9e01dde70d8..b2ec37bf935f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala @@ -72,11 +72,12 @@ private[feature] trait QuantileDiscretizerBase extends Params * Default: "error" * @group param */ + // TODO: SPARK-18619 Make QuantileDiscretizer inherit from HasHandleInvalid. @Since("2.1.0") - val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" + + val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " + "invalid entries. Options are skip (filter out rows with invalid values), " + "error (throw an error), or keep (keep invalid values in a special additional bucket).", - ParamValidators.inArray(Bucketizer.supportedHandleInvalid)) + ParamValidators.inArray(Bucketizer.supportedHandleInvalids)) setDefault(handleInvalid, Bucketizer.ERROR_INVALID) /** @group getParam */ @@ -91,8 +92,10 @@ private[feature] trait QuantileDiscretizerBase extends Params * possible that the number of buckets used will be smaller than this value, for example, if there * are too few distinct values of the input to create enough distinct quantiles. * - * NaN handling: Note also that - * QuantileDiscretizer will raise an error when it finds NaN values in the dataset, but the user can + * NaN handling: + * NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will + * produce a `Bucketizer` model for making predictions. During the transformation, + * `Bucketizer` will raise an error when it finds NaN values in the dataset, but the user can * also choose to either keep or remove NaN values within the dataset by setting `handleInvalid`. * If the user chooses to keep NaN values, they will be handled specially and placed into their own * bucket, for example, if 4 buckets are used, then non-NaN data will be put into buckets[0-3], diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala index 96fd0d18b5ae..dc3bcc662733 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala @@ -34,7 +34,7 @@ import org.apache.spark.mllib.linalg.CholeskyDecomposition * @param objectiveHistory Option containing the objective history when an optimization program is * used to solve the normal equations. None when an analytic solver is used. */ -private[ml] class NormalEquationSolution( +private[optim] class NormalEquationSolution( val coefficients: Array[Double], val aaInv: Option[Array[Double]], val objectiveHistory: Option[Array[Double]]) @@ -42,7 +42,7 @@ private[ml] class NormalEquationSolution( /** * Interface for classes that solve the normal equations locally. */ -private[ml] sealed trait NormalEquationSolver { +private[optim] sealed trait NormalEquationSolver { /** Solve the normal equations from summary statistics. */ def solve( @@ -56,7 +56,7 @@ private[ml] sealed trait NormalEquationSolver { /** * A class that solves the normal equations directly, using Cholesky decomposition. */ -private[ml] class CholeskySolver extends NormalEquationSolver { +private[optim] class CholeskySolver extends NormalEquationSolver { override def solve( bBar: Double, @@ -75,7 +75,7 @@ private[ml] class CholeskySolver extends NormalEquationSolver { /** * A class for solving the normal equations using Quasi-Newton optimization methods. */ -private[ml] class QuasiNewtonSolver( +private[optim] class QuasiNewtonSolver( fitIntercept: Boolean, maxIter: Int, tol: Double, diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 767d056861a8..bc3a6b2ad430 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -391,13 +391,13 @@ class NaiveBayes private ( object NaiveBayes { /** String name for multinomial model type. */ - private[spark] val Multinomial: String = "multinomial" + private[classification] val Multinomial: String = "multinomial" /** String name for Bernoulli model type. */ - private[spark] val Bernoulli: String = "bernoulli" + private[classification] val Bernoulli: String = "bernoulli" /* Set of modelTypes that NaiveBayes supports */ - private[spark] val supportedModelTypes = Set(Multinomial, Bernoulli) + private[classification] val supportedModelTypes = Set(Multinomial, Bernoulli) /** * Trains a Naive Bayes model given an RDD of `(label, features)` pairs. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index f9156b642785..9f71c4f6c11b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -262,7 +262,7 @@ private[spark] object ChiSqSelector { val Percentile: String = "percentile" /** String name for `fpr` selector type. */ - private[spark] val FPR: String = "fpr" + val FPR: String = "fpr" /** Set of selector types that ChiSqSelector supports. */ val supportedSelectorTypes: Array[String] = Array(NumTopFeatures, Percentile, FPR) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala index bc26655104a9..9abdd44a635d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala @@ -131,9 +131,9 @@ class HashingTF(val numFeatures: Int) extends Serializable { object HashingTF { - private[spark] val Native: String = "native" + private[HashingTF] val Native: String = "native" - private[spark] val Murmur3: String = "murmur3" + private[HashingTF] val Murmur3: String = "murmur3" private val seed = 42 @@ -141,7 +141,7 @@ object HashingTF { * Calculate a hash code value for the term object using the native Scala implementation. * This is the default hash algorithm used in Spark 1.6 and earlier. */ - private[spark] def nativeHash(term: Any): Int = term.## + private[HashingTF] def nativeHash(term: Any): Int = term.## /** * Calculate a hash code value for the term object using