Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,16 @@ import org.apache.spark.sql.functions._
* learning algorithm for classification.
* It supports binary labels, as well as both continuous and categorical features.
* Note: Multiclass labels are not currently supported.
*
* The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999.
*
* Notes on Gradient Boosting vs. TreeBoost:
* - This implementation is for Stochastic Gradient Boosting, not for TreeBoost.
* - Both algorithms learn tree ensembles by minimizing loss functions.
* - TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes
* based on the loss function, whereas the original gradient boosting method does not.
* - We expect to implement TreeBoost in the future:
* [https://issues.apache.org/jira/browse/SPARK-4240]
*/
@Since("1.4.0")
@Experimental
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -774,10 +774,10 @@ sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary
*/
sealed trait LogisticRegressionSummary extends Serializable {

/** Dataframe outputted by the model's `transform` method. */
/** Dataframe output by the model's `transform` method. */
def predictions: DataFrame

/** Field in "predictions" which gives the calibrated probability of each class as a vector. */
/** Field in "predictions" which gives the probability of each class as a vector. */
def probabilityCol: String

/** Field in "predictions" which gives the true label of each instance (if available). */
Expand All @@ -792,8 +792,8 @@ sealed trait LogisticRegressionSummary extends Serializable {
* :: Experimental ::
* Logistic regression training results.
*
* @param predictions dataframe outputted by the model's `transform` method.
* @param probabilityCol field in "predictions" which gives the calibrated probability of
* @param predictions dataframe output by the model's `transform` method.
* @param probabilityCol field in "predictions" which gives the probability of
* each class as a vector.
* @param labelCol field in "predictions" which gives the true label of each instance.
* @param featuresCol field in "predictions" which gives the features of each instance as a vector.
Expand All @@ -816,8 +816,8 @@ class BinaryLogisticRegressionTrainingSummary private[classification] (
* :: Experimental ::
* Binary Logistic regression results for a given model.
*
* @param predictions dataframe outputted by the model's `transform` method.
* @param probabilityCol field in "predictions" which gives the calibrated probability of
* @param predictions dataframe output by the model's `transform` method.
* @param probabilityCol field in "predictions" which gives the probability of
* each class as a vector.
* @param labelCol field in "predictions" which gives the true label of each instance.
* @param featuresCol field in "predictions" which gives the features of each instance as a vector.
Expand Down
34 changes: 23 additions & 11 deletions mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,8 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
def getTopicDistributionCol: String = $(topicDistributionCol)

/**
* For Online optimizer only: [[optimizer]] = "online".
*
* A (positive) learning parameter that downweights early iterations. Larger values make early
* iterations count less.
* This is called "tau0" in the Online LDA paper (Hoffman et al., 2010)
Expand All @@ -198,15 +200,18 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
* @group expertParam
*/
@Since("1.6.0")
final val learningOffset = new DoubleParam(this, "learningOffset", "A (positive) learning" +
" parameter that downweights early iterations. Larger values make early iterations count less.",
final val learningOffset = new DoubleParam(this, "learningOffset", "(For online optimizer)" +
" A (positive) learning parameter that downweights early iterations. Larger values make early" +
" iterations count less.",
ParamValidators.gt(0))

/** @group expertGetParam */
@Since("1.6.0")
def getLearningOffset: Double = $(learningOffset)

/**
* For Online optimizer only: [[optimizer]] = "online".
*
* Learning rate, set as an exponential decay rate.
* This should be between (0.5, 1.0] to guarantee asymptotic convergence.
* This is called "kappa" in the Online LDA paper (Hoffman et al., 2010).
Expand All @@ -215,15 +220,17 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
* @group expertParam
*/
@Since("1.6.0")
final val learningDecay = new DoubleParam(this, "learningDecay", "Learning rate, set as an" +
" exponential decay rate. This should be between (0.5, 1.0] to guarantee asymptotic" +
" convergence.", ParamValidators.gt(0))
final val learningDecay = new DoubleParam(this, "learningDecay", "(For online optimizer)" +
" Learning rate, set as an exponential decay rate. This should be between (0.5, 1.0] to" +
" guarantee asymptotic convergence.", ParamValidators.gt(0))

/** @group expertGetParam */
@Since("1.6.0")
def getLearningDecay: Double = $(learningDecay)

/**
* For Online optimizer only: [[optimizer]] = "online".
*
* Fraction of the corpus to be sampled and used in each iteration of mini-batch gradient descent,
* in range (0, 1].
*
Expand All @@ -239,15 +246,18 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
* @group param
*/
@Since("1.6.0")
final val subsamplingRate = new DoubleParam(this, "subsamplingRate", "Fraction of the corpus" +
" to be sampled and used in each iteration of mini-batch gradient descent, in range (0, 1].",
final val subsamplingRate = new DoubleParam(this, "subsamplingRate", "(For online optimizer)" +
" Fraction of the corpus to be sampled and used in each iteration of mini-batch" +
" gradient descent, in range (0, 1].",
ParamValidators.inRange(0.0, 1.0, lowerInclusive = false, upperInclusive = true))

/** @group getParam */
@Since("1.6.0")
def getSubsamplingRate: Double = $(subsamplingRate)

/**
* For Online optimizer only (currently): [[optimizer]] = "online".
*
* Indicates whether the docConcentration (Dirichlet parameter for
* document-topic distribution) will be optimized during training.
* Setting this to true will make the model more expressive and fit the training data better.
Expand All @@ -257,15 +267,17 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
*/
@Since("1.6.0")
final val optimizeDocConcentration = new BooleanParam(this, "optimizeDocConcentration",
"Indicates whether the docConcentration (Dirichlet parameter for document-topic" +
" distribution) will be optimized during training.")
"(For online optimizer only, currently) Indicates whether the docConcentration" +
" (Dirichlet parameter for document-topic distribution) will be optimized during training.")

/** @group expertGetParam */
@Since("1.6.0")
def getOptimizeDocConcentration: Boolean = $(optimizeDocConcentration)

/**
* For EM optimizer, if using checkpointing, this indicates whether to keep the last
* For EM optimizer only: [[optimizer]] = "em".
*
* If using checkpointing, this indicates whether to keep the last
* checkpoint. If false, then the checkpoint will be deleted. Deleting the checkpoint can
* cause failures if a data partition is lost, so set this bit with care.
* Note that checkpoints will be cleaned up via reference counting, regardless.
Expand All @@ -279,7 +291,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
*/
@Since("2.0.0")
final val keepLastCheckpoint = new BooleanParam(this, "keepLastCheckpoint",
"For EM optimizer, if using checkpointing, this indicates whether to keep the last" +
"(For EM optimizer) If using checkpointing, this indicates whether to keep the last" +
" checkpoint. If false, then the checkpoint will be deleted. Deleting the checkpoint can" +
" cause failures if a data partition is lost, so set this bit with care.")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,18 @@ import org.apache.spark.sql.functions._
* [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]]
* learning algorithm for regression.
* It supports both continuous and categorical features.
*
* The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999.
*
* Notes on Gradient Boosting vs. TreeBoost:
* - This implementation is for Stochastic Gradient Boosting, not for TreeBoost.
* - Both algorithms learn tree ensembles by minimizing loss functions.
* - TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes
* based on the loss function, whereas the original gradient boosting method does not.
* - When the loss is SquaredError, these methods give the same result, but they could differ
* for other loss functions.
* - We expect to implement TreeBoost in the future:
* [https://issues.apache.org/jira/browse/SPARK-4240]
*/
@Since("1.4.0")
@Experimental
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,11 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
setDefault(tol -> 1E-6)

/**
* Sets the regularization parameter.
* Sets the regularization parameter for L2 regularization.
* The regularization term is
* {{{
* 0.5 * regParam * L2norm(coefficients)^2
* }}}
* Default is 0.0.
* @group setParam
*/
Expand Down Expand Up @@ -772,7 +776,7 @@ object GeneralizedLinearRegressionModel extends MLReadable[GeneralizedLinearRegr
* :: Experimental ::
* Summarizing Generalized Linear regression Fits.
*
* @param predictions predictions outputted by the model's `transform` method
* @param predictions predictions output by the model's `transform` method
* @param predictionCol field in "predictions" which gives the prediction value of each instance
* @param model the model that should be summarized
* @param diagInvAtWA diagonal of matrix (A^T * W * A)^-1 in the last iteration
Expand Down Expand Up @@ -933,6 +937,9 @@ class GeneralizedLinearRegressionSummary private[regression] (

/**
* Standard error of estimated coefficients and intercept.
*
* If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
* then the last element returned corresponds to the intercept.
*/
@Since("2.0.0")
lazy val coefficientStandardErrors: Array[Double] = {
Expand All @@ -941,6 +948,9 @@ class GeneralizedLinearRegressionSummary private[regression] (

/**
* T-statistic of estimated coefficients and intercept.
*
* If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
* then the last element returned corresponds to the intercept.
*/
@Since("2.0.0")
lazy val tValues: Array[Double] = {
Expand All @@ -954,6 +964,9 @@ class GeneralizedLinearRegressionSummary private[regression] (

/**
* Two-sided p-value of estimated coefficients and intercept.
*
* If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
* then the last element returned corresponds to the intercept.
*/
@Since("2.0.0")
lazy val pValues: Array[Double] = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,7 @@ object LinearRegressionModel extends MLReadable[LinearRegressionModel] {
* Linear regression training results. Currently, the training summary ignores the
* training weights except for the objective trace.
*
* @param predictions predictions outputted by the model's `transform` method.
* @param predictions predictions output by the model's `transform` method.
* @param objectiveHistory objective function (scaled loss + regularization) at each iteration.
*/
@Since("1.5.0")
Expand Down Expand Up @@ -549,7 +549,7 @@ class LinearRegressionTrainingSummary private[regression] (
* :: Experimental ::
* Linear regression results evaluated on a dataset.
*
* @param predictions predictions outputted by the model's `transform` method.
* @param predictions predictions output by the model's `transform` method.
* @param predictionCol Field in "predictions" which gives the predicted value of the label at
* each instance.
* @param labelCol Field in "predictions" which gives the true label of each instance.
Expand Down Expand Up @@ -655,8 +655,11 @@ class LinearRegressionSummary private[regression] (

/**
* Standard error of estimated coefficients and intercept.
*
* This value is only available when using the "normal" solver.
*
* If [[LinearRegression.fitIntercept]] is set to true,
* then the last element returned corresponds to the intercept.
*
* @see [[LinearRegression.solver]]
*/
lazy val coefficientStandardErrors: Array[Double] = {
Expand All @@ -679,8 +682,11 @@ class LinearRegressionSummary private[regression] (

/**
* T-statistic of estimated coefficients and intercept.
*
* This value is only available when using the "normal" solver.
*
* If [[LinearRegression.fitIntercept]] is set to true,
* then the last element returned corresponds to the intercept.
*
* @see [[LinearRegression.solver]]
*/
lazy val tValues: Array[Double] = {
Expand All @@ -699,8 +705,11 @@ class LinearRegressionSummary private[regression] (

/**
* Two-sided p-value of estimated coefficients and intercept.
*
* This value is only available when using the "normal" solver.
*
* If [[LinearRegression.fitIntercept]] is set to true,
* then the last element returned corresponds to the intercept.
*
* @see [[LinearRegression.solver]]
*/
lazy val pValues: Array[Double] = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,22 +30,6 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel


/**
* A package that implements
* [[http://en.wikipedia.org/wiki/Gradient_boosting Stochastic Gradient Boosting]]
* for regression and binary classification.
*
* The implementation is based upon:
* J.H. Friedman. "Stochastic Gradient Boosting." 1999.
*
* Notes on Gradient Boosting vs. TreeBoost:
* - This implementation is for Stochastic Gradient Boosting, not for TreeBoost.
* - Both algorithms learn tree ensembles by minimizing loss functions.
* - TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes
* based on the loss function, whereas the original gradient boosting method does not.
* - When the loss is SquaredError, these methods give the same result, but they could differ
* for other loss functions.
*/
private[spark] object GradientBoostedTrees extends Logging {

/**
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def predictions(self):
@since("2.0.0")
def probabilityCol(self):
"""
Field in "predictions" which gives the calibrated probability
Field in "predictions" which gives the probability
of each class as a vector.
"""
return self._call_java("probabilityCol")
Expand Down
9 changes: 9 additions & 0 deletions python/pyspark/ml/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,9 @@ def coefficientStandardErrors(self):
Standard error of estimated coefficients and intercept.
This value is only available when using the "normal" solver.

If :py:attr:`LinearRegression.fitIntercept` is set to True,
then the last element returned corresponds to the intercept.

.. seealso:: :py:attr:`LinearRegression.solver`
"""
return self._call_java("coefficientStandardErrors")
Expand All @@ -342,6 +345,9 @@ def tValues(self):
T-statistic of estimated coefficients and intercept.
This value is only available when using the "normal" solver.

If :py:attr:`LinearRegression.fitIntercept` is set to True,
then the last element returned corresponds to the intercept.

.. seealso:: :py:attr:`LinearRegression.solver`
"""
return self._call_java("tValues")
Expand All @@ -353,6 +359,9 @@ def pValues(self):
Two-sided p-value of estimated coefficients and intercept.
This value is only available when using the "normal" solver.

If :py:attr:`LinearRegression.fitIntercept` is set to True,
then the last element returned corresponds to the intercept.

.. seealso:: :py:attr:`LinearRegression.solver`
"""
return self._call_java("pValues")
Expand Down
2 changes: 2 additions & 0 deletions python/pyspark/ml/tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,8 @@ def _to_java(self):
class TrainValidationSplitModel(Model, ValidatorParams, MLReadable, MLWritable):
"""
Model from train validation split.

.. versionadded:: 2.0.0
"""

def __init__(self, bestModel):
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ def __init__(self, java_model=None):
"""
Initialize this instance with a Java model object.
Subclasses should call this constructor, initialize params,
and then call _transformer_params_from_java.
and then call _transfer_params_from_java.

This instance can be instantiated without specifying java_model,
it will be assigned after that, but this scenario only used by
Expand Down
4 changes: 4 additions & 0 deletions sql/core/src/main/scala/org/apache/spark/sql/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ object functions {
/**
* Aggregate function: returns the approximate number of distinct items in a group.
*
* @param rsd maximum estimation error allowed (default = 0.05)
*
* @group agg_funcs
* @since 1.3.0
*/
Expand All @@ -164,6 +166,8 @@ object functions {
/**
* Aggregate function: returns the approximate number of distinct items in a group.
*
* @param rsd maximum estimation error allowed (default = 0.05)
*
* @group agg_funcs
* @since 1.3.0
*/
Expand Down