From 487b3614f7f4de2956f1af23091b9aacbbf57281 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Fri, 14 Aug 2015 01:00:45 +0530 Subject: [PATCH 1/8] [SPARK-9906] [ML] User guide for LogisticRegressionSummary --- docs/ml-guide.md | 135 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/docs/ml-guide.md b/docs/ml-guide.md index b6ca50e98db02..e022d535d7fdb 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -801,6 +801,141 @@ jsc.stop(); +## Examples: Summaries for LogisticRegression. + +Once LogisticRegression is run on data, it is useful to extract statistics such as the +loss per iteration which will provide an intuition on overfitting and metrics to understand +how well the model has performed on training and test data. + +LogisticRegressionTrainingSummary provides an interface to access such relevant information +i.e the objectiveHistory and metrics to evaluate the performance on the training data +directly with very less code to be rewritten by the user. +In the future, a method would be made available in the fitted LogisticRegressionModel to obtain +a LogisticRegressionSummary of the test data as well. + +This examples illustrates the use of LogisticRegressionTrainingSummary on some toyData. + +
+
+{% highlight scala %} +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.ml.classification.{LogisticRegression, BinaryLogisticRegressionSummary} +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.sql.{Row, SQLContext} + +val conf = new SparkConf().setAppName("LogisticRegressionSummary") +val sc = new SparkContext(conf) +val sqlContext = new SQLContext(sc) +import sqlContext.implicits._ + +// Use some random data for demonstration. +// Note that the RDD of LabeledPoints can be converted to a dataframe directly. +val data = sc.parallelize(Array( + LabeledPoint(0.0, Vectors.dense(0.2, 4.5, 1.6)), + LabeledPoint(1.0, Vectors.dense(3.1, 6.8, 3.6)), + LabeledPoint(0.0, Vectors.dense(2.4, 0.9, 1.9)), + LabeledPoint(1.0, Vectors.dense(9.1, 3.1, 3.6)), + LabeledPoint(0.0, Vectors.dense(2.5, 1.9, 9.1))) +) +val logRegDataFrame = data.toDF() + +// Run Logistic Regression on your toy data. +// Since LogisticRegression is an estimator, it returns an instance of LogisticRegressionModel +// which is a transformer. +val logReg = new LogisticRegression() +logReg.setMaxIter(5) +logReg.setRegParam(0.01) +val logRegModel = logReg.fit(logRegDataFrame) + +// Extract the summary directly from the returned LogisticRegressionModel instance. +val trainingSummary = logRegModel.summary + +// Obtain the loss per iteration. This should decrease upto a certain point and +// then increase or show negligible change after this. +val objectiveHistory = trainingSummary.objectiveHistory +objectiveHistory.foreach(loss => println(loss)) + +// Obtain the metrics useful to judge performance on test data. +val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary] + +// Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. +val roc = binarySummary.roc +val truePositiveRate = roc.select("FPR") +val area = binarySummary.areaUnderROC + +// Obtain the threshold with the highest fMeasure. +val fMeasure = binarySummary.fMeasureByThreshold +val fScoreRDD = fMeasure.map { case Row(thresh: Double, fscore: Double) => (thresh, fscore) } +val (highThresh, highFScore) = fScoreRDD.fold((0.0, 0.0))((threshFScore1, threshFScore2) => { + if (threshFScore1._2 > threshFScore2._2) threshFScore1 else threshFScore2 +}) +{% endhighlight %} +
+ +
+{% highlight java %} +import com.google.common.collect.Lists; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary; +import org.apache.spark.ml.classification.LogisticRegression; +import org.apache.spark.ml.classification.LogisticRegressionModel; +import org.apache.spark.ml.classification.LogisticRegressionTrainingSummary; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; + +// Use some random data for demonstration. +// Note that the RDD of LabeledPoints can be converted to a dataframe directly. +JavaRDD data = sc.parallelize(Lists.newArrayList( + new LabeledPoint(0.0, Vectors.dense(0.2, 4.5, 1.6)), + new LabeledPoint(1.0, Vectors.dense(3.1, 6.8, 3.6)), + new LabeledPoint(0.0, Vectors.dense(2.4, 0.9, 1.9)), + new LabeledPoint(1.0, Vectors.dense(9.1, 3.1, 3.6)), + new LabeledPoint(0.0, Vectors.dense(2.5, 1.9, 9.1))) +); +DataFrame logRegDataFrame = sql.createDataFrame(data, LabeledPoint.class); + +// Run Logistic Regression on your toy data. +// Since LogisticRegression is an estimator, it returns an instance of LogisticRegressionModel +// which is a transformer. +LogisticRegression logReg = new LogisticRegression(); +logReg.setMaxIter(5); +logReg.setRegParam(0.01); +LogisticRegressionModel logRegModel = logReg.fit(logRegDataFrame); + +// Extract the summary directly from the returned LogisticRegressionModel instance. +LogisticRegressionTrainingSummary trainingSummary = logRegModel.summary(); + +// Obtain the loss per iteration. This should decrease upto a certain point and +// then increase or show negligible change after this. +double[] objectiveHistory = trainingSummary.objectiveHistory(); +for (double lossPerIteration: objectiveHistory) { + System.out.println(lossPerIteration); +} + +// Obtain the metrics useful to judge performance on test data. +BinaryLogisticRegressionSummary binarySummary = (BinaryLogisticRegressionSummary) trainingSummary; + +// Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. +DataFrame roc = binarySummary.roc(); +DataFrame truePositiveRate = roc.select("FPR"); +double area = binarySummary.areaUnderROC(); + +// Obtain the threshold with the highest fMeasure. +DataFrame fMeasure = binarySummary.fMeasureByThreshold(); + + +{% highlight %} +
+ +
+ + + + # Dependencies Spark ML currently depends on MLlib and has the same dependencies. From 56cb35bdc555ce476e7fe441ae926f9f585a616d Mon Sep 17 00:00:00 2001 From: MechCoder Date: Sat, 15 Aug 2015 21:10:55 +0530 Subject: [PATCH 2/8] minor --- docs/ml-guide.md | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/docs/ml-guide.md b/docs/ml-guide.md index e022d535d7fdb..9fda93e29fe1a 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -803,17 +803,20 @@ jsc.stop(); ## Examples: Summaries for LogisticRegression. -Once LogisticRegression is run on data, it is useful to extract statistics such as the +Once [`LogisticRegression`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression) +is run on data, it is useful to extract statistics such as the loss per iteration which will provide an intuition on overfitting and metrics to understand how well the model has performed on training and test data. -LogisticRegressionTrainingSummary provides an interface to access such relevant information -i.e the objectiveHistory and metrics to evaluate the performance on the training data -directly with very less code to be rewritten by the user. -In the future, a method would be made available in the fitted LogisticRegressionModel to obtain -a LogisticRegressionSummary of the test data as well. +[`LogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionTrainingSummary) +provides an interface to access such relevant information. i.e the objectiveHistory and metrics +to evaluate the performance on the training data directly with very less code to be rewritten by +the user. In the future, a method would be made available in the fitted +[`LogisticRegressionModel`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionModel) to obtain +a [`LogisticRegressionSummary`](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionSummary) +of the test data as well. -This examples illustrates the use of LogisticRegressionTrainingSummary on some toyData. +This examples illustrates the use of `LogisticRegressionTrainingSummary` on some toy data.
@@ -861,8 +864,9 @@ val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary // Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. val roc = binarySummary.roc -val truePositiveRate = roc.select("FPR") -val area = binarySummary.areaUnderROC +roc.show() +roc.select("FPR").show() +println(binarySummary.areaUnderROC) // Obtain the threshold with the highest fMeasure. val fMeasure = binarySummary.fMeasureByThreshold @@ -921,8 +925,9 @@ BinaryLogisticRegressionSummary binarySummary = (BinaryLogisticRegressionSummary // Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. DataFrame roc = binarySummary.roc(); -DataFrame truePositiveRate = roc.select("FPR"); -double area = binarySummary.areaUnderROC(); +roc.show(); +roc.select("FPR").show(); +System.out.println(binarySummary.areaUnderROC()); // Obtain the threshold with the highest fMeasure. DataFrame fMeasure = binarySummary.fMeasureByThreshold(); From 983127077bf44c55fd85d0ca9bd206f3dd3b74fe Mon Sep 17 00:00:00 2001 From: MechCoder Date: Sun, 16 Aug 2015 10:27:05 +0530 Subject: [PATCH 3/8] remove threshold selection --- docs/ml-guide.md | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/docs/ml-guide.md b/docs/ml-guide.md index 9fda93e29fe1a..fc7e5e4474ee7 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -809,12 +809,9 @@ loss per iteration which will provide an intuition on overfitting and metrics to how well the model has performed on training and test data. [`LogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionTrainingSummary) -provides an interface to access such relevant information. i.e the objectiveHistory and metrics +provides an interface to access such relevant information. i.e the `objectiveHistory` and metrics to evaluate the performance on the training data directly with very less code to be rewritten by -the user. In the future, a method would be made available in the fitted -[`LogisticRegressionModel`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionModel) to obtain -a [`LogisticRegressionSummary`](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionSummary) -of the test data as well. +the user. This examples illustrates the use of `LogisticRegressionTrainingSummary` on some toy data. @@ -868,12 +865,11 @@ roc.show() roc.select("FPR").show() println(binarySummary.areaUnderROC) -// Obtain the threshold with the highest fMeasure. +// Print all threshold, fMeasure pairs. val fMeasure = binarySummary.fMeasureByThreshold -val fScoreRDD = fMeasure.map { case Row(thresh: Double, fscore: Double) => (thresh, fscore) } -val (highThresh, highFScore) = fScoreRDD.fold((0.0, 0.0))((threshFScore1, threshFScore2) => { - if (threshFScore1._2 > threshFScore2._2) threshFScore1 else threshFScore2 -}) +fMeasure.foreach { case Row(thresh: Double, fscore: Double) => + println(s"Threshold: $thresh, F-Measure: $fscore") } + {% endhighlight %}
@@ -881,7 +877,9 @@ val (highThresh, highFScore) = fScoreRDD.fold((0.0, 0.0))((threshFScore1, thresh {% highlight java %} import com.google.common.collect.Lists; +import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary; import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.classification.LogisticRegressionModel; @@ -890,6 +888,11 @@ import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; + +SparkConf conf = new SparkConf().setAppName("LogisticRegressionSummary"); +JavaSparkContext jsc = new JavaSparkContext(conf); +SQLContext jsql = new SQLContext(jsc); // Use some random data for demonstration. // Note that the RDD of LabeledPoints can be converted to a dataframe directly. @@ -929,18 +932,15 @@ roc.show(); roc.select("FPR").show(); System.out.println(binarySummary.areaUnderROC()); -// Obtain the threshold with the highest fMeasure. +// Print all threshold, fMeasure pairs. DataFrame fMeasure = binarySummary.fMeasureByThreshold(); - - -{% highlight %} +for (Row r: fMeasure.collect()) { + System.out.println("Threshold: " + r.get(0) + ", F-Measure: " + r.get(1)); +} +{% endhighlight %}
- - - - # Dependencies Spark ML currently depends on MLlib and has the same dependencies. From 1ab3d9c7fe2f8e5bb8b018b145d742514cdcc8f8 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Sun, 16 Aug 2015 16:12:20 +0530 Subject: [PATCH 4/8] Add example based on threshold selection --- docs/ml-guide.md | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/docs/ml-guide.md b/docs/ml-guide.md index fc7e5e4474ee7..1f37b413d45d8 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -865,11 +865,14 @@ roc.show() roc.select("FPR").show() println(binarySummary.areaUnderROC) -// Print all threshold, fMeasure pairs. +// Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with +// this selected threshold. val fMeasure = binarySummary.fMeasureByThreshold -fMeasure.foreach { case Row(thresh: Double, fscore: Double) => - println(s"Threshold: $thresh, F-Measure: $fscore") } - +val maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0) +val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure). + select("threshold").head().getDouble(0) +logReg.setThreshold(bestThreshold) +logReg.fit(logRegDataFrame) {% endhighlight %} @@ -889,6 +892,7 @@ import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; +import static org.apache.spark.sql.functions.*; SparkConf conf = new SparkConf().setAppName("LogisticRegressionSummary"); JavaSparkContext jsc = new JavaSparkContext(conf); @@ -932,11 +936,14 @@ roc.show(); roc.select("FPR").show(); System.out.println(binarySummary.areaUnderROC()); -// Print all threshold, fMeasure pairs. +// Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with +// this selected threshold. DataFrame fMeasure = binarySummary.fMeasureByThreshold(); -for (Row r: fMeasure.collect()) { - System.out.println("Threshold: " + r.get(0) + ", F-Measure: " + r.get(1)); -} +double maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0); +double bestThreshold = fMeasure.where(fMeasure.col("F-Measure").equalTo(maxFMeasure)). + select("threshold").head().getDouble(0); +logReg.setThreshold(bestThreshold); +logReg.fit(logRegDataFrame); {% endhighlight %} From 9825b149cae8f8c81c4a18d8452a1377135318ea Mon Sep 17 00:00:00 2001 From: MechCoder Date: Tue, 18 Aug 2015 11:29:58 +0530 Subject: [PATCH 5/8] Move from ml-guide to ml/linear_models --- docs/ml-guide.md | 147 --------------------------------- docs/ml-linear-methods.md | 165 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+), 147 deletions(-) diff --git a/docs/ml-guide.md b/docs/ml-guide.md index 1f37b413d45d8..b6ca50e98db02 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -801,153 +801,6 @@ jsc.stop(); -## Examples: Summaries for LogisticRegression. - -Once [`LogisticRegression`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression) -is run on data, it is useful to extract statistics such as the -loss per iteration which will provide an intuition on overfitting and metrics to understand -how well the model has performed on training and test data. - -[`LogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.mllib.classification.LogisticRegressionTrainingSummary) -provides an interface to access such relevant information. i.e the `objectiveHistory` and metrics -to evaluate the performance on the training data directly with very less code to be rewritten by -the user. - -This examples illustrates the use of `LogisticRegressionTrainingSummary` on some toy data. - -
-
-{% highlight scala %} -import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.ml.classification.{LogisticRegression, BinaryLogisticRegressionSummary} -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.sql.{Row, SQLContext} - -val conf = new SparkConf().setAppName("LogisticRegressionSummary") -val sc = new SparkContext(conf) -val sqlContext = new SQLContext(sc) -import sqlContext.implicits._ - -// Use some random data for demonstration. -// Note that the RDD of LabeledPoints can be converted to a dataframe directly. -val data = sc.parallelize(Array( - LabeledPoint(0.0, Vectors.dense(0.2, 4.5, 1.6)), - LabeledPoint(1.0, Vectors.dense(3.1, 6.8, 3.6)), - LabeledPoint(0.0, Vectors.dense(2.4, 0.9, 1.9)), - LabeledPoint(1.0, Vectors.dense(9.1, 3.1, 3.6)), - LabeledPoint(0.0, Vectors.dense(2.5, 1.9, 9.1))) -) -val logRegDataFrame = data.toDF() - -// Run Logistic Regression on your toy data. -// Since LogisticRegression is an estimator, it returns an instance of LogisticRegressionModel -// which is a transformer. -val logReg = new LogisticRegression() -logReg.setMaxIter(5) -logReg.setRegParam(0.01) -val logRegModel = logReg.fit(logRegDataFrame) - -// Extract the summary directly from the returned LogisticRegressionModel instance. -val trainingSummary = logRegModel.summary - -// Obtain the loss per iteration. This should decrease upto a certain point and -// then increase or show negligible change after this. -val objectiveHistory = trainingSummary.objectiveHistory -objectiveHistory.foreach(loss => println(loss)) - -// Obtain the metrics useful to judge performance on test data. -val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary] - -// Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. -val roc = binarySummary.roc -roc.show() -roc.select("FPR").show() -println(binarySummary.areaUnderROC) - -// Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with -// this selected threshold. -val fMeasure = binarySummary.fMeasureByThreshold -val maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0) -val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure). - select("threshold").head().getDouble(0) -logReg.setThreshold(bestThreshold) -logReg.fit(logRegDataFrame) -{% endhighlight %} -
- -
-{% highlight java %} -import com.google.common.collect.Lists; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary; -import org.apache.spark.ml.classification.LogisticRegression; -import org.apache.spark.ml.classification.LogisticRegressionModel; -import org.apache.spark.ml.classification.LogisticRegressionTrainingSummary; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.sql.DataFrame; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; -import static org.apache.spark.sql.functions.*; - -SparkConf conf = new SparkConf().setAppName("LogisticRegressionSummary"); -JavaSparkContext jsc = new JavaSparkContext(conf); -SQLContext jsql = new SQLContext(jsc); - -// Use some random data for demonstration. -// Note that the RDD of LabeledPoints can be converted to a dataframe directly. -JavaRDD data = sc.parallelize(Lists.newArrayList( - new LabeledPoint(0.0, Vectors.dense(0.2, 4.5, 1.6)), - new LabeledPoint(1.0, Vectors.dense(3.1, 6.8, 3.6)), - new LabeledPoint(0.0, Vectors.dense(2.4, 0.9, 1.9)), - new LabeledPoint(1.0, Vectors.dense(9.1, 3.1, 3.6)), - new LabeledPoint(0.0, Vectors.dense(2.5, 1.9, 9.1))) -); -DataFrame logRegDataFrame = sql.createDataFrame(data, LabeledPoint.class); - -// Run Logistic Regression on your toy data. -// Since LogisticRegression is an estimator, it returns an instance of LogisticRegressionModel -// which is a transformer. -LogisticRegression logReg = new LogisticRegression(); -logReg.setMaxIter(5); -logReg.setRegParam(0.01); -LogisticRegressionModel logRegModel = logReg.fit(logRegDataFrame); - -// Extract the summary directly from the returned LogisticRegressionModel instance. -LogisticRegressionTrainingSummary trainingSummary = logRegModel.summary(); - -// Obtain the loss per iteration. This should decrease upto a certain point and -// then increase or show negligible change after this. -double[] objectiveHistory = trainingSummary.objectiveHistory(); -for (double lossPerIteration: objectiveHistory) { - System.out.println(lossPerIteration); -} - -// Obtain the metrics useful to judge performance on test data. -BinaryLogisticRegressionSummary binarySummary = (BinaryLogisticRegressionSummary) trainingSummary; - -// Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. -DataFrame roc = binarySummary.roc(); -roc.show(); -roc.select("FPR").show(); -System.out.println(binarySummary.areaUnderROC()); - -// Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with -// this selected threshold. -DataFrame fMeasure = binarySummary.fMeasureByThreshold(); -double maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0); -double bestThreshold = fMeasure.where(fMeasure.col("F-Measure").equalTo(maxFMeasure)). - select("threshold").head().getDouble(0); -logReg.setThreshold(bestThreshold); -logReg.fit(logRegDataFrame); -{% endhighlight %} -
-
- # Dependencies Spark ML currently depends on MLlib and has the same dependencies. diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md index 1ac83d94c9e81..0594c3f39ef1e 100644 --- a/docs/ml-linear-methods.md +++ b/docs/ml-linear-methods.md @@ -127,3 +127,168 @@ print("Intercept: " + str(lrModel.intercept)) The optimization algorithm underlies the implementation is called [Orthant-Wise Limited-memory QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf) (OWL-QN). It is an extension of L-BFGS that can effectively handle L1 regularization and elastic net. + +### Model Summaries + +Once a linear model is fit on data, it is useful to extract statistics such as the +loss per iteration and metrics to understand how well the model has performed on training +and test data. The examples provided below will help in understanding how to use the summaries +obtained by the summary method of the fitted linear models. + +Note that the predictions and metrics which are stored as dataframes obtained from the summary +are transient and are not available on the driver. This is because these are as expensive +to store as the original data itself. + +#### Example: Summary for LogisticRegression +
+
+[`LogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionTrainingSummary) +provides an interface to access information such as `objectiveHistory` and metrics +to evaluate the performance on the training data directly with very less code to be rewritten by +the user. [`LogisticRegression`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression) +currently supports only binary classification and hence in order to access the binary metrics +the summary must be explicitly cast to +[BinaryLogisticRegressionTrainingSummary](api/scala/index.html#org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary) +as done in the code below. This avoids raising errors for multiclass outputs while providing +extensiblity when multiclass classification is supported in the future. + +This example illustrates the use of `LogisticRegressionTrainingSummary` on some toy data. + +{% highlight scala %} +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.ml.classification.{LogisticRegression, BinaryLogisticRegressionSummary} +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.sql.{Row, SQLContext} + +val conf = new SparkConf().setAppName("LogisticRegressionSummary") +val sc = new SparkContext(conf) +val sqlContext = new SQLContext(sc) +import sqlContext.implicits._ + +// Use some random data for demonstration. +// Note that the RDD of LabeledPoints can be converted to a dataframe directly. +val data = sc.parallelize(Array( + LabeledPoint(0.0, Vectors.dense(0.2, 4.5, 1.6)), + LabeledPoint(1.0, Vectors.dense(3.1, 6.8, 3.6)), + LabeledPoint(0.0, Vectors.dense(2.4, 0.9, 1.9)), + LabeledPoint(1.0, Vectors.dense(9.1, 3.1, 3.6)), + LabeledPoint(0.0, Vectors.dense(2.5, 1.9, 9.1))) +) +val logRegDataFrame = data.toDF() + +// Run Logistic Regression on your toy data. +// Since LogisticRegression is an estimator, it returns an instance of LogisticRegressionModel +// which is a transformer. +val logReg = new LogisticRegression().setMaxIter(5).setRegParam(0.01) +val logRegModel = logReg.fit(logRegDataFrame) + +// Extract the summary directly from the returned LogisticRegressionModel instance. +val trainingSummary = logRegModel.summary + +// Obtain the loss per iteration. +val objectiveHistory = trainingSummary.objectiveHistory +objectiveHistory.foreach(loss => println(loss)) + +// Obtain the metrics useful to judge performance on test data. +// We cast the summary to a BinaryLogisticRegressionSummary since the problem is a +// binary classification problem. +val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary] + +// Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. +val roc = binarySummary.roc +roc.show() +roc.select("FPR").show() +println(binarySummary.areaUnderROC) + +// Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with +// this selected threshold. +val fMeasure = binarySummary.fMeasureByThreshold +val maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0) +val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure). + select("threshold").head().getDouble(0) +logReg.setThreshold(bestThreshold) +logReg.fit(logRegDataFrame) +{% endhighlight %} +
+ +
+[`LogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/LogisticRegressionTrainingSummary) +provides an interface to access information such as `objectiveHistory` and metrics +to evaluate the performance on the training data directly with very less code to be rewritten by +the user. [`LogisticRegression`](api/java/org/apache/spark/ml/classification/LogisticRegression) +currently supports only binary classification and hence in order to access the binary metrics +the summary must be explicitly cast to +[BinaryLogisticRegressionTrainingSummary](api/java/org/apache/spark/ml/classification/LogisticRegressionTrainingSummary) +as done in the code below. This avoids raising errors for multiclass outputs while providing +extensiblity when multiclass classification is supported in the future + +This example illustrates the use of `LogisticRegressionTrainingSummary` on some toy data. + +{% highlight java %} +import com.google.common.collect.Lists; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary; +import org.apache.spark.ml.classification.LogisticRegression; +import org.apache.spark.ml.classification.LogisticRegressionModel; +import org.apache.spark.ml.classification.LogisticRegressionTrainingSummary; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; +import static org.apache.spark.sql.functions.*; + +SparkConf conf = new SparkConf().setAppName("LogisticRegressionSummary"); +JavaSparkContext jsc = new JavaSparkContext(conf); +SQLContext jsql = new SQLContext(jsc); + +// Use some random data for demonstration. +// Note that the RDD of LabeledPoints can be converted to a dataframe directly. +JavaRDD data = sc.parallelize(Lists.newArrayList( + new LabeledPoint(0.0, Vectors.dense(0.2, 4.5, 1.6)), + new LabeledPoint(1.0, Vectors.dense(3.1, 6.8, 3.6)), + new LabeledPoint(0.0, Vectors.dense(2.4, 0.9, 1.9)), + new LabeledPoint(1.0, Vectors.dense(9.1, 3.1, 3.6)), + new LabeledPoint(0.0, Vectors.dense(2.5, 1.9, 9.1))) +); +DataFrame logRegDataFrame = sql.createDataFrame(data, LabeledPoint.class); + +// Run Logistic Regression on your toy data. +// Since LogisticRegression is an estimator, it returns an instance of LogisticRegressionModel +// which is a transformer. +LogisticRegression logReg = new LogisticRegression().setMaxIter(5).setRegParam(0.01); +LogisticRegressionModel logRegModel = logReg.fit(logRegDataFrame); + +// Extract the summary directly from the returned LogisticRegressionModel instance. +LogisticRegressionTrainingSummary trainingSummary = logRegModel.summary(); + +// Obtain the loss per iteration. +double[] objectiveHistory = trainingSummary.objectiveHistory(); +for (double lossPerIteration: objectiveHistory) { + System.out.println(lossPerIteration); +} + +// Obtain the metrics useful to judge performance on test data. +BinaryLogisticRegressionSummary binarySummary = (BinaryLogisticRegressionSummary) trainingSummary; + +// Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. +DataFrame roc = binarySummary.roc(); +roc.show(); +roc.select("FPR").show(); +System.out.println(binarySummary.areaUnderROC()); + +// Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with +// this selected threshold. +DataFrame fMeasure = binarySummary.fMeasureByThreshold(); +double maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0); +double bestThreshold = fMeasure.where(fMeasure.col("F-Measure").equalTo(maxFMeasure)). + select("threshold").head().getDouble(0); +logReg.setThreshold(bestThreshold); +logReg.fit(logRegDataFrame); +{% endhighlight %} +
+
From 1dfe7f6766ff986fcbcdeedfe84e369c83c20343 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Tue, 18 Aug 2015 11:36:13 +0530 Subject: [PATCH 6/8] remove sparkcontext and sqlcontext imports --- docs/ml-linear-methods.md | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md index 0594c3f39ef1e..12b95144b5b10 100644 --- a/docs/ml-linear-methods.md +++ b/docs/ml-linear-methods.md @@ -155,16 +155,10 @@ extensiblity when multiclass classification is supported in the future. This example illustrates the use of `LogisticRegressionTrainingSummary` on some toy data. {% highlight scala %} -import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.classification.{LogisticRegression, BinaryLogisticRegressionSummary} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.sql.{Row, SQLContext} - -val conf = new SparkConf().setAppName("LogisticRegressionSummary") -val sc = new SparkContext(conf) -val sqlContext = new SQLContext(sc) -import sqlContext.implicits._ +import org.apache.spark.sql.Row // Use some random data for demonstration. // Note that the RDD of LabeledPoints can be converted to a dataframe directly. @@ -228,9 +222,7 @@ This example illustrates the use of `LogisticRegressionTrainingSummary` on some {% highlight java %} import com.google.common.collect.Lists; -import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary; import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.classification.LogisticRegressionModel; @@ -239,13 +231,8 @@ import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; import static org.apache.spark.sql.functions.*; -SparkConf conf = new SparkConf().setAppName("LogisticRegressionSummary"); -JavaSparkContext jsc = new JavaSparkContext(conf); -SQLContext jsql = new SQLContext(jsc); - // Use some random data for demonstration. // Note that the RDD of LabeledPoints can be converted to a dataframe directly. JavaRDD data = sc.parallelize(Lists.newArrayList( From 4060c5b20db6193f8dda1cf256e26e5bdf27d8ce Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Tue, 18 Aug 2015 11:16:24 -0700 Subject: [PATCH 7/8] Documentation cleanup in `ml-linear-methods` * Adds MathJax to Elastic Net exposition * Adds regularization term to Elastic Net formula and describes correspondance between formula symbols and ML pipeline parameters * Joins LogisticRegressionSummary example with training example to reuse code --- docs/ml-linear-methods.md | 177 +++++++++++++++----------------------- 1 file changed, 71 insertions(+), 106 deletions(-) diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md index 12b95144b5b10..85045db0528c3 100644 --- a/docs/ml-linear-methods.md +++ b/docs/ml-linear-methods.md @@ -23,20 +23,41 @@ displayTitle: ML - Linear Methods \]` -In MLlib, we implement popular linear methods such as logistic regression and linear least squares with L1 or L2 regularization. Refer to [the linear methods in mllib](mllib-linear-methods.html) for details. In `spark.ml`, we also include Pipelines API for [Elastic net](http://en.wikipedia.org/wiki/Elastic_net_regularization), a hybrid of L1 and L2 regularization proposed in [this paper](http://users.stat.umn.edu/~zouxx019/Papers/elasticnet.pdf). Mathematically it is defined as a linear combination of the L1-norm and the L2-norm: +In MLlib, we implement popular linear methods such as logistic +regression and linear least squares with $L_1$ or $L_2$ regularization. +Refer to [the linear methods in mllib](mllib-linear-methods.html) for +details. In `spark.ml`, we also include Pipelines API for [Elastic +net](http://en.wikipedia.org/wiki/Elastic_net_regularization), a hybrid +of $L_1$ and $L_2$ regularization proposed in [Zou et al, Regularization +and variable selection via the elastic +net](http://users.stat.umn.edu/~zouxx019/Papers/elasticnet.pdf). +Mathematically, it is defined as a convex combination of the $L_1$ and +the $L_2$ regularization terms: `\[ -\alpha \|\wv\|_1 + (1-\alpha) \frac{1}{2}\|\wv\|_2^2, \alpha \in [0, 1]. +\alpha~\lambda \|\wv\|_1 + (1-\alpha) \frac{\lambda}{2}\|\wv\|_2^2, \alpha \in [0, 1], \lambda \geq 0. \]` -By setting $\alpha$ properly, it contains both L1 and L2 regularization as special cases. For example, if a [linear regression](https://en.wikipedia.org/wiki/Linear_regression) model is trained with the elastic net parameter $\alpha$ set to $1$, it is equivalent to a [Lasso](http://en.wikipedia.org/wiki/Least_squares#Lasso_method) model. On the other hand, if $\alpha$ is set to $0$, the trained model reduces to a [ridge regression](http://en.wikipedia.org/wiki/Tikhonov_regularization) model. We implement Pipelines API for both linear regression and logistic regression with elastic net regularization. - -**Examples** +By setting $\alpha$ properly, elastic net contains both $L_1$ and $L_2$ +regularization as special cases. For example, if a [linear +regression](https://en.wikipedia.org/wiki/Linear_regression) model is +trained with the elastic net parameter $\alpha$ set to $1$, it is +equivalent to a +[Lasso](http://en.wikipedia.org/wiki/Least_squares#Lasso_method) model. +On the other hand, if $\alpha$ is set to $0$, the trained model reduces +to a [ridge +regression](http://en.wikipedia.org/wiki/Tikhonov_regularization) model. +We implement Pipelines API for both linear regression and logistic +regression with elastic net regularization. + +## Example: Logistic Regression + +The following example shows how to train a logistic regression model +with elastic net regularization. `elasticNetParam` corresponds to +$\alpha$ and `regParam` corresponds to $\lambda$.
- {% highlight scala %} - import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.mllib.util.MLUtils @@ -53,15 +74,11 @@ val lrModel = lr.fit(training) // Print the weights and intercept for logistic regression println(s"Weights: ${lrModel.weights} Intercept: ${lrModel.intercept}") - {% endhighlight %} -
- {% highlight java %} - import org.apache.spark.ml.classification.LogisticRegression; import org.apache.spark.ml.classification.LogisticRegressionModel; import org.apache.spark.mllib.regression.LabeledPoint; @@ -99,9 +116,7 @@ public class LogisticRegressionWithElasticNetExample {
- {% highlight python %} - from pyspark.ml.classification import LogisticRegression from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.util import MLUtils @@ -118,67 +133,33 @@ lrModel = lr.fit(training) print("Weights: " + str(lrModel.weights)) print("Intercept: " + str(lrModel.intercept)) {% endhighlight %} -
-### Optimization - -The optimization algorithm underlies the implementation is called [Orthant-Wise Limited-memory QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf) -(OWL-QN). It is an extension of L-BFGS that can effectively handle L1 regularization and elastic net. - -### Model Summaries +The `spark.ml` implementation of logistic regression also supports +extracting a summary of the model over the training set. Note that the +predictions and metrics which are stored as `Datafram`s in +`BinaryLogisticRegressionSummary` are annoted `@transient` and hence +only available on the driver. -Once a linear model is fit on data, it is useful to extract statistics such as the -loss per iteration and metrics to understand how well the model has performed on training -and test data. The examples provided below will help in understanding how to use the summaries -obtained by the summary method of the fitted linear models. +
-Note that the predictions and metrics which are stored as dataframes obtained from the summary -are transient and are not available on the driver. This is because these are as expensive -to store as the original data itself. +
-#### Example: Summary for LogisticRegression -
-
[`LogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionTrainingSummary) -provides an interface to access information such as `objectiveHistory` and metrics -to evaluate the performance on the training data directly with very less code to be rewritten by -the user. [`LogisticRegression`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression) -currently supports only binary classification and hence in order to access the binary metrics -the summary must be explicitly cast to -[BinaryLogisticRegressionTrainingSummary](api/scala/index.html#org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary) -as done in the code below. This avoids raising errors for multiclass outputs while providing -extensiblity when multiclass classification is supported in the future. +provides a summary for a +[`LogisticRegressionModel`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionModel). +Currently, only binary classification is supported and the +summary must be explicitly cast to +[`BinaryLogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary). +This will likely change when multiclass classification is supported. -This example illustrates the use of `LogisticRegressionTrainingSummary` on some toy data. +Continuing the earlier example: {% highlight scala %} -import org.apache.spark.ml.classification.{LogisticRegression, BinaryLogisticRegressionSummary} -import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.sql.Row - -// Use some random data for demonstration. -// Note that the RDD of LabeledPoints can be converted to a dataframe directly. -val data = sc.parallelize(Array( - LabeledPoint(0.0, Vectors.dense(0.2, 4.5, 1.6)), - LabeledPoint(1.0, Vectors.dense(3.1, 6.8, 3.6)), - LabeledPoint(0.0, Vectors.dense(2.4, 0.9, 1.9)), - LabeledPoint(1.0, Vectors.dense(9.1, 3.1, 3.6)), - LabeledPoint(0.0, Vectors.dense(2.5, 1.9, 9.1))) -) -val logRegDataFrame = data.toDF() - -// Run Logistic Regression on your toy data. -// Since LogisticRegression is an estimator, it returns an instance of LogisticRegressionModel -// which is a transformer. -val logReg = new LogisticRegression().setMaxIter(5).setRegParam(0.01) -val logRegModel = logReg.fit(logRegDataFrame) - -// Extract the summary directly from the returned LogisticRegressionModel instance. -val trainingSummary = logRegModel.summary +// Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example +val trainingSummary = lrModel.summary // Obtain the loss per iteration. val objectiveHistory = trainingSummary.objectiveHistory @@ -206,60 +187,30 @@ logReg.fit(logRegDataFrame) {% endhighlight %}
-
-[`LogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/LogisticRegressionTrainingSummary) -provides an interface to access information such as `objectiveHistory` and metrics -to evaluate the performance on the training data directly with very less code to be rewritten by -the user. [`LogisticRegression`](api/java/org/apache/spark/ml/classification/LogisticRegression) -currently supports only binary classification and hence in order to access the binary metrics -the summary must be explicitly cast to -[BinaryLogisticRegressionTrainingSummary](api/java/org/apache/spark/ml/classification/LogisticRegressionTrainingSummary) -as done in the code below. This avoids raising errors for multiclass outputs while providing -extensiblity when multiclass classification is supported in the future +
+[`LogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/LogisticRegressionTrainingSummary.html) +provides a summary for a +[`LogisticRegressionModel`](api/java/org/apache/spark/ml/classification/LogisticRegressionModel.html). +Currently, only binary classification is supported and the +summary must be explicitly cast to +[`BinaryLogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/BinaryLogisticRegressionTrainingSummary.html). +This will likely change when multiclass classification is supported. -This example illustrates the use of `LogisticRegressionTrainingSummary` on some toy data. +Continuing the earlier example: {% highlight java %} -import com.google.common.collect.Lists; - -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary; -import org.apache.spark.ml.classification.LogisticRegression; -import org.apache.spark.ml.classification.LogisticRegressionModel; -import org.apache.spark.ml.classification.LogisticRegressionTrainingSummary; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.sql.DataFrame; -import org.apache.spark.sql.Row; -import static org.apache.spark.sql.functions.*; - -// Use some random data for demonstration. -// Note that the RDD of LabeledPoints can be converted to a dataframe directly. -JavaRDD data = sc.parallelize(Lists.newArrayList( - new LabeledPoint(0.0, Vectors.dense(0.2, 4.5, 1.6)), - new LabeledPoint(1.0, Vectors.dense(3.1, 6.8, 3.6)), - new LabeledPoint(0.0, Vectors.dense(2.4, 0.9, 1.9)), - new LabeledPoint(1.0, Vectors.dense(9.1, 3.1, 3.6)), - new LabeledPoint(0.0, Vectors.dense(2.5, 1.9, 9.1))) -); -DataFrame logRegDataFrame = sql.createDataFrame(data, LabeledPoint.class); - -// Run Logistic Regression on your toy data. -// Since LogisticRegression is an estimator, it returns an instance of LogisticRegressionModel -// which is a transformer. -LogisticRegression logReg = new LogisticRegression().setMaxIter(5).setRegParam(0.01); -LogisticRegressionModel logRegModel = logReg.fit(logRegDataFrame); - -// Extract the summary directly from the returned LogisticRegressionModel instance. +// Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example LogisticRegressionTrainingSummary trainingSummary = logRegModel.summary(); // Obtain the loss per iteration. double[] objectiveHistory = trainingSummary.objectiveHistory(); -for (double lossPerIteration: objectiveHistory) { +for (double lossPerIteration : objectiveHistory) { System.out.println(lossPerIteration); } // Obtain the metrics useful to judge performance on test data. +// We cast the summary to a BinaryLogisticRegressionSummary since the problem is a +// binary classification problem. BinaryLogisticRegressionSummary binarySummary = (BinaryLogisticRegressionSummary) trainingSummary; // Obtain the receiver-operating characteristic as a dataframe and areaUnderROC. @@ -278,4 +229,18 @@ logReg.setThreshold(bestThreshold); logReg.fit(logRegDataFrame); {% endhighlight %}
+ +
+Logistic regression model summary is not yet supported in Python. +
+
+ +# Optimization + +The optimization algorithm underlying the implementation is called +[Orthant-Wise Limited-memory +QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf) +(OWL-QN). It is an extension of L-BFGS that can effectively handle L1 +regularization and elastic net. + From 7bf922c53b0e7f6e6d5304107f432b58ad7b93c7 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Wed, 19 Aug 2015 02:06:14 +0530 Subject: [PATCH 8/8] typo --- docs/ml-linear-methods.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md index 85045db0528c3..2761aeb789621 100644 --- a/docs/ml-linear-methods.md +++ b/docs/ml-linear-methods.md @@ -139,8 +139,8 @@ print("Intercept: " + str(lrModel.intercept)) The `spark.ml` implementation of logistic regression also supports extracting a summary of the model over the training set. Note that the -predictions and metrics which are stored as `Datafram`s in -`BinaryLogisticRegressionSummary` are annoted `@transient` and hence +predictions and metrics which are stored as `Dataframe` in +`BinaryLogisticRegressionSummary` are annotated `@transient` and hence only available on the driver.