From a537c49170ee2b74563689d3048b2ef85168c939 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 2 Jun 2020 11:42:51 -0700 Subject: [PATCH 01/15] [SPARK-31893][ML] Add a generic ClassificationSummary trait --- .../ClassificationSummary.scala | 275 ++++++++++++++++++ .../classification/LogisticRegression.scala | 249 +--------------- 2 files changed, 283 insertions(+), 241 deletions(-) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala new file mode 100644 index 0000000000000..5ae93dc5f1aa8 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala @@ -0,0 +1,275 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.classification + +import org.apache.spark.annotation.Since +import org.apache.spark.ml.functions.checkNonNegativeWeight +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics} +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.functions.{col, lit} +import org.apache.spark.sql.types.DoubleType + + +/** + * Abstraction for multiclass classification results for a given model. + */ +private[classification] trait ClassificationSummary extends Serializable { + + /** + * Dataframe output by the model's `transform` method. + */ + @Since("3.1.0") + def predictions: DataFrame + + /** Field in "predictions" which gives the prediction of each class. */ + @Since("3.1.0") + def predictionCol: String + + /** Field in "predictions" which gives the true label of each instance (if available). */ + @Since("3.1.0") + def labelCol: String + + /** Field in "predictions" which gives the features of each instance as a vector. */ + @Since("3.1.0") + def featuresCol: String + + /** Field in "predictions" which gives the weight of each instance as a vector. */ + @Since("3.1.0") + def weightCol: String + + @transient private val multiclassMetrics = { + if (predictions.schema.fieldNames.contains(weightCol)) { + new MulticlassMetrics( + predictions.select( + col(predictionCol), + col(labelCol).cast(DoubleType), + checkNonNegativeWeight(col(weightCol).cast(DoubleType))).rdd.map { + case Row(prediction: Double, label: Double, weight: Double) => (prediction, label, weight) + }) + } else { + new MulticlassMetrics( + predictions.select( + col(predictionCol), + col(labelCol).cast(DoubleType), + lit(1.0)).rdd.map { + case Row(prediction: Double, label: Double, weight: Double) => (prediction, label, weight) + }) + } + } + + /** + * Returns the sequence of labels in ascending order. This order matches the order used + * in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel. + * + * Note: In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the + * training set is missing a label, then all of the arrays over labels + * (e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the + * expected numClasses. + */ + @Since("3.1.0") + def labels: Array[Double] = multiclassMetrics.labels + + /** Returns true positive rate for each label (category). */ + @Since("3.1.0") + def truePositiveRateByLabel: Array[Double] = recallByLabel + + /** Returns false positive rate for each label (category). */ + @Since("3.1.0") + def falsePositiveRateByLabel: Array[Double] = { + multiclassMetrics.labels.map(label => multiclassMetrics.falsePositiveRate(label)) + } + + /** Returns precision for each label (category). */ + @Since("3.1.0") + def precisionByLabel: Array[Double] = { + multiclassMetrics.labels.map(label => multiclassMetrics.precision(label)) + } + + /** Returns recall for each label (category). */ + @Since("3.1.0") + def recallByLabel: Array[Double] = { + multiclassMetrics.labels.map(label => multiclassMetrics.recall(label)) + } + + /** Returns f-measure for each label (category). */ + @Since("3.1.0") + def fMeasureByLabel(beta: Double): Array[Double] = { + multiclassMetrics.labels.map(label => multiclassMetrics.fMeasure(label, beta)) + } + + /** Returns f1-measure for each label (category). */ + @Since("3.1.0") + def fMeasureByLabel: Array[Double] = fMeasureByLabel(1.0) + + /** + * Returns accuracy. + * (equals to the total number of correctly classified instances + * out of the total number of instances.) + */ + @Since("3.1.0") + def accuracy: Double = multiclassMetrics.accuracy + + /** + * Returns weighted true positive rate. + * (equals to precision, recall and f-measure) + */ + @Since("3.1.0") + def weightedTruePositiveRate: Double = weightedRecall + + /** Returns weighted false positive rate. */ + @Since("3.1.0") + def weightedFalsePositiveRate: Double = multiclassMetrics.weightedFalsePositiveRate + + /** + * Returns weighted averaged recall. + * (equals to precision, recall and f-measure) + */ + @Since("3.1.0") + def weightedRecall: Double = multiclassMetrics.weightedRecall + + /** Returns weighted averaged precision. */ + @Since("3.1.0") + def weightedPrecision: Double = multiclassMetrics.weightedPrecision + + /** Returns weighted averaged f-measure. */ + @Since("3.1.0") + def weightedFMeasure(beta: Double): Double = multiclassMetrics.weightedFMeasure(beta) + + /** Returns weighted averaged f1-measure. */ + @Since("3.1.0") + def weightedFMeasure: Double = multiclassMetrics.weightedFMeasure(1.0) + + /** + * Convenient method for casting to binary classification summary. + * This method will throw an Exception if the summary is not a binary summary. + */ + @Since("3.1.0") + def asBinary: BinaryClassificationSummary = this match { + case b: BinaryClassificationSummary => b + case _ => + throw new RuntimeException("Cannot cast to a binary summary.") + } +} + +/** + * Abstraction for multiclass classification training results. + */ +private[classification] trait ClassificationTrainingSummary extends ClassificationSummary { + + /** + * objective function (scaled loss + regularization) at each iteration. + * It contains one more element, the initial state, than number of iterations. + */ + @Since("3.1.0") + def objectiveHistory: Array[Double] + + /** Number of training iterations. */ + @Since("3.1.0") + def totalIterations: Int = { + assert(objectiveHistory.length > 0, s"objectiveHistory length should be greater than 1.") + objectiveHistory.length - 1 + } +} + +/** + * Abstraction for binary classification results for a given model. + */ +trait BinaryClassificationSummary extends ClassificationSummary { + + private val sparkSession = predictions.sparkSession + import sparkSession.implicits._ + + /** Field in "predictions" which gives the probability of each class as a vector. */ + @Since("3.1.0") + def probabilityCol: String + + // TODO: Allow the user to vary the number of bins using a setBins method in + // BinaryClassificationMetrics. For now the default is set to 100. + @transient private val binaryMetrics = if (predictions.schema.fieldNames.contains(weightCol)) { + new BinaryClassificationMetrics( + predictions.select(col(probabilityCol), col(labelCol).cast(DoubleType), + checkNonNegativeWeight(col(weightCol).cast(DoubleType))).rdd.map { + case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight) + }, 100 + ) + } else { + new BinaryClassificationMetrics( + predictions.select(col(probabilityCol), col(labelCol).cast(DoubleType), + lit(1.0)).rdd.map { + case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight) + }, 100 + ) + } + + /** + * Returns the receiver operating characteristic (ROC) curve, + * which is a Dataframe having two fields (FPR, TPR) + * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. + * See http://en.wikipedia.org/wiki/Receiver_operating_characteristic + */ + @Since("3.1.0") + @transient lazy val roc: DataFrame = binaryMetrics.roc().toDF("FPR", "TPR") + + /** + * Computes the area under the receiver operating characteristic (ROC) curve. + */ + @Since("3.1.0") + lazy val areaUnderROC: Double = binaryMetrics.areaUnderROC() + + /** + * Returns the precision-recall curve, which is a Dataframe containing + * two fields recall, precision with (0.0, 1.0) prepended to it. + */ + @Since("3.1.0") + @transient lazy val pr: DataFrame = binaryMetrics.pr().toDF("recall", "precision") + + /** + * Returns a dataframe with two fields (threshold, F-Measure) curve with beta = 1.0. + */ + @Since("3.1.0") + @transient lazy val fMeasureByThreshold: DataFrame = { + binaryMetrics.fMeasureByThreshold().toDF("threshold", "F-Measure") + } + + /** + * Returns a dataframe with two fields (threshold, precision) curve. + * Every possible probability obtained in transforming the dataset are used + * as thresholds used in calculating the precision. + */ + @Since("3.1.0") + @transient lazy val precisionByThreshold: DataFrame = { + binaryMetrics.precisionByThreshold().toDF("threshold", "precision") + } + + /** + * Returns a dataframe with two fields (threshold, recall) curve. + * Every possible probability obtained in transforming the dataset are used + * as thresholds used in calculating the recall. + */ + @Since("3.1.0") + @transient lazy val recallByThreshold: DataFrame = { + binaryMetrics.recallByThreshold().toDF("threshold", "recall") + } +} + +/** + * Abstraction for binary classification training results. + */ +sealed trait BinaryClassificationTrainingSummary extends BinaryClassificationSummary + with ClassificationTrainingSummary diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 1f5976c59235b..b7e928588e837 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -29,7 +29,6 @@ import org.apache.spark.SparkException import org.apache.spark.annotation.Since import org.apache.spark.internal.Logging import org.apache.spark.ml.feature._ -import org.apache.spark.ml.functions.checkNonNegativeWeight import org.apache.spark.ml.linalg._ import org.apache.spark.ml.optim.aggregator._ import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction} @@ -38,12 +37,10 @@ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.stat._ import org.apache.spark.ml.util._ import org.apache.spark.ml.util.Instrumentation.instrumented -import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Row} -import org.apache.spark.sql.functions.{col, lit} -import org.apache.spark.sql.types.{DataType, DoubleType, StructType} +import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.storage.StorageLevel import org.apache.spark.util.VersionUtils @@ -1194,7 +1191,7 @@ class LogisticRegressionModel private[spark] ( val (summaryModel, probabilityColName, predictionColName) = findSummaryModel() if (numClasses > 2) { new LogisticRegressionSummaryImpl(summaryModel.transform(dataset), - probabilityColName, predictionColName, $(labelCol), $(featuresCol), weightColName) + predictionColName, $(labelCol), $(featuresCol), weightColName) } else { new BinaryLogisticRegressionSummaryImpl(summaryModel.transform(dataset), probabilityColName, predictionColName, $(labelCol), $(featuresCol), weightColName) @@ -1396,246 +1393,19 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] { /** * Abstraction for logistic regression results for a given model. */ -sealed trait LogisticRegressionSummary extends Serializable { - - /** - * Dataframe output by the model's `transform` method. - */ - @Since("1.5.0") - def predictions: DataFrame - - /** Field in "predictions" which gives the probability of each class as a vector. */ - @Since("1.5.0") - def probabilityCol: String - - /** Field in "predictions" which gives the prediction of each class. */ - @Since("2.3.0") - def predictionCol: String - - /** Field in "predictions" which gives the true label of each instance (if available). */ - @Since("1.5.0") - def labelCol: String - - /** Field in "predictions" which gives the features of each instance as a vector. */ - @Since("1.6.0") - def featuresCol: String - - /** Field in "predictions" which gives the weight of each instance as a vector. */ - @Since("3.1.0") - def weightCol: String - - @transient private val multiclassMetrics = { - if (predictions.schema.fieldNames.contains(weightCol)) { - new MulticlassMetrics( - predictions.select( - col(predictionCol), - col(labelCol).cast(DoubleType), - checkNonNegativeWeight(col(weightCol).cast(DoubleType))).rdd.map { - case Row(prediction: Double, label: Double, weight: Double) => (prediction, label, weight) - }) - } else { - new MulticlassMetrics( - predictions.select( - col(predictionCol), - col(labelCol).cast(DoubleType), - lit(1.0)).rdd.map { - case Row(prediction: Double, label: Double, weight: Double) => (prediction, label, weight) - }) - } - } - - /** - * Returns the sequence of labels in ascending order. This order matches the order used - * in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel. - * - * Note: In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the - * training set is missing a label, then all of the arrays over labels - * (e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the - * expected numClasses. - */ - @Since("2.3.0") - def labels: Array[Double] = multiclassMetrics.labels - - /** Returns true positive rate for each label (category). */ - @Since("2.3.0") - def truePositiveRateByLabel: Array[Double] = recallByLabel - - /** Returns false positive rate for each label (category). */ - @Since("2.3.0") - def falsePositiveRateByLabel: Array[Double] = { - multiclassMetrics.labels.map(label => multiclassMetrics.falsePositiveRate(label)) - } - - /** Returns precision for each label (category). */ - @Since("2.3.0") - def precisionByLabel: Array[Double] = { - multiclassMetrics.labels.map(label => multiclassMetrics.precision(label)) - } - - /** Returns recall for each label (category). */ - @Since("2.3.0") - def recallByLabel: Array[Double] = { - multiclassMetrics.labels.map(label => multiclassMetrics.recall(label)) - } - - /** Returns f-measure for each label (category). */ - @Since("2.3.0") - def fMeasureByLabel(beta: Double): Array[Double] = { - multiclassMetrics.labels.map(label => multiclassMetrics.fMeasure(label, beta)) - } - - /** Returns f1-measure for each label (category). */ - @Since("2.3.0") - def fMeasureByLabel: Array[Double] = fMeasureByLabel(1.0) - - /** - * Returns accuracy. - * (equals to the total number of correctly classified instances - * out of the total number of instances.) - */ - @Since("2.3.0") - def accuracy: Double = multiclassMetrics.accuracy - - /** - * Returns weighted true positive rate. - * (equals to precision, recall and f-measure) - */ - @Since("2.3.0") - def weightedTruePositiveRate: Double = weightedRecall - - /** Returns weighted false positive rate. */ - @Since("2.3.0") - def weightedFalsePositiveRate: Double = multiclassMetrics.weightedFalsePositiveRate - - /** - * Returns weighted averaged recall. - * (equals to precision, recall and f-measure) - */ - @Since("2.3.0") - def weightedRecall: Double = multiclassMetrics.weightedRecall - - /** Returns weighted averaged precision. */ - @Since("2.3.0") - def weightedPrecision: Double = multiclassMetrics.weightedPrecision - - /** Returns weighted averaged f-measure. */ - @Since("2.3.0") - def weightedFMeasure(beta: Double): Double = multiclassMetrics.weightedFMeasure(beta) - - /** Returns weighted averaged f1-measure. */ - @Since("2.3.0") - def weightedFMeasure: Double = multiclassMetrics.weightedFMeasure(1.0) - - /** - * Convenient method for casting to binary logistic regression summary. - * This method will throw an Exception if the summary is not a binary summary. - */ - @Since("2.3.0") - def asBinary: BinaryLogisticRegressionSummary = this match { - case b: BinaryLogisticRegressionSummary => b - case _ => - throw new RuntimeException("Cannot cast to a binary summary.") - } +sealed trait LogisticRegressionSummary extends ClassificationSummary { } /** * Abstraction for multiclass logistic regression training results. - * Currently, the training summary ignores the training weights except - * for the objective trace. */ -sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary { - - /** - * objective function (scaled loss + regularization) at each iteration. - * It contains one more element, the initial state, than number of iterations. - */ - @Since("1.5.0") - def objectiveHistory: Array[Double] - - /** Number of training iterations. */ - @Since("1.5.0") - def totalIterations: Int = { - assert(objectiveHistory.length > 0, s"objectiveHistory length should be greater than 1.") - objectiveHistory.length - 1 - } - +sealed trait LogisticRegressionTrainingSummary extends ClassificationTrainingSummary { } /** * Abstraction for binary logistic regression results for a given model. */ -sealed trait BinaryLogisticRegressionSummary extends LogisticRegressionSummary { - - private val sparkSession = predictions.sparkSession - import sparkSession.implicits._ - - // TODO: Allow the user to vary the number of bins using a setBins method in - // BinaryClassificationMetrics. For now the default is set to 100. - @transient private val binaryMetrics = if (predictions.schema.fieldNames.contains(weightCol)) { - new BinaryClassificationMetrics( - predictions.select(col(probabilityCol), col(labelCol).cast(DoubleType), - checkNonNegativeWeight(col(weightCol).cast(DoubleType))).rdd.map { - case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight) - }, 100 - ) - } else { - new BinaryClassificationMetrics( - predictions.select(col(probabilityCol), col(labelCol).cast(DoubleType), - lit(1.0)).rdd.map { - case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight) - }, 100 - ) - } - - /** - * Returns the receiver operating characteristic (ROC) curve, - * which is a Dataframe having two fields (FPR, TPR) - * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. - * See http://en.wikipedia.org/wiki/Receiver_operating_characteristic - */ - @Since("1.5.0") - @transient lazy val roc: DataFrame = binaryMetrics.roc().toDF("FPR", "TPR") - - /** - * Computes the area under the receiver operating characteristic (ROC) curve. - */ - @Since("1.5.0") - lazy val areaUnderROC: Double = binaryMetrics.areaUnderROC() - - /** - * Returns the precision-recall curve, which is a Dataframe containing - * two fields recall, precision with (0.0, 1.0) prepended to it. - */ - @Since("1.5.0") - @transient lazy val pr: DataFrame = binaryMetrics.pr().toDF("recall", "precision") - - /** - * Returns a dataframe with two fields (threshold, F-Measure) curve with beta = 1.0. - */ - @Since("1.5.0") - @transient lazy val fMeasureByThreshold: DataFrame = { - binaryMetrics.fMeasureByThreshold().toDF("threshold", "F-Measure") - } - - /** - * Returns a dataframe with two fields (threshold, precision) curve. - * Every possible probability obtained in transforming the dataset are used - * as thresholds used in calculating the precision. - */ - @Since("1.5.0") - @transient lazy val precisionByThreshold: DataFrame = { - binaryMetrics.precisionByThreshold().toDF("threshold", "precision") - } - - /** - * Returns a dataframe with two fields (threshold, recall) curve. - * Every possible probability obtained in transforming the dataset are used - * as thresholds used in calculating the recall. - */ - @Since("1.5.0") - @transient lazy val recallByThreshold: DataFrame = { - binaryMetrics.recallByThreshold().toDF("threshold", "recall") - } +sealed trait BinaryLogisticRegressionSummary extends BinaryClassificationSummary { } /** @@ -1666,15 +1436,13 @@ private class LogisticRegressionTrainingSummaryImpl( weightCol: String, override val objectiveHistory: Array[Double]) extends LogisticRegressionSummaryImpl( - predictions, probabilityCol, predictionCol, labelCol, featuresCol, weightCol) + predictions, predictionCol, labelCol, featuresCol, weightCol) with LogisticRegressionTrainingSummary /** * Multiclass logistic regression results for a given model. * * @param predictions dataframe output by the model's `transform` method. - * @param probabilityCol field in "predictions" which gives the probability of - * each class as a vector. * @param predictionCol field in "predictions" which gives the prediction for a data instance as a * double. * @param labelCol field in "predictions" which gives the true label of each instance. @@ -1683,7 +1451,6 @@ private class LogisticRegressionTrainingSummaryImpl( */ private class LogisticRegressionSummaryImpl( @transient override val predictions: DataFrame, - override val probabilityCol: String, override val predictionCol: String, override val labelCol: String, override val featuresCol: String, @@ -1729,11 +1496,11 @@ private class BinaryLogisticRegressionTrainingSummaryImpl( */ private class BinaryLogisticRegressionSummaryImpl( predictions: DataFrame, - probabilityCol: String, + override val probabilityCol: String, predictionCol: String, labelCol: String, featuresCol: String, weightCol: String) extends LogisticRegressionSummaryImpl( - predictions, probabilityCol, predictionCol, labelCol, featuresCol, weightCol) + predictions, predictionCol, labelCol, featuresCol, weightCol) with BinaryLogisticRegressionSummary From 93c82bd5b47099fc7623c1c8994caebb11deb69e Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 2 Jun 2020 14:32:36 -0700 Subject: [PATCH 02/15] change probabilityCol to scoreCol --- .../ml/classification/ClassificationSummary.scala | 6 +++--- .../spark/ml/classification/LogisticRegression.scala | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala index 5ae93dc5f1aa8..10588851f6619 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala @@ -197,20 +197,20 @@ trait BinaryClassificationSummary extends ClassificationSummary { /** Field in "predictions" which gives the probability of each class as a vector. */ @Since("3.1.0") - def probabilityCol: String + def scoreCol: String // TODO: Allow the user to vary the number of bins using a setBins method in // BinaryClassificationMetrics. For now the default is set to 100. @transient private val binaryMetrics = if (predictions.schema.fieldNames.contains(weightCol)) { new BinaryClassificationMetrics( - predictions.select(col(probabilityCol), col(labelCol).cast(DoubleType), + predictions.select(col(scoreCol), col(labelCol).cast(DoubleType), checkNonNegativeWeight(col(weightCol).cast(DoubleType))).rdd.map { case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight) }, 100 ) } else { new BinaryClassificationMetrics( - predictions.select(col(probabilityCol), col(labelCol).cast(DoubleType), + predictions.select(col(scoreCol), col(labelCol).cast(DoubleType), lit(1.0)).rdd.map { case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight) }, 100 diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index b7e928588e837..95e1315ad3fb2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -1461,7 +1461,7 @@ private class LogisticRegressionSummaryImpl( * Binary logistic regression training results. * * @param predictions dataframe output by the model's `transform` method. - * @param probabilityCol field in "predictions" which gives the probability of + * @param scoreCol field in "predictions" which gives the probability of * each class as a vector. * @param predictionCol field in "predictions" which gives the prediction for a data instance as a * double. @@ -1472,21 +1472,21 @@ private class LogisticRegressionSummaryImpl( */ private class BinaryLogisticRegressionTrainingSummaryImpl( predictions: DataFrame, - probabilityCol: String, + scoreCol: String, predictionCol: String, labelCol: String, featuresCol: String, weightCol: String, override val objectiveHistory: Array[Double]) extends BinaryLogisticRegressionSummaryImpl( - predictions, probabilityCol, predictionCol, labelCol, featuresCol, weightCol) + predictions, scoreCol, predictionCol, labelCol, featuresCol, weightCol) with BinaryLogisticRegressionTrainingSummary /** * Binary logistic regression results for a given model. * * @param predictions dataframe output by the model's `transform` method. - * @param probabilityCol field in "predictions" which gives the probability of + * @param scoreCol field in "predictions" which gives the probability of * each class as a vector. * @param predictionCol field in "predictions" which gives the prediction of * each class as a double. @@ -1496,7 +1496,7 @@ private class BinaryLogisticRegressionTrainingSummaryImpl( */ private class BinaryLogisticRegressionSummaryImpl( predictions: DataFrame, - override val probabilityCol: String, + override val scoreCol: String, predictionCol: String, labelCol: String, featuresCol: String, From 981ae656060ed8cf89bea0923532b1606ebf2e33 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 2 Jun 2020 14:59:11 -0700 Subject: [PATCH 03/15] fix MiMa --- project/MimaExcludes.scala | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index addb2d8152189..aec58fc2decc9 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -39,6 +39,7 @@ object MimaExcludes { // [SPARK-31077] Remove ChiSqSelector dependency on mllib.ChiSqSelectorModel // private constructor ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.ChiSqSelectorModel.this"), + // [SPARK-31127] Implement abstract Selector // org.apache.spark.ml.feature.ChiSqSelectorModel type hierarchy change // before: class ChiSqSelector extends Estimator with ChiSqSelectorParams @@ -46,11 +47,36 @@ object MimaExcludes { // false positive, no binary incompatibility ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.feature.ChiSqSelectorModel"), ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.feature.ChiSqSelector"), + //[SPARK-31840] Add instance weight support in LogisticRegressionSummary // weightCol in org.apache.spark.ml.classification.LogisticRegressionSummary is present only in current version ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightCol"), + // [SPARK-24634] Add a new metric regarding number of inputs later than watermark plus allowed delay - ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StateOperatorProgress.$default$4") + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StateOperatorProgress.$default$4"), + + //[SPARK-31893] Add a generic ClassificationSummary trait + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionTrainingSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionTrainingSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionTrainingSummary.weightCol"), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$_setter_$org$apache$spark$ml$classification$BinaryClassificationSummary$$sparkSession_="), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$_setter_$org$apache$spark$ml$classification$BinaryClassificationSummary$$binaryMetrics_="), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$$binaryMetrics"), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$$sparkSession"), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.weightCol"), + ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.asBinary"), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightCol"), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$_setter_$org$apache$spark$ml$classification$BinaryClassificationSummary$$sparkSession_="), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$_setter_$org$apache$spark$ml$classification$BinaryClassificationSummary$$binaryMetrics_="), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$$binaryMetrics"), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$$sparkSession"), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.weightCol") ) // Exclude rules for 3.0.x From 173a24ebf189ad28cf237e1f8fe0e265af63269c Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 2 Jun 2020 15:28:01 -0700 Subject: [PATCH 04/15] nit --- .../spark/ml/classification/ClassificationSummary.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala index 10588851f6619..3ff7a15a1442a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala @@ -195,7 +195,9 @@ trait BinaryClassificationSummary extends ClassificationSummary { private val sparkSession = predictions.sparkSession import sparkSession.implicits._ - /** Field in "predictions" which gives the probability of each class as a vector. */ + /** + * Field in "predictions" which gives the probability or rawPrediction of each class as a vector. + */ @Since("3.1.0") def scoreCol: String From aa6c00eb845a34d1ae334f2239dd3fbdb72be4d4 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 3 Jun 2020 14:01:27 -0700 Subject: [PATCH 05/15] keep probabilityCol in LogisticRegression as an alias of scoreCol --- .../classification/ClassificationSummary.scala | 12 ++++++------ .../ml/classification/LogisticRegression.scala | 17 ++++++++++++----- project/MimaExcludes.scala | 4 +++- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala index 3ff7a15a1442a..049860c81ddad 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala @@ -37,6 +37,12 @@ private[classification] trait ClassificationSummary extends Serializable { @Since("3.1.0") def predictions: DataFrame + /** + * Field in "predictions" which gives the probability or rawPrediction of each class as a vector. + */ + @Since("3.1.0") + def scoreCol: String + /** Field in "predictions" which gives the prediction of each class. */ @Since("3.1.0") def predictionCol: String @@ -195,12 +201,6 @@ trait BinaryClassificationSummary extends ClassificationSummary { private val sparkSession = predictions.sparkSession import sparkSession.implicits._ - /** - * Field in "predictions" which gives the probability or rawPrediction of each class as a vector. - */ - @Since("3.1.0") - def scoreCol: String - // TODO: Allow the user to vary the number of bins using a setBins method in // BinaryClassificationMetrics. For now the default is set to 100. @transient private val binaryMetrics = if (predictions.schema.fieldNames.contains(weightCol)) { diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 95e1315ad3fb2..2befee50d970d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -1191,7 +1191,7 @@ class LogisticRegressionModel private[spark] ( val (summaryModel, probabilityColName, predictionColName) = findSummaryModel() if (numClasses > 2) { new LogisticRegressionSummaryImpl(summaryModel.transform(dataset), - predictionColName, $(labelCol), $(featuresCol), weightColName) + probabilityColName, predictionColName, $(labelCol), $(featuresCol), weightColName) } else { new BinaryLogisticRegressionSummaryImpl(summaryModel.transform(dataset), probabilityColName, predictionColName, $(labelCol), $(featuresCol), weightColName) @@ -1394,6 +1394,10 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] { * Abstraction for logistic regression results for a given model. */ sealed trait LogisticRegressionSummary extends ClassificationSummary { + + /** Field in "predictions" which gives the probability of each class as a vector. */ + @Since("1.5.0") + def probabilityCol: String = scoreCol } /** @@ -1436,13 +1440,15 @@ private class LogisticRegressionTrainingSummaryImpl( weightCol: String, override val objectiveHistory: Array[Double]) extends LogisticRegressionSummaryImpl( - predictions, predictionCol, labelCol, featuresCol, weightCol) + predictions, probabilityCol, predictionCol, labelCol, featuresCol, weightCol) with LogisticRegressionTrainingSummary /** * Multiclass logistic regression results for a given model. * * @param predictions dataframe output by the model's `transform` method. + * @param scoreCol field in "predictions" which gives the probability of + * each class as a vector. * @param predictionCol field in "predictions" which gives the prediction for a data instance as a * double. * @param labelCol field in "predictions" which gives the true label of each instance. @@ -1451,6 +1457,7 @@ private class LogisticRegressionTrainingSummaryImpl( */ private class LogisticRegressionSummaryImpl( @transient override val predictions: DataFrame, + override val scoreCol: String, override val predictionCol: String, override val labelCol: String, override val featuresCol: String, @@ -1487,7 +1494,7 @@ private class BinaryLogisticRegressionTrainingSummaryImpl( * * @param predictions dataframe output by the model's `transform` method. * @param scoreCol field in "predictions" which gives the probability of - * each class as a vector. + * each class as a vector. * @param predictionCol field in "predictions" which gives the prediction of * each class as a double. * @param labelCol field in "predictions" which gives the true label of each instance. @@ -1496,11 +1503,11 @@ private class BinaryLogisticRegressionTrainingSummaryImpl( */ private class BinaryLogisticRegressionSummaryImpl( predictions: DataFrame, - override val scoreCol: String, + scoreCol: String, predictionCol: String, labelCol: String, featuresCol: String, weightCol: String) extends LogisticRegressionSummaryImpl( - predictions, predictionCol, labelCol, featuresCol, weightCol) + predictions, scoreCol, predictionCol, labelCol, featuresCol, weightCol) with BinaryLogisticRegressionSummary diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index aec58fc2decc9..a1bcfb3db552a 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -76,7 +76,9 @@ object MimaExcludes { ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$$sparkSession"), ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="), ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"), - ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.weightCol") + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.weightCol"), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionTrainingSummary.scoreCol"), + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.scoreCol") ) // Exclude rules for 3.0.x From d718af73d3db934b4c205a58d37be5846dbd4874 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 3 Jun 2020 14:36:25 -0700 Subject: [PATCH 06/15] nit --- .../ml/classification/LogisticRegression.scala | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 2befee50d970d..b3f7b515909a3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -1448,7 +1448,7 @@ private class LogisticRegressionTrainingSummaryImpl( * * @param predictions dataframe output by the model's `transform` method. * @param scoreCol field in "predictions" which gives the probability of - * each class as a vector. + * each class as a vector. * @param predictionCol field in "predictions" which gives the prediction for a data instance as a * double. * @param labelCol field in "predictions" which gives the true label of each instance. @@ -1468,7 +1468,7 @@ private class LogisticRegressionSummaryImpl( * Binary logistic regression training results. * * @param predictions dataframe output by the model's `transform` method. - * @param scoreCol field in "predictions" which gives the probability of + * @param probabilityCol field in "predictions" which gives the probability of * each class as a vector. * @param predictionCol field in "predictions" which gives the prediction for a data instance as a * double. @@ -1479,22 +1479,22 @@ private class LogisticRegressionSummaryImpl( */ private class BinaryLogisticRegressionTrainingSummaryImpl( predictions: DataFrame, - scoreCol: String, + probabilityCol: String, predictionCol: String, labelCol: String, featuresCol: String, weightCol: String, override val objectiveHistory: Array[Double]) extends BinaryLogisticRegressionSummaryImpl( - predictions, scoreCol, predictionCol, labelCol, featuresCol, weightCol) + predictions, probabilityCol, predictionCol, labelCol, featuresCol, weightCol) with BinaryLogisticRegressionTrainingSummary /** * Binary logistic regression results for a given model. * * @param predictions dataframe output by the model's `transform` method. - * @param scoreCol field in "predictions" which gives the probability of - * each class as a vector. + * @param probabilityCol field in "predictions" which gives the probability of + * each class as a vector. * @param predictionCol field in "predictions" which gives the prediction of * each class as a double. * @param labelCol field in "predictions" which gives the true label of each instance. @@ -1503,11 +1503,11 @@ private class BinaryLogisticRegressionTrainingSummaryImpl( */ private class BinaryLogisticRegressionSummaryImpl( predictions: DataFrame, - scoreCol: String, + probabilityCol: String, predictionCol: String, labelCol: String, featuresCol: String, weightCol: String) extends LogisticRegressionSummaryImpl( - predictions, scoreCol, predictionCol, labelCol, featuresCol, weightCol) + predictions, probabilityCol, predictionCol, labelCol, featuresCol, weightCol) with BinaryLogisticRegressionSummary From 0856d4050202030971c6b135f9686c74b79c6f92 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 4 Jun 2020 14:01:46 -0700 Subject: [PATCH 07/15] add python changes --- .../ClassificationSummary.scala | 10 +- .../classification/LogisticRegression.scala | 6 +- .../LogisticRegressionSuite.scala | 9 +- python/pyspark/ml/classification.py | 546 ++++++++++-------- .../pyspark/ml/tests/test_training_summary.py | 2 + 5 files changed, 307 insertions(+), 266 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala index 049860c81ddad..435ce00a78e5b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala @@ -174,9 +174,9 @@ private[classification] trait ClassificationSummary extends Serializable { } /** - * Abstraction for multiclass classification training results. + * Abstraction for training results. */ -private[classification] trait ClassificationTrainingSummary extends ClassificationSummary { +private[classification] trait TrainingSummary { /** * objective function (scaled loss + regularization) at each iteration. @@ -269,9 +269,3 @@ trait BinaryClassificationSummary extends ClassificationSummary { binaryMetrics.recallByThreshold().toDF("threshold", "recall") } } - -/** - * Abstraction for binary classification training results. - */ -sealed trait BinaryClassificationTrainingSummary extends BinaryClassificationSummary - with ClassificationTrainingSummary diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index b3f7b515909a3..559a84d2249e3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -1403,13 +1403,15 @@ sealed trait LogisticRegressionSummary extends ClassificationSummary { /** * Abstraction for multiclass logistic regression training results. */ -sealed trait LogisticRegressionTrainingSummary extends ClassificationTrainingSummary { +sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary + with TrainingSummary { } /** * Abstraction for binary logistic regression results for a given model. */ -sealed trait BinaryLogisticRegressionSummary extends BinaryClassificationSummary { +sealed trait BinaryLogisticRegressionSummary extends LogisticRegressionSummary + with BinaryClassificationSummary { } /** diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 30c21d8b06670..0caeedfe1d655 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -263,6 +263,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { .setMaxIter(1) val blorModel = lr.fit(smallBinaryDataset) + assert(blorModel.summary.probabilityCol.equals("probability")) assert(blorModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummary]) assert(blorModel.summary.asBinary.isInstanceOf[BinaryLogisticRegressionSummary]) assert(blorModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummary]) @@ -270,6 +271,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { assert(blorModel.binarySummary.totalIterations == 1) val mlorModel = lr.setFamily("multinomial").fit(smallMultinomialDataset) + assert(mlorModel.summary.probabilityCol.equals("probability")) assert(mlorModel.summary.isInstanceOf[LogisticRegressionTrainingSummary]) withClue("cannot get binary summary for multiclass model") { intercept[RuntimeException] { @@ -284,6 +286,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { assert(mlorModel.summary.totalIterations == 1) val mlorBinaryModel = lr.setFamily("multinomial").fit(smallBinaryDataset) + assert(mlorBinaryModel.summary.probabilityCol.equals("probability")) assert(mlorBinaryModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummary]) assert(mlorBinaryModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummary]) @@ -291,6 +294,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { val mlorSummary = mlorModel.evaluate(smallMultinomialDataset) assert(blorSummary.isInstanceOf[BinaryLogisticRegressionSummary]) assert(mlorSummary.isInstanceOf[LogisticRegressionSummary]) + assert(blorSummary.probabilityCol.equals("probability")) + assert(mlorSummary.probabilityCol.equals("probability")) // verify instance weight works val lr2 = new LogisticRegression() @@ -313,12 +318,12 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { assert(mlorModel2.summary.isInstanceOf[LogisticRegressionTrainingSummary]) withClue("cannot get binary summary for multiclass model") { intercept[RuntimeException] { - mlorModel.binarySummary + mlorModel2.binarySummary } } withClue("cannot cast summary to binary summary multiclass model") { intercept[RuntimeException] { - mlorModel.summary.asBinary + mlorModel2.summary.asBinary } } diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 3f3699ce53b51..92eecde99943a 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -240,6 +240,292 @@ def predictProbability(self, value): return self._call_java("predictProbability", value) +@inherit_doc +class _ClassificationSummary(JavaWrapper): + """ + Abstraction for multiclass classification results for a given model. + + .. versionadded:: 3.1.0 + """ + + @property + @since("3.1.0") + def predictions(self): + """ + Dataframe outputted by the model's `transform` method. + """ + return self._call_java("predictions") + + @property + @since("3.1.0") + def scoreCol(self): + """ + Field in "predictions" which gives the probability or raw prediction + of each class as a vector. + """ + return self._call_java("scoreCol") + + @property + @since("3.1.0") + def predictionCol(self): + """ + Field in "predictions" which gives the prediction of each class. + """ + return self._call_java("predictionCol") + + @property + @since("3.1.0") + def labelCol(self): + """ + Field in "predictions" which gives the true label of each + instance. + """ + return self._call_java("labelCol") + + @property + @since("3.1.0") + def featuresCol(self): + """ + Field in "predictions" which gives the features of each instance + as a vector. + """ + return self._call_java("featuresCol") + + @property + @since("3.1.0") + def weightCol(self): + """ + Field in "predictions" which gives the weight of each instance + as a vector. + """ + return self._call_java("weightCol") + + @property + @since("3.1.0") + def labels(self): + """ + Returns the sequence of labels in ascending order. This order matches the order used + in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel. + + Note: In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the + training set is missing a label, then all of the arrays over labels + (e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the + expected numClasses. + """ + return self._call_java("labels") + + @property + @since("3.1.0") + def truePositiveRateByLabel(self): + """ + Returns true positive rate for each label (category). + """ + return self._call_java("truePositiveRateByLabel") + + @property + @since("3.1.0") + def falsePositiveRateByLabel(self): + """ + Returns false positive rate for each label (category). + """ + return self._call_java("falsePositiveRateByLabel") + + @property + @since("3.1.0") + def precisionByLabel(self): + """ + Returns precision for each label (category). + """ + return self._call_java("precisionByLabel") + + @property + @since("3.1.0") + def recallByLabel(self): + """ + Returns recall for each label (category). + """ + return self._call_java("recallByLabel") + + @since("3.1.0") + def fMeasureByLabel(self, beta=1.0): + """ + Returns f-measure for each label (category). + """ + return self._call_java("fMeasureByLabel", beta) + + @property + @since("3.1.0") + def accuracy(self): + """ + Returns accuracy. + (equals to the total number of correctly classified instances + out of the total number of instances.) + """ + return self._call_java("accuracy") + + @property + @since("3.1.0") + def weightedTruePositiveRate(self): + """ + Returns weighted true positive rate. + (equals to precision, recall and f-measure) + """ + return self._call_java("weightedTruePositiveRate") + + @property + @since("3.1.0") + def weightedFalsePositiveRate(self): + """ + Returns weighted false positive rate. + """ + return self._call_java("weightedFalsePositiveRate") + + @property + @since("3.1.0") + def weightedRecall(self): + """ + Returns weighted averaged recall. + (equals to precision, recall and f-measure) + """ + return self._call_java("weightedRecall") + + @property + @since("3.1.0") + def weightedPrecision(self): + """ + Returns weighted averaged precision. + """ + return self._call_java("weightedPrecision") + + @since("3.1.0") + def weightedFMeasure(self, beta=1.0): + """ + Returns weighted averaged f-measure. + """ + return self._call_java("weightedFMeasure", beta) + + +@inherit_doc +class _TrainingSummary(JavaWrapper): + """ + Abstraction for Training results. + + .. versionadded:: 3.1.0 + """ + + @property + @since("3.1.0") + def objectiveHistory(self): + """ + Objective function (scaled loss + regularization) at each + iteration. It contains one more element, the initial state, + than number of iterations. + """ + return self._call_java("objectiveHistory") + + @property + @since("3.1.0") + def totalIterations(self): + """ + Number of training iterations until termination. + """ + return self._call_java("totalIterations") + + +@inherit_doc +class _BinaryClassificationSummary(_ClassificationSummary): + """ + Binary classification results for a given model. + + .. versionadded:: 3.1.0 + """ + + @property + @since("3.1.0") + def roc(self): + """ + Returns the receiver operating characteristic (ROC) curve, + which is a Dataframe having two fields (FPR, TPR) with + (0.0, 0.0) prepended and (1.0, 1.0) appended to it. + + .. seealso:: `Wikipedia reference + `_ + + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call_java("roc") + + @property + @since("3.1.0") + def areaUnderROC(self): + """ + Computes the area under the receiver operating characteristic + (ROC) curve. + + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call_java("areaUnderROC") + + @property + @since("3.1.0") + def pr(self): + """ + Returns the precision-recall curve, which is a Dataframe + containing two fields recall, precision with (0.0, 1.0) prepended + to it. + + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call_java("pr") + + @property + @since("3.1.0") + def fMeasureByThreshold(self): + """ + Returns a dataframe with two fields (threshold, F-Measure) curve + with beta = 1.0. + + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call_java("fMeasureByThreshold") + + @property + @since("3.1.0") + def precisionByThreshold(self): + """ + Returns a dataframe with two fields (threshold, precision) curve. + Every possible probability obtained in transforming the dataset + are used as thresholds used in calculating the precision. + + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call_java("precisionByThreshold") + + @property + @since("3.1.0") + def recallByThreshold(self): + """ + Returns a dataframe with two fields (threshold, recall) curve. + Every possible probability obtained in transforming the dataset + are used as thresholds used in calculating the recall. + + .. note:: This ignores instance weights (setting all to 1.0) from + `LogisticRegression.weightCol`. This will change in later Spark + versions. + """ + return self._call_java("recallByThreshold") + + class _LinearSVCParams(_ClassifierParams, HasRegParam, HasMaxIter, HasFitIntercept, HasTol, HasStandardization, HasWeightCol, HasAggregationDepth, HasThreshold, HasBlockSize): @@ -940,21 +1226,13 @@ def evaluate(self, dataset): return LogisticRegressionSummary(java_blr_summary) -class LogisticRegressionSummary(JavaWrapper): +class LogisticRegressionSummary(_ClassificationSummary): """ Abstraction for Logistic Regression Results for a given model. .. versionadded:: 2.0.0 """ - @property - @since("2.0.0") - def predictions(self): - """ - Dataframe outputted by the model's `transform` method. - """ - return self._call_java("predictions") - @property @since("2.0.0") def probabilityCol(self): @@ -964,268 +1242,28 @@ def probabilityCol(self): """ return self._call_java("probabilityCol") - @property - @since("2.3.0") - def predictionCol(self): - """ - Field in "predictions" which gives the prediction of each class. - """ - return self._call_java("predictionCol") - - @property - @since("2.0.0") - def labelCol(self): - """ - Field in "predictions" which gives the true label of each - instance. - """ - return self._call_java("labelCol") - - @property - @since("2.0.0") - def featuresCol(self): - """ - Field in "predictions" which gives the features of each instance - as a vector. - """ - return self._call_java("featuresCol") - - @property - @since("3.1.0") - def weightCol(self): - """ - Field in "predictions" which gives the weight of each instance - as a vector. - """ - return self._call_java("weightCol") - - @property - @since("2.3.0") - def labels(self): - """ - Returns the sequence of labels in ascending order. This order matches the order used - in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel. - - Note: In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the - training set is missing a label, then all of the arrays over labels - (e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the - expected numClasses. - """ - return self._call_java("labels") - - @property - @since("2.3.0") - def truePositiveRateByLabel(self): - """ - Returns true positive rate for each label (category). - """ - return self._call_java("truePositiveRateByLabel") - - @property - @since("2.3.0") - def falsePositiveRateByLabel(self): - """ - Returns false positive rate for each label (category). - """ - return self._call_java("falsePositiveRateByLabel") - - @property - @since("2.3.0") - def precisionByLabel(self): - """ - Returns precision for each label (category). - """ - return self._call_java("precisionByLabel") - - @property - @since("2.3.0") - def recallByLabel(self): - """ - Returns recall for each label (category). - """ - return self._call_java("recallByLabel") - - @since("2.3.0") - def fMeasureByLabel(self, beta=1.0): - """ - Returns f-measure for each label (category). - """ - return self._call_java("fMeasureByLabel", beta) - - @property - @since("2.3.0") - def accuracy(self): - """ - Returns accuracy. - (equals to the total number of correctly classified instances - out of the total number of instances.) - """ - return self._call_java("accuracy") - - @property - @since("2.3.0") - def weightedTruePositiveRate(self): - """ - Returns weighted true positive rate. - (equals to precision, recall and f-measure) - """ - return self._call_java("weightedTruePositiveRate") - - @property - @since("2.3.0") - def weightedFalsePositiveRate(self): - """ - Returns weighted false positive rate. - """ - return self._call_java("weightedFalsePositiveRate") - - @property - @since("2.3.0") - def weightedRecall(self): - """ - Returns weighted averaged recall. - (equals to precision, recall and f-measure) - """ - return self._call_java("weightedRecall") - - @property - @since("2.3.0") - def weightedPrecision(self): - """ - Returns weighted averaged precision. - """ - return self._call_java("weightedPrecision") - - @since("2.3.0") - def weightedFMeasure(self, beta=1.0): - """ - Returns weighted averaged f-measure. - """ - return self._call_java("weightedFMeasure", beta) - @inherit_doc -class LogisticRegressionTrainingSummary(LogisticRegressionSummary): +class LogisticRegressionTrainingSummary(LogisticRegressionSummary, _TrainingSummary): """ Abstraction for multinomial Logistic Regression Training results. - Currently, the training summary ignores the training weights except - for the objective trace. .. versionadded:: 2.0.0 """ - @property - @since("2.0.0") - def objectiveHistory(self): - """ - Objective function (scaled loss + regularization) at each - iteration. It contains one more element, the initial state, - than number of iterations. - """ - return self._call_java("objectiveHistory") - - @property - @since("2.0.0") - def totalIterations(self): - """ - Number of training iterations until termination. - """ - return self._call_java("totalIterations") + pass @inherit_doc -class BinaryLogisticRegressionSummary(LogisticRegressionSummary): +class BinaryLogisticRegressionSummary(_BinaryClassificationSummary, + LogisticRegressionSummary): """ Binary Logistic regression results for a given model. .. versionadded:: 2.0.0 """ - @property - @since("2.0.0") - def roc(self): - """ - Returns the receiver operating characteristic (ROC) curve, - which is a Dataframe having two fields (FPR, TPR) with - (0.0, 0.0) prepended and (1.0, 1.0) appended to it. - - .. seealso:: `Wikipedia reference - `_ - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("roc") - - @property - @since("2.0.0") - def areaUnderROC(self): - """ - Computes the area under the receiver operating characteristic - (ROC) curve. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("areaUnderROC") - - @property - @since("2.0.0") - def pr(self): - """ - Returns the precision-recall curve, which is a Dataframe - containing two fields recall, precision with (0.0, 1.0) prepended - to it. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("pr") - - @property - @since("2.0.0") - def fMeasureByThreshold(self): - """ - Returns a dataframe with two fields (threshold, F-Measure) curve - with beta = 1.0. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("fMeasureByThreshold") - - @property - @since("2.0.0") - def precisionByThreshold(self): - """ - Returns a dataframe with two fields (threshold, precision) curve. - Every possible probability obtained in transforming the dataset - are used as thresholds used in calculating the precision. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("precisionByThreshold") - - @property - @since("2.0.0") - def recallByThreshold(self): - """ - Returns a dataframe with two fields (threshold, recall) curve. - Every possible probability obtained in transforming the dataset - are used as thresholds used in calculating the recall. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. - """ - return self._call_java("recallByThreshold") - + pass @inherit_doc class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary, diff --git a/python/pyspark/ml/tests/test_training_summary.py b/python/pyspark/ml/tests/test_training_summary.py index ac944d8397a86..00f68e91a5656 100644 --- a/python/pyspark/ml/tests/test_training_summary.py +++ b/python/pyspark/ml/tests/test_training_summary.py @@ -150,6 +150,7 @@ def test_binary_logistic_regression_summary(self): # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) + self.assertEqual(sameSummary.probabilityCol, "probability") self.assertTrue(isinstance(sameSummary, BinaryLogisticRegressionSummary)) self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC) @@ -189,6 +190,7 @@ def test_multiclass_logistic_regression_summary(self): # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) + self.assertEqual(sameSummary.probabilityCol, "probability") self.assertTrue(isinstance(sameSummary, LogisticRegressionSummary)) self.assertFalse(isinstance(sameSummary, BinaryLogisticRegressionSummary)) self.assertAlmostEqual(sameSummary.accuracy, s.accuracy) From ff47581c48643e3b2e90032dca0c3fd99c22c11f Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 4 Jun 2020 14:36:30 -0700 Subject: [PATCH 08/15] fix python style failure --- python/pyspark/ml/classification.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 92eecde99943a..7b12636d11942 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1250,7 +1250,6 @@ class LogisticRegressionTrainingSummary(LogisticRegressionSummary, _TrainingSumm .. versionadded:: 2.0.0 """ - pass @@ -1262,9 +1261,9 @@ class BinaryLogisticRegressionSummary(_BinaryClassificationSummary, .. versionadded:: 2.0.0 """ - pass + @inherit_doc class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary, LogisticRegressionTrainingSummary): From 8b35ce9cfc91080af58ba0d3a6119ebb388d5802 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 10 Jun 2020 15:42:04 -0700 Subject: [PATCH 09/15] remove unnessessary negative weight check --- .../ClassificationSummary.scala | 42 +++++++------------ 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala index 435ce00a78e5b..29cbd45d81fbd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala @@ -18,7 +18,6 @@ package org.apache.spark.ml.classification import org.apache.spark.annotation.Since -import org.apache.spark.ml.functions.checkNonNegativeWeight import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics} import org.apache.spark.sql.{DataFrame, Row} @@ -60,23 +59,16 @@ private[classification] trait ClassificationSummary extends Serializable { def weightCol: String @transient private val multiclassMetrics = { - if (predictions.schema.fieldNames.contains(weightCol)) { - new MulticlassMetrics( - predictions.select( - col(predictionCol), - col(labelCol).cast(DoubleType), - checkNonNegativeWeight(col(weightCol).cast(DoubleType))).rdd.map { - case Row(prediction: Double, label: Double, weight: Double) => (prediction, label, weight) - }) + val weightColumn = if (predictions.schema.fieldNames.contains(weightCol)) { + col(weightCol).cast(DoubleType) } else { - new MulticlassMetrics( - predictions.select( - col(predictionCol), - col(labelCol).cast(DoubleType), - lit(1.0)).rdd.map { - case Row(prediction: Double, label: Double, weight: Double) => (prediction, label, weight) - }) + lit(1.0) } + new MulticlassMetrics( + predictions.select(col(predictionCol), col(labelCol).cast(DoubleType), weightColumn) + .rdd.map { + case Row(prediction: Double, label: Double, weight: Double) => (prediction, label, weight) + }) } /** @@ -203,17 +195,15 @@ trait BinaryClassificationSummary extends ClassificationSummary { // TODO: Allow the user to vary the number of bins using a setBins method in // BinaryClassificationMetrics. For now the default is set to 100. - @transient private val binaryMetrics = if (predictions.schema.fieldNames.contains(weightCol)) { - new BinaryClassificationMetrics( - predictions.select(col(scoreCol), col(labelCol).cast(DoubleType), - checkNonNegativeWeight(col(weightCol).cast(DoubleType))).rdd.map { - case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight) - }, 100 - ) - } else { + @transient private val binaryMetrics = { + val weightColumn = if (predictions.schema.fieldNames.contains(weightCol)) { + col(weightCol).cast(DoubleType) + } else { + lit(1.0) + } + new BinaryClassificationMetrics( - predictions.select(col(scoreCol), col(labelCol).cast(DoubleType), - lit(1.0)).rdd.map { + predictions.select(col(scoreCol), col(labelCol).cast(DoubleType), weightColumn).rdd.map { case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight) }, 100 ) From b1031833209b8a788742e55ed0496f4908763072 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Sat, 13 Jun 2020 11:15:57 -0700 Subject: [PATCH 10/15] address comments --- .../ClassificationSummary.scala | 22 ++++++++----------- .../classification/LogisticRegression.scala | 16 +++++++++++--- .../LogisticRegressionSuite.scala | 5 ----- project/MimaExcludes.scala | 4 +--- .../pyspark/ml/tests/test_training_summary.py | 2 -- 5 files changed, 23 insertions(+), 26 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala index 29cbd45d81fbd..94bbf707a5e2a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala @@ -36,12 +36,6 @@ private[classification] trait ClassificationSummary extends Serializable { @Since("3.1.0") def predictions: DataFrame - /** - * Field in "predictions" which gives the probability or rawPrediction of each class as a vector. - */ - @Since("3.1.0") - def scoreCol: String - /** Field in "predictions" which gives the prediction of each class. */ @Since("3.1.0") def predictionCol: String @@ -50,10 +44,6 @@ private[classification] trait ClassificationSummary extends Serializable { @Since("3.1.0") def labelCol: String - /** Field in "predictions" which gives the features of each instance as a vector. */ - @Since("3.1.0") - def featuresCol: String - /** Field in "predictions" which gives the weight of each instance as a vector. */ @Since("3.1.0") def weightCol: String @@ -193,8 +183,12 @@ trait BinaryClassificationSummary extends ClassificationSummary { private val sparkSession = predictions.sparkSession import sparkSession.implicits._ - // TODO: Allow the user to vary the number of bins using a setBins method in - // BinaryClassificationMetrics. For now the default is set to 100. + /** + * Field in "predictions" which gives the probability or rawPrediction of each class as a + * vector. + */ + def scoreCol: String = null + @transient private val binaryMetrics = { val weightColumn = if (predictions.schema.fieldNames.contains(weightCol)) { col(weightCol).cast(DoubleType) @@ -202,10 +196,12 @@ trait BinaryClassificationSummary extends ClassificationSummary { lit(1.0) } + // TODO: Allow the user to vary the number of bins using a setBins method in + // BinaryClassificationMetrics. For now the default is set to 1000. new BinaryClassificationMetrics( predictions.select(col(scoreCol), col(labelCol).cast(DoubleType), weightColumn).rdd.map { case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight) - }, 100 + }, 1000 ) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 559a84d2249e3..e2a01000c96fa 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -1397,7 +1397,11 @@ sealed trait LogisticRegressionSummary extends ClassificationSummary { /** Field in "predictions" which gives the probability of each class as a vector. */ @Since("1.5.0") - def probabilityCol: String = scoreCol + def probabilityCol: String + + /** Field in "predictions" which gives the features of each instance as a vector. */ + @Since("1.6.0") + def featuresCol: String } /** @@ -1412,6 +1416,12 @@ sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary */ sealed trait BinaryLogisticRegressionSummary extends LogisticRegressionSummary with BinaryClassificationSummary { + + override def scoreCol: String = if (probabilityCol.nonEmpty) { + probabilityCol + } else { + throw new SparkException(s"probabilityCol is required for BinaryLogisticRegressionSummary.") + } } /** @@ -1449,7 +1459,7 @@ private class LogisticRegressionTrainingSummaryImpl( * Multiclass logistic regression results for a given model. * * @param predictions dataframe output by the model's `transform` method. - * @param scoreCol field in "predictions" which gives the probability of + * @param probabilityCol field in "predictions" which gives the probability of * each class as a vector. * @param predictionCol field in "predictions" which gives the prediction for a data instance as a * double. @@ -1459,7 +1469,7 @@ private class LogisticRegressionTrainingSummaryImpl( */ private class LogisticRegressionSummaryImpl( @transient override val predictions: DataFrame, - override val scoreCol: String, + override val probabilityCol: String, override val predictionCol: String, override val labelCol: String, override val featuresCol: String, diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 0caeedfe1d655..ecee531c88a8f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -263,7 +263,6 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { .setMaxIter(1) val blorModel = lr.fit(smallBinaryDataset) - assert(blorModel.summary.probabilityCol.equals("probability")) assert(blorModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummary]) assert(blorModel.summary.asBinary.isInstanceOf[BinaryLogisticRegressionSummary]) assert(blorModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummary]) @@ -271,7 +270,6 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { assert(blorModel.binarySummary.totalIterations == 1) val mlorModel = lr.setFamily("multinomial").fit(smallMultinomialDataset) - assert(mlorModel.summary.probabilityCol.equals("probability")) assert(mlorModel.summary.isInstanceOf[LogisticRegressionTrainingSummary]) withClue("cannot get binary summary for multiclass model") { intercept[RuntimeException] { @@ -286,7 +284,6 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { assert(mlorModel.summary.totalIterations == 1) val mlorBinaryModel = lr.setFamily("multinomial").fit(smallBinaryDataset) - assert(mlorBinaryModel.summary.probabilityCol.equals("probability")) assert(mlorBinaryModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummary]) assert(mlorBinaryModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummary]) @@ -294,8 +291,6 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { val mlorSummary = mlorModel.evaluate(smallMultinomialDataset) assert(blorSummary.isInstanceOf[BinaryLogisticRegressionSummary]) assert(mlorSummary.isInstanceOf[LogisticRegressionSummary]) - assert(blorSummary.probabilityCol.equals("probability")) - assert(mlorSummary.probabilityCol.equals("probability")) // verify instance weight works val lr2 = new LogisticRegression() diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index a1bcfb3db552a..aec58fc2decc9 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -76,9 +76,7 @@ object MimaExcludes { ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$$sparkSession"), ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="), ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"), - ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.weightCol"), - ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionTrainingSummary.scoreCol"), - ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.scoreCol") + ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.weightCol") ) // Exclude rules for 3.0.x diff --git a/python/pyspark/ml/tests/test_training_summary.py b/python/pyspark/ml/tests/test_training_summary.py index 00f68e91a5656..ac944d8397a86 100644 --- a/python/pyspark/ml/tests/test_training_summary.py +++ b/python/pyspark/ml/tests/test_training_summary.py @@ -150,7 +150,6 @@ def test_binary_logistic_regression_summary(self): # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) - self.assertEqual(sameSummary.probabilityCol, "probability") self.assertTrue(isinstance(sameSummary, BinaryLogisticRegressionSummary)) self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC) @@ -190,7 +189,6 @@ def test_multiclass_logistic_regression_summary(self): # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) - self.assertEqual(sameSummary.probabilityCol, "probability") self.assertTrue(isinstance(sameSummary, LogisticRegressionSummary)) self.assertFalse(isinstance(sameSummary, BinaryLogisticRegressionSummary)) self.assertAlmostEqual(sameSummary.accuracy, s.accuracy) From 137e5b20790f6e0c61e6f64fdcf47b2608e198f3 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 15 Jun 2020 01:37:41 -0700 Subject: [PATCH 11/15] change trait BinaryClassificationSummary to private[classification] --- .../apache/spark/ml/classification/ClassificationSummary.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala index 94bbf707a5e2a..0b93127ef7462 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala @@ -178,7 +178,7 @@ private[classification] trait TrainingSummary { /** * Abstraction for binary classification results for a given model. */ -trait BinaryClassificationSummary extends ClassificationSummary { +private[classification] trait BinaryClassificationSummary extends ClassificationSummary { private val sparkSession = predictions.sparkSession import sparkSession.implicits._ From 61e93d0cab590975b5d4f4551edd43cac6b55ce2 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 15 Jun 2020 18:35:21 -0700 Subject: [PATCH 12/15] update python changes --- python/pyspark/ml/classification.py | 36 ++++++++++++++--------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 7b12636d11942..5101b99887fe3 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -256,15 +256,6 @@ def predictions(self): """ return self._call_java("predictions") - @property - @since("3.1.0") - def scoreCol(self): - """ - Field in "predictions" which gives the probability or raw prediction - of each class as a vector. - """ - return self._call_java("scoreCol") - @property @since("3.1.0") def predictionCol(self): @@ -282,15 +273,6 @@ def labelCol(self): """ return self._call_java("labelCol") - @property - @since("3.1.0") - def featuresCol(self): - """ - Field in "predictions" which gives the features of each instance - as a vector. - """ - return self._call_java("featuresCol") - @property @since("3.1.0") def weightCol(self): @@ -440,6 +422,15 @@ class _BinaryClassificationSummary(_ClassificationSummary): .. versionadded:: 3.1.0 """ + @property + @since("3.1.0") + def scoreCol(self): + """ + Field in "predictions" which gives the probability or raw prediction + of each class as a vector. + """ + return self._call_java("scoreCol") + @property @since("3.1.0") def roc(self): @@ -1242,6 +1233,15 @@ def probabilityCol(self): """ return self._call_java("probabilityCol") + @property + @since("3.1.0") + def featuresCol(self): + """ + Field in "predictions" which gives the features of each instance + as a vector. + """ + return self._call_java("featuresCol") + @inherit_doc class LogisticRegressionTrainingSummary(LogisticRegressionSummary, _TrainingSummary): From e844c11939d5523d5d7402f5ee5ed6cabd52f8fb Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 17 Jun 2020 21:58:10 -0700 Subject: [PATCH 13/15] move asBinary to LogisticRegression --- .../ml/classification/ClassificationSummary.scala | 11 ----------- .../spark/ml/classification/LogisticRegression.scala | 11 +++++++++++ project/MimaExcludes.scala | 1 - 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala index 0b93127ef7462..b7a10af7be3cb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala @@ -142,17 +142,6 @@ private[classification] trait ClassificationSummary extends Serializable { /** Returns weighted averaged f1-measure. */ @Since("3.1.0") def weightedFMeasure: Double = multiclassMetrics.weightedFMeasure(1.0) - - /** - * Convenient method for casting to binary classification summary. - * This method will throw an Exception if the summary is not a binary summary. - */ - @Since("3.1.0") - def asBinary: BinaryClassificationSummary = this match { - case b: BinaryClassificationSummary => b - case _ => - throw new RuntimeException("Cannot cast to a binary summary.") - } } /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index e2a01000c96fa..99861c22e16e2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -1402,6 +1402,17 @@ sealed trait LogisticRegressionSummary extends ClassificationSummary { /** Field in "predictions" which gives the features of each instance as a vector. */ @Since("1.6.0") def featuresCol: String + + /** + * Convenient method for casting to binary logistic regression summary. + * This method will throw an Exception if the summary is not a binary summary. + */ + @Since("2.3.0") + def asBinary: BinaryLogisticRegressionSummary = this match { + case b: BinaryLogisticRegressionSummary => b + case _ => + throw new RuntimeException("Cannot cast to a binary summary.") + } } /** diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index aec58fc2decc9..c724e82aa4712 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -66,7 +66,6 @@ object MimaExcludes { ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="), ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"), ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.weightCol"), - ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.asBinary"), ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="), ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"), ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightCol"), From c76d5917b9a491213d7d6f4fd4be6793a6d07108 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 18 Jun 2020 00:25:49 -0700 Subject: [PATCH 14/15] address comments --- .../ClassificationSummary.scala | 2 +- .../classification/LogisticRegression.scala | 2 +- python/pyspark/ml/classification.py | 26 +------------------ 3 files changed, 3 insertions(+), 27 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala index b7a10af7be3cb..e9ea38161d3c0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala @@ -159,7 +159,7 @@ private[classification] trait TrainingSummary { /** Number of training iterations. */ @Since("3.1.0") def totalIterations: Int = { - assert(objectiveHistory.length > 0, s"objectiveHistory length should be greater than 1.") + assert(objectiveHistory.length > 0, "objectiveHistory length should be greater than 0.") objectiveHistory.length - 1 } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 99861c22e16e2..20d619334f7b9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -1431,7 +1431,7 @@ sealed trait BinaryLogisticRegressionSummary extends LogisticRegressionSummary override def scoreCol: String = if (probabilityCol.nonEmpty) { probabilityCol } else { - throw new SparkException(s"probabilityCol is required for BinaryLogisticRegressionSummary.") + throw new SparkException("probabilityCol is required for BinaryLogisticRegressionSummary.") } } diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 5101b99887fe3..ff506066519cd 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -441,10 +441,6 @@ def roc(self): .. seealso:: `Wikipedia reference `_ - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. """ return self._call_java("roc") @@ -454,10 +450,6 @@ def areaUnderROC(self): """ Computes the area under the receiver operating characteristic (ROC) curve. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. """ return self._call_java("areaUnderROC") @@ -468,10 +460,6 @@ def pr(self): Returns the precision-recall curve, which is a Dataframe containing two fields recall, precision with (0.0, 1.0) prepended to it. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. """ return self._call_java("pr") @@ -481,10 +469,6 @@ def fMeasureByThreshold(self): """ Returns a dataframe with two fields (threshold, F-Measure) curve with beta = 1.0. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. """ return self._call_java("fMeasureByThreshold") @@ -495,10 +479,6 @@ def precisionByThreshold(self): Returns a dataframe with two fields (threshold, precision) curve. Every possible probability obtained in transforming the dataset are used as thresholds used in calculating the precision. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. """ return self._call_java("precisionByThreshold") @@ -509,10 +489,6 @@ def recallByThreshold(self): Returns a dataframe with two fields (threshold, recall) curve. Every possible probability obtained in transforming the dataset are used as thresholds used in calculating the recall. - - .. note:: This ignores instance weights (setting all to 1.0) from - `LogisticRegression.weightCol`. This will change in later Spark - versions. """ return self._call_java("recallByThreshold") @@ -1234,7 +1210,7 @@ def probabilityCol(self): return self._call_java("probabilityCol") @property - @since("3.1.0") + @since("2.0.0") def featuresCol(self): """ Field in "predictions" which gives the features of each instance From ead6da2e0e3d1a269e6ef76b23f8e845690f42a3 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Thu, 18 Jun 2020 10:10:45 -0700 Subject: [PATCH 15/15] remove outdated MiMaExclude --- project/MimaExcludes.scala | 4 ---- 1 file changed, 4 deletions(-) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index c724e82aa4712..0be7b4c1003a7 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -48,10 +48,6 @@ object MimaExcludes { ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.feature.ChiSqSelectorModel"), ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.feature.ChiSqSelector"), - //[SPARK-31840] Add instance weight support in LogisticRegressionSummary - // weightCol in org.apache.spark.ml.classification.LogisticRegressionSummary is present only in current version - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightCol"), - // [SPARK-24634] Add a new metric regarding number of inputs later than watermark plus allowed delay ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StateOperatorProgress.$default$4"),