From a537c49170ee2b74563689d3048b2ef85168c939 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Tue, 2 Jun 2020 11:42:51 -0700
Subject: [PATCH 01/15] [SPARK-31893][ML] Add a generic ClassificationSummary
 trait

---
 .../ClassificationSummary.scala               | 275 ++++++++++++++++++
 .../classification/LogisticRegression.scala   | 249 +---------------
 2 files changed, 283 insertions(+), 241 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
new file mode 100644
index 0000000000000..5ae93dc5f1aa8
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
@@ -0,0 +1,275 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.classification
+
+import org.apache.spark.annotation.Since
+import org.apache.spark.ml.functions.checkNonNegativeWeight
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics}
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions.{col, lit}
+import org.apache.spark.sql.types.DoubleType
+
+
+/**
+ * Abstraction for multiclass classification results for a given model.
+ */
+private[classification] trait ClassificationSummary extends Serializable {
+
+  /**
+   * Dataframe output by the model's `transform` method.
+   */
+  @Since("3.1.0")
+  def predictions: DataFrame
+
+  /** Field in "predictions" which gives the prediction of each class. */
+  @Since("3.1.0")
+  def predictionCol: String
+
+  /** Field in "predictions" which gives the true label of each instance (if available). */
+  @Since("3.1.0")
+  def labelCol: String
+
+  /** Field in "predictions" which gives the features of each instance as a vector. */
+  @Since("3.1.0")
+  def featuresCol: String
+
+  /** Field in "predictions" which gives the weight of each instance as a vector. */
+  @Since("3.1.0")
+  def weightCol: String
+
+  @transient private val multiclassMetrics = {
+    if (predictions.schema.fieldNames.contains(weightCol)) {
+      new MulticlassMetrics(
+        predictions.select(
+          col(predictionCol),
+          col(labelCol).cast(DoubleType),
+          checkNonNegativeWeight(col(weightCol).cast(DoubleType))).rdd.map {
+          case Row(prediction: Double, label: Double, weight: Double) => (prediction, label, weight)
+        })
+    } else {
+      new MulticlassMetrics(
+        predictions.select(
+          col(predictionCol),
+          col(labelCol).cast(DoubleType),
+          lit(1.0)).rdd.map {
+          case Row(prediction: Double, label: Double, weight: Double) => (prediction, label, weight)
+        })
+    }
+  }
+
+  /**
+   * Returns the sequence of labels in ascending order. This order matches the order used
+   * in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel.
+   *
+   * Note: In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the
+   * training set is missing a label, then all of the arrays over labels
+   * (e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the
+   * expected numClasses.
+   */
+  @Since("3.1.0")
+  def labels: Array[Double] = multiclassMetrics.labels
+
+  /** Returns true positive rate for each label (category). */
+  @Since("3.1.0")
+  def truePositiveRateByLabel: Array[Double] = recallByLabel
+
+  /** Returns false positive rate for each label (category). */
+  @Since("3.1.0")
+  def falsePositiveRateByLabel: Array[Double] = {
+    multiclassMetrics.labels.map(label => multiclassMetrics.falsePositiveRate(label))
+  }
+
+  /** Returns precision for each label (category). */
+  @Since("3.1.0")
+  def precisionByLabel: Array[Double] = {
+    multiclassMetrics.labels.map(label => multiclassMetrics.precision(label))
+  }
+
+  /** Returns recall for each label (category). */
+  @Since("3.1.0")
+  def recallByLabel: Array[Double] = {
+    multiclassMetrics.labels.map(label => multiclassMetrics.recall(label))
+  }
+
+  /** Returns f-measure for each label (category). */
+  @Since("3.1.0")
+  def fMeasureByLabel(beta: Double): Array[Double] = {
+    multiclassMetrics.labels.map(label => multiclassMetrics.fMeasure(label, beta))
+  }
+
+  /** Returns f1-measure for each label (category). */
+  @Since("3.1.0")
+  def fMeasureByLabel: Array[Double] = fMeasureByLabel(1.0)
+
+  /**
+   * Returns accuracy.
+   * (equals to the total number of correctly classified instances
+   * out of the total number of instances.)
+   */
+  @Since("3.1.0")
+  def accuracy: Double = multiclassMetrics.accuracy
+
+  /**
+   * Returns weighted true positive rate.
+   * (equals to precision, recall and f-measure)
+   */
+  @Since("3.1.0")
+  def weightedTruePositiveRate: Double = weightedRecall
+
+  /** Returns weighted false positive rate. */
+  @Since("3.1.0")
+  def weightedFalsePositiveRate: Double = multiclassMetrics.weightedFalsePositiveRate
+
+  /**
+   * Returns weighted averaged recall.
+   * (equals to precision, recall and f-measure)
+   */
+  @Since("3.1.0")
+  def weightedRecall: Double = multiclassMetrics.weightedRecall
+
+  /** Returns weighted averaged precision. */
+  @Since("3.1.0")
+  def weightedPrecision: Double = multiclassMetrics.weightedPrecision
+
+  /** Returns weighted averaged f-measure. */
+  @Since("3.1.0")
+  def weightedFMeasure(beta: Double): Double = multiclassMetrics.weightedFMeasure(beta)
+
+  /** Returns weighted averaged f1-measure. */
+  @Since("3.1.0")
+  def weightedFMeasure: Double = multiclassMetrics.weightedFMeasure(1.0)
+
+  /**
+   * Convenient method for casting to binary classification summary.
+   * This method will throw an Exception if the summary is not a binary summary.
+   */
+  @Since("3.1.0")
+  def asBinary: BinaryClassificationSummary = this match {
+    case b: BinaryClassificationSummary => b
+    case _ =>
+      throw new RuntimeException("Cannot cast to a binary summary.")
+  }
+}
+
+/**
+ * Abstraction for multiclass classification training results.
+ */
+private[classification] trait ClassificationTrainingSummary extends ClassificationSummary {
+
+  /**
+   *  objective function (scaled loss + regularization) at each iteration.
+   *  It contains one more element, the initial state, than number of iterations.
+   */
+  @Since("3.1.0")
+  def objectiveHistory: Array[Double]
+
+  /** Number of training iterations. */
+  @Since("3.1.0")
+  def totalIterations: Int = {
+    assert(objectiveHistory.length > 0, s"objectiveHistory length should be greater than 1.")
+    objectiveHistory.length - 1
+  }
+}
+
+/**
+ * Abstraction for binary classification results for a given model.
+ */
+trait BinaryClassificationSummary extends ClassificationSummary {
+
+  private val sparkSession = predictions.sparkSession
+  import sparkSession.implicits._
+
+  /** Field in "predictions" which gives the probability of each class as a vector. */
+  @Since("3.1.0")
+  def probabilityCol: String
+
+  // TODO: Allow the user to vary the number of bins using a setBins method in
+  // BinaryClassificationMetrics. For now the default is set to 100.
+  @transient private val binaryMetrics = if (predictions.schema.fieldNames.contains(weightCol)) {
+    new BinaryClassificationMetrics(
+      predictions.select(col(probabilityCol), col(labelCol).cast(DoubleType),
+        checkNonNegativeWeight(col(weightCol).cast(DoubleType))).rdd.map {
+        case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight)
+      }, 100
+    )
+  } else {
+    new BinaryClassificationMetrics(
+      predictions.select(col(probabilityCol), col(labelCol).cast(DoubleType),
+        lit(1.0)).rdd.map {
+        case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight)
+      }, 100
+    )
+  }
+
+  /**
+   * Returns the receiver operating characteristic (ROC) curve,
+   * which is a Dataframe having two fields (FPR, TPR)
+   * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
+   * See http://en.wikipedia.org/wiki/Receiver_operating_characteristic
+   */
+  @Since("3.1.0")
+  @transient lazy val roc: DataFrame = binaryMetrics.roc().toDF("FPR", "TPR")
+
+  /**
+   * Computes the area under the receiver operating characteristic (ROC) curve.
+   */
+  @Since("3.1.0")
+  lazy val areaUnderROC: Double = binaryMetrics.areaUnderROC()
+
+  /**
+   * Returns the precision-recall curve, which is a Dataframe containing
+   * two fields recall, precision with (0.0, 1.0) prepended to it.
+   */
+  @Since("3.1.0")
+  @transient lazy val pr: DataFrame = binaryMetrics.pr().toDF("recall", "precision")
+
+  /**
+   * Returns a dataframe with two fields (threshold, F-Measure) curve with beta = 1.0.
+   */
+  @Since("3.1.0")
+  @transient lazy val fMeasureByThreshold: DataFrame = {
+    binaryMetrics.fMeasureByThreshold().toDF("threshold", "F-Measure")
+  }
+
+  /**
+   * Returns a dataframe with two fields (threshold, precision) curve.
+   * Every possible probability obtained in transforming the dataset are used
+   * as thresholds used in calculating the precision.
+   */
+  @Since("3.1.0")
+  @transient lazy val precisionByThreshold: DataFrame = {
+    binaryMetrics.precisionByThreshold().toDF("threshold", "precision")
+  }
+
+  /**
+   * Returns a dataframe with two fields (threshold, recall) curve.
+   * Every possible probability obtained in transforming the dataset are used
+   * as thresholds used in calculating the recall.
+   */
+  @Since("3.1.0")
+  @transient lazy val recallByThreshold: DataFrame = {
+    binaryMetrics.recallByThreshold().toDF("threshold", "recall")
+  }
+}
+
+/**
+ * Abstraction for binary classification training results.
+ */
+sealed trait BinaryClassificationTrainingSummary extends BinaryClassificationSummary
+  with ClassificationTrainingSummary
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 1f5976c59235b..b7e928588e837 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -29,7 +29,6 @@ import org.apache.spark.SparkException
 import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.feature._
-import org.apache.spark.ml.functions.checkNonNegativeWeight
 import org.apache.spark.ml.linalg._
 import org.apache.spark.ml.optim.aggregator._
 import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction}
@@ -38,12 +37,10 @@ import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.stat._
 import org.apache.spark.ml.util._
 import org.apache.spark.ml.util.Instrumentation.instrumented
-import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics}
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
-import org.apache.spark.sql.functions.{col, lit}
-import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
+import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.VersionUtils
 
@@ -1194,7 +1191,7 @@ class LogisticRegressionModel private[spark] (
     val (summaryModel, probabilityColName, predictionColName) = findSummaryModel()
     if (numClasses > 2) {
       new LogisticRegressionSummaryImpl(summaryModel.transform(dataset),
-        probabilityColName, predictionColName, $(labelCol), $(featuresCol), weightColName)
+        predictionColName, $(labelCol), $(featuresCol), weightColName)
     } else {
       new BinaryLogisticRegressionSummaryImpl(summaryModel.transform(dataset),
         probabilityColName, predictionColName, $(labelCol), $(featuresCol), weightColName)
@@ -1396,246 +1393,19 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] {
 /**
  * Abstraction for logistic regression results for a given model.
  */
-sealed trait LogisticRegressionSummary extends Serializable {
-
-  /**
-   * Dataframe output by the model's `transform` method.
-   */
-  @Since("1.5.0")
-  def predictions: DataFrame
-
-  /** Field in "predictions" which gives the probability of each class as a vector. */
-  @Since("1.5.0")
-  def probabilityCol: String
-
-  /** Field in "predictions" which gives the prediction of each class. */
-  @Since("2.3.0")
-  def predictionCol: String
-
-  /** Field in "predictions" which gives the true label of each instance (if available). */
-  @Since("1.5.0")
-  def labelCol: String
-
-  /** Field in "predictions" which gives the features of each instance as a vector. */
-  @Since("1.6.0")
-  def featuresCol: String
-
-  /** Field in "predictions" which gives the weight of each instance as a vector. */
-  @Since("3.1.0")
-  def weightCol: String
-
-  @transient private val multiclassMetrics = {
-    if (predictions.schema.fieldNames.contains(weightCol)) {
-      new MulticlassMetrics(
-        predictions.select(
-          col(predictionCol),
-          col(labelCol).cast(DoubleType),
-          checkNonNegativeWeight(col(weightCol).cast(DoubleType))).rdd.map {
-          case Row(prediction: Double, label: Double, weight: Double) => (prediction, label, weight)
-        })
-    } else {
-      new MulticlassMetrics(
-        predictions.select(
-          col(predictionCol),
-          col(labelCol).cast(DoubleType),
-          lit(1.0)).rdd.map {
-          case Row(prediction: Double, label: Double, weight: Double) => (prediction, label, weight)
-        })
-    }
-  }
-
-  /**
-   * Returns the sequence of labels in ascending order. This order matches the order used
-   * in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel.
-   *
-   * Note: In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the
-   * training set is missing a label, then all of the arrays over labels
-   * (e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the
-   * expected numClasses.
-   */
-  @Since("2.3.0")
-  def labels: Array[Double] = multiclassMetrics.labels
-
-  /** Returns true positive rate for each label (category). */
-  @Since("2.3.0")
-  def truePositiveRateByLabel: Array[Double] = recallByLabel
-
-  /** Returns false positive rate for each label (category). */
-  @Since("2.3.0")
-  def falsePositiveRateByLabel: Array[Double] = {
-    multiclassMetrics.labels.map(label => multiclassMetrics.falsePositiveRate(label))
-  }
-
-  /** Returns precision for each label (category). */
-  @Since("2.3.0")
-  def precisionByLabel: Array[Double] = {
-    multiclassMetrics.labels.map(label => multiclassMetrics.precision(label))
-  }
-
-  /** Returns recall for each label (category). */
-  @Since("2.3.0")
-  def recallByLabel: Array[Double] = {
-    multiclassMetrics.labels.map(label => multiclassMetrics.recall(label))
-  }
-
-  /** Returns f-measure for each label (category). */
-  @Since("2.3.0")
-  def fMeasureByLabel(beta: Double): Array[Double] = {
-    multiclassMetrics.labels.map(label => multiclassMetrics.fMeasure(label, beta))
-  }
-
-  /** Returns f1-measure for each label (category). */
-  @Since("2.3.0")
-  def fMeasureByLabel: Array[Double] = fMeasureByLabel(1.0)
-
-  /**
-   * Returns accuracy.
-   * (equals to the total number of correctly classified instances
-   * out of the total number of instances.)
-   */
-  @Since("2.3.0")
-  def accuracy: Double = multiclassMetrics.accuracy
-
-  /**
-   * Returns weighted true positive rate.
-   * (equals to precision, recall and f-measure)
-   */
-  @Since("2.3.0")
-  def weightedTruePositiveRate: Double = weightedRecall
-
-  /** Returns weighted false positive rate. */
-  @Since("2.3.0")
-  def weightedFalsePositiveRate: Double = multiclassMetrics.weightedFalsePositiveRate
-
-  /**
-   * Returns weighted averaged recall.
-   * (equals to precision, recall and f-measure)
-   */
-  @Since("2.3.0")
-  def weightedRecall: Double = multiclassMetrics.weightedRecall
-
-  /** Returns weighted averaged precision. */
-  @Since("2.3.0")
-  def weightedPrecision: Double = multiclassMetrics.weightedPrecision
-
-  /** Returns weighted averaged f-measure. */
-  @Since("2.3.0")
-  def weightedFMeasure(beta: Double): Double = multiclassMetrics.weightedFMeasure(beta)
-
-  /** Returns weighted averaged f1-measure. */
-  @Since("2.3.0")
-  def weightedFMeasure: Double = multiclassMetrics.weightedFMeasure(1.0)
-
-  /**
-   * Convenient method for casting to binary logistic regression summary.
-   * This method will throw an Exception if the summary is not a binary summary.
-   */
-  @Since("2.3.0")
-  def asBinary: BinaryLogisticRegressionSummary = this match {
-    case b: BinaryLogisticRegressionSummary => b
-    case _ =>
-      throw new RuntimeException("Cannot cast to a binary summary.")
-  }
+sealed trait LogisticRegressionSummary extends ClassificationSummary {
 }
 
 /**
  * Abstraction for multiclass logistic regression training results.
- * Currently, the training summary ignores the training weights except
- * for the objective trace.
  */
-sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary {
-
-  /**
-   *  objective function (scaled loss + regularization) at each iteration.
-   *  It contains one more element, the initial state, than number of iterations.
-   */
-  @Since("1.5.0")
-  def objectiveHistory: Array[Double]
-
-  /** Number of training iterations. */
-  @Since("1.5.0")
-  def totalIterations: Int = {
-    assert(objectiveHistory.length > 0, s"objectiveHistory length should be greater than 1.")
-    objectiveHistory.length - 1
-  }
-
+sealed trait LogisticRegressionTrainingSummary extends ClassificationTrainingSummary {
 }
 
 /**
  * Abstraction for binary logistic regression results for a given model.
  */
-sealed trait BinaryLogisticRegressionSummary extends LogisticRegressionSummary {
-
-  private val sparkSession = predictions.sparkSession
-  import sparkSession.implicits._
-
-  // TODO: Allow the user to vary the number of bins using a setBins method in
-  // BinaryClassificationMetrics. For now the default is set to 100.
-  @transient private val binaryMetrics = if (predictions.schema.fieldNames.contains(weightCol)) {
-    new BinaryClassificationMetrics(
-      predictions.select(col(probabilityCol), col(labelCol).cast(DoubleType),
-        checkNonNegativeWeight(col(weightCol).cast(DoubleType))).rdd.map {
-        case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight)
-      }, 100
-    )
-  } else {
-    new BinaryClassificationMetrics(
-      predictions.select(col(probabilityCol), col(labelCol).cast(DoubleType),
-        lit(1.0)).rdd.map {
-        case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight)
-      }, 100
-    )
-  }
-
-  /**
-   * Returns the receiver operating characteristic (ROC) curve,
-   * which is a Dataframe having two fields (FPR, TPR)
-   * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
-   * See http://en.wikipedia.org/wiki/Receiver_operating_characteristic
-   */
-  @Since("1.5.0")
-  @transient lazy val roc: DataFrame = binaryMetrics.roc().toDF("FPR", "TPR")
-
-  /**
-   * Computes the area under the receiver operating characteristic (ROC) curve.
-   */
-  @Since("1.5.0")
-  lazy val areaUnderROC: Double = binaryMetrics.areaUnderROC()
-
-  /**
-   * Returns the precision-recall curve, which is a Dataframe containing
-   * two fields recall, precision with (0.0, 1.0) prepended to it.
-   */
-  @Since("1.5.0")
-  @transient lazy val pr: DataFrame = binaryMetrics.pr().toDF("recall", "precision")
-
-  /**
-   * Returns a dataframe with two fields (threshold, F-Measure) curve with beta = 1.0.
-   */
-  @Since("1.5.0")
-  @transient lazy val fMeasureByThreshold: DataFrame = {
-    binaryMetrics.fMeasureByThreshold().toDF("threshold", "F-Measure")
-  }
-
-  /**
-   * Returns a dataframe with two fields (threshold, precision) curve.
-   * Every possible probability obtained in transforming the dataset are used
-   * as thresholds used in calculating the precision.
-   */
-  @Since("1.5.0")
-  @transient lazy val precisionByThreshold: DataFrame = {
-    binaryMetrics.precisionByThreshold().toDF("threshold", "precision")
-  }
-
-  /**
-   * Returns a dataframe with two fields (threshold, recall) curve.
-   * Every possible probability obtained in transforming the dataset are used
-   * as thresholds used in calculating the recall.
-   */
-  @Since("1.5.0")
-  @transient lazy val recallByThreshold: DataFrame = {
-    binaryMetrics.recallByThreshold().toDF("threshold", "recall")
-  }
+sealed trait BinaryLogisticRegressionSummary extends BinaryClassificationSummary {
 }
 
 /**
@@ -1666,15 +1436,13 @@ private class LogisticRegressionTrainingSummaryImpl(
     weightCol: String,
     override val objectiveHistory: Array[Double])
   extends LogisticRegressionSummaryImpl(
-    predictions, probabilityCol, predictionCol, labelCol, featuresCol, weightCol)
+    predictions, predictionCol, labelCol, featuresCol, weightCol)
   with LogisticRegressionTrainingSummary
 
 /**
  * Multiclass logistic regression results for a given model.
  *
  * @param predictions dataframe output by the model's `transform` method.
- * @param probabilityCol field in "predictions" which gives the probability of
- *                       each class as a vector.
  * @param predictionCol field in "predictions" which gives the prediction for a data instance as a
  *                      double.
  * @param labelCol field in "predictions" which gives the true label of each instance.
@@ -1683,7 +1451,6 @@ private class LogisticRegressionTrainingSummaryImpl(
  */
 private class LogisticRegressionSummaryImpl(
     @transient override val predictions: DataFrame,
-    override val probabilityCol: String,
     override val predictionCol: String,
     override val labelCol: String,
     override val featuresCol: String,
@@ -1729,11 +1496,11 @@ private class BinaryLogisticRegressionTrainingSummaryImpl(
  */
 private class BinaryLogisticRegressionSummaryImpl(
     predictions: DataFrame,
-    probabilityCol: String,
+    override val probabilityCol: String,
     predictionCol: String,
     labelCol: String,
     featuresCol: String,
     weightCol: String)
   extends LogisticRegressionSummaryImpl(
-    predictions, probabilityCol, predictionCol, labelCol, featuresCol, weightCol)
+    predictions, predictionCol, labelCol, featuresCol, weightCol)
   with BinaryLogisticRegressionSummary

From 93c82bd5b47099fc7623c1c8994caebb11deb69e Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Tue, 2 Jun 2020 14:32:36 -0700
Subject: [PATCH 02/15] change probabilityCol to scoreCol

---
 .../ml/classification/ClassificationSummary.scala      |  6 +++---
 .../spark/ml/classification/LogisticRegression.scala   | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
index 5ae93dc5f1aa8..10588851f6619 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
@@ -197,20 +197,20 @@ trait BinaryClassificationSummary extends ClassificationSummary {
 
   /** Field in "predictions" which gives the probability of each class as a vector. */
   @Since("3.1.0")
-  def probabilityCol: String
+  def scoreCol: String
 
   // TODO: Allow the user to vary the number of bins using a setBins method in
   // BinaryClassificationMetrics. For now the default is set to 100.
   @transient private val binaryMetrics = if (predictions.schema.fieldNames.contains(weightCol)) {
     new BinaryClassificationMetrics(
-      predictions.select(col(probabilityCol), col(labelCol).cast(DoubleType),
+      predictions.select(col(scoreCol), col(labelCol).cast(DoubleType),
         checkNonNegativeWeight(col(weightCol).cast(DoubleType))).rdd.map {
         case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight)
       }, 100
     )
   } else {
     new BinaryClassificationMetrics(
-      predictions.select(col(probabilityCol), col(labelCol).cast(DoubleType),
+      predictions.select(col(scoreCol), col(labelCol).cast(DoubleType),
         lit(1.0)).rdd.map {
         case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight)
       }, 100
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index b7e928588e837..95e1315ad3fb2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -1461,7 +1461,7 @@ private class LogisticRegressionSummaryImpl(
  * Binary logistic regression training results.
  *
  * @param predictions dataframe output by the model's `transform` method.
- * @param probabilityCol field in "predictions" which gives the probability of
+ * @param scoreCol field in "predictions" which gives the probability of
  *                       each class as a vector.
  * @param predictionCol field in "predictions" which gives the prediction for a data instance as a
  *                      double.
@@ -1472,21 +1472,21 @@ private class LogisticRegressionSummaryImpl(
  */
 private class BinaryLogisticRegressionTrainingSummaryImpl(
     predictions: DataFrame,
-    probabilityCol: String,
+    scoreCol: String,
     predictionCol: String,
     labelCol: String,
     featuresCol: String,
     weightCol: String,
     override val objectiveHistory: Array[Double])
   extends BinaryLogisticRegressionSummaryImpl(
-    predictions, probabilityCol, predictionCol, labelCol, featuresCol, weightCol)
+    predictions, scoreCol, predictionCol, labelCol, featuresCol, weightCol)
   with BinaryLogisticRegressionTrainingSummary
 
 /**
  * Binary logistic regression results for a given model.
  *
  * @param predictions dataframe output by the model's `transform` method.
- * @param probabilityCol field in "predictions" which gives the probability of
+ * @param scoreCol field in "predictions" which gives the probability of
  *                       each class as a vector.
  * @param predictionCol field in "predictions" which gives the prediction of
  *                      each class as a double.
@@ -1496,7 +1496,7 @@ private class BinaryLogisticRegressionTrainingSummaryImpl(
  */
 private class BinaryLogisticRegressionSummaryImpl(
     predictions: DataFrame,
-    override val probabilityCol: String,
+    override val scoreCol: String,
     predictionCol: String,
     labelCol: String,
     featuresCol: String,

From 981ae656060ed8cf89bea0923532b1606ebf2e33 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Tue, 2 Jun 2020 14:59:11 -0700
Subject: [PATCH 03/15] fix MiMa

---
 project/MimaExcludes.scala | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index addb2d8152189..aec58fc2decc9 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -39,6 +39,7 @@ object MimaExcludes {
     // [SPARK-31077] Remove ChiSqSelector dependency on mllib.ChiSqSelectorModel
     // private constructor
     ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.ml.feature.ChiSqSelectorModel.this"),
+
     // [SPARK-31127] Implement abstract Selector
     // org.apache.spark.ml.feature.ChiSqSelectorModel type hierarchy change
     // before: class ChiSqSelector extends Estimator with ChiSqSelectorParams
@@ -46,11 +47,36 @@ object MimaExcludes {
     // false positive, no binary incompatibility
     ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.feature.ChiSqSelectorModel"),
     ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.feature.ChiSqSelector"),
+
     //[SPARK-31840] Add instance weight support in LogisticRegressionSummary
     // weightCol in org.apache.spark.ml.classification.LogisticRegressionSummary is present only in current version
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightCol"),
+
     // [SPARK-24634] Add a new metric regarding number of inputs later than watermark plus allowed delay
-    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StateOperatorProgress.<init>$default$4")
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StateOperatorProgress.<init>$default$4"),
+
+    //[SPARK-31893] Add a generic ClassificationSummary trait
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionTrainingSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionTrainingSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionTrainingSummary.weightCol"),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$_setter_$org$apache$spark$ml$classification$BinaryClassificationSummary$$sparkSession_="),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$_setter_$org$apache$spark$ml$classification$BinaryClassificationSummary$$binaryMetrics_="),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$$binaryMetrics"),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$$sparkSession"),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.weightCol"),
+    ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.asBinary"),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightCol"),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$_setter_$org$apache$spark$ml$classification$BinaryClassificationSummary$$sparkSession_="),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$_setter_$org$apache$spark$ml$classification$BinaryClassificationSummary$$binaryMetrics_="),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$$binaryMetrics"),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$$sparkSession"),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.weightCol")
   )
 
   // Exclude rules for 3.0.x

From 173a24ebf189ad28cf237e1f8fe0e265af63269c Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Tue, 2 Jun 2020 15:28:01 -0700
Subject: [PATCH 04/15] nit

---
 .../spark/ml/classification/ClassificationSummary.scala       | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
index 10588851f6619..3ff7a15a1442a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
@@ -195,7 +195,9 @@ trait BinaryClassificationSummary extends ClassificationSummary {
   private val sparkSession = predictions.sparkSession
   import sparkSession.implicits._
 
-  /** Field in "predictions" which gives the probability of each class as a vector. */
+  /**
+   * Field in "predictions" which gives the probability or rawPrediction of each class as a vector.
+   */
   @Since("3.1.0")
   def scoreCol: String
 

From aa6c00eb845a34d1ae334f2239dd3fbdb72be4d4 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Wed, 3 Jun 2020 14:01:27 -0700
Subject: [PATCH 05/15] keep probabilityCol in LogisticRegression as an alias
 of scoreCol

---
 .../classification/ClassificationSummary.scala  | 12 ++++++------
 .../ml/classification/LogisticRegression.scala  | 17 ++++++++++++-----
 project/MimaExcludes.scala                      |  4 +++-
 3 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
index 3ff7a15a1442a..049860c81ddad 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
@@ -37,6 +37,12 @@ private[classification] trait ClassificationSummary extends Serializable {
   @Since("3.1.0")
   def predictions: DataFrame
 
+  /**
+   * Field in "predictions" which gives the probability or rawPrediction of each class as a vector.
+   */
+  @Since("3.1.0")
+  def scoreCol: String
+
   /** Field in "predictions" which gives the prediction of each class. */
   @Since("3.1.0")
   def predictionCol: String
@@ -195,12 +201,6 @@ trait BinaryClassificationSummary extends ClassificationSummary {
   private val sparkSession = predictions.sparkSession
   import sparkSession.implicits._
 
-  /**
-   * Field in "predictions" which gives the probability or rawPrediction of each class as a vector.
-   */
-  @Since("3.1.0")
-  def scoreCol: String
-
   // TODO: Allow the user to vary the number of bins using a setBins method in
   // BinaryClassificationMetrics. For now the default is set to 100.
   @transient private val binaryMetrics = if (predictions.schema.fieldNames.contains(weightCol)) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 95e1315ad3fb2..2befee50d970d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -1191,7 +1191,7 @@ class LogisticRegressionModel private[spark] (
     val (summaryModel, probabilityColName, predictionColName) = findSummaryModel()
     if (numClasses > 2) {
       new LogisticRegressionSummaryImpl(summaryModel.transform(dataset),
-        predictionColName, $(labelCol), $(featuresCol), weightColName)
+        probabilityColName, predictionColName, $(labelCol), $(featuresCol), weightColName)
     } else {
       new BinaryLogisticRegressionSummaryImpl(summaryModel.transform(dataset),
         probabilityColName, predictionColName, $(labelCol), $(featuresCol), weightColName)
@@ -1394,6 +1394,10 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] {
  * Abstraction for logistic regression results for a given model.
  */
 sealed trait LogisticRegressionSummary extends ClassificationSummary {
+
+  /** Field in "predictions" which gives the probability of each class as a vector. */
+  @Since("1.5.0")
+  def probabilityCol: String = scoreCol
 }
 
 /**
@@ -1436,13 +1440,15 @@ private class LogisticRegressionTrainingSummaryImpl(
     weightCol: String,
     override val objectiveHistory: Array[Double])
   extends LogisticRegressionSummaryImpl(
-    predictions, predictionCol, labelCol, featuresCol, weightCol)
+    predictions, probabilityCol, predictionCol, labelCol, featuresCol, weightCol)
   with LogisticRegressionTrainingSummary
 
 /**
  * Multiclass logistic regression results for a given model.
  *
  * @param predictions dataframe output by the model's `transform` method.
+ * @param scoreCol field in "predictions" which gives the probability of
+ *                       each class as a vector.
  * @param predictionCol field in "predictions" which gives the prediction for a data instance as a
  *                      double.
  * @param labelCol field in "predictions" which gives the true label of each instance.
@@ -1451,6 +1457,7 @@ private class LogisticRegressionTrainingSummaryImpl(
  */
 private class LogisticRegressionSummaryImpl(
     @transient override val predictions: DataFrame,
+    override val scoreCol: String,
     override val predictionCol: String,
     override val labelCol: String,
     override val featuresCol: String,
@@ -1487,7 +1494,7 @@ private class BinaryLogisticRegressionTrainingSummaryImpl(
  *
  * @param predictions dataframe output by the model's `transform` method.
  * @param scoreCol field in "predictions" which gives the probability of
- *                       each class as a vector.
+ *                 each class as a vector.
  * @param predictionCol field in "predictions" which gives the prediction of
  *                      each class as a double.
  * @param labelCol field in "predictions" which gives the true label of each instance.
@@ -1496,11 +1503,11 @@ private class BinaryLogisticRegressionTrainingSummaryImpl(
  */
 private class BinaryLogisticRegressionSummaryImpl(
     predictions: DataFrame,
-    override val scoreCol: String,
+    scoreCol: String,
     predictionCol: String,
     labelCol: String,
     featuresCol: String,
     weightCol: String)
   extends LogisticRegressionSummaryImpl(
-    predictions, predictionCol, labelCol, featuresCol, weightCol)
+    predictions, scoreCol, predictionCol, labelCol, featuresCol, weightCol)
   with BinaryLogisticRegressionSummary
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index aec58fc2decc9..a1bcfb3db552a 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -76,7 +76,9 @@ object MimaExcludes {
     ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$$sparkSession"),
     ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="),
     ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"),
-    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.weightCol")
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.weightCol"),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionTrainingSummary.scoreCol"),
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.scoreCol")
   )
 
   // Exclude rules for 3.0.x

From d718af73d3db934b4c205a58d37be5846dbd4874 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Wed, 3 Jun 2020 14:36:25 -0700
Subject: [PATCH 06/15] nit

---
 .../ml/classification/LogisticRegression.scala   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 2befee50d970d..b3f7b515909a3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -1448,7 +1448,7 @@ private class LogisticRegressionTrainingSummaryImpl(
  *
  * @param predictions dataframe output by the model's `transform` method.
  * @param scoreCol field in "predictions" which gives the probability of
- *                       each class as a vector.
+ *                 each class as a vector.
  * @param predictionCol field in "predictions" which gives the prediction for a data instance as a
  *                      double.
  * @param labelCol field in "predictions" which gives the true label of each instance.
@@ -1468,7 +1468,7 @@ private class LogisticRegressionSummaryImpl(
  * Binary logistic regression training results.
  *
  * @param predictions dataframe output by the model's `transform` method.
- * @param scoreCol field in "predictions" which gives the probability of
+ * @param probabilityCol field in "predictions" which gives the probability of
  *                       each class as a vector.
  * @param predictionCol field in "predictions" which gives the prediction for a data instance as a
  *                      double.
@@ -1479,22 +1479,22 @@ private class LogisticRegressionSummaryImpl(
  */
 private class BinaryLogisticRegressionTrainingSummaryImpl(
     predictions: DataFrame,
-    scoreCol: String,
+    probabilityCol: String,
     predictionCol: String,
     labelCol: String,
     featuresCol: String,
     weightCol: String,
     override val objectiveHistory: Array[Double])
   extends BinaryLogisticRegressionSummaryImpl(
-    predictions, scoreCol, predictionCol, labelCol, featuresCol, weightCol)
+    predictions, probabilityCol, predictionCol, labelCol, featuresCol, weightCol)
   with BinaryLogisticRegressionTrainingSummary
 
 /**
  * Binary logistic regression results for a given model.
  *
  * @param predictions dataframe output by the model's `transform` method.
- * @param scoreCol field in "predictions" which gives the probability of
- *                 each class as a vector.
+ * @param probabilityCol field in "predictions" which gives the probability of
+ *                       each class as a vector.
  * @param predictionCol field in "predictions" which gives the prediction of
  *                      each class as a double.
  * @param labelCol field in "predictions" which gives the true label of each instance.
@@ -1503,11 +1503,11 @@ private class BinaryLogisticRegressionTrainingSummaryImpl(
  */
 private class BinaryLogisticRegressionSummaryImpl(
     predictions: DataFrame,
-    scoreCol: String,
+    probabilityCol: String,
     predictionCol: String,
     labelCol: String,
     featuresCol: String,
     weightCol: String)
   extends LogisticRegressionSummaryImpl(
-    predictions, scoreCol, predictionCol, labelCol, featuresCol, weightCol)
+    predictions, probabilityCol, predictionCol, labelCol, featuresCol, weightCol)
   with BinaryLogisticRegressionSummary

From 0856d4050202030971c6b135f9686c74b79c6f92 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Thu, 4 Jun 2020 14:01:46 -0700
Subject: [PATCH 07/15] add python changes

---
 .../ClassificationSummary.scala               |  10 +-
 .../classification/LogisticRegression.scala   |   6 +-
 .../LogisticRegressionSuite.scala             |   9 +-
 python/pyspark/ml/classification.py           | 546 ++++++++++--------
 .../pyspark/ml/tests/test_training_summary.py |   2 +
 5 files changed, 307 insertions(+), 266 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
index 049860c81ddad..435ce00a78e5b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
@@ -174,9 +174,9 @@ private[classification] trait ClassificationSummary extends Serializable {
 }
 
 /**
- * Abstraction for multiclass classification training results.
+ * Abstraction for training results.
  */
-private[classification] trait ClassificationTrainingSummary extends ClassificationSummary {
+private[classification] trait TrainingSummary {
 
   /**
    *  objective function (scaled loss + regularization) at each iteration.
@@ -269,9 +269,3 @@ trait BinaryClassificationSummary extends ClassificationSummary {
     binaryMetrics.recallByThreshold().toDF("threshold", "recall")
   }
 }
-
-/**
- * Abstraction for binary classification training results.
- */
-sealed trait BinaryClassificationTrainingSummary extends BinaryClassificationSummary
-  with ClassificationTrainingSummary
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index b3f7b515909a3..559a84d2249e3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -1403,13 +1403,15 @@ sealed trait LogisticRegressionSummary extends ClassificationSummary {
 /**
  * Abstraction for multiclass logistic regression training results.
  */
-sealed trait LogisticRegressionTrainingSummary extends ClassificationTrainingSummary {
+sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary
+  with TrainingSummary {
 }
 
 /**
  * Abstraction for binary logistic regression results for a given model.
  */
-sealed trait BinaryLogisticRegressionSummary extends BinaryClassificationSummary {
+sealed trait BinaryLogisticRegressionSummary extends LogisticRegressionSummary
+  with BinaryClassificationSummary {
 }
 
 /**
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 30c21d8b06670..0caeedfe1d655 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -263,6 +263,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
       .setMaxIter(1)
 
     val blorModel = lr.fit(smallBinaryDataset)
+    assert(blorModel.summary.probabilityCol.equals("probability"))
     assert(blorModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummary])
     assert(blorModel.summary.asBinary.isInstanceOf[BinaryLogisticRegressionSummary])
     assert(blorModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummary])
@@ -270,6 +271,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
     assert(blorModel.binarySummary.totalIterations == 1)
 
     val mlorModel = lr.setFamily("multinomial").fit(smallMultinomialDataset)
+    assert(mlorModel.summary.probabilityCol.equals("probability"))
     assert(mlorModel.summary.isInstanceOf[LogisticRegressionTrainingSummary])
     withClue("cannot get binary summary for multiclass model") {
       intercept[RuntimeException] {
@@ -284,6 +286,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
     assert(mlorModel.summary.totalIterations == 1)
 
     val mlorBinaryModel = lr.setFamily("multinomial").fit(smallBinaryDataset)
+    assert(mlorBinaryModel.summary.probabilityCol.equals("probability"))
     assert(mlorBinaryModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummary])
     assert(mlorBinaryModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummary])
 
@@ -291,6 +294,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
     val mlorSummary = mlorModel.evaluate(smallMultinomialDataset)
     assert(blorSummary.isInstanceOf[BinaryLogisticRegressionSummary])
     assert(mlorSummary.isInstanceOf[LogisticRegressionSummary])
+    assert(blorSummary.probabilityCol.equals("probability"))
+    assert(mlorSummary.probabilityCol.equals("probability"))
 
     // verify instance weight works
     val lr2 = new LogisticRegression()
@@ -313,12 +318,12 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
     assert(mlorModel2.summary.isInstanceOf[LogisticRegressionTrainingSummary])
     withClue("cannot get binary summary for multiclass model") {
       intercept[RuntimeException] {
-        mlorModel.binarySummary
+        mlorModel2.binarySummary
       }
     }
     withClue("cannot cast summary to binary summary multiclass model") {
       intercept[RuntimeException] {
-        mlorModel.summary.asBinary
+        mlorModel2.summary.asBinary
       }
     }
 
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 3f3699ce53b51..92eecde99943a 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -240,6 +240,292 @@ def predictProbability(self, value):
         return self._call_java("predictProbability", value)
 
 
+@inherit_doc
+class _ClassificationSummary(JavaWrapper):
+    """
+    Abstraction for multiclass classification results for a given model.
+
+    .. versionadded:: 3.1.0
+    """
+
+    @property
+    @since("3.1.0")
+    def predictions(self):
+        """
+        Dataframe outputted by the model's `transform` method.
+        """
+        return self._call_java("predictions")
+
+    @property
+    @since("3.1.0")
+    def scoreCol(self):
+        """
+        Field in "predictions" which gives the probability or raw prediction
+        of each class as a vector.
+        """
+        return self._call_java("scoreCol")
+
+    @property
+    @since("3.1.0")
+    def predictionCol(self):
+        """
+        Field in "predictions" which gives the prediction of each class.
+        """
+        return self._call_java("predictionCol")
+
+    @property
+    @since("3.1.0")
+    def labelCol(self):
+        """
+        Field in "predictions" which gives the true label of each
+        instance.
+        """
+        return self._call_java("labelCol")
+
+    @property
+    @since("3.1.0")
+    def featuresCol(self):
+        """
+        Field in "predictions" which gives the features of each instance
+        as a vector.
+        """
+        return self._call_java("featuresCol")
+
+    @property
+    @since("3.1.0")
+    def weightCol(self):
+        """
+        Field in "predictions" which gives the weight of each instance
+        as a vector.
+        """
+        return self._call_java("weightCol")
+
+    @property
+    @since("3.1.0")
+    def labels(self):
+        """
+        Returns the sequence of labels in ascending order. This order matches the order used
+        in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel.
+
+        Note: In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the
+        training set is missing a label, then all of the arrays over labels
+        (e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the
+        expected numClasses.
+        """
+        return self._call_java("labels")
+
+    @property
+    @since("3.1.0")
+    def truePositiveRateByLabel(self):
+        """
+        Returns true positive rate for each label (category).
+        """
+        return self._call_java("truePositiveRateByLabel")
+
+    @property
+    @since("3.1.0")
+    def falsePositiveRateByLabel(self):
+        """
+        Returns false positive rate for each label (category).
+        """
+        return self._call_java("falsePositiveRateByLabel")
+
+    @property
+    @since("3.1.0")
+    def precisionByLabel(self):
+        """
+        Returns precision for each label (category).
+        """
+        return self._call_java("precisionByLabel")
+
+    @property
+    @since("3.1.0")
+    def recallByLabel(self):
+        """
+        Returns recall for each label (category).
+        """
+        return self._call_java("recallByLabel")
+
+    @since("3.1.0")
+    def fMeasureByLabel(self, beta=1.0):
+        """
+        Returns f-measure for each label (category).
+        """
+        return self._call_java("fMeasureByLabel", beta)
+
+    @property
+    @since("3.1.0")
+    def accuracy(self):
+        """
+        Returns accuracy.
+        (equals to the total number of correctly classified instances
+        out of the total number of instances.)
+        """
+        return self._call_java("accuracy")
+
+    @property
+    @since("3.1.0")
+    def weightedTruePositiveRate(self):
+        """
+        Returns weighted true positive rate.
+        (equals to precision, recall and f-measure)
+        """
+        return self._call_java("weightedTruePositiveRate")
+
+    @property
+    @since("3.1.0")
+    def weightedFalsePositiveRate(self):
+        """
+        Returns weighted false positive rate.
+        """
+        return self._call_java("weightedFalsePositiveRate")
+
+    @property
+    @since("3.1.0")
+    def weightedRecall(self):
+        """
+        Returns weighted averaged recall.
+        (equals to precision, recall and f-measure)
+        """
+        return self._call_java("weightedRecall")
+
+    @property
+    @since("3.1.0")
+    def weightedPrecision(self):
+        """
+        Returns weighted averaged precision.
+        """
+        return self._call_java("weightedPrecision")
+
+    @since("3.1.0")
+    def weightedFMeasure(self, beta=1.0):
+        """
+        Returns weighted averaged f-measure.
+        """
+        return self._call_java("weightedFMeasure", beta)
+
+
+@inherit_doc
+class _TrainingSummary(JavaWrapper):
+    """
+    Abstraction for Training results.
+
+    .. versionadded:: 3.1.0
+    """
+
+    @property
+    @since("3.1.0")
+    def objectiveHistory(self):
+        """
+        Objective function (scaled loss + regularization) at each
+        iteration. It contains one more element, the initial state,
+        than number of iterations.
+        """
+        return self._call_java("objectiveHistory")
+
+    @property
+    @since("3.1.0")
+    def totalIterations(self):
+        """
+        Number of training iterations until termination.
+        """
+        return self._call_java("totalIterations")
+
+
+@inherit_doc
+class _BinaryClassificationSummary(_ClassificationSummary):
+    """
+    Binary classification results for a given model.
+
+    .. versionadded:: 3.1.0
+    """
+
+    @property
+    @since("3.1.0")
+    def roc(self):
+        """
+        Returns the receiver operating characteristic (ROC) curve,
+        which is a Dataframe having two fields (FPR, TPR) with
+        (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
+
+        .. seealso:: `Wikipedia reference
+            <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
+
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
+        """
+        return self._call_java("roc")
+
+    @property
+    @since("3.1.0")
+    def areaUnderROC(self):
+        """
+        Computes the area under the receiver operating characteristic
+        (ROC) curve.
+
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
+        """
+        return self._call_java("areaUnderROC")
+
+    @property
+    @since("3.1.0")
+    def pr(self):
+        """
+        Returns the precision-recall curve, which is a Dataframe
+        containing two fields recall, precision with (0.0, 1.0) prepended
+        to it.
+
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
+        """
+        return self._call_java("pr")
+
+    @property
+    @since("3.1.0")
+    def fMeasureByThreshold(self):
+        """
+        Returns a dataframe with two fields (threshold, F-Measure) curve
+        with beta = 1.0.
+
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
+        """
+        return self._call_java("fMeasureByThreshold")
+
+    @property
+    @since("3.1.0")
+    def precisionByThreshold(self):
+        """
+        Returns a dataframe with two fields (threshold, precision) curve.
+        Every possible probability obtained in transforming the dataset
+        are used as thresholds used in calculating the precision.
+
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
+        """
+        return self._call_java("precisionByThreshold")
+
+    @property
+    @since("3.1.0")
+    def recallByThreshold(self):
+        """
+        Returns a dataframe with two fields (threshold, recall) curve.
+        Every possible probability obtained in transforming the dataset
+        are used as thresholds used in calculating the recall.
+
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
+        """
+        return self._call_java("recallByThreshold")
+
+
 class _LinearSVCParams(_ClassifierParams, HasRegParam, HasMaxIter, HasFitIntercept, HasTol,
                        HasStandardization, HasWeightCol, HasAggregationDepth, HasThreshold,
                        HasBlockSize):
@@ -940,21 +1226,13 @@ def evaluate(self, dataset):
             return LogisticRegressionSummary(java_blr_summary)
 
 
-class LogisticRegressionSummary(JavaWrapper):
+class LogisticRegressionSummary(_ClassificationSummary):
     """
     Abstraction for Logistic Regression Results for a given model.
 
     .. versionadded:: 2.0.0
     """
 
-    @property
-    @since("2.0.0")
-    def predictions(self):
-        """
-        Dataframe outputted by the model's `transform` method.
-        """
-        return self._call_java("predictions")
-
     @property
     @since("2.0.0")
     def probabilityCol(self):
@@ -964,268 +1242,28 @@ def probabilityCol(self):
         """
         return self._call_java("probabilityCol")
 
-    @property
-    @since("2.3.0")
-    def predictionCol(self):
-        """
-        Field in "predictions" which gives the prediction of each class.
-        """
-        return self._call_java("predictionCol")
-
-    @property
-    @since("2.0.0")
-    def labelCol(self):
-        """
-        Field in "predictions" which gives the true label of each
-        instance.
-        """
-        return self._call_java("labelCol")
-
-    @property
-    @since("2.0.0")
-    def featuresCol(self):
-        """
-        Field in "predictions" which gives the features of each instance
-        as a vector.
-        """
-        return self._call_java("featuresCol")
-
-    @property
-    @since("3.1.0")
-    def weightCol(self):
-        """
-        Field in "predictions" which gives the weight of each instance
-        as a vector.
-        """
-        return self._call_java("weightCol")
-
-    @property
-    @since("2.3.0")
-    def labels(self):
-        """
-        Returns the sequence of labels in ascending order. This order matches the order used
-        in metrics which are specified as arrays over labels, e.g., truePositiveRateByLabel.
-
-        Note: In most cases, it will be values {0.0, 1.0, ..., numClasses-1}, However, if the
-        training set is missing a label, then all of the arrays over labels
-        (e.g., from truePositiveRateByLabel) will be of length numClasses-1 instead of the
-        expected numClasses.
-        """
-        return self._call_java("labels")
-
-    @property
-    @since("2.3.0")
-    def truePositiveRateByLabel(self):
-        """
-        Returns true positive rate for each label (category).
-        """
-        return self._call_java("truePositiveRateByLabel")
-
-    @property
-    @since("2.3.0")
-    def falsePositiveRateByLabel(self):
-        """
-        Returns false positive rate for each label (category).
-        """
-        return self._call_java("falsePositiveRateByLabel")
-
-    @property
-    @since("2.3.0")
-    def precisionByLabel(self):
-        """
-        Returns precision for each label (category).
-        """
-        return self._call_java("precisionByLabel")
-
-    @property
-    @since("2.3.0")
-    def recallByLabel(self):
-        """
-        Returns recall for each label (category).
-        """
-        return self._call_java("recallByLabel")
-
-    @since("2.3.0")
-    def fMeasureByLabel(self, beta=1.0):
-        """
-        Returns f-measure for each label (category).
-        """
-        return self._call_java("fMeasureByLabel", beta)
-
-    @property
-    @since("2.3.0")
-    def accuracy(self):
-        """
-        Returns accuracy.
-        (equals to the total number of correctly classified instances
-        out of the total number of instances.)
-        """
-        return self._call_java("accuracy")
-
-    @property
-    @since("2.3.0")
-    def weightedTruePositiveRate(self):
-        """
-        Returns weighted true positive rate.
-        (equals to precision, recall and f-measure)
-        """
-        return self._call_java("weightedTruePositiveRate")
-
-    @property
-    @since("2.3.0")
-    def weightedFalsePositiveRate(self):
-        """
-        Returns weighted false positive rate.
-        """
-        return self._call_java("weightedFalsePositiveRate")
-
-    @property
-    @since("2.3.0")
-    def weightedRecall(self):
-        """
-        Returns weighted averaged recall.
-        (equals to precision, recall and f-measure)
-        """
-        return self._call_java("weightedRecall")
-
-    @property
-    @since("2.3.0")
-    def weightedPrecision(self):
-        """
-        Returns weighted averaged precision.
-        """
-        return self._call_java("weightedPrecision")
-
-    @since("2.3.0")
-    def weightedFMeasure(self, beta=1.0):
-        """
-        Returns weighted averaged f-measure.
-        """
-        return self._call_java("weightedFMeasure", beta)
-
 
 @inherit_doc
-class LogisticRegressionTrainingSummary(LogisticRegressionSummary):
+class LogisticRegressionTrainingSummary(LogisticRegressionSummary, _TrainingSummary):
     """
     Abstraction for multinomial Logistic Regression Training results.
-    Currently, the training summary ignores the training weights except
-    for the objective trace.
 
     .. versionadded:: 2.0.0
     """
 
-    @property
-    @since("2.0.0")
-    def objectiveHistory(self):
-        """
-        Objective function (scaled loss + regularization) at each
-        iteration. It contains one more element, the initial state,
-        than number of iterations.
-        """
-        return self._call_java("objectiveHistory")
-
-    @property
-    @since("2.0.0")
-    def totalIterations(self):
-        """
-        Number of training iterations until termination.
-        """
-        return self._call_java("totalIterations")
+    pass
 
 
 @inherit_doc
-class BinaryLogisticRegressionSummary(LogisticRegressionSummary):
+class BinaryLogisticRegressionSummary(_BinaryClassificationSummary,
+                                      LogisticRegressionSummary):
     """
     Binary Logistic regression results for a given model.
 
     .. versionadded:: 2.0.0
     """
 
-    @property
-    @since("2.0.0")
-    def roc(self):
-        """
-        Returns the receiver operating characteristic (ROC) curve,
-        which is a Dataframe having two fields (FPR, TPR) with
-        (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
-
-        .. seealso:: `Wikipedia reference
-            <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
-
-        .. note:: This ignores instance weights (setting all to 1.0) from
-            `LogisticRegression.weightCol`. This will change in later Spark
-            versions.
-        """
-        return self._call_java("roc")
-
-    @property
-    @since("2.0.0")
-    def areaUnderROC(self):
-        """
-        Computes the area under the receiver operating characteristic
-        (ROC) curve.
-
-        .. note:: This ignores instance weights (setting all to 1.0) from
-            `LogisticRegression.weightCol`. This will change in later Spark
-            versions.
-        """
-        return self._call_java("areaUnderROC")
-
-    @property
-    @since("2.0.0")
-    def pr(self):
-        """
-        Returns the precision-recall curve, which is a Dataframe
-        containing two fields recall, precision with (0.0, 1.0) prepended
-        to it.
-
-        .. note:: This ignores instance weights (setting all to 1.0) from
-            `LogisticRegression.weightCol`. This will change in later Spark
-            versions.
-        """
-        return self._call_java("pr")
-
-    @property
-    @since("2.0.0")
-    def fMeasureByThreshold(self):
-        """
-        Returns a dataframe with two fields (threshold, F-Measure) curve
-        with beta = 1.0.
-
-        .. note:: This ignores instance weights (setting all to 1.0) from
-            `LogisticRegression.weightCol`. This will change in later Spark
-            versions.
-        """
-        return self._call_java("fMeasureByThreshold")
-
-    @property
-    @since("2.0.0")
-    def precisionByThreshold(self):
-        """
-        Returns a dataframe with two fields (threshold, precision) curve.
-        Every possible probability obtained in transforming the dataset
-        are used as thresholds used in calculating the precision.
-
-        .. note:: This ignores instance weights (setting all to 1.0) from
-            `LogisticRegression.weightCol`. This will change in later Spark
-            versions.
-        """
-        return self._call_java("precisionByThreshold")
-
-    @property
-    @since("2.0.0")
-    def recallByThreshold(self):
-        """
-        Returns a dataframe with two fields (threshold, recall) curve.
-        Every possible probability obtained in transforming the dataset
-        are used as thresholds used in calculating the recall.
-
-        .. note:: This ignores instance weights (setting all to 1.0) from
-            `LogisticRegression.weightCol`. This will change in later Spark
-            versions.
-        """
-        return self._call_java("recallByThreshold")
-
+    pass
 
 @inherit_doc
 class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary,
diff --git a/python/pyspark/ml/tests/test_training_summary.py b/python/pyspark/ml/tests/test_training_summary.py
index ac944d8397a86..00f68e91a5656 100644
--- a/python/pyspark/ml/tests/test_training_summary.py
+++ b/python/pyspark/ml/tests/test_training_summary.py
@@ -150,6 +150,7 @@ def test_binary_logistic_regression_summary(self):
         # test evaluation (with training dataset) produces a summary with same values
         # one check is enough to verify a summary is returned, Scala version runs full test
         sameSummary = model.evaluate(df)
+        self.assertEqual(sameSummary.probabilityCol, "probability")
         self.assertTrue(isinstance(sameSummary, BinaryLogisticRegressionSummary))
         self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
 
@@ -189,6 +190,7 @@ def test_multiclass_logistic_regression_summary(self):
         # test evaluation (with training dataset) produces a summary with same values
         # one check is enough to verify a summary is returned, Scala version runs full test
         sameSummary = model.evaluate(df)
+        self.assertEqual(sameSummary.probabilityCol, "probability")
         self.assertTrue(isinstance(sameSummary, LogisticRegressionSummary))
         self.assertFalse(isinstance(sameSummary, BinaryLogisticRegressionSummary))
         self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)

From ff47581c48643e3b2e90032dca0c3fd99c22c11f Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Thu, 4 Jun 2020 14:36:30 -0700
Subject: [PATCH 08/15] fix python style failure

---
 python/pyspark/ml/classification.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 92eecde99943a..7b12636d11942 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -1250,7 +1250,6 @@ class LogisticRegressionTrainingSummary(LogisticRegressionSummary, _TrainingSumm
 
     .. versionadded:: 2.0.0
     """
-
     pass
 
 
@@ -1262,9 +1261,9 @@ class BinaryLogisticRegressionSummary(_BinaryClassificationSummary,
 
     .. versionadded:: 2.0.0
     """
-
     pass
 
+
 @inherit_doc
 class BinaryLogisticRegressionTrainingSummary(BinaryLogisticRegressionSummary,
                                               LogisticRegressionTrainingSummary):

From 8b35ce9cfc91080af58ba0d3a6119ebb388d5802 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Wed, 10 Jun 2020 15:42:04 -0700
Subject: [PATCH 09/15] remove unnessessary negative weight check

---
 .../ClassificationSummary.scala               | 42 +++++++------------
 1 file changed, 16 insertions(+), 26 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
index 435ce00a78e5b..29cbd45d81fbd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.ml.classification
 
 import org.apache.spark.annotation.Since
-import org.apache.spark.ml.functions.checkNonNegativeWeight
 import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics}
 import org.apache.spark.sql.{DataFrame, Row}
@@ -60,23 +59,16 @@ private[classification] trait ClassificationSummary extends Serializable {
   def weightCol: String
 
   @transient private val multiclassMetrics = {
-    if (predictions.schema.fieldNames.contains(weightCol)) {
-      new MulticlassMetrics(
-        predictions.select(
-          col(predictionCol),
-          col(labelCol).cast(DoubleType),
-          checkNonNegativeWeight(col(weightCol).cast(DoubleType))).rdd.map {
-          case Row(prediction: Double, label: Double, weight: Double) => (prediction, label, weight)
-        })
+    val weightColumn = if (predictions.schema.fieldNames.contains(weightCol)) {
+      col(weightCol).cast(DoubleType)
     } else {
-      new MulticlassMetrics(
-        predictions.select(
-          col(predictionCol),
-          col(labelCol).cast(DoubleType),
-          lit(1.0)).rdd.map {
-          case Row(prediction: Double, label: Double, weight: Double) => (prediction, label, weight)
-        })
+      lit(1.0)
     }
+    new MulticlassMetrics(
+      predictions.select(col(predictionCol), col(labelCol).cast(DoubleType), weightColumn)
+        .rdd.map {
+          case Row(prediction: Double, label: Double, weight: Double) => (prediction, label, weight)
+      })
   }
 
   /**
@@ -203,17 +195,15 @@ trait BinaryClassificationSummary extends ClassificationSummary {
 
   // TODO: Allow the user to vary the number of bins using a setBins method in
   // BinaryClassificationMetrics. For now the default is set to 100.
-  @transient private val binaryMetrics = if (predictions.schema.fieldNames.contains(weightCol)) {
-    new BinaryClassificationMetrics(
-      predictions.select(col(scoreCol), col(labelCol).cast(DoubleType),
-        checkNonNegativeWeight(col(weightCol).cast(DoubleType))).rdd.map {
-        case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight)
-      }, 100
-    )
-  } else {
+  @transient private val binaryMetrics = {
+    val weightColumn = if (predictions.schema.fieldNames.contains(weightCol)) {
+      col(weightCol).cast(DoubleType)
+    } else {
+      lit(1.0)
+    }
+
     new BinaryClassificationMetrics(
-      predictions.select(col(scoreCol), col(labelCol).cast(DoubleType),
-        lit(1.0)).rdd.map {
+      predictions.select(col(scoreCol), col(labelCol).cast(DoubleType), weightColumn).rdd.map {
         case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight)
       }, 100
     )

From b1031833209b8a788742e55ed0496f4908763072 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Sat, 13 Jun 2020 11:15:57 -0700
Subject: [PATCH 10/15] address comments

---
 .../ClassificationSummary.scala               | 22 ++++++++-----------
 .../classification/LogisticRegression.scala   | 16 +++++++++++---
 .../LogisticRegressionSuite.scala             |  5 -----
 project/MimaExcludes.scala                    |  4 +---
 .../pyspark/ml/tests/test_training_summary.py |  2 --
 5 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
index 29cbd45d81fbd..94bbf707a5e2a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
@@ -36,12 +36,6 @@ private[classification] trait ClassificationSummary extends Serializable {
   @Since("3.1.0")
   def predictions: DataFrame
 
-  /**
-   * Field in "predictions" which gives the probability or rawPrediction of each class as a vector.
-   */
-  @Since("3.1.0")
-  def scoreCol: String
-
   /** Field in "predictions" which gives the prediction of each class. */
   @Since("3.1.0")
   def predictionCol: String
@@ -50,10 +44,6 @@ private[classification] trait ClassificationSummary extends Serializable {
   @Since("3.1.0")
   def labelCol: String
 
-  /** Field in "predictions" which gives the features of each instance as a vector. */
-  @Since("3.1.0")
-  def featuresCol: String
-
   /** Field in "predictions" which gives the weight of each instance as a vector. */
   @Since("3.1.0")
   def weightCol: String
@@ -193,8 +183,12 @@ trait BinaryClassificationSummary extends ClassificationSummary {
   private val sparkSession = predictions.sparkSession
   import sparkSession.implicits._
 
-  // TODO: Allow the user to vary the number of bins using a setBins method in
-  // BinaryClassificationMetrics. For now the default is set to 100.
+  /**
+   *  Field in "predictions" which gives the probability or rawPrediction of each class as a
+   *  vector.
+   */
+  def scoreCol: String = null
+
   @transient private val binaryMetrics = {
     val weightColumn = if (predictions.schema.fieldNames.contains(weightCol)) {
       col(weightCol).cast(DoubleType)
@@ -202,10 +196,12 @@ trait BinaryClassificationSummary extends ClassificationSummary {
       lit(1.0)
     }
 
+    // TODO: Allow the user to vary the number of bins using a setBins method in
+    // BinaryClassificationMetrics. For now the default is set to 1000.
     new BinaryClassificationMetrics(
       predictions.select(col(scoreCol), col(labelCol).cast(DoubleType), weightColumn).rdd.map {
         case Row(score: Vector, label: Double, weight: Double) => (score(1), label, weight)
-      }, 100
+      }, 1000
     )
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 559a84d2249e3..e2a01000c96fa 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -1397,7 +1397,11 @@ sealed trait LogisticRegressionSummary extends ClassificationSummary {
 
   /** Field in "predictions" which gives the probability of each class as a vector. */
   @Since("1.5.0")
-  def probabilityCol: String = scoreCol
+  def probabilityCol: String
+
+  /** Field in "predictions" which gives the features of each instance as a vector. */
+  @Since("1.6.0")
+  def featuresCol: String
 }
 
 /**
@@ -1412,6 +1416,12 @@ sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary
  */
 sealed trait BinaryLogisticRegressionSummary extends LogisticRegressionSummary
   with BinaryClassificationSummary {
+
+  override def scoreCol: String = if (probabilityCol.nonEmpty) {
+    probabilityCol
+  } else {
+    throw new SparkException(s"probabilityCol is required for BinaryLogisticRegressionSummary.")
+  }
 }
 
 /**
@@ -1449,7 +1459,7 @@ private class LogisticRegressionTrainingSummaryImpl(
  * Multiclass logistic regression results for a given model.
  *
  * @param predictions dataframe output by the model's `transform` method.
- * @param scoreCol field in "predictions" which gives the probability of
+ * @param probabilityCol field in "predictions" which gives the probability of
  *                 each class as a vector.
  * @param predictionCol field in "predictions" which gives the prediction for a data instance as a
  *                      double.
@@ -1459,7 +1469,7 @@ private class LogisticRegressionTrainingSummaryImpl(
  */
 private class LogisticRegressionSummaryImpl(
     @transient override val predictions: DataFrame,
-    override val scoreCol: String,
+    override val probabilityCol: String,
     override val predictionCol: String,
     override val labelCol: String,
     override val featuresCol: String,
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 0caeedfe1d655..ecee531c88a8f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -263,7 +263,6 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
       .setMaxIter(1)
 
     val blorModel = lr.fit(smallBinaryDataset)
-    assert(blorModel.summary.probabilityCol.equals("probability"))
     assert(blorModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummary])
     assert(blorModel.summary.asBinary.isInstanceOf[BinaryLogisticRegressionSummary])
     assert(blorModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummary])
@@ -271,7 +270,6 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
     assert(blorModel.binarySummary.totalIterations == 1)
 
     val mlorModel = lr.setFamily("multinomial").fit(smallMultinomialDataset)
-    assert(mlorModel.summary.probabilityCol.equals("probability"))
     assert(mlorModel.summary.isInstanceOf[LogisticRegressionTrainingSummary])
     withClue("cannot get binary summary for multiclass model") {
       intercept[RuntimeException] {
@@ -286,7 +284,6 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
     assert(mlorModel.summary.totalIterations == 1)
 
     val mlorBinaryModel = lr.setFamily("multinomial").fit(smallBinaryDataset)
-    assert(mlorBinaryModel.summary.probabilityCol.equals("probability"))
     assert(mlorBinaryModel.summary.isInstanceOf[BinaryLogisticRegressionTrainingSummary])
     assert(mlorBinaryModel.binarySummary.isInstanceOf[BinaryLogisticRegressionTrainingSummary])
 
@@ -294,8 +291,6 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
     val mlorSummary = mlorModel.evaluate(smallMultinomialDataset)
     assert(blorSummary.isInstanceOf[BinaryLogisticRegressionSummary])
     assert(mlorSummary.isInstanceOf[LogisticRegressionSummary])
-    assert(blorSummary.probabilityCol.equals("probability"))
-    assert(mlorSummary.probabilityCol.equals("probability"))
 
     // verify instance weight works
     val lr2 = new LogisticRegression()
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index a1bcfb3db552a..aec58fc2decc9 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -76,9 +76,7 @@ object MimaExcludes {
     ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$BinaryClassificationSummary$$sparkSession"),
     ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="),
     ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"),
-    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.weightCol"),
-    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionTrainingSummary.scoreCol"),
-    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.scoreCol")
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionSummary.weightCol")
   )
 
   // Exclude rules for 3.0.x
diff --git a/python/pyspark/ml/tests/test_training_summary.py b/python/pyspark/ml/tests/test_training_summary.py
index 00f68e91a5656..ac944d8397a86 100644
--- a/python/pyspark/ml/tests/test_training_summary.py
+++ b/python/pyspark/ml/tests/test_training_summary.py
@@ -150,7 +150,6 @@ def test_binary_logistic_regression_summary(self):
         # test evaluation (with training dataset) produces a summary with same values
         # one check is enough to verify a summary is returned, Scala version runs full test
         sameSummary = model.evaluate(df)
-        self.assertEqual(sameSummary.probabilityCol, "probability")
         self.assertTrue(isinstance(sameSummary, BinaryLogisticRegressionSummary))
         self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
 
@@ -190,7 +189,6 @@ def test_multiclass_logistic_regression_summary(self):
         # test evaluation (with training dataset) produces a summary with same values
         # one check is enough to verify a summary is returned, Scala version runs full test
         sameSummary = model.evaluate(df)
-        self.assertEqual(sameSummary.probabilityCol, "probability")
         self.assertTrue(isinstance(sameSummary, LogisticRegressionSummary))
         self.assertFalse(isinstance(sameSummary, BinaryLogisticRegressionSummary))
         self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)

From 137e5b20790f6e0c61e6f64fdcf47b2608e198f3 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Mon, 15 Jun 2020 01:37:41 -0700
Subject: [PATCH 11/15] change trait BinaryClassificationSummary to
 private[classification]

---
 .../apache/spark/ml/classification/ClassificationSummary.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
index 94bbf707a5e2a..0b93127ef7462 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
@@ -178,7 +178,7 @@ private[classification] trait TrainingSummary {
 /**
  * Abstraction for binary classification results for a given model.
  */
-trait BinaryClassificationSummary extends ClassificationSummary {
+private[classification] trait BinaryClassificationSummary extends ClassificationSummary {
 
   private val sparkSession = predictions.sparkSession
   import sparkSession.implicits._

From 61e93d0cab590975b5d4f4551edd43cac6b55ce2 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Mon, 15 Jun 2020 18:35:21 -0700
Subject: [PATCH 12/15] update python changes

---
 python/pyspark/ml/classification.py | 36 ++++++++++++++---------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 7b12636d11942..5101b99887fe3 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -256,15 +256,6 @@ def predictions(self):
         """
         return self._call_java("predictions")
 
-    @property
-    @since("3.1.0")
-    def scoreCol(self):
-        """
-        Field in "predictions" which gives the probability or raw prediction
-        of each class as a vector.
-        """
-        return self._call_java("scoreCol")
-
     @property
     @since("3.1.0")
     def predictionCol(self):
@@ -282,15 +273,6 @@ def labelCol(self):
         """
         return self._call_java("labelCol")
 
-    @property
-    @since("3.1.0")
-    def featuresCol(self):
-        """
-        Field in "predictions" which gives the features of each instance
-        as a vector.
-        """
-        return self._call_java("featuresCol")
-
     @property
     @since("3.1.0")
     def weightCol(self):
@@ -440,6 +422,15 @@ class _BinaryClassificationSummary(_ClassificationSummary):
     .. versionadded:: 3.1.0
     """
 
+    @property
+    @since("3.1.0")
+    def scoreCol(self):
+        """
+        Field in "predictions" which gives the probability or raw prediction
+        of each class as a vector.
+        """
+        return self._call_java("scoreCol")
+
     @property
     @since("3.1.0")
     def roc(self):
@@ -1242,6 +1233,15 @@ def probabilityCol(self):
         """
         return self._call_java("probabilityCol")
 
+    @property
+    @since("3.1.0")
+    def featuresCol(self):
+        """
+        Field in "predictions" which gives the features of each instance
+        as a vector.
+        """
+        return self._call_java("featuresCol")
+
 
 @inherit_doc
 class LogisticRegressionTrainingSummary(LogisticRegressionSummary, _TrainingSummary):

From e844c11939d5523d5d7402f5ee5ed6cabd52f8fb Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Wed, 17 Jun 2020 21:58:10 -0700
Subject: [PATCH 13/15] move asBinary to LogisticRegression

---
 .../ml/classification/ClassificationSummary.scala     | 11 -----------
 .../spark/ml/classification/LogisticRegression.scala  | 11 +++++++++++
 project/MimaExcludes.scala                            |  1 -
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
index 0b93127ef7462..b7a10af7be3cb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
@@ -142,17 +142,6 @@ private[classification] trait ClassificationSummary extends Serializable {
   /** Returns weighted averaged f1-measure. */
   @Since("3.1.0")
   def weightedFMeasure: Double = multiclassMetrics.weightedFMeasure(1.0)
-
-  /**
-   * Convenient method for casting to binary classification summary.
-   * This method will throw an Exception if the summary is not a binary summary.
-   */
-  @Since("3.1.0")
-  def asBinary: BinaryClassificationSummary = this match {
-    case b: BinaryClassificationSummary => b
-    case _ =>
-      throw new RuntimeException("Cannot cast to a binary summary.")
-  }
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index e2a01000c96fa..99861c22e16e2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -1402,6 +1402,17 @@ sealed trait LogisticRegressionSummary extends ClassificationSummary {
   /** Field in "predictions" which gives the features of each instance as a vector. */
   @Since("1.6.0")
   def featuresCol: String
+
+  /**
+   * Convenient method for casting to binary logistic regression summary.
+   * This method will throw an Exception if the summary is not a binary summary.
+   */
+  @Since("2.3.0")
+  def asBinary: BinaryLogisticRegressionSummary = this match {
+    case b: BinaryLogisticRegressionSummary => b
+    case _ =>
+      throw new RuntimeException("Cannot cast to a binary summary.")
+  }
 }
 
 /**
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index aec58fc2decc9..c724e82aa4712 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -66,7 +66,6 @@ object MimaExcludes {
     ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="),
     ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"),
     ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary.weightCol"),
-    ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.asBinary"),
     ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$_setter_$org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics_="),
     ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.org$apache$spark$ml$classification$ClassificationSummary$$multiclassMetrics"),
     ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightCol"),

From c76d5917b9a491213d7d6f4fd4be6793a6d07108 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Thu, 18 Jun 2020 00:25:49 -0700
Subject: [PATCH 14/15] address comments

---
 .../ClassificationSummary.scala               |  2 +-
 .../classification/LogisticRegression.scala   |  2 +-
 python/pyspark/ml/classification.py           | 26 +------------------
 3 files changed, 3 insertions(+), 27 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
index b7a10af7be3cb..e9ea38161d3c0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ClassificationSummary.scala
@@ -159,7 +159,7 @@ private[classification] trait TrainingSummary {
   /** Number of training iterations. */
   @Since("3.1.0")
   def totalIterations: Int = {
-    assert(objectiveHistory.length > 0, s"objectiveHistory length should be greater than 1.")
+    assert(objectiveHistory.length > 0, "objectiveHistory length should be greater than 0.")
     objectiveHistory.length - 1
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 99861c22e16e2..20d619334f7b9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -1431,7 +1431,7 @@ sealed trait BinaryLogisticRegressionSummary extends LogisticRegressionSummary
   override def scoreCol: String = if (probabilityCol.nonEmpty) {
     probabilityCol
   } else {
-    throw new SparkException(s"probabilityCol is required for BinaryLogisticRegressionSummary.")
+    throw new SparkException("probabilityCol is required for BinaryLogisticRegressionSummary.")
   }
 }
 
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 5101b99887fe3..ff506066519cd 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -441,10 +441,6 @@ def roc(self):
 
         .. seealso:: `Wikipedia reference
             <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
-
-        .. note:: This ignores instance weights (setting all to 1.0) from
-            `LogisticRegression.weightCol`. This will change in later Spark
-            versions.
         """
         return self._call_java("roc")
 
@@ -454,10 +450,6 @@ def areaUnderROC(self):
         """
         Computes the area under the receiver operating characteristic
         (ROC) curve.
-
-        .. note:: This ignores instance weights (setting all to 1.0) from
-            `LogisticRegression.weightCol`. This will change in later Spark
-            versions.
         """
         return self._call_java("areaUnderROC")
 
@@ -468,10 +460,6 @@ def pr(self):
         Returns the precision-recall curve, which is a Dataframe
         containing two fields recall, precision with (0.0, 1.0) prepended
         to it.
-
-        .. note:: This ignores instance weights (setting all to 1.0) from
-            `LogisticRegression.weightCol`. This will change in later Spark
-            versions.
         """
         return self._call_java("pr")
 
@@ -481,10 +469,6 @@ def fMeasureByThreshold(self):
         """
         Returns a dataframe with two fields (threshold, F-Measure) curve
         with beta = 1.0.
-
-        .. note:: This ignores instance weights (setting all to 1.0) from
-            `LogisticRegression.weightCol`. This will change in later Spark
-            versions.
         """
         return self._call_java("fMeasureByThreshold")
 
@@ -495,10 +479,6 @@ def precisionByThreshold(self):
         Returns a dataframe with two fields (threshold, precision) curve.
         Every possible probability obtained in transforming the dataset
         are used as thresholds used in calculating the precision.
-
-        .. note:: This ignores instance weights (setting all to 1.0) from
-            `LogisticRegression.weightCol`. This will change in later Spark
-            versions.
         """
         return self._call_java("precisionByThreshold")
 
@@ -509,10 +489,6 @@ def recallByThreshold(self):
         Returns a dataframe with two fields (threshold, recall) curve.
         Every possible probability obtained in transforming the dataset
         are used as thresholds used in calculating the recall.
-
-        .. note:: This ignores instance weights (setting all to 1.0) from
-            `LogisticRegression.weightCol`. This will change in later Spark
-            versions.
         """
         return self._call_java("recallByThreshold")
 
@@ -1234,7 +1210,7 @@ def probabilityCol(self):
         return self._call_java("probabilityCol")
 
     @property
-    @since("3.1.0")
+    @since("2.0.0")
     def featuresCol(self):
         """
         Field in "predictions" which gives the features of each instance

From ead6da2e0e3d1a269e6ef76b23f8e845690f42a3 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Thu, 18 Jun 2020 10:10:45 -0700
Subject: [PATCH 15/15] remove outdated MiMaExclude

---
 project/MimaExcludes.scala | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index c724e82aa4712..0be7b4c1003a7 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -48,10 +48,6 @@ object MimaExcludes {
     ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.feature.ChiSqSelectorModel"),
     ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.feature.ChiSqSelector"),
 
-    //[SPARK-31840] Add instance weight support in LogisticRegressionSummary
-    // weightCol in org.apache.spark.ml.classification.LogisticRegressionSummary is present only in current version
-    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionSummary.weightCol"),
-
     // [SPARK-24634] Add a new metric regarding number of inputs later than watermark plus allowed delay
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StateOperatorProgress.<init>$default$4"),