[SPARK-49038][SQL] SQLMetric should report the raw value in the accumulator update event

cloud-fan · cloud-fan · commit f135e0c605b0 · 2024-08-14T11:36:23.000+08:00
Some `SQLMetrics` set the initial value to `-1`, so that we can recognize no-update metrics (e.g. there is no input data and the metric is not updated at all) and filter them out later in the UI. However, there is a bug here. Spark turns accumulator updates into `AccumulableInfo`, using `AccumulatorV2#value`. To avoid exposing the internal `-1` value to end users, `SQLMetric#value` turns `-1` into `0` before returning the value. See more details in apache#39311 . UI can no longer see `-1` and filter them out. This PR fixes the bug by using the raw value of `SQLMetric` to create `AccumulableInfo`, so that UI can still see `-1` and filters it. To avoid getting the wrong min value for certain SQL metrics when some partitions have no data. Yes, if people write spark listeners to watch the `SparkListenerExecutorMetricsUpdate` event, they can see the correct value of SQL metrics. manual UI tests. We do not have an end-to-end UI test framework for SQL metrics yet. no Closes apache#47721 from cloud-fan/metrics. Authored-by: Wenchen Fan <wenchen@databricks.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -530,7 +530,7 @@ private[spark] class Executor(
       // Collect latest accumulator values to report back to the driver
       val accums: Seq[AccumulatorV2[_, _]] =
         Option(task).map(_.collectAccumulatorUpdates(taskFailed = true)).getOrElse(Seq.empty)
-      val accUpdates = accums.map(acc => acc.toInfo(Some(acc.value), None))
+      val accUpdates = accums.map(acc => acc.toInfoUpdate)
 
       setTaskFinishedAndClearInterruptStatus()
       (accums, accUpdates)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -901,7 +901,7 @@ private[spark] class TaskSchedulerImpl(
          executorRunTime = acc.value.asInstanceOf[Long]
        }
      }
-     acc.toInfo(Some(acc.value), None)
+     acc.toInfoUpdate
    }
    val taskProcessRate = if (efficientTaskCalcualtionEnabled) {
      getTaskProcessRate(recordsRead, executorRunTime)
diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
@@ -102,16 +102,24 @@ abstract class AccumulatorV2[IN, OUT] extends Serializable {
     metadata.countFailedValues
   }
 
+  private def isInternal = name.exists(_.startsWith(InternalAccumulator.METRICS_PREFIX))
+
   /**
    * Creates an [[AccumulableInfo]] representation of this [[AccumulatorV2]] with the provided
    * values.
    */
   private[spark] def toInfo(update: Option[Any], value: Option[Any]): AccumulableInfo = {
-    val isInternal = name.exists(_.startsWith(InternalAccumulator.METRICS_PREFIX))
     AccumulableInfo(id, name, internOption(update), internOption(value), isInternal,
       countFailedValues)
   }
 
+  /**
+   * Creates an [[AccumulableInfo]] representation of this [[AccumulatorV2]] as an update.
+   */
+  private[spark] def toInfoUpdate: AccumulableInfo = {
+    AccumulableInfo(id, name, internOption(Some(value)), None, isInternal, countFailedValues)
+  }
+
   final private[spark] def isAtDriverSide: Boolean = atDriverSide
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -1350,7 +1350,7 @@ private[spark] object JsonProtocol extends JsonUtils {
         val accumUpdates = jsonOption(json.get("Accumulator Updates"))
           .map(_.extractElements.map(accumulableInfoFromJson).toArray.toSeq)
           .getOrElse(taskMetricsFromJson(json.get("Metrics")).accumulators().map(acc => {
-            acc.toInfo(Some(acc.value), None)
+            acc.toInfoUpdate
           }).toArray.toSeq)
         ExceptionFailure(className, description, stackTrace, fullStackTrace, None, accumUpdates)
       case `taskResultLost` => TaskResultLost
diff --git a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala
@@ -147,7 +147,7 @@ private[spark] object AccumulatorSuite {
    * Make an `AccumulableInfo` out of an `AccumulatorV2` with the intent to use the
    * info as an accumulator update.
    */
-  def makeInfo(a: AccumulatorV2[_, _]): AccumulableInfo = a.toInfo(Some(a.value), None)
+  def makeInfo(a: AccumulatorV2[_, _]): AccumulableInfo = a.toInfoUpdate
 
   /**
    * Run one or more Spark jobs and verify that in at least one job the peak execution memory
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
@@ -90,6 +90,13 @@ class SQLMetric(val metricType: String, initValue: Long = 0L) extends Accumulato
     AccumulableInfo(id, name, internOption(update), internOption(value), true, true,
       SQLMetrics.cachedSQLAccumIdentifier)
   }
+
+  // We should provide the raw value which can be -1, so that `SQLMetrics.stringValue` can correctly
+  // filter out the invalid -1 values.
+  override def toInfoUpdate: AccumulableInfo = {
+    AccumulableInfo(id, name, internOption(Some(_value)), None, true, true,
+      SQLMetrics.cachedSQLAccumIdentifier)
+  }
 }
 
 object SQLMetrics {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
@@ -181,7 +181,7 @@ class SQLAppStatusListener(
       event.taskMetrics.withExternalAccums(_.flatMap { a =>
         // This call may fail if the accumulator is gc'ed, so account for that.
         try {
-          Some(a.toInfo(Some(a.value), None))
+          Some(a.toInfoUpdate)
         } catch {
           case _: IllegalAccessError => None
         }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -960,6 +960,11 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils
     assert(SQLMetrics.createNanoTimingMetric(sparkContext, name = "m", initValue = -1).isZero())
     assert(SQLMetrics.createNanoTimingMetric(sparkContext, name = "m", initValue = 5).isZero())
   }
+
+  test("SQLMetric#toInfoUpdate") {
+    assert(SQLMetrics.createSizeMetric(sparkContext, name = "m").toInfoUpdate.update === Some(-1))
+    assert(SQLMetrics.createMetric(sparkContext, name = "m").toInfoUpdate.update === Some(0))
+  }
 }
 
 case class CustomFileCommitProtocol(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala
@@ -312,7 +312,7 @@ object InputOutputMetricsHelper {
 
       var maxOutputRows = 0L
       taskEnd.taskMetrics.withExternalAccums(_.foreach { accum =>
-        val info = accum.toInfo(Some(accum.value), None)
+        val info = accum.toInfoUpdate
         if (info.name.toString.contains("number of output rows")) {
           info.update match {
             case Some(n: Number) =>

Original file line number	Diff line number	Diff line change
`@@ -901,7 +901,7 @@ private[spark] class TaskSchedulerImpl(`
`901`	`901`	`executorRunTime = acc.value.asInstanceOf[Long]`
`902`	`902`	`}`
`903`	`903`	`}`
`904`		`- acc.toInfo(Some(acc.value), None)`
	`904`	`+ acc.toInfoUpdate`
`905`	`905`	`}`
`906`	`906`	`val taskProcessRate = if (efficientTaskCalcualtionEnabled) {`
`907`	`907`	`getTaskProcessRate(recordsRead, executorRunTime)`
Original file line number	Diff line number	Diff line change
`@@ -90,6 +90,13 @@ class SQLMetric(val metricType: String, initValue: Long = 0L) extends Accumulato`
`90`	`90`	`AccumulableInfo(id, name, internOption(update), internOption(value), true, true,`
`91`	`91`	`SQLMetrics.cachedSQLAccumIdentifier)`
`92`	`92`	`}`
	`93`	`+`
	`94`	+ // We should provide the raw value which can be -1, so that `SQLMetrics.stringValue` can correctly
	`95`	`+ // filter out the invalid -1 values.`
	`96`	`+ override def toInfoUpdate: AccumulableInfo = {`
	`97`	`+ AccumulableInfo(id, name, internOption(Some(_value)), None, true, true,`
	`98`	`+ SQLMetrics.cachedSQLAccumIdentifier)`
	`99`	`+ }`
`93`	`100`	`}`
`94`	`101`
`95`	`102`	`object SQLMetrics {`
Original file line number	Diff line number	Diff line change
`@@ -181,7 +181,7 @@ class SQLAppStatusListener(`
`181`	`181`	`event.taskMetrics.withExternalAccums(_.flatMap { a =>`
`182`	`182`	`// This call may fail if the accumulator is gc'ed, so account for that.`
`183`	`183`	`try {`
`184`		`- Some(a.toInfo(Some(a.value), None))`
	`184`	`+ Some(a.toInfoUpdate)`
`185`	`185`	`} catch {`
`186`	`186`	`case _: IllegalAccessError => None`
`187`	`187`	`}`
Original file line number	Diff line number	Diff line change
`@@ -960,6 +960,11 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils`
`960`	`960`	`assert(SQLMetrics.createNanoTimingMetric(sparkContext, name = "m", initValue = -1).isZero())`
`961`	`961`	`assert(SQLMetrics.createNanoTimingMetric(sparkContext, name = "m", initValue = 5).isZero())`
`962`	`962`	`}`
	`963`	`+`
	`964`	`+ test("SQLMetric#toInfoUpdate") {`
	`965`	`+ assert(SQLMetrics.createSizeMetric(sparkContext, name = "m").toInfoUpdate.update === Some(-1))`
	`966`	`+ assert(SQLMetrics.createMetric(sparkContext, name = "m").toInfoUpdate.update === Some(0))`
	`967`	`+ }`
`963`	`968`	`}`
`964`	`969`
`965`	`970`	`case class CustomFileCommitProtocol(`