[SPARK-37203][SQL] Fix NotSerializableException when observe with TypedImperativeAggregate

beliefer · cloud-fan · commit 3f3201a7882b · 2021-11-05T16:13:11.000+08:00
### What changes were proposed in this pull request? Currently, ``` val namedObservation = Observation("named") val df = spark.range(100) val observed_df = df.observe( namedObservation, percentile_approx($"id", lit(0.5), lit(100)).as("percentile_approx_val")) observed_df.collect() namedObservation.get ``` throws exception as follows: ``` 15:16:27.994 ERROR org.apache.spark.util.Utils: Exception encountered java.io.NotSerializableException: org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile$PercentileDigest at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1184) at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548) at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509) at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432) at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178) at java.io.ObjectOutputStream.writeArray(ObjectOutputStream.java:1378) at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1174) at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548) at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509) at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432) at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178) at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548) at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509) at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432) at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178) at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:348) at org.apache.spark.scheduler.DirectTaskResult.$anonfun$writeExternal$2(TaskResult.scala:55) at org.apache.spark.scheduler.DirectTaskResult.$anonfun$writeExternal$2$adapted(TaskResult.scala:55) at scala.collection.Iterator.foreach(Iterator.scala:943) at scala.collection.Iterator.foreach$(Iterator.scala:943) at scala.collection.AbstractIterator.foreach(Iterator.scala:1431) at scala.collection.IterableLike.foreach(IterableLike.scala:74) at scala.collection.IterableLike.foreach$(IterableLike.scala:73) at scala.collection.AbstractIterable.foreach(Iterable.scala:56) at org.apache.spark.scheduler.DirectTaskResult.$anonfun$writeExternal$1(TaskResult.scala:55) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1434) at org.apache.spark.scheduler.DirectTaskResult.writeExternal(TaskResult.scala:51) at java.io.ObjectOutputStream.writeExternalData(ObjectOutputStream.java:1459) at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1430) at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178) at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:348) at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:44) at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:616) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) ``` This PR will fix the issue. After the change, `assert(namedObservation.get === Map("percentile_approx_val" -> 49))` `java.io.NotSerializableException` will not happen. ### Why are the changes needed? Fix `NotSerializableException` when observe with `TypedImperativeAggregate`. ### Does this PR introduce _any_ user-facing change? No. This PR change the implement of `AggregatingAccumulator` who uses serialize and deserialize of `TypedImperativeAggregate` now. ### How was this patch tested? New tests. Closes apache#34474 from beliefer/SPARK-37203. Authored-by: Jiaan Geng <beliefer@163.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
@@ -157,6 +157,10 @@ abstract class AccumulatorV2[IN, OUT] extends Serializable {
    */
   def value: OUT
 
+  // Serialize the buffer of this accumulator before sending back this accumulator to the driver.
+  // By default this method does nothing.
+  protected def withBufferSerialized(): AccumulatorV2[IN, OUT] = this
+
   // Called by Java when serializing an object
   final protected def writeReplace(): Any = {
     if (atDriverSide) {
@@ -179,7 +183,7 @@ abstract class AccumulatorV2[IN, OUT] extends Serializable {
       }
       copyAcc
     } else {
-      this
+      withBufferSerialized()
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
@@ -284,7 +284,7 @@ object ApproximatePercentile {
   }
 
   /**
-   * Serializer  for class [[PercentileDigest]]
+   * Serializer for class [[PercentileDigest]]
    *
    * This class is thread safe.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/AggregatingAccumulator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/AggregatingAccumulator.scala
@@ -163,9 +163,18 @@ class AggregatingAccumulator private(
             i += 1
           }
           i = 0
-          while (i < typedImperatives.length) {
-            typedImperatives(i).mergeBuffersObjects(buffer, otherBuffer)
-            i += 1
+          if (isAtDriverSide) {
+            while (i < typedImperatives.length) {
+              // The input buffer stores serialized data
+              typedImperatives(i).merge(buffer, otherBuffer)
+              i += 1
+            }
+          } else {
+            while (i < typedImperatives.length) {
+              // The input buffer stores deserialized object
+              typedImperatives(i).mergeBuffersObjects(buffer, otherBuffer)
+              i += 1
+            }
           }
         case _ =>
           throw QueryExecutionErrors.cannotMergeClassWithOtherClassError(
@@ -188,6 +197,17 @@ class AggregatingAccumulator private(
     resultProjection(input)
   }
 
+  override def withBufferSerialized(): AggregatingAccumulator = {
+    assert(!isAtDriverSide)
+    var i = 0
+    // AggregatingAccumulator runs on executor, we should serialize all TypedImperativeAggregate.
+    while (i < typedImperatives.length) {
+      typedImperatives(i).serializeAggregateBufferInPlace(buffer)
+      i += 1
+    }
+    this
+  }
+
   /**
    * Get the output schema of the aggregating accumulator.
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -754,6 +754,19 @@ class DatasetSuite extends QueryTest
     assert(err2.getMessage.contains("Name must not be empty"))
   }
 
+  test("SPARK-37203: Fix NotSerializableException when observe with TypedImperativeAggregate") {
+    def observe[T](df: Dataset[T], expected: Map[String, _]): Unit = {
+      val namedObservation = Observation("named")
+      val observed_df = df.observe(
+        namedObservation, percentile_approx($"id", lit(0.5), lit(100)).as("percentile_approx_val"))
+      observed_df.collect()
+      assert(namedObservation.get === expected)
+    }
+
+    observe(spark.range(100), Map("percentile_approx_val" -> 49))
+    observe(spark.range(0), Map("percentile_approx_val" -> null))
+  }
+
   test("sample with replacement") {
     val n = 100
     val data = sparkContext.parallelize(1 to n, 2).toDS()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -417,7 +417,8 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
         min($"value").as("min_val"),
         max($"value").as("max_val"),
         sum($"value").as("sum_val"),
-        count(when($"value" % 2 === 0, 1)).as("num_even"))
+        count(when($"value" % 2 === 0, 1)).as("num_even"),
+        percentile_approx($"value", lit(0.5), lit(100)).as("percentile_approx_val"))
       .observe(
         name = "other_event",
         avg($"value").cast("int").as("avg_val"))
@@ -444,15 +445,15 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
         AddData(inputData, 1, 2),
         AdvanceManualClock(100),
         checkMetrics { metrics =>
-          assert(metrics.get("my_event") === Row(1, 2, 3L, 1L))
+          assert(metrics.get("my_event") === Row(1, 2, 3L, 1L, 1))
           assert(metrics.get("other_event") === Row(1))
         },
 
         // Batch 2
         AddData(inputData, 10, 30, -10, 5),
         AdvanceManualClock(100),
         checkMetrics { metrics =>
-          assert(metrics.get("my_event") === Row(-10, 30, 35L, 3L))
+          assert(metrics.get("my_event") === Row(-10, 30, 35L, 3L, 5))
           assert(metrics.get("other_event") === Row(8))
         },
 

Original file line number	Diff line number	Diff line change
`@@ -157,6 +157,10 @@ abstract class AccumulatorV2[IN, OUT] extends Serializable {`
`157`	`157`	`*/`
`158`	`158`	`def value: OUT`
`159`	`159`
	`160`	`+ // Serialize the buffer of this accumulator before sending back this accumulator to the driver.`
	`161`	`+ // By default this method does nothing.`
	`162`	`+ protected def withBufferSerialized(): AccumulatorV2[IN, OUT] = this`
	`163`	`+`
`160`	`164`	`// Called by Java when serializing an object`
`161`	`165`	`final protected def writeReplace(): Any = {`
`162`	`166`	`if (atDriverSide) {`
`@@ -179,7 +183,7 @@ abstract class AccumulatorV2[IN, OUT] extends Serializable {`
`179`	`183`	`}`
`180`	`184`	`copyAcc`
`181`	`185`	`} else {`
`182`		`- this`
	`186`	`+ withBufferSerialized()`
`183`	`187`	`}`
`184`	`188`	`}`
`185`	`189`
Original file line number	Diff line number	Diff line change
`@@ -284,7 +284,7 @@ object ApproximatePercentile {`
`284`	`284`	`}`
`285`	`285`
`286`	`286`	`/**`
`287`		`- * Serializer for class [[PercentileDigest]]`
	`287`	`+ * Serializer for class [[PercentileDigest]]`
`288`	`288`	`*`
`289`	`289`	`* This class is thread safe.`
`290`	`290`	`*/`