apache · zhongyu09 · Jan 13, 2021 · Jan 13, 2021 · LuciferYang · Jan 13, 2021
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -23,7 +23,8 @@ import java.util.concurrent.LinkedBlockingQueue
 import scala.collection.JavaConverters._
 import scala.collection.concurrent.TrieMap
 import scala.collection.mutable
-import scala.concurrent.ExecutionContext
+import scala.concurrent.{ExecutionContext, Future}
+import scala.concurrent.duration.Duration
 import scala.util.control.NonFatal
 
 import org.apache.spark.SparkException
@@ -190,7 +191,36 @@ case class AdaptiveSparkPlanExec(
           executionId.foreach(onUpdatePlan(_, result.newStages.map(_.plan)))
 
           // Start materialization of all new stages and fail fast if any stages failed eagerly
-          result.newStages.foreach { stage =>
+
+          // SPARK-33933: we should materialize broadcast stages first and wait the
+          // materialization finish before materialize other stages, to avoid waiting
+          // for broadcast tasks to be scheduled and leading to broadcast timeout.
+          val broadcastMaterializationFutures = result.newStages
+            .filter(_.isInstanceOf[BroadcastQueryStageExec])
+            .map { stage =>
+            var future: Future[Any] = null
+            try {
+              future = stage.materialize()
+              future.onComplete { res =>
+                if (res.isSuccess) {
+                  events.offer(StageSuccess(stage, res.get))
+                } else {
+                  events.offer(StageFailure(stage, res.failed.get))
+                }
+              }(AdaptiveSparkPlanExec.executionContext)
+            } catch {
+              case e: Throwable =>
+                cleanUpAndThrowException(Seq(e), Some(stage.id))
+            }
+            future
+          }
+
+          // Wait for the materialization of all broadcast stages finish
+          broadcastMaterializationFutures.foreach(ThreadUtils.awaitReady(_, Duration.Inf))
+
+          // Start materialization of non-broadcast stages
+          result.newStages.filter(!_.isInstanceOf[BroadcastQueryStageExec])
+            .foreach { stage =>
             try {
               stage.materialize().onComplete { res =>
                 if (res.isSuccess) {

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -22,7 +22,7 @@ import java.net.URI
 
 import org.apache.log4j.Level
 
-import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent, SparkListenerJobStart}
+import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent, SparkListenerJobStart, SparkListenerStageSubmitted, StageInfo}
 import org.apache.spark.sql.{Dataset, QueryTest, Row, SparkSession, Strategy}
 import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight}
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan}
@@ -1460,4 +1460,42 @@ class AdaptiveQueryExecSuite
       }
     }
   }
+
+  test("SPARK-33933: AQE broadcast should not timeout with slow map tasks") {
+
+    val broadcastTimeoutInSec = 2
+    val shuffleMapTaskParallsm = 100
+
+    val input = spark.sparkContext.parallelize(Range(0, 100), shuffleMapTaskParallsm)
+      .flatMap(x => {
+        Thread.sleep(50)
+        for (i <- Range(0, 100)) yield (x % 26, x % 10)
+      }).toDF("index", "pv")
+    val dim = Range(0, 26)
+      .map(x => (x, ('a' + x).toChar.toString))
+      .toDF("index", "name")
+      .coalesce(1)
+    val testDf = input.groupBy("index")
+      .agg(sum($"pv").alias("pv"))
+      .join(dim, Seq("index"))
+
+    val stageInfos = scala.collection.mutable.ArrayBuffer[StageInfo]()
+    val listener = new SparkListener {
+      override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = {
+        stageInfos += stageSubmitted.stageInfo
+      }
+    }
+    spark.sparkContext.addSparkListener(listener)
+
+    withSQLConf(SQLConf.BROADCAST_TIMEOUT.key -> broadcastTimeoutInSec.toString,
+      SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") {
+      val result = testDf.collect()
+      assert(result.length == 26)
+      val sortedStageInfos = stageInfos.sortBy(_.submissionTime)
+      assert(sortedStageInfos.size > 2)
+      assert(sortedStageInfos(0).numTasks == 1)
+      assert(sortedStageInfos(1).numTasks == shuffleMapTaskParallsm)
+    }
+
+  }
 }