update

jiangxb1987 · jiangxb1987 · commit 48f1ef4512f0 · 2018-08-01T22:04:56.000+08:00
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -341,14 +341,18 @@ class DAGScheduler(
   }
 
   /**
-   * Check to make sure we are not launching a barrier stage that contains PartitionPruningRDD,
-   * which may launch tasks on partial partitions.
+   * Check to make sure we don't launch a barrier stage with unsupported RDD chain pattern. The
+   * following patterns are not supported:
+   * 1. Ancestor RDDs that have different number of partitions from the resulting RDD (eg.
+   * union()/coalesce()/first()/PartitionPruningRDD);
+   * 2. An RDD that depends on multiple barrier RDDs (eg. barrierRdd1.zip(barrierRdd2)).
    */
-  private def checkBarrierStageWithPartitionPruningRDD(rdd: RDD[_]): Unit = {
-    if (rdd.isBarrier() &&
-        !traverseParentRDDsWithinStage(rdd, (r => !r.isInstanceOf[PartitionPruningRDD[_]]))) {
-      throw new SparkException("Don't support run a barrier stage that contains " +
-        "PartitionPruningRDD, because PartitionPruningRDD may launch tasks on partial partitions.")
+  private def checkBarrierStageWithRDDChainPattern(rdd: RDD[_], numPartitions: Int): Unit = {
+    val predicate: RDD[_] => Boolean = (r =>
+      r.getNumPartitions == numPartitions && r.dependencies.filter(_.rdd.isBarrier()).size <= 1)
+    if (rdd.isBarrier() && !traverseParentRDDsWithinStage(rdd, predicate)) {
+      throw new SparkException(
+        DAGScheduler.ERROR_MESSAGE_RUN_BARRIER_WITH_UNSUPPORTED_RDD_CHAIN_PATTERN)
     }
   }
 
@@ -360,7 +364,7 @@ class DAGScheduler(
    */
   def createShuffleMapStage(shuffleDep: ShuffleDependency[_, _, _], jobId: Int): ShuffleMapStage = {
     val rdd = shuffleDep.rdd
-    checkBarrierStageWithPartitionPruningRDD(rdd)
+    checkBarrierStageWithRDDChainPattern(rdd, rdd.getNumPartitions)
     val numTasks = rdd.partitions.length
     val parents = getOrCreateParentStages(rdd, jobId)
     val id = nextStageId.getAndIncrement()
@@ -389,7 +393,7 @@ class DAGScheduler(
       partitions: Array[Int],
       jobId: Int,
       callSite: CallSite): ResultStage = {
-    checkBarrierStageWithPartitionPruningRDD(rdd)
+    checkBarrierStageWithRDDChainPattern(rdd, partitions.toSet.size)
     val parents = getOrCreateParentStages(rdd, jobId)
     val id = nextStageId.getAndIncrement()
     val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite)
@@ -466,8 +470,8 @@ class DAGScheduler(
   }
 
   /**
-   * Traverse all the parent RDDs within the same stage with the given RDD, check whether all the
-   * parent RDDs satisfy a given predicate.
+   * Traverses the given RDD and its ancestors within the same stage and checks whether all of the
+   * RDDs satisfy a given predicate.
    */
   private def traverseParentRDDsWithinStage(rdd: RDD[_], predicate: RDD[_] => Boolean): Boolean = {
     val visited = new HashSet[RDD[_]]
@@ -481,7 +485,7 @@ class DAGScheduler(
         }
         visited += toVisit
         toVisit.dependencies.foreach {
-          case shuffleDep: ShuffleDependency[_, _, _] =>
+          case _: ShuffleDependency[_, _, _] =>
             // Not within the same stage with current rdd, do nothing.
           case dependency =>
             waitingForVisit.push(dependency.rdd)
@@ -1986,4 +1990,11 @@ private[spark] object DAGScheduler {
 
   // Number of consecutive stage attempts allowed before a stage is aborted
   val DEFAULT_MAX_CONSECUTIVE_STAGE_ATTEMPTS = 4
+
+  // Error message when running a barrier stage that have unsupported RDD chain pattern.
+  val ERROR_MESSAGE_RUN_BARRIER_WITH_UNSUPPORTED_RDD_CHAIN_PATTERN =
+    "[SPARK-24820][SPARK-24821]: Barrier execution mode does not allow the following pattern of " +
+      "RDD chain within a barrier stage:\n1. Ancestor RDDs that have different number of " +
+      "partitions from the resulting RDD (eg. union()/coalesce()/first()/PartitionPruningRDD);\n" +
+      "2. An RDD that depends on multiple barrier RDDs (eg. barrierRdd1.zip(barrierRdd2))."
 }
diff --git a/core/src/test/scala/org/apache/spark/BarrierStageOnSubmittedSuite.scala b/core/src/test/scala/org/apache/spark/BarrierStageOnSubmittedSuite.scala
@@ -20,74 +20,134 @@ package org.apache.spark
 import scala.concurrent.duration._
 import scala.language.postfixOps
 
+import org.scalatest.BeforeAndAfterEach
+
 import org.apache.spark.rdd.{PartitionPruningRDD, RDD}
+import org.apache.spark.scheduler.DAGScheduler
 import org.apache.spark.util.ThreadUtils
 
 /**
  * This test suite covers all the cases that shall fail fast on job submitted that contains one
  * of more barrier stages.
  */
-class BarrierStageOnSubmittedSuite extends SparkFunSuite with LocalSparkContext {
+class BarrierStageOnSubmittedSuite extends SparkFunSuite with BeforeAndAfterEach
+    with LocalSparkContext {
+
+  override def beforeEach(): Unit = {
+    super.beforeEach()
+
+    val conf = new SparkConf()
+      .setMaster("local[4]")
+      .setAppName("test")
+    sc = new SparkContext(conf)
+  }
 
-  private def testSubmitJob(sc: SparkContext, rdd: RDD[Int], message: String): Unit = {
+  private def testSubmitJob(
+      sc: SparkContext,
+      rdd: RDD[Int],
+      partitions: Option[Seq[Int]] = None,
+      message: String): Unit = {
     val futureAction = sc.submitJob(
       rdd,
       (iter: Iterator[Int]) => iter.toArray,
-      0 until rdd.partitions.length,
+      partitions.getOrElse(0 until rdd.partitions.length),
       { case (_, _) => return }: (Int, Array[Int]) => Unit,
       { return }
     )
 
     val error = intercept[SparkException] {
-      ThreadUtils.awaitResult(futureAction, 1 seconds)
+      ThreadUtils.awaitResult(futureAction, 5 seconds)
     }.getCause.getMessage
     assert(error.contains(message))
   }
 
   test("submit a barrier ResultStage that contains PartitionPruningRDD") {
-    val conf = new SparkConf()
-      .setMaster("local[4]")
-      .setAppName("test")
-    sc = new SparkContext(conf)
-
     val prunedRdd = new PartitionPruningRDD(sc.parallelize(1 to 10, 4), index => index > 1)
     val rdd = prunedRdd
       .barrier()
       .mapPartitions((iter, context) => iter)
     testSubmitJob(sc, rdd,
-      "Don't support run a barrier stage that contains PartitionPruningRDD")
+      message = DAGScheduler.ERROR_MESSAGE_RUN_BARRIER_WITH_UNSUPPORTED_RDD_CHAIN_PATTERN)
   }
 
   test("submit a barrier ShuffleMapStage that contains PartitionPruningRDD") {
-    val conf = new SparkConf()
-      .setMaster("local[4]")
-      .setAppName("test")
-    sc = new SparkContext(conf)
-
     val prunedRdd = new PartitionPruningRDD(sc.parallelize(1 to 10, 4), index => index > 1)
     val rdd = prunedRdd
       .barrier()
       .mapPartitions((iter, context) => iter)
       .repartition(2)
       .map(x => x + 1)
     testSubmitJob(sc, rdd,
-      "Don't support run a barrier stage that contains PartitionPruningRDD")
+      message = DAGScheduler.ERROR_MESSAGE_RUN_BARRIER_WITH_UNSUPPORTED_RDD_CHAIN_PATTERN)
   }
 
   test("submit a barrier stage that doesn't contain PartitionPruningRDD") {
-    val conf = new SparkConf()
-      .setMaster("local[4]")
-      .setAppName("test")
-    sc = new SparkContext(conf)
-
     val prunedRdd = new PartitionPruningRDD(sc.parallelize(1 to 10, 4), index => index > 1)
     val rdd = prunedRdd
       .repartition(2)
       .barrier()
       .mapPartitions((iter, context) => iter)
-
     // Should be able to submit job and run successfully.
     val result = rdd.collect().sorted
     assert(result === Seq(6, 7, 8, 9, 10))
   }
+
+  test("submit a barrier stage with partial partitions") {
+    val rdd = sc.parallelize(1 to 10, 4)
+      .barrier()
+      .mapPartitions((iter, context) => iter)
+    testSubmitJob(sc, rdd, Some(Seq(1, 3)),
+      message = DAGScheduler.ERROR_MESSAGE_RUN_BARRIER_WITH_UNSUPPORTED_RDD_CHAIN_PATTERN)
+  }
+
+  test("submit a barrier stage with union()") {
+    val rdd1 = sc.parallelize(1 to 10, 2)
+      .barrier()
+      .mapPartitions((iter, context) => iter)
+    val rdd2 = sc.parallelize(1 to 20, 2)
+    val rdd3 = rdd1
+      .union(rdd2)
+      .map(x => x * 2)
+    // Fail the job on submit because the barrier RDD (rdd1) may be not assigned Task 0.
+    testSubmitJob(sc, rdd3,
+      message = DAGScheduler.ERROR_MESSAGE_RUN_BARRIER_WITH_UNSUPPORTED_RDD_CHAIN_PATTERN)
+  }
+
+  test("submit a barrier stage with coalesce()") {
+    val rdd = sc.parallelize(1 to 10, 4)
+      .barrier()
+      .mapPartitions((iter, context) => iter)
+      .coalesce(1)
+    // Fail the job on submit because the barrier RDD requires to run on 4 tasks, but the stage
+    // only launches 1 task.
+    testSubmitJob(sc, rdd,
+      message = DAGScheduler.ERROR_MESSAGE_RUN_BARRIER_WITH_UNSUPPORTED_RDD_CHAIN_PATTERN)
+  }
+
+  test("submit a barrier stage that contains an RDD that depends on multiple barrier RDDs") {
+    val rdd1 = sc.parallelize(1 to 10, 4)
+      .barrier()
+      .mapPartitions((iter, context) => iter)
+    val rdd2 = sc.parallelize(11 to 20, 4)
+      .barrier()
+      .mapPartitions((iter, context) => iter)
+    val rdd3 = rdd1
+      .zip(rdd2)
+      .map(x => x._1 + x._2)
+    testSubmitJob(sc, rdd3,
+      message = DAGScheduler.ERROR_MESSAGE_RUN_BARRIER_WITH_UNSUPPORTED_RDD_CHAIN_PATTERN)
+  }
+
+  test("submit a barrier stage with zip()") {
+    val rdd1 = sc.parallelize(1 to 10, 4)
+      .barrier()
+      .mapPartitions((iter, context) => iter)
+    val rdd2 = sc.parallelize(11 to 20, 4)
+    val rdd3 = rdd1
+      .zip(rdd2)
+      .map(x => x._1 + x._2)
+    // Should be able to submit job and run successfully.
+    val result = rdd3.collect().sorted
+    assert(result === Seq(12, 14, 16, 18, 20, 22, 24, 26, 28, 30))
+  }
 }