Fix zombieTasksets and RemovedTaskset lost output

suyanNone · suyanNone · commit 70af48497caa · 2016-04-22T17:42:59.000+08:00
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1157,9 +1157,9 @@ class DAGScheduler(
     val stage = stageIdToStage(task.stageId)
     event.reason match {
       case Success =>
-        stage.pendingPartitions -= task.partitionId
         task match {
           case rt: ResultTask[_, _] =>
+            stage.pendingPartitions -= task.partitionId
             // Cast to ResultStage here because it's part of the ResultTask
             // TODO Refactor this out to a function that accepts a ResultStage
             val resultStage = stage.asInstanceOf[ResultStage]
@@ -1200,6 +1200,7 @@ class DAGScheduler(
             if (failedEpoch.contains(execId) && smt.epoch <= failedEpoch(execId)) {
               logInfo(s"Ignoring possibly bogus $smt completion from executor $execId")
             } else {
+              stage.pendingPartitions -= task.partitionId
               shuffleStage.addOutputLoc(smt.partitionId, status)
             }
 
@@ -1339,19 +1340,51 @@ class DAGScheduler(
       logInfo("Executor lost: %s (epoch %d)".format(execId, currentEpoch))
       blockManagerMaster.removeExecutor(execId)
 
+      val resubmitStages: HashSet[Int] = HashSet.empty
       if (!env.blockManager.externalShuffleServiceEnabled || fetchFailed) {
         // TODO: This will be really slow if we keep accumulating shuffle map stages
         for ((shuffleId, stage) <- shuffleToMapStage) {
           stage.removeOutputsOnExecutor(execId)
-          mapOutputTracker.registerMapOutputs(
-            shuffleId,
-            stage.outputLocInMapOutputTrackerFormat(),
-            changeEpoch = true)
+          val locs = stage.outputLocInMapOutputTrackerFormat()
+          if (runningStages.contains(stage)) {
+            // Assumption: 1) not a FetchFailed ExecutorLost, 2) a running shuffleMapStage has
+            // multiple taskSets: 1 active, some Zombie, some removed as finished. Executor lost
+            // may lost the output only finish by the removedTasksets or zombieTasksets, So need
+            // to check if runningStage.pendingPartitions == Missing shuffleMapStage.outputLocs
+            // if is false, says lost locs in removedTaskSets or zombieTaskSets,
+            // So need mark active as zombie and resubmit that stage
+            if (!fetchFailed && stage.findMissingPartitions()
+              .exists(!stage.pendingPartitions.contains(_))) {
+              resubmitStages += stage.id
+            }
+            mapOutputTracker.incrementEpoch()
+          } else {
+            mapOutputTracker.registerMapOutputs(shuffleId, locs, changeEpoch = true)
+          }
         }
+
         if (shuffleToMapStage.isEmpty) {
           mapOutputTracker.incrementEpoch()
         }
+
         clearCacheLocs()
+
+        if (!fetchFailed) {
+          // if FailedStages is not empty,
+          // it implies that had already scheduled a ResubmitFailedStages.
+          if (failedStages.isEmpty) {
+            messageScheduler.schedule(new Runnable {
+              override def run(): Unit = eventProcessLoop.post(ResubmitFailedStages)
+            }, DAGScheduler.RESUBMIT_TIMEOUT, TimeUnit.MILLISECONDS)
+          }
+          resubmitStages.foreach {
+            case stageId =>
+              val stage = stageIdToStage(stageId)
+              logWarning(s"Executor $execId cause $stageId partition lost, So resubmit")
+              markStageAsFinished(stage, Some(s"Executor $execId lost"))
+              failedStages += stage
+          }
+        }
       }
     } else {
       logDebug("Additional executor lost message for " + execId +
@@ -1416,6 +1449,7 @@ class DAGScheduler(
 
     outputCommitCoordinator.stageEnd(stage.id)
     listenerBus.post(SparkListenerStageCompleted(stage.latestInfo))
+    taskScheduler.zombieTasks(stage.id)
     runningStages -= stage
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala
@@ -53,6 +53,8 @@ private[spark] trait TaskScheduler {
   // Cancel a stage.
   def cancelTasks(stageId: Int, interruptThread: Boolean): Unit
 
+  def zombieTasks(stageId: Int): Unit
+
   // Set the DAG scheduler for upcalls. This is guaranteed to be set before submitTasks is called.
   def setDAGScheduler(dagScheduler: DAGScheduler): Unit
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -222,6 +222,17 @@ private[spark] class TaskSchedulerImpl(
     }
   }
 
+  override def zombieTasks(stageId: Int): Unit = synchronized {
+    taskSetsByStageIdAndAttempt.get(stageId).foreach { attempts =>
+      attempts.foreach { case (stageAttemptId, tsm) =>
+          if (!tsm.isZombie) {
+            logInfo(s"Mark stage($stageId) taskset ${tsm.taskSet.id} as Zombie")
+            tsm.isZombie = true
+          }
+      }
+    }
+  }
+
   /**
    * Called to indicate that all task attempts (including speculated tasks) associated with the
    * given TaskSetManager have completed, so state associated with the TaskSetManager should be
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -787,8 +787,12 @@ private[spark] class TaskSetManager(
           addPendingTask(index)
           // Tell the DAGScheduler that this task was resubmitted so that it doesn't think our
           // stage finishes when a total of tasks.size tasks finish.
-          sched.dagScheduler.taskEnded(
-            tasks(index), Resubmitted, null, Seq.empty[AccumulableInfo], info)
+          // The reason for not resubmitting ZombieTasks is make DAGScheduler to
+          // know whether the lost partition can re-run on current activeTaskSet or not.
+          if (!isZombie) {
+            sched.dagScheduler.taskEnded(
+              tasks(index), Resubmitted, null, Seq.empty[AccumulableInfo], info)
+          }
         }
       }
     }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -122,6 +122,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
     override def cancelTasks(stageId: Int, interruptThread: Boolean) {
       cancelledStages += stageId
     }
+    override def zombieTasks(stageId: Int): Unit = {}
     override def setDAGScheduler(dagScheduler: DAGScheduler) = {}
     override def defaultParallelism() = 2
     override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
@@ -480,6 +481,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
       override def cancelTasks(stageId: Int, interruptThread: Boolean) {
         throw new UnsupportedOperationException
       }
+      override def zombieTasks(stageId: Int): Unit = {}
       override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {}
       override def defaultParallelism(): Int = 2
       override def executorHeartbeatReceived(
@@ -1083,8 +1085,8 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
       Success,
       makeMapStatus("hostA", reduceRdd.partitions.size)))
     assert(shuffleStage.numAvailableOutputs === 2)
-    assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1).toSet ===
-      HashSet(makeBlockManagerId("hostB"), makeBlockManagerId("hostA")))
+    //assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1).toSet ===
+    //       HashSet(makeBlockManagerId("hostB"), makeBlockManagerId("hostA")))
 
     // finish the next stage normally, which completes the job
     complete(taskSets(1), Seq((Success, 42), (Success, 43)))
@@ -1272,13 +1274,15 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
       Success,
       makeMapStatus("hostA", reduceRdd.partitions.length)))
 
-    // now that host goes down
     runEvent(ExecutorLost("exec-hostA"))
 
-    // so we resubmit those tasks
+    // TaskSetManager handle Executor lost before DAG, so we resubmit those tasks
     runEvent(makeCompletionEvent(taskSets(0).tasks(0), Resubmitted, null))
     runEvent(makeCompletionEvent(taskSets(0).tasks(1), Resubmitted, null))
 
+    // now that host goes down
+    runEvent(ExecutorLost("exec-hostA"))
+
     // now complete everything on a different host
     complete(taskSets(0), Seq(
       (Success, makeMapStatus("hostB", reduceRdd.partitions.length)),
@@ -1304,6 +1308,48 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
     assert(stage1TaskSet.stageAttemptId == 0)
   }
 
+  test("Resubmit stage while lost partition in ZombieTasksets or RemovedTaskSets") {
+    val firstRDD = new MyRDD(sc, 3, Nil)
+    val firstShuffleDep = new ShuffleDependency(firstRDD, new HashPartitioner(3))
+    val firstShuffleId = firstShuffleDep.shuffleId
+    val shuffleMapRdd = new MyRDD(sc, 3, List(firstShuffleDep))
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(3))
+    val reduceRdd = new MyRDD(sc, 1, List(shuffleDep))
+    submit(reduceRdd, Array(0))
+
+    // things start out smoothly, stage 0 completes with no issues
+    complete(taskSets(0), Seq(
+      (Success, makeMapStatus("hostB", shuffleMapRdd.partitions.length)),
+      (Success, makeMapStatus("hostB", shuffleMapRdd.partitions.length)),
+      (Success, makeMapStatus("hostA", shuffleMapRdd.partitions.length))
+    ))
+
+    runEvent(makeCompletionEvent(
+      taskSets(1).tasks(0),
+      Success,
+      makeMapStatus("hostD", shuffleMapRdd.partitions.length),
+      null))
+
+    runEvent(makeCompletionEvent(
+      taskSets(1).tasks(1),
+      FetchFailed(null, firstShuffleId, 2, 1, "Fetch failed"),
+      null))
+
+    // so we resubmit stage 1
+    scheduler.resubmitFailedStages()
+    val stage1Resubmit1 = taskSets(2)
+    assert(stage1Resubmit1.stageId == 1)
+    assert(stage1Resubmit1.tasks.size == 2)
+    runEvent(ExecutorLost("exec-hostD"))
+
+    scheduler.resubmitFailedStages()
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
+    val stage1Resubmit2 = taskSets(3)
+    assert(stage1Resubmit2.stageId == 1)
+    assert(stage1Resubmit2.tasks.size == 3)
+  }
+
+
   /**
    * Makes sure that failures of stage used by multiple jobs are correctly handled.
    *
@@ -1469,14 +1515,15 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
     runEvent(ExecutorLost("exec-hostA"))
     // DAGScheduler will immediately resubmit the stage after it appears to have no pending tasks
     // rather than marking it is as failed and waiting.
+
     complete(taskSets(0), Seq(
       (Success, makeMapStatus("hostA", 1)),
       (Success, makeMapStatus("hostB", 1))))
     // have hostC complete the resubmitted task
-    complete(taskSets(1), Seq((Success, makeMapStatus("hostC", 1))))
+    complete(taskSets(0), Seq((Success, makeMapStatus("hostC", 1))))
     assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1).toSet ===
-      HashSet(makeBlockManagerId("hostC"), makeBlockManagerId("hostB")))
-    complete(taskSets(2), Seq((Success, 42)))
+    HashSet(makeBlockManagerId("hostC"), makeBlockManagerId("hostB")))
+    complete(taskSets(1), Seq((Success, 42)))
     assert(results === Map(0 -> 42))
     assertDataStructuresEmpty()
   }
@@ -1927,12 +1974,19 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
     runEvent(makeCompletionEvent(oldTaskSet.tasks(0), Success, makeMapStatus("hostA", 2)))
     assert(results.size === 0)    // Map stage job should not be complete yet
 
+    // TaskSetManager handle Executor lost before DAG, so we resubmit those tasks
+    // runEvent(CompletionEvent(
+    // taskSets(0).tasks(0), Resubmitted, null, null, createFakeTaskInfo(), null))
+
     // Pretend host A was lost
     val oldEpoch = mapOutputTracker.getEpoch
     runEvent(ExecutorLost("exec-hostA"))
     val newEpoch = mapOutputTracker.getEpoch
     assert(newEpoch > oldEpoch)
 
+    runEvent(ResubmitFailedStages)
+    sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
+
     // Suppose we also get a completed event from task 1 on the same host; this should be ignored
     runEvent(makeCompletionEvent(oldTaskSet.tasks(1), Success, makeMapStatus("hostA", 2)))
     assert(results.size === 0)    // Map stage job should not be complete yet
@@ -1943,8 +1997,9 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
 
     // Now complete tasks in the second task set
     val newTaskSet = taskSets(1)
-    assert(newTaskSet.tasks.size === 2)     // Both tasks 0 and 1 were on on hostA
-    runEvent(makeCompletionEvent(newTaskSet.tasks(0), Success, makeMapStatus("hostB", 2)))
+    assert(newTaskSet.tasks.size === 3)     // Both tasks 0 and 1 were on on hostA
+    runEvent(makeCompletionEvent(newTaskSet.tasks(0), Success, makeMapStatus("hostB", 2),
+      null))
     assert(results.size === 0)    // Map stage job should not be complete yet
     runEvent(makeCompletionEvent(newTaskSet.tasks(1), Success, makeMapStatus("hostB", 2)))
     assert(results.size === 1)    // Map stage job should now finally be complete

Original file line number	Diff line number	Diff line change
`@@ -787,8 +787,12 @@ private[spark] class TaskSetManager(`
`787`	`787`	`addPendingTask(index)`
`788`	`788`	`// Tell the DAGScheduler that this task was resubmitted so that it doesn't think our`
`789`	`789`	`// stage finishes when a total of tasks.size tasks finish.`
`790`		`- sched.dagScheduler.taskEnded(`
`791`		`- tasks(index), Resubmitted, null, Seq.empty[AccumulableInfo], info)`
	`790`	`+ // The reason for not resubmitting ZombieTasks is make DAGScheduler to`
	`791`	`+ // know whether the lost partition can re-run on current activeTaskSet or not.`
	`792`	`+ if (!isZombie) {`
	`793`	`+ sched.dagScheduler.taskEnded(`
	`794`	`+ tasks(index), Resubmitted, null, Seq.empty[AccumulableInfo], info)`
	`795`	`+ }`
`792`	`796`	`}`
`793`	`797`	`}`
`794`	`798`	`}`