apache · ahshahid · Feb 20, 2025 · Feb 20, 2025 · Feb 20, 2025 · Feb 21, 2025
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1554,6 +1554,7 @@ private[spark] class DAGScheduler(
       case sms: ShuffleMapStage if stage.isIndeterminate && !sms.isAvailable =>
         mapOutputTracker.unregisterAllMapAndMergeOutput(sms.shuffleDep.shuffleId)
         sms.shuffleDep.newShuffleMergeState()
+
       case _ =>
     }
 
@@ -1873,15 +1874,28 @@ private[spark] class DAGScheduler(
   private[scheduler] def handleTaskCompletion(event: CompletionEvent): Unit = {
     val task = event.task
     val stageId = task.stageId
+    val stageOption = stageIdToStage.get(task.stageId)
+    val isIndeterministicZombie = event.reason match {
+      case Success if stageOption.isDefined =>
+        val stage = stageOption.get
+        (task.stageAttemptId < stage.latestInfo.attemptNumber()
+          && stage.isIndeterminate) || stage.shouldDiscardResult(task.stageAttemptId)
+
+      case _ => false
+    }
 
     outputCommitCoordinator.taskCompleted(
       stageId,
       task.stageAttemptId,
       task.partitionId,
       event.taskInfo.attemptNumber, // this is a task attempt number
-      event.reason)
+      if (isIndeterministicZombie) {
+        TaskKilled(reason = "Indeterminate stage needs all tasks to be retried")
+      } else {
+        event.reason
+      })
 
-    if (!stageIdToStage.contains(task.stageId)) {
+    if (stageOption.isEmpty) {
       // The stage may have already finished when we get this event -- e.g. maybe it was a
       // speculative task. It is important that we send the TaskEnd event in any case, so listeners
       // are properly notified and can chose to handle it. For instance, some listeners are
@@ -1893,34 +1907,37 @@ private[spark] class DAGScheduler(
       return
     }
 
-    val stage = stageIdToStage(task.stageId)
+    val stage = stageOption.get
 
     // Make sure the task's accumulators are updated before any other processing happens, so that
     // we can post a task end event before any jobs or stages are updated. The accumulators are
     // only updated in certain cases.
     event.reason match {
       case Success =>
-        task match {
-          case rt: ResultTask[_, _] =>
-            val resultStage = stage.asInstanceOf[ResultStage]
-            resultStage.activeJob match {
-              case Some(job) =>
-                // Only update the accumulator once for each result task.
-                if (!job.finished(rt.outputId)) {
-                  updateAccumulators(event)
-                }
-              case None => // Ignore update if task's job has finished.
-            }
-          case _ =>
-            updateAccumulators(event)
+        if (!isIndeterministicZombie) {
+          task match {
+            case rt: ResultTask[_, _] =>
+              val resultStage = stage.asInstanceOf[ResultStage]
+              resultStage.activeJob match {
+                case Some(job) =>
+                  // Only update the accumulator once for each result task.
+                  if (!job.finished(rt.outputId)) {
+                    updateAccumulators(event)
+                  }
+                case _ => // Ignore update if task's job has finished.
+              }
+            case _ => updateAccumulators(event)
+          }
         }
+
       case _: ExceptionFailure | _: TaskKilled => updateAccumulators(event)
+
       case _ =>
     }
     if (trackingCacheVisibility) {
       // Update rdd blocks' visibility status.
       blockManagerMaster.updateRDDBlockVisibility(
-        event.taskInfo.taskId, visible = event.reason == Success)
+        event.taskInfo.taskId, visible = event.reason == Success && !isIndeterministicZombie)
     }
 
     postTaskEnd(event)
@@ -1936,7 +1953,7 @@ private[spark] class DAGScheduler(
         }
 
         task match {
-          case rt: ResultTask[_, _] =>
+          case rt: ResultTask[_, _] if !isIndeterministicZombie =>
             // Cast to ResultStage here because it's part of the ResultTask
             // TODO Refactor this out to a function that accepts a ResultStage
             val resultStage = stage.asInstanceOf[ResultStage]
@@ -1984,7 +2001,7 @@ private[spark] class DAGScheduler(
                 logInfo(log"Ignoring result from ${MDC(RESULT, rt)} because its job has finished")
             }
 
-          case smt: ShuffleMapTask =>
+          case smt: ShuffleMapTask if !isIndeterministicZombie =>
             val shuffleStage = stage.asInstanceOf[ShuffleMapStage]
             // Ignore task completion for old attempt of indeterminate stage
             val ignoreIndeterminate = stage.isIndeterminate &&
@@ -2017,6 +2034,8 @@ private[spark] class DAGScheduler(
                 processShuffleMapStageCompletion(shuffleStage)
               }
             }
+
+          case _ => // ignore
         }
 
       case FetchFailed(bmAddress, shuffleId, _, mapIndex, reduceId, failureMessage) =>
@@ -2121,6 +2140,12 @@ private[spark] class DAGScheduler(
             failedStages += failedStage
             failedStages += mapStage
             if (noResubmitEnqueued) {
+              def generateErrorMessage(stage: Stage): String = {
+                "A shuffle map stage with indeterminate output was failed and retried. " +
+                  s"However, Spark cannot rollback the $stage to re-process the input data, " +
+                  "and has to fail this job. Please eliminate the indeterminacy by " +
+                  "checkpointing the RDD before repartition and try again."
+              }
               // If the map stage is INDETERMINATE, which means the map tasks may return
               // different result when re-try, we need to re-try all the tasks of the failed
               // stage and its succeeding stages, because the input data will be changed after the
@@ -2147,13 +2172,6 @@ private[spark] class DAGScheduler(
                   }
                 }
 
-                def generateErrorMessage(stage: Stage): String = {
-                  "A shuffle map stage with indeterminate output was failed and retried. " +
-                    s"However, Spark cannot rollback the $stage to re-process the input data, " +
-                    "and has to fail this job. Please eliminate the indeterminacy by " +
-                    "checkpointing the RDD before repartition and try again."
-                }
-
                 activeJobs.foreach(job => collectStagesToRollback(job.finalStage :: Nil))
 
                 // The stages will be rolled back after checking
@@ -2171,21 +2189,41 @@ private[spark] class DAGScheduler(
                         abortStage(mapStage, reason, None)
                       } else {
                         rollingBackStages += mapStage
+                        mapOutputTracker.unregisterAllMapAndMergeOutput(
+                          mapStage.shuffleDep.shuffleId)
                       }
+                    } else {
+                      mapOutputTracker.unregisterAllMapAndMergeOutput(
+                        mapStage.shuffleDep.shuffleId)
                     }
 
                   case resultStage: ResultStage if resultStage.activeJob.isDefined =>
                     val numMissingPartitions = resultStage.findMissingPartitions().length
                     if (numMissingPartitions < resultStage.numTasks) {
                       // TODO: support to rollback result tasks.
                       abortStage(resultStage, generateErrorMessage(resultStage), None)
+                    } else {
+                      resultStage.markAllPartitionsMissing()
                     }
 
                   case _ =>
                 }
                 logInfo(log"The shuffle map stage ${MDC(SHUFFLE_ID, mapStage)} with indeterminate output was failed, " +
                   log"we will roll back and rerun below stages which include itself and all its " +
                   log"indeterminate child stages: ${MDC(STAGES, rollingBackStages)}")
+              } else if (failedStage.isIndeterminate) {
+                failedStage match {
+                  case resultStage: ResultStage if resultStage.activeJob.isDefined =>
+                    val numMissingPartitions = resultStage.findMissingPartitions().length
+                    if (numMissingPartitions < resultStage.numTasks) {
+                      // TODO: support to rollback result tasks.
+                      abortStage(resultStage, generateErrorMessage(resultStage), None)
+                    } else {
+                      resultStage.markAllPartitionsMissing()
+                    }
+
+                  case _ =>
+                }
               }
 
               // We expect one executor failure to trigger many FetchFailures in rapid succession,

diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
@@ -38,6 +38,9 @@ private[spark] class ResultStage(
     resourceProfileId: Int)
   extends Stage(id, rdd, partitions.length, parents, firstJobId, callSite, resourceProfileId) {
 
+  @volatile
+  private var discardResultsForAttemptId: Int = -1
+
   /**
    * The active job for this result stage. Will be empty if the job has already finished
    * (e.g., because the job was cancelled).
@@ -54,6 +57,14 @@ private[spark] class ResultStage(
     _activeJob = None
   }
 
+  override def makeNewStageAttempt(
+      numPartitionsToCompute: Int,
+      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty): Unit = {
+    super.makeNewStageAttempt(numPartitionsToCompute, taskLocalityPreferences)
+    // clear the attemptId set in the attemptIdAllPartitionsMissing
+    discardResultsForAttemptId = -1
+  }
+
   /**
    * Returns the sequence of partition ids that are missing (i.e. needs to be computed).
    *
@@ -64,5 +75,16 @@ private[spark] class ResultStage(
     (0 until job.numPartitions).filter(id => !job.finished(id))
   }
 
+  def markAllPartitionsMissing(): Unit = {
+    this.discardResultsForAttemptId = this.latestInfo.attemptNumber()
+    val job = activeJob.get
+    for (id <- 0 until job.numPartitions) {
+      job.finished(id) = false
+    }
+  }
+
+  override def shouldDiscardResult(attemptId: Int): Boolean =
+    this.discardResultsForAttemptId >= attemptId
+
   override def toString: String = "ResultStage " + id
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -131,4 +131,6 @@ private[scheduler] abstract class Stage(
   def isIndeterminate: Boolean = {
     rdd.outputDeterministicLevel == DeterministicLevel.INDETERMINATE
   }
+
+  def shouldDiscardResult(attemptId: Int): Boolean = false
 }