-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-51272][CORE]. Fix for the race condition in Scheduler causing failure in retrying all partitions in case of indeterministic shuffle keys #50033
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c537b3a
765ae55
e9a4659
1da65be
fc918af
d204a67
5a940a7
ddeeff8
c4e1fee
f84af25
ac963bf
d652319
d8b4079
a0c700c
a930c9d
9b153a3
70b7910
a5ddc9e
001258a
a819f4f
d8eb1c3
1ec98f0
69b66aa
c95a34d
c5fbb45
3439bf9
abcd133
b8865af
784aad3
cd041ee
2c95271
1014c6b
b1a0593
dc08a7b
4f263d1
3895fec
2498169
9fd7114
914123f
00a4aad
2d3cb78
47eab67
f16187c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1873,15 +1873,28 @@ private[spark] class DAGScheduler( | |
| private[scheduler] def handleTaskCompletion(event: CompletionEvent): Unit = { | ||
| val task = event.task | ||
| val stageId = task.stageId | ||
| val stageOption = stageIdToStage.get(task.stageId) | ||
| val isIndeterministicZombie = event.reason match { | ||
| case Success if stageOption.isDefined => | ||
| val stage = stageOption.get | ||
| (task.stageAttemptId < stage.latestInfo.attemptNumber() | ||
| && stage.isIndeterminate) || stage.shouldDiscardResult(task.stageAttemptId) | ||
|
|
||
| case _ => false | ||
| } | ||
|
|
||
| outputCommitCoordinator.taskCompleted( | ||
| stageId, | ||
| task.stageAttemptId, | ||
| task.partitionId, | ||
| event.taskInfo.attemptNumber, // this is a task attempt number | ||
| event.reason) | ||
| if (isIndeterministicZombie) { | ||
| TaskKilled(reason = "Indeterminate stage needs all tasks to be retried") | ||
| } else { | ||
| event.reason | ||
| }) | ||
|
|
||
| if (!stageIdToStage.contains(task.stageId)) { | ||
| if (stageOption.isEmpty) { | ||
| // The stage may have already finished when we get this event -- e.g. maybe it was a | ||
| // speculative task. It is important that we send the TaskEnd event in any case, so listeners | ||
| // are properly notified and can chose to handle it. For instance, some listeners are | ||
|
|
@@ -1893,34 +1906,37 @@ private[spark] class DAGScheduler( | |
| return | ||
| } | ||
|
|
||
| val stage = stageIdToStage(task.stageId) | ||
| val stage = stageOption.get | ||
|
|
||
| // Make sure the task's accumulators are updated before any other processing happens, so that | ||
| // we can post a task end event before any jobs or stages are updated. The accumulators are | ||
| // only updated in certain cases. | ||
| event.reason match { | ||
| case Success => | ||
| task match { | ||
| case rt: ResultTask[_, _] => | ||
| val resultStage = stage.asInstanceOf[ResultStage] | ||
| resultStage.activeJob match { | ||
| case Some(job) => | ||
| // Only update the accumulator once for each result task. | ||
| if (!job.finished(rt.outputId)) { | ||
| updateAccumulators(event) | ||
| } | ||
| case None => // Ignore update if task's job has finished. | ||
| } | ||
| case _ => | ||
| updateAccumulators(event) | ||
| if (!isIndeterministicZombie) { | ||
| task match { | ||
| case rt: ResultTask[_, _] => | ||
| val resultStage = stage.asInstanceOf[ResultStage] | ||
| resultStage.activeJob match { | ||
| case Some(job) => | ||
| // Only update the accumulator once for each result task. | ||
| if (!job.finished(rt.outputId)) { | ||
| updateAccumulators(event) | ||
| } | ||
| case _ => // Ignore update if task's job has finished. | ||
| } | ||
| case _ => updateAccumulators(event) | ||
| } | ||
| } | ||
|
|
||
| case _: ExceptionFailure | _: TaskKilled => updateAccumulators(event) | ||
|
|
||
| case _ => | ||
| } | ||
| if (trackingCacheVisibility) { | ||
| // Update rdd blocks' visibility status. | ||
| blockManagerMaster.updateRDDBlockVisibility( | ||
| event.taskInfo.taskId, visible = event.reason == Success) | ||
| event.taskInfo.taskId, visible = event.reason == Success && !isIndeterministicZombie) | ||
| } | ||
|
|
||
| postTaskEnd(event) | ||
|
|
@@ -1936,7 +1952,7 @@ private[spark] class DAGScheduler( | |
| } | ||
|
|
||
| task match { | ||
| case rt: ResultTask[_, _] => | ||
| case rt: ResultTask[_, _] if !isIndeterministicZombie => | ||
| // Cast to ResultStage here because it's part of the ResultTask | ||
| // TODO Refactor this out to a function that accepts a ResultStage | ||
| val resultStage = stage.asInstanceOf[ResultStage] | ||
|
|
@@ -1984,7 +2000,7 @@ private[spark] class DAGScheduler( | |
| logInfo(log"Ignoring result from ${MDC(RESULT, rt)} because its job has finished") | ||
| } | ||
|
|
||
| case smt: ShuffleMapTask => | ||
| case smt: ShuffleMapTask if !isIndeterministicZombie => | ||
| val shuffleStage = stage.asInstanceOf[ShuffleMapStage] | ||
| // Ignore task completion for old attempt of indeterminate stage | ||
| val ignoreIndeterminate = stage.isIndeterminate && | ||
|
|
@@ -2017,6 +2033,8 @@ private[spark] class DAGScheduler( | |
| processShuffleMapStageCompletion(shuffleStage) | ||
| } | ||
| } | ||
|
|
||
| case _ => // ignore | ||
| } | ||
|
|
||
| case FetchFailed(bmAddress, shuffleId, _, mapIndex, reduceId, failureMessage) => | ||
|
|
@@ -2121,6 +2139,12 @@ private[spark] class DAGScheduler( | |
| failedStages += failedStage | ||
| failedStages += mapStage | ||
| if (noResubmitEnqueued) { | ||
| def generateErrorMessage(stage: Stage): String = { | ||
| "A shuffle map stage with indeterminate output was failed and retried. " + | ||
| s"However, Spark cannot rollback the $stage to re-process the input data, " + | ||
| "and has to fail this job. Please eliminate the indeterminacy by " + | ||
| "checkpointing the RDD before repartition and try again." | ||
| } | ||
| // If the map stage is INDETERMINATE, which means the map tasks may return | ||
| // different result when re-try, we need to re-try all the tasks of the failed | ||
| // stage and its succeeding stages, because the input data will be changed after the | ||
|
|
@@ -2147,13 +2171,6 @@ private[spark] class DAGScheduler( | |
| } | ||
| } | ||
|
|
||
| def generateErrorMessage(stage: Stage): String = { | ||
| "A shuffle map stage with indeterminate output was failed and retried. " + | ||
| s"However, Spark cannot rollback the $stage to re-process the input data, " + | ||
| "and has to fail this job. Please eliminate the indeterminacy by " + | ||
| "checkpointing the RDD before repartition and try again." | ||
| } | ||
|
|
||
| activeJobs.foreach(job => collectStagesToRollback(job.finalStage :: Nil)) | ||
|
|
||
| // The stages will be rolled back after checking | ||
|
|
@@ -2171,21 +2188,41 @@ private[spark] class DAGScheduler( | |
| abortStage(mapStage, reason, None) | ||
| } else { | ||
| rollingBackStages += mapStage | ||
| mapOutputTracker.unregisterAllMapAndMergeOutput( | ||
| mapStage.shuffleDep.shuffleId) | ||
| } | ||
| } else { | ||
| mapOutputTracker.unregisterAllMapAndMergeOutput( | ||
| mapStage.shuffleDep.shuffleId) | ||
| } | ||
|
|
||
| case resultStage: ResultStage if resultStage.activeJob.isDefined => | ||
| val numMissingPartitions = resultStage.findMissingPartitions().length | ||
| if (numMissingPartitions < resultStage.numTasks) { | ||
| // TODO: support to rollback result tasks. | ||
| abortStage(resultStage, generateErrorMessage(resultStage), None) | ||
| } else { | ||
| resultStage.markAllPartitionsMissing() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (Here and in other places) If ResultStage does not have any missing tasks - why are we failing it ?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are at this line, as per my understanding, because the current stage ( result stage) , its first task has failed. And the ResultStage is directly or indirectly dependent on an inDeterminate stage ( that is ResultStage is an inDeterminate type of stage). The function call resultStage.markAllPartitionsMissing(), sets the flag for this stage , such that while this stage is being resubmitted for re-execution ( a separate thread adds it back to the event queue), in that window , if any successful task comes for some other partition , it should be rejected. If it gets added , then that is the race. As it would other wise have resulted in refetch of some partitions ( & not all). |
||
| } | ||
|
|
||
| case _ => | ||
| } | ||
| logInfo(log"The shuffle map stage ${MDC(SHUFFLE_ID, mapStage)} with indeterminate output was failed, " + | ||
| log"we will roll back and rerun below stages which include itself and all its " + | ||
| log"indeterminate child stages: ${MDC(STAGES, rollingBackStages)}") | ||
| } else if (failedStage.isIndeterminate) { | ||
| failedStage match { | ||
| case resultStage: ResultStage if resultStage.activeJob.isDefined => | ||
| val numMissingPartitions = resultStage.findMissingPartitions().length | ||
| if (numMissingPartitions < resultStage.numTasks) { | ||
| // TODO: support to rollback result tasks. | ||
| abortStage(resultStage, generateErrorMessage(resultStage), None) | ||
| } else { | ||
| resultStage.markAllPartitionsMissing() | ||
| } | ||
|
|
||
| case _ => | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Given mapStage is not indeterminate, but only
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the case, where ResultStage is failing due to the first task. The ShuffleID which caused the result stage to fail is Determinate. But the Result Stage ( in case of Join) is dependent on two shuffle stages, where the other shuffle stage is inDeterminate. ( which makes the ResultStage also inDeterminate). |
||
| } | ||
| } | ||
|
|
||
| // We expect one executor failure to trigger many FetchFailures in rapid succession, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,6 +38,9 @@ private[spark] class ResultStage( | |
| resourceProfileId: Int) | ||
| extends Stage(id, rdd, partitions.length, parents, firstJobId, callSite, resourceProfileId) { | ||
|
|
||
| @volatile | ||
| private var discardResultsForAttemptId: Int = -1 | ||
|
|
||
| /** | ||
| * The active job for this result stage. Will be empty if the job has already finished | ||
| * (e.g., because the job was cancelled). | ||
|
|
@@ -54,6 +57,14 @@ private[spark] class ResultStage( | |
| _activeJob = None | ||
| } | ||
|
|
||
| override def makeNewStageAttempt( | ||
| numPartitionsToCompute: Int, | ||
| taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty): Unit = { | ||
| super.makeNewStageAttempt(numPartitionsToCompute, taskLocalityPreferences) | ||
| // clear the attemptId set in the attemptIdAllPartitionsMissing | ||
| discardResultsForAttemptId = -1 | ||
| } | ||
|
|
||
| /** | ||
| * Returns the sequence of partition ids that are missing (i.e. needs to be computed). | ||
| * | ||
|
|
@@ -64,5 +75,16 @@ private[spark] class ResultStage( | |
| (0 until job.numPartitions).filter(id => !job.finished(id)) | ||
| } | ||
|
|
||
| def markAllPartitionsMissing(): Unit = { | ||
| this.discardResultsForAttemptId = this.latestInfo.attemptNumber() | ||
| val job = activeJob.get | ||
| for (id <- 0 until job.numPartitions) { | ||
| job.finished(id) = false | ||
| } | ||
| } | ||
|
|
||
| override def shouldDiscardResult(attemptId: Int): Boolean = | ||
| this.discardResultsForAttemptId >= attemptId | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Assuming both usecases for
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unless we decide to aggressively abort the query, even if its the first task which is failing ( & resultStage is inDeterminate), this piece of code and the two above, are needed , IMO to avoid the race condition. |
||
|
|
||
| override def toString: String = "ResultStage " + id | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add if/else for
SHUFFLE_USE_OLD_FETCH_PROTOCOL.Orthogonally, given we are in 4.0, perhaps it is time to drop
SHUFFLE_USE_OLD_FETCH_PROTOCOL. Thoughts @attilapiros ?