Skip to content

Commit e89dadf

Browse files
wangyumGitHub Enterprise
authored andcommitted
[HADP-55144] Fix Spark failed due to Attempt recovered after RM restartApplicationMaster (apache#575)
1 parent a41e7e4 commit e89dadf

File tree

1 file changed

+5
-10
lines changed

1 file changed

+5
-10
lines changed

resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -583,17 +583,12 @@ private[spark] class ApplicationMaster(
583583
allocator.allocateResources()
584584
}
585585
} else {
586-
if (allocator.isAllNodeExcluded) {
587-
finish(FinalApplicationStatus.FAILED,
588-
ApplicationMaster.EXIT_MAX_EXECUTOR_FAILURES,
589-
"Due to executor failures all available nodes are excluded")
590-
} else {
591-
if (allocator.getNumExecutorsFailed >= maxNumExecutorFailures) {
592-
logError(s"Max number of executor failures ($maxNumExecutorFailures) reached")
593-
}
594-
logDebug("Sending progress")
595-
allocator.allocateResources()
586+
// allocator.isAllNodeExcluded may be true if failing over RM, so we should not exit here
587+
if (allocator.getNumExecutorsFailed >= maxNumExecutorFailures) {
588+
logError(s"Max number of executor failures ($maxNumExecutorFailures) reached")
596589
}
590+
logDebug("Sending progress")
591+
allocator.allocateResources()
597592
}
598593
failureCount = 0
599594
} catch {

0 commit comments

Comments
 (0)