[HADP-55144] Fix Spark failed due to Attempt recovered after RM restartApplicationMaster (apache#575)

wangyum · GitHub Enterprise · commit e89dadf92073 · 2024-10-11T00:55:36.000-05:00
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -583,17 +583,12 @@ private[spark] class ApplicationMaster(
             allocator.allocateResources()
           }
         } else {
-          if (allocator.isAllNodeExcluded) {
-            finish(FinalApplicationStatus.FAILED,
-              ApplicationMaster.EXIT_MAX_EXECUTOR_FAILURES,
-              "Due to executor failures all available nodes are excluded")
-          } else {
-            if (allocator.getNumExecutorsFailed >= maxNumExecutorFailures) {
-              logError(s"Max number of executor failures ($maxNumExecutorFailures) reached")
-            }
-            logDebug("Sending progress")
-            allocator.allocateResources()
+          // allocator.isAllNodeExcluded may be true if failing over RM, so we should not exit here
+          if (allocator.getNumExecutorsFailed >= maxNumExecutorFailures) {
+            logError(s"Max number of executor failures ($maxNumExecutorFailures) reached")
           }
+          logDebug("Sending progress")
+          allocator.allocateResources()
         }
         failureCount = 0
       } catch {