diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala index 37aaca7e8ceeb..cb4d8810e5c38 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala @@ -41,7 +41,7 @@ private[spark] case class ExecutorPodsSnapshot(executorPods: Map[Long, ExecutorP object ExecutorPodsSnapshot extends Logging { private var shouldCheckAllContainers: Boolean = _ - private var sparkContainerName: String = _ + private var sparkContainerName: String = DEFAULT_EXECUTOR_CONTAINER_NAME def apply(executorPods: Seq[Pod]): ExecutorPodsSnapshot = { ExecutorPodsSnapshot(toStatesByExecutorId(executorPods)) @@ -80,24 +80,21 @@ object ExecutorPodsSnapshot extends Logging { .anyMatch(t => t != null && t.getExitCode != 0)) { PodFailed(pod) } else { - // Otherwise look for the Spark container - val sparkContainerStatusOpt = pod.getStatus.getContainerStatuses.asScala - .find(_.getName() == sparkContainerName) - sparkContainerStatusOpt match { - case Some(sparkContainerStatus) => - sparkContainerStatus.getState.getTerminated match { - case t if t.getExitCode != 0 => - PodFailed(pod) - case t if t.getExitCode == 0 => + // Otherwise look for the Spark container and get the exit code if present. + val sparkContainerExitCode = pod.getStatus.getContainerStatuses.asScala + .find(_.getName() == sparkContainerName).flatMap(x => Option(x.getState)) + .flatMap(x => Option(x.getTerminated)).flatMap(x => Option(x.getExitCode)) + .map(_.toInt) + sparkContainerExitCode match { + case Some(t) => + t match { + case 0 => PodSucceeded(pod) case _ => - PodRunning(pod) + PodFailed(pod) } - // If we can't find the Spark container status, fall back to the pod status. This is - // expected to occur during pod startup and other situations. + // No exit code means we are running. case _ => - logDebug(s"Unable to find container ${sparkContainerName} in pod ${pod} " + - "defaulting to entire pod status (running).") PodRunning(pod) } } diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorLifecycleTestUtils.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorLifecycleTestUtils.scala index 225278c2aad71..41cba573d89c2 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorLifecycleTestUtils.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorLifecycleTestUtils.scala @@ -115,13 +115,16 @@ object ExecutorLifecycleTestUtils { .editOrNewStatus() .withPhase("running") .addNewContainerStatus() + .withName(DEFAULT_EXECUTOR_CONTAINER_NAME) .withNewState() .withNewTerminated() + .withMessage("message") .withExitCode(exitCode) .endTerminated() .endState() .endContainerStatus() .addNewContainerStatus() + .withName("SIDECARFRIEND") .withNewState() .withNewRunning() .endRunning()