Skip to content

Commit cb0cc86

Browse files
committed
[SPARK-937] adding EXITED executor state and not relaunching cleanly exited executors
updated code comments minor comment wording change
1 parent bd67551 commit cb0cc86

File tree

3 files changed

+8
-8
lines changed

3 files changed

+8
-8
lines changed

core/src/main/scala/org/apache/spark/deploy/ExecutorState.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ package org.apache.spark.deploy
1919

2020
private[spark] object ExecutorState extends Enumeration {
2121

22-
val LAUNCHING, LOADING, RUNNING, KILLED, FAILED, LOST = Value
22+
val LAUNCHING, LOADING, RUNNING, KILLED, FAILED, LOST, EXITED = Value
2323

2424
type ExecutorState = Value
2525

26-
def isFinished(state: ExecutorState): Boolean = Seq(KILLED, FAILED, LOST).contains(state)
26+
def isFinished(state: ExecutorState): Boolean = Seq(KILLED, FAILED, LOST, EXITED).contains(state)
2727
}

core/src/main/scala/org/apache/spark/deploy/master/Master.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -294,10 +294,11 @@ private[spark] class Master(
294294
appInfo.removeExecutor(exec)
295295
exec.worker.removeExecutor(exec)
296296

297+
val normalExit = exitStatus.exists(_ == 0)
297298
// Only retry certain number of times so we don't go into an infinite loop.
298-
if (appInfo.incrementRetryCount < ApplicationState.MAX_NUM_RETRY) {
299+
if (!normalExit && appInfo.incrementRetryCount < ApplicationState.MAX_NUM_RETRY) {
299300
schedule()
300-
} else {
301+
} else if (!normalExit) {
301302
logError("Application %s with ID %s failed %d times, removing it".format(
302303
appInfo.desc.name, appInfo.id, appInfo.retryCount))
303304
removeApplication(appInfo, ApplicationState.FAILED)

core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,11 +138,10 @@ private[spark] class ExecutorRunner(
138138
Files.write(header, stderr, Charsets.UTF_8)
139139
CommandUtils.redirectStream(process.getErrorStream, stderr)
140140

141-
// Wait for it to exit; this is actually a bad thing if it happens, because we expect to run
142-
// long-lived processes only. However, in the future, we might restart the executor a few
143-
// times on the same machine.
141+
// Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown)
142+
// or with nonzero exit code
144143
val exitCode = process.waitFor()
145-
state = ExecutorState.FAILED
144+
state = ExecutorState.EXITED
146145
val message = "Command exited with code " + exitCode
147146
worker ! ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode))
148147
} catch {

0 commit comments

Comments
 (0)