-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-17929] [CORE] Fix deadlock when CoarseGrainedSchedulerBackend reset #15481
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -145,6 +145,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp | |
| // Ignoring the task kill since the executor is not registered. | ||
| logWarning(s"Attempted to kill task $taskId for unknown executor $executorId.") | ||
| } | ||
|
|
||
| case RemoveExecutor(executorId, reason) => | ||
| removeExecutor(executorId, reason) | ||
| } | ||
|
|
||
| override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { | ||
|
|
@@ -386,14 +389,17 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp | |
| * Reset the state of CoarseGrainedSchedulerBackend to the initial state. Currently it will only | ||
| * be called in the yarn-client mode when AM re-registers after a failure. | ||
| * */ | ||
| protected def reset(): Unit = synchronized { | ||
| numPendingExecutors = 0 | ||
| executorsPendingToRemove.clear() | ||
| protected def reset(): Unit = { | ||
| val executors = synchronized { | ||
| numPendingExecutors = 0 | ||
| executorsPendingToRemove.clear() | ||
| Set() ++ executorDataMap.keys | ||
| } | ||
|
|
||
| // Remove all the lingering executors that should be removed but not yet. The reason might be | ||
| // because (1) disconnected event is not yet received; (2) executors die silently. | ||
| executorDataMap.toMap.foreach { case (eid, _) => | ||
| driverEndpoint.askWithRetry[Boolean]( | ||
| executors.foreach { eid => | ||
| driverEndpoint.send( | ||
|
||
| RemoveExecutor(eid, SlaveLost("Stale executor after cluster manager re-registered."))) | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why remove
executorDataMap.get(executorId).foreach(_.executorEndpoint.send(StopExecutor))?