-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-54088][CORE] When TaskSchedulerImpl searches for an executor to schedule an unschedulable task, it should exclude executors that are pending to remove #52793
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
7dc12da
2573651
2e51fa8
b6f9e83
b3f541f
be886ad
fd79d92
7ddfca4
c887fbc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -164,6 +164,8 @@ private[spark] class TaskSchedulerImpl( | |
| // in turn is used to decide when we can attain data locality on a given host | ||
| protected val hostToExecutors = new HashMap[String, HashSet[String]] | ||
|
|
||
| protected val availableHostToExecutors = new HashMap[String, HashSet[String]] | ||
|
|
||
| protected val hostsByRack = new HashMap[String, HashSet[String]] | ||
|
|
||
| protected val executorIdToHost = new HashMap[String, String] | ||
|
|
@@ -495,8 +497,12 @@ private[spark] class TaskSchedulerImpl( | |
| if (!hostToExecutors.contains(o.host)) { | ||
| hostToExecutors(o.host) = new HashSet[String]() | ||
| } | ||
| if (!availableHostToExecutors.contains(o.host)) { | ||
| availableHostToExecutors(o.host) = new HashSet[String]() | ||
| } | ||
| if (!executorIdToRunningTaskIds.contains(o.executorId)) { | ||
| hostToExecutors(o.host) += o.executorId | ||
| availableHostToExecutors(o.host) += o.executorId | ||
| executorAdded(o.executorId, o.host) | ||
| executorIdToHost(o.executorId) = o.host | ||
| executorIdToRunningTaskIds(o.executorId) = HashSet[Long]() | ||
|
|
@@ -589,7 +595,7 @@ private[spark] class TaskSchedulerImpl( | |
| } | ||
|
|
||
| if (!launchedAnyTask) { | ||
| taskSet.getCompletelyExcludedTaskIfAny(hostToExecutors).foreach { taskIndex => | ||
| taskSet.getCompletelyExcludedTaskIfAny(availableHostToExecutors).foreach { taskIndex => | ||
|
||
| // If the taskSet is unschedulable we try to find an existing idle excluded | ||
| // executor and kill the idle executor and kick off an abortTimer which if it doesn't | ||
| // schedule a task within the timeout will abort the taskSet if we were unable to | ||
|
|
@@ -605,7 +611,8 @@ private[spark] class TaskSchedulerImpl( | |
| // If there are no idle executors and dynamic allocation is enabled, then we would | ||
| // notify ExecutorAllocationManager to allocate more executors to schedule the | ||
| // unschedulable tasks else we will abort immediately. | ||
| executorIdToRunningTaskIds.find(x => !isExecutorBusy(x._1)) match { | ||
| executorIdToRunningTaskIds.find( | ||
| x => isExecutorAvailable(x._1) && !isExecutorBusy(x._1)) match { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we just extend busy to include those we're planning on removing?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IF we can directly use CoarseGrainedSchedulerBackend's executorPendingToRemove should be more better, but seems can't? |
||
| case Some ((executorId, _)) => | ||
| if (!unschedulableTaskSetToExpiryTime.contains(taskSet)) { | ||
| healthTrackerOpt.foreach(blt => blt.killExcludedIdleExecutor(executorId)) | ||
|
|
@@ -967,6 +974,7 @@ private[spark] class TaskSchedulerImpl( | |
| if (executorIdToHost.contains(executorId)) { | ||
| executorsPendingDecommission(executorId) = | ||
| ExecutorDecommissionState(clock.getTimeMillis(), decommissionInfo.workerHost) | ||
| removeAvailableExecutor(executorId) | ||
| } | ||
| } | ||
| rootPool.executorDecommission(executorId) | ||
|
|
@@ -1084,6 +1092,7 @@ private[spark] class TaskSchedulerImpl( | |
| } | ||
| } | ||
|
|
||
| removeAvailableExecutor(executorId) | ||
| executorsPendingDecommission.remove(executorId) | ||
| .foreach(executorsRemovedByDecom.put(executorId, _)) | ||
|
|
||
|
|
@@ -1120,6 +1129,11 @@ private[spark] class TaskSchedulerImpl( | |
| executorIdToRunningTaskIds.get(execId).exists(_.nonEmpty) | ||
| } | ||
|
|
||
| def isExecutorAvailable(execId: String): Boolean = synchronized { | ||
| executorIdToHost.get(execId) | ||
| .exists(availableHostToExecutors.get(_).exists(_.contains(execId))) | ||
| } | ||
|
|
||
| // exposed for test | ||
| protected final def isExecutorDecommissioned(execId: String): Boolean = | ||
| getExecutorDecommissionState(execId).isDefined | ||
|
|
@@ -1190,6 +1204,20 @@ private[spark] class TaskSchedulerImpl( | |
| manager | ||
| } | ||
| } | ||
|
|
||
| override def markExecutorPendingToRemove(executorId: String): Unit = synchronized { | ||
| removeAvailableExecutor(executorId) | ||
| } | ||
|
|
||
| private def removeAvailableExecutor(executorId: String): Unit = { | ||
| executorIdToHost.get(executorId).foreach { host => | ||
| val execs = availableHostToExecutors.getOrElse(host, new HashSet) | ||
| execs -= executorId | ||
| if (execs.isEmpty) { | ||
| availableHostToExecutors -= host | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
|
|
@@ -1311,4 +1339,5 @@ private[spark] object TaskSchedulerImpl { | |
| } | ||
| } | ||
|
|
||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -975,7 +975,10 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp | |
| val executorsToKill = knownExecutors | ||
| .filter { id => !executorsPendingToRemove.contains(id) } | ||
| .filter { id => force || !scheduler.isExecutorBusy(id) } | ||
| executorsToKill.foreach { id => executorsPendingToRemove(id) = !countFailures } | ||
| executorsToKill.foreach { id => { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what about decommissioned executors ? |
||
| scheduler.markExecutorPendingToRemove(id) | ||
| executorsPendingToRemove(id) = !countFailures | ||
| } } | ||
|
|
||
| logInfo(log"Actual list of executor(s) to be killed is " + | ||
| log"${MDC(LogKeys.EXECUTOR_IDS, executorsToKill.mkString(", "))}") | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.