-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-33799][CORE] Handle excluded executors/nodes in ExecutorMonitor #30795
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
fa5f141
.
Ngone51 c6301f8
add JIRA ID
Ngone51 f46a9a2
rename isExcluded to excluded
Ngone51 f8d8495
rename increase.. to increment..
Ngone51 6239a99
extract the variable & hanlde exec being null
Ngone51 75cdd1e
fix tests
Ngone51 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -52,6 +52,11 @@ private[spark] class ExecutorMonitor( | |
| private val shuffleTrackingEnabled = !conf.get(SHUFFLE_SERVICE_ENABLED) && | ||
| conf.get(DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED) | ||
|
|
||
| // We only track the `excluded` status of executor tacker when the exclusion feature is enabled | ||
| // but killing the excluded executors is disabled. | ||
| private val exclusionButNotKillEnabled = conf.get(EXCLUDE_ON_FAILURE_ENABLED).getOrElse(false) && | ||
| !conf.get(EXCLUDE_ON_FAILURE_KILL_ENABLED) | ||
|
|
||
| private val executors = new ConcurrentHashMap[String, Tracker]() | ||
| private val execResourceProfileCount = new ConcurrentHashMap[Int, Int]() | ||
|
|
||
|
|
@@ -166,6 +171,7 @@ private[spark] class ExecutorMonitor( | |
|
|
||
| def executorCount: Int = executors.size() | ||
|
|
||
| // executors that are available for running tasks. Excluded executors are already excluded. | ||
| def executorCountWithResourceProfile(id: Int): Int = { | ||
| execResourceProfileCount.getOrDefault(id, 0) | ||
| } | ||
|
|
@@ -180,6 +186,14 @@ private[spark] class ExecutorMonitor( | |
| } | ||
| } | ||
|
|
||
| def excludedExecutorCount: Int = { | ||
| if (exclusionButNotKillEnabled) { | ||
| executors.values().asScala.count(_.excluded) | ||
| } else { | ||
| 0 | ||
| } | ||
| } | ||
|
|
||
| def pendingRemovalCount: Int = executors.asScala.count { case (_, exec) => exec.pendingRemoval } | ||
|
|
||
| def pendingRemovalCountPerResourceProfileId(id: Int): Int = { | ||
|
|
@@ -307,7 +321,8 @@ private[spark] class ExecutorMonitor( | |
| val executorId = event.taskInfo.executorId | ||
| // Guard against a late arriving task start event (SPARK-26927). | ||
| if (client.isExecutorActive(executorId)) { | ||
| val exec = ensureExecutorIsTracked(executorId, UNKNOWN_RESOURCE_PROFILE_ID) | ||
| val exec = ensureExecutorIsTracked( | ||
| executorId, event.taskInfo.host, UNKNOWN_RESOURCE_PROFILE_ID) | ||
| exec.updateRunningTasks(1) | ||
| } | ||
| } | ||
|
|
@@ -337,7 +352,8 @@ private[spark] class ExecutorMonitor( | |
| } | ||
|
|
||
| override def onExecutorAdded(event: SparkListenerExecutorAdded): Unit = { | ||
| val exec = ensureExecutorIsTracked(event.executorId, event.executorInfo.resourceProfileId) | ||
| val exec = ensureExecutorIsTracked( | ||
| event.executorId, event.executorInfo.executorHost, event.executorInfo.resourceProfileId) | ||
| exec.updateRunningTasks(0) | ||
| logInfo(s"New executor ${event.executorId} has registered (new total is ${executors.size()})") | ||
| } | ||
|
|
@@ -348,6 +364,11 @@ private[spark] class ExecutorMonitor( | |
| execResourceProfileCount.remove(rpId, 0) | ||
| } | ||
|
|
||
| private def incrementExecResourceProfileCount(rpId: Int): Unit = { | ||
| val count = execResourceProfileCount.computeIfAbsent(rpId, _ => 0) | ||
| execResourceProfileCount.replace(rpId, count, count + 1) | ||
| } | ||
|
|
||
| override def onExecutorRemoved(event: SparkListenerExecutorRemoved): Unit = { | ||
| val removed = executors.remove(event.executorId) | ||
| if (removed != null) { | ||
|
|
@@ -359,7 +380,9 @@ private[spark] class ExecutorMonitor( | |
| } | ||
|
|
||
| override def onBlockUpdated(event: SparkListenerBlockUpdated): Unit = { | ||
| val exec = ensureExecutorIsTracked(event.blockUpdatedInfo.blockManagerId.executorId, | ||
| val exec = ensureExecutorIsTracked( | ||
| event.blockUpdatedInfo.blockManagerId.executorId, | ||
| event.blockUpdatedInfo.blockManagerId.host, | ||
| UNKNOWN_RESOURCE_PROFILE_ID) | ||
|
|
||
| // Check if it is a shuffle file, or RDD to pick the correct codepath for update | ||
|
|
@@ -418,6 +441,42 @@ private[spark] class ExecutorMonitor( | |
| } | ||
| } | ||
|
|
||
| override def onExecutorExcluded(executorExcluded: SparkListenerExecutorExcluded): Unit = { | ||
| if (exclusionButNotKillEnabled) { | ||
| Option(executors.get(executorExcluded.executorId)).foreach { exec => | ||
| exec.excluded = true | ||
| decrementExecResourceProfileCount(exec.resourceProfileId) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| override def onExecutorUnexcluded(executorUnexcluded: SparkListenerExecutorUnexcluded): Unit = { | ||
| if (exclusionButNotKillEnabled) { | ||
| Option(executors.get(executorUnexcluded.executorId)).foreach { exec => | ||
| exec.excluded = false | ||
| incrementExecResourceProfileCount(exec.resourceProfileId) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| override def onNodeExcluded(nodeExcluded: SparkListenerNodeExcluded): Unit = { | ||
| if (exclusionButNotKillEnabled) { | ||
| executors.values().asScala.filter(_.host == nodeExcluded.hostId).foreach { exec => | ||
| exec.excluded = true | ||
| decrementExecResourceProfileCount(exec.resourceProfileId) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| override def onNodeUnexcluded(nodeUnexcluded: SparkListenerNodeUnexcluded): Unit = { | ||
| if (exclusionButNotKillEnabled) { | ||
| executors.values().asScala.filter(_.host == nodeUnexcluded.hostId).foreach { exec => | ||
| exec.excluded = false | ||
| incrementExecResourceProfileCount(exec.resourceProfileId) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| override def onOtherEvent(event: SparkListenerEvent): Unit = event match { | ||
| case ShuffleCleanedEvent(id) => cleanupShuffle(id) | ||
| case _ => | ||
|
|
@@ -467,23 +526,21 @@ private[spark] class ExecutorMonitor( | |
| * which the `SparkListenerTaskStart` event is posted before the `SparkListenerBlockManagerAdded` | ||
| * event, which is possible because these events are posted in different threads. (see SPARK-4951) | ||
| */ | ||
| private def ensureExecutorIsTracked(id: String, resourceProfileId: Int): Tracker = { | ||
| val numExecsWithRpId = execResourceProfileCount.computeIfAbsent(resourceProfileId, _ => 0) | ||
| private def ensureExecutorIsTracked(id: String, host: String, resourceProfileId: Int): Tracker = { | ||
| val execTracker = executors.computeIfAbsent(id, _ => { | ||
| val newcount = numExecsWithRpId + 1 | ||
| execResourceProfileCount.put(resourceProfileId, newcount) | ||
| logDebug(s"Executor added with ResourceProfile id: $resourceProfileId " + | ||
| s"count is now $newcount") | ||
| new Tracker(resourceProfileId) | ||
| }) | ||
| incrementExecResourceProfileCount(resourceProfileId) | ||
| logDebug(s"Executor added with ResourceProfile id: $resourceProfileId " + | ||
| s"count is now ${execResourceProfileCount.get(resourceProfileId)}") | ||
| new Tracker(resourceProfileId, host) | ||
| }) | ||
| // if we had added executor before without knowing the resource profile id, fix it up | ||
| if (execTracker.resourceProfileId == UNKNOWN_RESOURCE_PROFILE_ID && | ||
| resourceProfileId != UNKNOWN_RESOURCE_PROFILE_ID) { | ||
| logDebug(s"Executor: $id, resource profile id was unknown, setting " + | ||
| s"it to $resourceProfileId") | ||
| execTracker.resourceProfileId = resourceProfileId | ||
| // fix up the counts for each resource profile id | ||
| execResourceProfileCount.put(resourceProfileId, numExecsWithRpId + 1) | ||
| incrementExecResourceProfileCount(resourceProfileId) | ||
| decrementExecResourceProfileCount(UNKNOWN_RESOURCE_PROFILE_ID) | ||
| } | ||
| execTracker | ||
|
|
@@ -506,7 +563,7 @@ private[spark] class ExecutorMonitor( | |
| } | ||
| } | ||
|
|
||
| private class Tracker(var resourceProfileId: Int) { | ||
| private class Tracker(var resourceProfileId: Int, val host: String) { | ||
| @volatile var timeoutAt: Long = Long.MaxValue | ||
|
|
||
| // Tracks whether this executor is thought to be timed out. It's used to detect when the list | ||
|
|
@@ -516,6 +573,8 @@ private[spark] class ExecutorMonitor( | |
| var pendingRemoval: Boolean = false | ||
| var decommissioning: Boolean = false | ||
| var hasActiveShuffle: Boolean = false | ||
| // whether the executor is temporarily excluded by the `HealthTracker` | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should expand this to state excluded for entire application (I realize HealthTracker implies this but would like to be more explicit), does not include excluded within the stage level. |
||
| var excluded: Boolean = false | ||
|
|
||
| private var idleStart: Long = -1 | ||
| private var runningTasks: Int = 0 | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so I don't agree with this, at least not how its defined. The user defined the maximum number of executors to use, this is getting more than that. I realize that some are excluded, but this also comes down to a resource utilization question as well. If I am in multi-tenant environment, I want to make sure 1 job doesn't take over the entire cluster. max is one way to do this. I think we would either need to redefine this, which isn't great for backwards compatibility and could result in unexpected behavior or we add another config that is around the excluded nodes. this would either just be an allow to go over or a allow to go over by X. The downside to this is default would be 0 or false so you would have to configure if you do set max and want to use this feature. But I don't see a lot of jobs setting max unless they are trying to be nice in multi-tenant so it seems ok as long as its in release notes, etc.
you will notice the other logic for unschedulableTaskSets does not increase this, just increases the number we ask for.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make sense to me. Add an extra conf would be a good choice.
Although, I'm rethinking this change. It only takes effect when users set the max explicitly and the cluster reaches the max.( By default, max is Int.MaxValue. So we won't reach the max normally.) However, we still want to replace those excluded executors even if the cluster doesn't reach the max. For example, max/2 may be enough for task scheduling. And TaskScheduler also thinks there're max/2 executors without realizing X executors actually excluded.
So I think what we actually need here is to forcibly replace excluded executors when dynamic allocation & exclusion (but not kill) are both enabled. And it should not be related to the max value.