-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-11334][CORE] Fix bug in Executor allocation manager in running tasks calculation #19580
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -267,6 +267,10 @@ private[spark] class ExecutorAllocationManager( | |
| (numRunningOrPendingTasks + tasksPerExecutor - 1) / tasksPerExecutor | ||
| } | ||
|
|
||
| private def totalRunningTasks(): Int = synchronized { | ||
| listener.totalRunningTasks | ||
| } | ||
|
|
||
| /** | ||
| * This is called at a fixed interval to regulate the number of pending executor requests | ||
| * and number of executors running. | ||
|
|
@@ -602,12 +606,11 @@ private[spark] class ExecutorAllocationManager( | |
| private class ExecutorAllocationListener extends SparkListener { | ||
|
|
||
| private val stageIdToNumTasks = new mutable.HashMap[Int, Int] | ||
| // Number of running tasks per stage including speculative tasks. | ||
| // Should be 0 when no stages are active. | ||
| private val stageIdToNumRunningTask = new mutable.HashMap[Int, Int] | ||
| private val stageIdToTaskIndices = new mutable.HashMap[Int, mutable.HashSet[Int]] | ||
| private val executorIdToTaskIds = new mutable.HashMap[String, mutable.HashSet[Long]] | ||
| // Number of tasks currently running on the cluster including speculative tasks. | ||
| // Should be 0 when no stages are active. | ||
| private var numRunningTasks: Int = _ | ||
|
|
||
| // Number of speculative tasks to be scheduled in each stage | ||
| private val stageIdToNumSpeculativeTasks = new mutable.HashMap[Int, Int] | ||
| // The speculative tasks started in each stage | ||
|
|
@@ -625,6 +628,7 @@ private[spark] class ExecutorAllocationManager( | |
| val numTasks = stageSubmitted.stageInfo.numTasks | ||
| allocationManager.synchronized { | ||
| stageIdToNumTasks(stageId) = numTasks | ||
| stageIdToNumRunningTask(stageId) = 0 | ||
| allocationManager.onSchedulerBacklogged() | ||
|
|
||
| // Compute the number of tasks requested by the stage on each host | ||
|
|
@@ -651,6 +655,7 @@ private[spark] class ExecutorAllocationManager( | |
| val stageId = stageCompleted.stageInfo.stageId | ||
| allocationManager.synchronized { | ||
| stageIdToNumTasks -= stageId | ||
| stageIdToNumRunningTask -= stageId | ||
| stageIdToNumSpeculativeTasks -= stageId | ||
| stageIdToTaskIndices -= stageId | ||
| stageIdToSpeculativeTaskIndices -= stageId | ||
|
|
@@ -663,10 +668,6 @@ private[spark] class ExecutorAllocationManager( | |
| // This is needed in case the stage is aborted for any reason | ||
| if (stageIdToNumTasks.isEmpty && stageIdToNumSpeculativeTasks.isEmpty) { | ||
| allocationManager.onSchedulerQueueEmpty() | ||
| if (numRunningTasks != 0) { | ||
| logWarning("No stages are running, but numRunningTasks != 0") | ||
| numRunningTasks = 0 | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -678,7 +679,9 @@ private[spark] class ExecutorAllocationManager( | |
| val executorId = taskStart.taskInfo.executorId | ||
|
|
||
| allocationManager.synchronized { | ||
| numRunningTasks += 1 | ||
| if (stageIdToNumRunningTask.contains(stageId)) { | ||
| stageIdToNumRunningTask(stageId) += 1 | ||
| } | ||
| // This guards against the race condition in which the `SparkListenerTaskStart` | ||
| // event is posted before the `SparkListenerBlockManagerAdded` event, which is | ||
| // possible because these events are posted in different threads. (see SPARK-4951) | ||
|
|
@@ -709,7 +712,9 @@ private[spark] class ExecutorAllocationManager( | |
| val taskIndex = taskEnd.taskInfo.index | ||
| val stageId = taskEnd.stageId | ||
| allocationManager.synchronized { | ||
| numRunningTasks -= 1 | ||
| if (stageIdToNumRunningTask.contains(stageId)) { | ||
| stageIdToNumRunningTask(stageId) -= 1 | ||
| } | ||
| // If the executor is no longer running any scheduled tasks, mark it as idle | ||
| if (executorIdToTaskIds.contains(executorId)) { | ||
| executorIdToTaskIds(executorId) -= taskId | ||
|
|
@@ -787,7 +792,9 @@ private[spark] class ExecutorAllocationManager( | |
| /** | ||
| * The number of tasks currently running across all stages. | ||
| */ | ||
| def totalRunningTasks(): Int = numRunningTasks | ||
| def totalRunningTasks(): Int = { | ||
| stageIdToNumRunningTask.values.sum | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This needs to be inside
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nevermind, this is called from a synchronized context. Except in your unit tests, that is (which call the private
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It'd be nice to make the other method calling this synchronized, just to be paranoid. |
||
| } | ||
|
|
||
| /** | ||
| * Return true if an executor is not currently running a task, and false otherwise. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like no one invoke this method?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is being called from the test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure why do we need to add a method which only used for unit test. If want to verify the behavior of
totalRunningTasks, I thinkmaxNumExecutorsNeededcan also be used indirectly for verification.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Its okay to add a method which is used for unit testing purpose only. I am not inclined towards the idea of using
maxNumExecutorsNeededto indirectly verifytotalRunningTasksfor the following reason -Currently, the test case is testing what it is supposed to. If you check for
maxNumExecutorsNeededinstead, it might not be clear what we are testing.