-
Notifications
You must be signed in to change notification settings - Fork 29k
Bugfixes/improvements to scheduler #159
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
fa5d9f1
270d841
9bda70e
167fad8
5ff59c2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -59,6 +59,15 @@ private[spark] class TaskSetManager( | |
| // CPUs to request per task | ||
| val CPUS_PER_TASK = conf.getInt("spark.task.cpus", 1) | ||
|
|
||
| /* | ||
| * Sometimes if an executor is dead or in an otherwise invalid state, the driver | ||
| * does not realize right away leading to repeated task failures. If enabled, | ||
| * this temporarily prevents a task from re-launching on an executor where | ||
| * it just failed. | ||
| */ | ||
| private[this] val EXECUTOR_TASK_BLACKLIST_TIMEOUT = | ||
| conf.getLong("spark.task.executorBlacklistTimeout", 0L) | ||
|
|
||
| // Quantile of tasks at which to start speculation | ||
| val SPECULATION_QUANTILE = conf.getDouble("spark.speculation.quantile", 0.75) | ||
| val SPECULATION_MULTIPLIER = conf.getDouble("spark.speculation.multiplier", 1.5) | ||
|
|
@@ -71,7 +80,9 @@ private[spark] class TaskSetManager( | |
| val numTasks = tasks.length | ||
| val copiesRunning = new Array[Int](numTasks) | ||
| val successful = new Array[Boolean](numTasks) | ||
| val numFailures = new Array[Int](numTasks) | ||
| private val numFailures = new Array[Int](numTasks) | ||
| // key is taskId, value is a Map of executor id to when it failed | ||
| private[this] val failedExecutors = new HashMap[Int, HashMap[String, Long]]() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One small thing, this might be better as a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When a task is successful, we need to remove all blacklisted executors we know of for that task - moving to key == (Int, String) will mean we will need to iterate over all keys in failedExecutors which can be expensive. |
||
| val taskAttempts = Array.fill[List[TaskInfo]](numTasks)(Nil) | ||
| var tasksSuccessful = 0 | ||
|
|
||
|
|
@@ -228,12 +239,18 @@ private[spark] class TaskSetManager( | |
| * This method also cleans up any tasks in the list that have already | ||
| * been launched, since we want that to happen lazily. | ||
| */ | ||
| private def findTaskFromList(list: ArrayBuffer[Int]): Option[Int] = { | ||
| while (!list.isEmpty) { | ||
| val index = list.last | ||
| list.trimEnd(1) | ||
| if (copiesRunning(index) == 0 && !successful(index)) { | ||
| return Some(index) | ||
| private def findTaskFromList(execId: String, list: ArrayBuffer[Int]): Option[Int] = { | ||
| var indexOffset = list.size | ||
|
|
||
| while (indexOffset > 0) { | ||
| indexOffset -= 1 | ||
| val index = list(indexOffset) | ||
| if (!executorIsBlacklisted(execId, index)) { | ||
| // This should almost always be list.trimEnd(1) to remove tail | ||
| list.remove(indexOffset) | ||
| if (copiesRunning(index) == 0 && !successful(index)) { | ||
| return Some(index) | ||
| } | ||
| } | ||
| } | ||
| None | ||
|
|
@@ -244,6 +261,21 @@ private[spark] class TaskSetManager( | |
| taskAttempts(taskIndex).exists(_.host == host) | ||
| } | ||
|
|
||
| /** | ||
| * Is this re-execution of a failed task on an executor it already failed in before | ||
| * EXECUTOR_TASK_BLACKLIST_TIMEOUT has elapsed ? | ||
| */ | ||
| private[this] def executorIsBlacklisted(execId: String, taskId: Int): Boolean = { | ||
| if (failedExecutors.contains(taskId)) { | ||
| val failed = failedExecutors.get(taskId).get | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can just write this as |
||
|
|
||
| return failed.contains(execId) && | ||
| clock.getTime() - failed.get(execId).get < EXECUTOR_TASK_BLACKLIST_TIMEOUT | ||
| } | ||
|
|
||
| false | ||
| } | ||
|
|
||
| /** | ||
| * Return a speculative task for a given executor if any are available. The task should not have | ||
| * an attempt running on this host, in case the host is slow. In addition, the task should meet | ||
|
|
@@ -254,10 +286,13 @@ private[spark] class TaskSetManager( | |
| { | ||
| speculatableTasks.retain(index => !successful(index)) // Remove finished tasks from set | ||
|
|
||
| def canRunOnHost(index: Int): Boolean = | ||
| !hasAttemptOnHost(index, host) && !executorIsBlacklisted(execId, index) | ||
|
|
||
| if (!speculatableTasks.isEmpty) { | ||
| // Check for process-local or preference-less tasks; note that tasks can be process-local | ||
| // on multiple nodes when we replicate cached blocks, as in Spark Streaming | ||
| for (index <- speculatableTasks if !hasAttemptOnHost(index, host)) { | ||
| for (index <- speculatableTasks if canRunOnHost(index)) { | ||
| val prefs = tasks(index).preferredLocations | ||
| val executors = prefs.flatMap(_.executorId) | ||
| if (prefs.size == 0 || executors.contains(execId)) { | ||
|
|
@@ -268,7 +303,7 @@ private[spark] class TaskSetManager( | |
|
|
||
| // Check for node-local tasks | ||
| if (TaskLocality.isAllowed(locality, TaskLocality.NODE_LOCAL)) { | ||
| for (index <- speculatableTasks if !hasAttemptOnHost(index, host)) { | ||
| for (index <- speculatableTasks if canRunOnHost(index)) { | ||
| val locations = tasks(index).preferredLocations.map(_.host) | ||
| if (locations.contains(host)) { | ||
| speculatableTasks -= index | ||
|
|
@@ -280,7 +315,7 @@ private[spark] class TaskSetManager( | |
| // Check for rack-local tasks | ||
| if (TaskLocality.isAllowed(locality, TaskLocality.RACK_LOCAL)) { | ||
| for (rack <- sched.getRackForHost(host)) { | ||
| for (index <- speculatableTasks if !hasAttemptOnHost(index, host)) { | ||
| for (index <- speculatableTasks if canRunOnHost(index)) { | ||
| val racks = tasks(index).preferredLocations.map(_.host).map(sched.getRackForHost) | ||
| if (racks.contains(rack)) { | ||
| speculatableTasks -= index | ||
|
|
@@ -292,7 +327,7 @@ private[spark] class TaskSetManager( | |
|
|
||
| // Check for non-local tasks | ||
| if (TaskLocality.isAllowed(locality, TaskLocality.ANY)) { | ||
| for (index <- speculatableTasks if !hasAttemptOnHost(index, host)) { | ||
| for (index <- speculatableTasks if canRunOnHost(index)) { | ||
| speculatableTasks -= index | ||
| return Some((index, TaskLocality.ANY)) | ||
| } | ||
|
|
@@ -309,32 +344,32 @@ private[spark] class TaskSetManager( | |
| private def findTask(execId: String, host: String, locality: TaskLocality.Value) | ||
| : Option[(Int, TaskLocality.Value)] = | ||
| { | ||
| for (index <- findTaskFromList(getPendingTasksForExecutor(execId))) { | ||
| for (index <- findTaskFromList(execId, getPendingTasksForExecutor(execId))) { | ||
| return Some((index, TaskLocality.PROCESS_LOCAL)) | ||
| } | ||
|
|
||
| if (TaskLocality.isAllowed(locality, TaskLocality.NODE_LOCAL)) { | ||
| for (index <- findTaskFromList(getPendingTasksForHost(host))) { | ||
| for (index <- findTaskFromList(execId, getPendingTasksForHost(host))) { | ||
| return Some((index, TaskLocality.NODE_LOCAL)) | ||
| } | ||
| } | ||
|
|
||
| if (TaskLocality.isAllowed(locality, TaskLocality.RACK_LOCAL)) { | ||
| for { | ||
| rack <- sched.getRackForHost(host) | ||
| index <- findTaskFromList(getPendingTasksForRack(rack)) | ||
| index <- findTaskFromList(execId, getPendingTasksForRack(rack)) | ||
| } { | ||
| return Some((index, TaskLocality.RACK_LOCAL)) | ||
| } | ||
| } | ||
|
|
||
| // Look for no-pref tasks after rack-local tasks since they can run anywhere. | ||
| for (index <- findTaskFromList(pendingTasksWithNoPrefs)) { | ||
| for (index <- findTaskFromList(execId, pendingTasksWithNoPrefs)) { | ||
| return Some((index, TaskLocality.PROCESS_LOCAL)) | ||
| } | ||
|
|
||
| if (TaskLocality.isAllowed(locality, TaskLocality.ANY)) { | ||
| for (index <- findTaskFromList(allPendingTasks)) { | ||
| for (index <- findTaskFromList(execId, allPendingTasks)) { | ||
| return Some((index, TaskLocality.ANY)) | ||
| } | ||
| } | ||
|
|
@@ -460,6 +495,7 @@ private[spark] class TaskSetManager( | |
| logInfo("Ignorning task-finished event for TID " + tid + " because task " + | ||
| index + " has already completed successfully") | ||
| } | ||
| failedExecutors.remove(index) | ||
| maybeFinishTaskSet() | ||
| } | ||
|
|
||
|
|
@@ -480,17 +516,19 @@ private[spark] class TaskSetManager( | |
| logWarning("Lost TID %s (task %s:%d)".format(tid, taskSet.id, index)) | ||
| } | ||
| var taskMetrics : TaskMetrics = null | ||
| var failureReason = "unknown" | ||
| var failureReason: String = null | ||
| reason match { | ||
| case fetchFailed: FetchFailed => | ||
| logWarning("Loss was due to fetch failure from " + fetchFailed.bmAddress) | ||
| if (!successful(index)) { | ||
| successful(index) = true | ||
| tasksSuccessful += 1 | ||
| } | ||
| // Not adding to failed executors for FetchFailed. | ||
| isZombie = true | ||
|
|
||
| case TaskKilled => | ||
| // Not adding to failed executors for TaskKilled. | ||
| logWarning("Task %d was killed.".format(tid)) | ||
|
|
||
| case ef: ExceptionFailure => | ||
|
|
@@ -504,7 +542,8 @@ private[spark] class TaskSetManager( | |
| return | ||
| } | ||
| val key = ef.description | ||
| failureReason = "Exception failure: %s".format(ef.description) | ||
| failureReason = "Exception failure in TID %s on host %s: %s".format( | ||
| tid, info.host, ef.description) | ||
| val now = clock.getTime() | ||
| val (printFull, dupCount) = { | ||
| if (recentExceptions.contains(key)) { | ||
|
|
@@ -533,11 +572,16 @@ private[spark] class TaskSetManager( | |
| failureReason = "Lost result for TID %s on host %s".format(tid, info.host) | ||
| logWarning(failureReason) | ||
|
|
||
| case _ => {} | ||
| case _ => | ||
| failureReason = "TID %s on host %s failed for unknown reason".format(tid, info.host) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you move this to line 519 instead (or if you prefer, remove the default setting on line 519, which now won't ever be used)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will make default null. |
||
| } | ||
| // always add to failed executors | ||
| failedExecutors.getOrElseUpdate(index, new HashMap[String, Long]()). | ||
| put(info.executorId, clock.getTime()) | ||
| sched.dagScheduler.taskEnded(tasks(index), reason, null, null, info, taskMetrics) | ||
| addPendingTask(index) | ||
| if (!isZombie && state != TaskState.KILLED) { | ||
| assert (null != failureReason) | ||
| numFailures(index) += 1 | ||
| if (numFailures(index) >= maxTaskFailures) { | ||
| logError("Task %s:%d failed %d times; aborting job".format( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe better to rename this to
spark.scheduler.executorBlacklistTimeoutor justspark.scheduler.blacklistTimeoutThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also I'm not sure if it needs to be
private[this]instead of justprivate; we don't usually useprivate[this]There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was @markhamstra 's recommendation - private[this] as better than private.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Regarding timeout variable name : I was considering a very specific variable name to allow for a better/future approach to handling this issue - and at that time allow us to retire this variable without potential variable name conflicts (spark.scheduler.blacklistTimeout implies a more general black-list handling, which this is not unfortunately); IMO, this is a stop gap solution until we add support for a better black list approach which handles both executors and blocks.
But until we have that, this will atleast unblock us - thankfully, this is not something which a lot of users are hitting (but is fairly common in our case unfortunately).
Given this, should we expose this in documentation ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Regarding
private[this], we don't really use it elsewhere, so I'd skip it here. It would be a bigger change to convert the whole codebase toprivate[this]. Its main benefit seems to be in terms of code generated but that kind of stuff usually gets inlined anyway.I now see that this is per-task, not a blacklist for the whole executor, so it makes sense for the variable name to contain "task". In that case maybe call this
spark.scheduler.executorTaskBlacklistTime-- I think it's good to put "scheduler" as the first component there since it's not really a property of tasks.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Converting the entire codebase to
private[this]might not be an unreasonable thing to do. Only in very rare instances doesprivate[this]not work where plainprivatedoes (it wouldn't surprise me if every instance ofprivatein Spark could safely be madeprivate[this]), and it can provide some performance benefits (not everything that you would expect to get inlined away actually does), so arguablyprivate[this]should be the default instead ofprivate.If someone has a few spare cycles, doing the search and replace and running the spark-perf numbers could at least give us some idea of how much we are giving away by using the slightly simpler syntax. Digging out just when
private[this]makes a difference would be more work, and maybe not worth doing.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In any case that seems like something for a separate change, and for now @mridulm should just keep it consistent
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yup, that's entirely reasonable.
And fwiw, I gave the naive replace-all-private-with-private[this] a shot, and it is more complicated than I anticipated. I may stick with this for a while and may end up with that separate PR -- if for no other reason than that it is turning out to be a somewhat interesting way to probe Spark's internals.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.