-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-21219][Core] Task retry occurs on same executor due to race condition with blacklisting #18427
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-21219][Core] Task retry occurs on same executor due to race condition with blacklisting #18427
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,7 +23,7 @@ import scala.collection.mutable | |
| import scala.collection.mutable.ArrayBuffer | ||
|
|
||
| import org.mockito.Matchers.{any, anyInt, anyString} | ||
| import org.mockito.Mockito.{mock, never, spy, verify, when} | ||
| import org.mockito.Mockito.{mock, never, spy, times, verify, when} | ||
| import org.mockito.invocation.InvocationOnMock | ||
| import org.mockito.stubbing.Answer | ||
|
|
||
|
|
@@ -1172,6 +1172,50 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg | |
| assert(blacklistTracker.isNodeBlacklisted("host1")) | ||
| } | ||
|
|
||
| test("update blacklist before adding pending task to avoid race condition") { | ||
| // When a task fails, it should apply the blacklist policy prior to | ||
| // retrying the task otherwise there's a race condition where run on | ||
| // the same executor that it was intended to be black listed from. | ||
| val conf = new SparkConf(). | ||
| set(config.BLACKLIST_ENABLED, true). | ||
| set(config.MAX_TASK_ATTEMPTS_PER_EXECUTOR, 1) | ||
|
|
||
| // Create a task with two executors. | ||
| sc = new SparkContext("local", "test", conf) | ||
| val exec = "executor1" | ||
| val host = "host1" | ||
| val exec2 = "executor2" | ||
| val host2 = "host2" | ||
| sched = new FakeTaskScheduler(sc, (exec, host), (exec2, host2)) | ||
| val taskSet = FakeTask.createTaskSet(1) | ||
|
|
||
| val clock = new ManualClock | ||
| val mockListenerBus = mock(classOf[LiveListenerBus]) | ||
| val blacklistTracker = new BlacklistTracker(mockListenerBus, conf, None, clock) | ||
| val taskSetManager = new TaskSetManager(sched, taskSet, 1, Some(blacklistTracker)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we using
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems all the tests in this file are using ManualClock so was following convention here. This test doesn't validate anything specifically dependent on the clock/time. |
||
| val taskSetManagerSpy = spy(taskSetManager) | ||
|
|
||
| val taskDesc = taskSetManagerSpy.resourceOffer(exec, host, TaskLocality.ANY) | ||
|
|
||
| // Assert the task has been black listed on the executor it was last executed on. | ||
| when(taskSetManagerSpy.addPendingTask(anyInt())).thenAnswer( | ||
| new Answer[Unit] { | ||
| override def answer(invocationOnMock: InvocationOnMock): Unit = { | ||
| val task = invocationOnMock.getArgumentAt(0, classOf[Int]) | ||
| assert(taskSetManager.taskSetBlacklistHelperOpt.get. | ||
| isExecutorBlacklistedForTask(exec, task)) | ||
| } | ||
| } | ||
| ) | ||
|
|
||
| // Simulate an out of memory error | ||
| val e = new OutOfMemoryError | ||
| taskSetManagerSpy.handleFailedTask( | ||
| taskDesc.get.taskId, TaskState.FAILED, new ExceptionFailure(e, Seq())) | ||
|
||
|
|
||
| verify(taskSetManagerSpy, times(1)).addPendingTask(anyInt()) | ||
| } | ||
|
|
||
| private def createTaskResult( | ||
| id: Int, | ||
| accumUpdates: Seq[AccumulatorV2[_, _]] = Seq.empty): DirectTaskResult[Int] = { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The default value of
config.MAX_TASK_ATTEMPTS_PER_EXECUTORis 1, so we don't have to set it here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I added it to make the test code (configuration) inputs more explicit, but I can remove if it's a default unlikely to change.