From a6e94d7dac23431462b576b059de701eac548643 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Tue, 10 May 2016 15:31:57 -0500 Subject: [PATCH 01/35] basic test framework for entire spark scheduler --- .../scala/org/apache/spark/SparkContext.scala | 9 + .../apache/spark/scheduler/DAGScheduler.scala | 19 +- .../spark/scheduler/TaskResultGetter.scala | 12 + .../org/apache/spark/util/EventLoop.scala | 9 +- .../spark/scheduler/DAGSchedulerSuite.scala | 16 +- .../scheduler/SchedulerIntegrationSuite.scala | 390 ++++++++++++++++++ 6 files changed, 446 insertions(+), 9 deletions(-) create mode 100644 core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index e391599336074..c3f51923e73a7 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -2420,6 +2420,14 @@ object SparkContext extends Logging { scheduler.initialize(backend) (backend, scheduler) + case MOCK_REGEX(backendClassName) => + val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true) + val backendClass = Utils.classForName(backendClassName) + val ctor = backendClass.getConstructor(classOf[SparkConf], classOf[TaskSchedulerImpl]) + val backend = ctor.newInstance(sc.getConf, scheduler).asInstanceOf[SchedulerBackend] + scheduler.initialize(backend) + (backend, scheduler) + case LOCAL_N_REGEX(threads) => def localCpuCount: Int = Runtime.getRuntime.availableProcessors() // local[*] estimates the number of cores on the machine; local[N] uses exactly N threads. @@ -2520,6 +2528,7 @@ object SparkContext extends Logging { * A collection of regexes for extracting information from the master string. */ private object SparkMasterRegex { + val MOCK_REGEX = """mock\[(.*)\]""".r // Regular expression used for local[N] and local[*] master formats val LOCAL_N_REGEX = """local\[([0-9]+|\*)\]""".r // Regular expression for local[N, maxRetries], used in tests with failing tasks diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 5291b663667ea..709514f4327c4 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -183,6 +183,14 @@ class DAGScheduler( private val messageScheduler = ThreadUtils.newDaemonSingleThreadScheduledExecutor("dag-scheduler-message") + private val msgsScheduled = new AtomicInteger(0) + + /** + * Visible for testing, to know if the DAGScheduler is still "busy" + */ + private[scheduler] def msgSchedulerEmpty: Boolean = { + msgsScheduled.get() == 0 + } private[scheduler] val eventProcessLoop = new DAGSchedulerEventProcessLoop(this) taskScheduler.setDAGScheduler(this) @@ -1283,8 +1291,15 @@ class DAGScheduler( // TODO: Cancel running tasks in the stage logInfo(s"Resubmitting $mapStage (${mapStage.name}) and " + s"$failedStage (${failedStage.name}) due to fetch failure") + // We might get lots of fetch failed for this stage, from lots of executors. + // Its better if we can resubmit for all the failed executors at one time, so lets + // just wait a *bit* before we resubmit. + msgsScheduled.incrementAndGet() messageScheduler.schedule(new Runnable { - override def run(): Unit = eventProcessLoop.post(ResubmitFailedStages) + override def run(): Unit = { + eventProcessLoop.post(ResubmitFailedStages) + msgsScheduled.decrementAndGet() + } }, DAGScheduler.RESUBMIT_TIMEOUT, TimeUnit.MILLISECONDS) } failedStages += failedStage @@ -1411,7 +1426,7 @@ class DAGScheduler( stage.clearFailures() } else { stage.latestInfo.stageFailed(errorMessage.get) - logInfo("%s (%s) failed in %s s".format(stage, stage.name, serviceTime)) + logInfo(s"$stage (${stage.name}) failed in $serviceTime s due to ${errorMessage.get}") } outputCommitCoordinator.stageEnd(stage.id) diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala index 685ef55c66876..bc7b32ad0b5c5 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala @@ -19,6 +19,7 @@ package org.apache.spark.scheduler import java.nio.ByteBuffer import java.util.concurrent.{ExecutorService, RejectedExecutionException} +import java.util.concurrent.atomic.AtomicInteger import scala.language.existentials import scala.util.control.NonFatal @@ -37,6 +38,11 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul private val THREADS = sparkEnv.conf.getInt("spark.resultGetter.threads", 4) + private val nTasks = new AtomicInteger(0) + def isEmpty: Boolean = { + nTasks.get() == 0 + } + // Exposed for testing. protected val getTaskResultExecutor: ExecutorService = ThreadUtils.newDaemonFixedThreadPool(THREADS, "task-result-getter") @@ -52,6 +58,7 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul taskSetManager: TaskSetManager, tid: Long, serializedData: ByteBuffer): Unit = { + nTasks.incrementAndGet() getTaskResultExecutor.execute(new Runnable { override def run(): Unit = Utils.logUncaughtExceptions { try { @@ -111,6 +118,8 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul case NonFatal(ex) => logError("Exception while getting task result", ex) taskSetManager.abort("Exception while getting task result: %s".format(ex)) + } finally { + nTasks.decrementAndGet() } } }) @@ -119,6 +128,7 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul def enqueueFailedTask(taskSetManager: TaskSetManager, tid: Long, taskState: TaskState, serializedData: ByteBuffer) { var reason : TaskEndReason = UnknownReason + nTasks.incrementAndGet() try { getTaskResultExecutor.execute(new Runnable { override def run(): Unit = Utils.logUncaughtExceptions { @@ -142,6 +152,8 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul } catch { case e: RejectedExecutionException if sparkEnv.isStopped => // ignore it + } finally { + nTasks.decrementAndGet() } } diff --git a/core/src/main/scala/org/apache/spark/util/EventLoop.scala b/core/src/main/scala/org/apache/spark/util/EventLoop.scala index 3ea9139e11027..d5ece57b88c9e 100644 --- a/core/src/main/scala/org/apache/spark/util/EventLoop.scala +++ b/core/src/main/scala/org/apache/spark/util/EventLoop.scala @@ -18,7 +18,7 @@ package org.apache.spark.util import java.util.concurrent.{BlockingQueue, LinkedBlockingDeque} -import java.util.concurrent.atomic.AtomicBoolean +import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger} import scala.util.control.NonFatal @@ -36,6 +36,7 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging { private val eventQueue: BlockingQueue[E] = new LinkedBlockingDeque[E]() private val stopped = new AtomicBoolean(false) + private val nMsgs = new AtomicInteger(0) private val eventThread = new Thread(name) { setDaemon(true) @@ -46,6 +47,7 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging { val event = eventQueue.take() try { onReceive(event) + nMsgs.decrementAndGet() } catch { case NonFatal(e) => try { @@ -99,6 +101,7 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging { * Put the event into the event queue. The event thread will process it later. */ def post(event: E): Unit = { + nMsgs.incrementAndGet() eventQueue.put(event) } @@ -107,6 +110,10 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging { */ def isActive: Boolean = eventThread.isAlive + def isEmpty: Boolean = { + nMsgs.get() == 0 + } + /** * Invoked when `start()` is called but before the event thread starts. */ diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 088a476086217..9c004e0dd76dc 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -98,6 +98,8 @@ class DAGSchedulerSuiteDummyException extends Exception class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeouts { + import DAGSchedulerSuite._ + val conf = new SparkConf /** Set of TaskSets the DAGScheduler has requested executed. */ val taskSets = scala.collection.mutable.Buffer[TaskSet]() @@ -2027,12 +2029,6 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou } } - private def makeMapStatus(host: String, reduces: Int, sizes: Byte = 2): MapStatus = - MapStatus(makeBlockManagerId(host), Array.fill[Long](reduces)(sizes)) - - private def makeBlockManagerId(host: String): BlockManagerId = - BlockManagerId("exec-" + host, host, 12345) - private def assertDataStructuresEmpty(): Unit = { assert(scheduler.activeJobs.isEmpty) assert(scheduler.failedStages.isEmpty) @@ -2072,5 +2068,13 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou } CompletionEvent(task, reason, result, accumUpdates ++ extraAccumUpdates, taskInfo) } +} + +object DAGSchedulerSuite { + def makeMapStatus(host: String, reduces: Int, sizes: Byte = 2): MapStatus = + MapStatus(makeBlockManagerId(host), Array.fill[Long](reduces)(sizes)) + + def makeBlockManagerId(host: String): BlockManagerId = + BlockManagerId("exec-" + host, host, 12345) } diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala new file mode 100644 index 0000000000000..2c55d50aa0cd0 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -0,0 +1,390 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.scheduler + +import java.util.concurrent.TimeoutException + +import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} + +import org.scalactic.TripleEquals +import org.scalatest.Assertions.AssertionsHelper +import org.scalatest.BeforeAndAfter + +import org.apache.spark._ +import org.apache.spark.TaskState._ +import org.apache.spark.executor.TaskMetrics +import org.apache.spark.internal.Logging +import org.apache.spark.rdd.RDD +import org.apache.spark.scheduler.DAGSchedulerSuite._ +import org.apache.spark.util.CallSite + +/** + * Tests for the entire scheduler code -- DAGScheduler, TaskSchedulerImpl, TaskSets, + * TaskSetManagers. + * + * Test cases are configured by providing a set of jobs to submit, and then simulating interaction + * with spark's executors via a mocked backend (eg., task completion, task failure, executors + * disconnecting, etc.). + */ +class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with LocalSparkContext { + val conf = new SparkConf + + /** Set of TaskSets the DAGScheduler has requested executed. */ + val runningTaskSets = HashSet[TaskSet]() + + var taskScheduler: TaskSchedulerImpl = null + var scheduler: DAGScheduler = null + var backend: SingleCoreMockBackend = null + + before { + runningTaskSets.clear() + results.clear() + sc = new SparkContext("mock[org.apache.spark.scheduler.SingleCoreMockBackend]", + "SchedulerIntegrationSuite") + backend = sc.schedulerBackend.asInstanceOf[SingleCoreMockBackend] + taskScheduler = new TaskSchedulerImpl(sc) { + override def submitTasks(taskSet: TaskSet): Unit = { + runningTaskSets += taskSet + super.submitTasks(taskSet) + } + override def taskSetFinished(manager: TaskSetManager): Unit = { + runningTaskSets -= manager.taskSet + super.taskSetFinished(manager) + } + } + taskScheduler.initialize(sc.schedulerBackend) + backend.taskScheduler = taskScheduler + scheduler = new DAGScheduler(sc, taskScheduler) + taskScheduler.setDAGScheduler(scheduler) + } + + after { + taskScheduler.stop() + backend.stop() + scheduler.stop() + } + + /** + * Process the supplied event as if it were the top of the DAGScheduler event queue, expecting + * the scheduler not to exit. + */ + private def runEvent(event: DAGSchedulerEvent) { + scheduler.eventProcessLoop.post(event) + } + + val results = new HashMap[Int, Any]() + var failure: Exception = _ + val jobListener = new JobListener() { + override def taskSucceeded(index: Int, result: Any) = results.put(index, result) + override def jobFailed(exception: Exception) = { failure = exception } + } + + /** + * When we submit dummy Jobs, this is the compute function we supply. + */ + private val jobComputeFunc: (TaskContext, scala.Iterator[_]) => Any = { + (context: TaskContext, it: Iterator[(_)]) => + throw new RuntimeException("jobComputeFunc shouldn't get called in this mock") + } + + /** Sends the rdd to the scheduler for scheduling and returns the job id. */ + private def submit( + rdd: RDD[_], + partitions: Array[Int], + func: (TaskContext, Iterator[_]) => _ = jobComputeFunc, + listener: JobListener = jobListener): Int = { + val jobId = scheduler.nextJobId.getAndIncrement() + runEvent(JobSubmitted(jobId, rdd, func, partitions, CallSite("", ""), listener)) + jobId + } + + /** + * Return true if the backend has more work to do, false otherwise. It will block until it has + * a definitive answer either way -- eg., if the backend does not appear to have any work, but + * the dag scheduler has some events left to process, this will wait until the dag scheduler is + * done processing enough events to say for sure. + */ + private def backendHasWorkToDo: Boolean = { + // the ordering is somewhat important here -- avoid waiting if we can (both to speed up test, + // and also to test with more concurrency inside scheduler) + if (backend.runningTasks.nonEmpty) { + true + } else if (runningTaskSets.isEmpty && scheduler.msgSchedulerEmpty && + scheduler.eventProcessLoop.isEmpty && taskScheduler.taskResultGetter.isEmpty ) { + false + } else if (runningTaskSets.nonEmpty) { + // need to get all task results, as they might lead to finishing a taskSet + waitUntil(() => taskScheduler.taskResultGetter.isEmpty) + backendHasWorkToDo + } else { + waitUntil(() => taskScheduler.taskResultGetter.isEmpty) + waitUntil(() => scheduler.eventProcessLoop.isEmpty) + backendHasWorkToDo + } + } + + private def waitUntil(condition: () => Boolean): Unit = { + val timeoutMillis = 1000L + val finishTime = System.currentTimeMillis + timeoutMillis + while (!condition()) { + if (System.currentTimeMillis > finishTime) { + throw new TimeoutException( + s"Not ready after $timeoutMillis milliseconds") + } + /* Sleep rather than using wait/notify, because this is used only for testing and + * wait/notify add overhead in the general case. */ + Thread.sleep(10) + } + } + + private def assertDataStructuresEmpty(): Unit = { + assert(!backendHasWorkToDo) + assert(runningTaskSets.isEmpty) + assert(backend.runningTasks.isEmpty) + } + + /** + * Looks at all shuffleMapOutputs that are dependencies of the given RDD, and makes sure + * they are all registered + */ + private def assertMapOutputAvailable(targetRdd: MockRDD): Unit = { + val shuffleIds = targetRdd.shuffleDeps.map{_.shuffleId} + val nParts = targetRdd.numPartitions + for { + shuffleId <- shuffleIds + reduceIdx <- (0 until nParts) + } { + val statuses = taskScheduler.mapOutputTracker.getMapSizesByExecutorId(shuffleId, reduceIdx) + // really we should have already thrown an exception rather than fail either of these + // asserts, but just to be extra defensive let's double check the statuses are OK + assert(statuses != null) + assert(statuses.nonEmpty) + } + } + + + /** models a stage boundary with a single dependency, like a shuffle */ + def shuffle(nParts: Int, input: MockRDD): MockRDD = { + val partitioner = new HashPartitioner(nParts) + val shuffleDep = new ShuffleDependency(input, partitioner) + new MockRDD(sc, nParts, List(shuffleDep)) + } + + /** models a stage boundary with multiple dependencies, like a join */ + def join(nParts: Int, inputs: MockRDD*): MockRDD = { + val partitioner = new HashPartitioner(nParts) + val shuffleDeps = inputs.map { inputRDD => + new ShuffleDependency(inputRDD, partitioner) + } + new MockRDD(sc, nParts, shuffleDeps) + } + + /** + * Very simple one stage job. Backend successfully completes each task, one by one + */ + test("super simple job") { + submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray) + while (backendHasWorkToDo) { + val task = backend.runningTasks.last + backend.taskSuccess(task, 42) + } + assert(results === (0 until 10).map { _ -> 42 }.toMap) + assertDataStructuresEmpty() + } + + /** + * 5 stage job, diamond dependencies. + * + * a ----> b ----> d --> result + * \--> c --/ + * + * Backend successfully completes each task + */ + test("multi-stage job") { + val a = new MockRDD(sc, 2, Nil) + val b = shuffle(10, a) + val c = shuffle(20, a) + val d = join(30, b, c) + submit(d, (0 until 30).toArray) + + def stageToOutputParts(stageId: Int): Int = { + stageId match { + case 0 => 10 + case 2 => 20 + case _ => 30 + } + } + + while (backendHasWorkToDo) { + assert(backend.runningTasks.nonEmpty) + val taskDescription = backend.runningTasks.last + val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet + val task = taskSet.tasks(taskDescription.index) + + // make sure the required map output is available + task.stageId match { + case 1 => assertMapOutputAvailable(b) + case 3 => assertMapOutputAvailable(c) + case 4 => assertMapOutputAvailable(d) + case _ => // no shuffle map input, nothing to check + } + + (task.stageId, task.stageAttemptId, task.partitionId) match { + case (stage, 0, _) if stage < 4 => + backend.taskSuccess(taskDescription, makeMapStatus("hostA", stageToOutputParts(stage))) + case (4, 0, partition) => + backend.taskSuccess(taskDescription, 4321 + partition) + } + } + assert(results === (0 until 30).map { idx => idx -> (4321 + idx) }.toMap) + assertDataStructuresEmpty() + } + + /** + * 2 stage job, with a fetch failure. Make sure that: + * (a) map output is available whenever we run stage 1 + * (b) we get a second attempt for stage 0 & stage 1 + */ + test("job with fetch failure") { + val input = new MockRDD(sc, 2, Nil) + val shuffledRdd = shuffle(10, input) + val shuffleId = shuffledRdd.shuffleDeps.head.shuffleId + submit(shuffledRdd, (0 until 10).toArray) + + val stageToAttempts = new HashMap[Int, HashSet[Int]]() + + while (backendHasWorkToDo) { + val taskDescription = backend.runningTasks.last + val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet + val task = taskSet.tasks(taskDescription.index) + stageToAttempts.getOrElseUpdate(task.stageId, new HashSet()) += task.stageAttemptId + + // make sure the required map output is available + task.stageId match { + case 1 => assertMapOutputAvailable(shuffledRdd) + case _ => // no shuffle map input, nothing to check + } + + (task.stageId, task.stageAttemptId, task.partitionId) match { + case (0, _, _) => + backend.taskSuccess(taskDescription, makeMapStatus("hostA", 10)) + case (1, 0, 0) => + val fetchFailed = FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored") + backend.failTask(taskDescription, TaskState.FAILED, fetchFailed) + case (1, _, partition) => + backend.taskSuccess(taskDescription, 42 + partition) + } + } + assert(results === (0 until 10).map { idx => idx -> (42 + idx) }.toMap) + assert(stageToAttempts === Map(0 -> Set(0, 1), 1 -> Set(0, 1))) + assertDataStructuresEmpty() + } +} + +/** + * A very simple mock backend that can just run one task at a time. + */ +private[spark] class SingleCoreMockBackend( + conf: SparkConf, + var taskScheduler: TaskSchedulerImpl) extends SchedulerBackend with Logging { + + val cores = 1 + + override def start(): Unit = {} + + override def stop(): Unit = {} + + override def defaultParallelism(): Int = conf.getInt("spark.default.parallelism", cores) + + var freeCores = cores + val localExecutorId = SparkContext.DRIVER_IDENTIFIER + val localExecutorHostname = "localhost" + val env = SparkEnv.get + + val runningTasks = ArrayBuffer[TaskDescription]() + + /** + * This is called by the scheduler whenever it has tasks it would like to schedule + */ + override def reviveOffers(): Unit = { + val offers = Seq(new WorkerOffer(localExecutorId, localExecutorHostname, freeCores)) + val newTasks = taskScheduler.resourceOffers(offers).flatten + synchronized { + freeCores -= newTasks.size * taskScheduler.CPUS_PER_TASK + runningTasks ++= newTasks + } + } + + def taskSuccess(task: TaskDescription, result: Any): Unit = { + val ser = env.serializer.newInstance() + val resultBytes = ser.serialize(result) + val metrics = new TaskMetrics + val directResult = new DirectTaskResult(resultBytes, Seq()) // no accumulator updates + val serializedDirectResult = ser.serialize(directResult) + taskScheduler.statusUpdate(task.taskId, TaskState.FINISHED, serializedDirectResult) + synchronized { + freeCores += taskScheduler.CPUS_PER_TASK + runningTasks -= task + } + reviveOffers() + } + + def failTask(task: TaskDescription, state: TaskState, result: Any): Unit = { + val ser = env.serializer.newInstance() + val resultBytes = ser.serialize(result) + taskScheduler.statusUpdate(task.taskId, state, resultBytes) + if (TaskState.isFinished(state)) { + synchronized { + freeCores += taskScheduler.CPUS_PER_TASK + runningTasks -= task + } + reviveOffers() + } + } + +} + +class MockRDD( + sc: SparkContext, + val numPartitions: Int, + val shuffleDeps: Seq[ShuffleDependency[Int, Int, Nothing]] +) extends RDD[(Int, Int)](sc, shuffleDeps) with Serializable { + + MockRDD.validate(numPartitions, shuffleDeps) + + override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] = + throw new RuntimeException("should not be reached") + override def getPartitions: Array[Partition] = (0 until numPartitions).map(i => new Partition { + override def index: Int = i + }).toArray + override def getPreferredLocations(split: Partition): Seq[String] = Nil + override def toString: String = "MockRDD " + id +} + +object MockRDD extends AssertionsHelper with TripleEquals { + /** + * make sure all the shuffle dependencies have a consistent number of output partitions + * (mostly to make sure the test setup makes sense, not that Spark itself would get this wrong) + */ + def validate(numPartitions: Int, dependencies: Seq[ShuffleDependency[_, _, _]]): Unit = { + dependencies.foreach { dependency => + val partitioner = dependency.partitioner + assert(partitioner != null) + assert(partitioner.numPartitions === numPartitions) + } + } +} From 20fb3e98ea08cd3cb777227d0b08e35ea3408de3 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Tue, 10 May 2016 16:19:55 -0500 Subject: [PATCH 02/35] TaskResultGetter now expects there to always be non-null accum updates --- core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala index 80f2bf41224b5..77fda6fcff959 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala @@ -59,7 +59,7 @@ private[spark] class DirectTaskResult[T]( val numUpdates = in.readInt if (numUpdates == 0) { - accumUpdates = null + accumUpdates = Seq() } else { val _accumUpdates = new ArrayBuffer[AccumulatorV2[_, _]] for (i <- 0 until numUpdates) { From 0ca981547832780de4405278fce8314f8be73a84 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Fri, 13 May 2016 13:13:59 -0500 Subject: [PATCH 03/35] switch to making backend run in another thread --- .../apache/spark/scheduler/DAGScheduler.scala | 10 - .../spark/scheduler/TaskResultGetter.scala | 12 - .../org/apache/spark/util/EventLoop.scala | 7 - .../scheduler/SchedulerIntegrationSuite.scala | 463 +++++++++++------- 4 files changed, 289 insertions(+), 203 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 709514f4327c4..f9e9be40ab8e1 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -183,14 +183,6 @@ class DAGScheduler( private val messageScheduler = ThreadUtils.newDaemonSingleThreadScheduledExecutor("dag-scheduler-message") - private val msgsScheduled = new AtomicInteger(0) - - /** - * Visible for testing, to know if the DAGScheduler is still "busy" - */ - private[scheduler] def msgSchedulerEmpty: Boolean = { - msgsScheduled.get() == 0 - } private[scheduler] val eventProcessLoop = new DAGSchedulerEventProcessLoop(this) taskScheduler.setDAGScheduler(this) @@ -1294,11 +1286,9 @@ class DAGScheduler( // We might get lots of fetch failed for this stage, from lots of executors. // Its better if we can resubmit for all the failed executors at one time, so lets // just wait a *bit* before we resubmit. - msgsScheduled.incrementAndGet() messageScheduler.schedule(new Runnable { override def run(): Unit = { eventProcessLoop.post(ResubmitFailedStages) - msgsScheduled.decrementAndGet() } }, DAGScheduler.RESUBMIT_TIMEOUT, TimeUnit.MILLISECONDS) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala index bc7b32ad0b5c5..685ef55c66876 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala @@ -19,7 +19,6 @@ package org.apache.spark.scheduler import java.nio.ByteBuffer import java.util.concurrent.{ExecutorService, RejectedExecutionException} -import java.util.concurrent.atomic.AtomicInteger import scala.language.existentials import scala.util.control.NonFatal @@ -38,11 +37,6 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul private val THREADS = sparkEnv.conf.getInt("spark.resultGetter.threads", 4) - private val nTasks = new AtomicInteger(0) - def isEmpty: Boolean = { - nTasks.get() == 0 - } - // Exposed for testing. protected val getTaskResultExecutor: ExecutorService = ThreadUtils.newDaemonFixedThreadPool(THREADS, "task-result-getter") @@ -58,7 +52,6 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul taskSetManager: TaskSetManager, tid: Long, serializedData: ByteBuffer): Unit = { - nTasks.incrementAndGet() getTaskResultExecutor.execute(new Runnable { override def run(): Unit = Utils.logUncaughtExceptions { try { @@ -118,8 +111,6 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul case NonFatal(ex) => logError("Exception while getting task result", ex) taskSetManager.abort("Exception while getting task result: %s".format(ex)) - } finally { - nTasks.decrementAndGet() } } }) @@ -128,7 +119,6 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul def enqueueFailedTask(taskSetManager: TaskSetManager, tid: Long, taskState: TaskState, serializedData: ByteBuffer) { var reason : TaskEndReason = UnknownReason - nTasks.incrementAndGet() try { getTaskResultExecutor.execute(new Runnable { override def run(): Unit = Utils.logUncaughtExceptions { @@ -152,8 +142,6 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul } catch { case e: RejectedExecutionException if sparkEnv.isStopped => // ignore it - } finally { - nTasks.decrementAndGet() } } diff --git a/core/src/main/scala/org/apache/spark/util/EventLoop.scala b/core/src/main/scala/org/apache/spark/util/EventLoop.scala index d5ece57b88c9e..eefe934c63883 100644 --- a/core/src/main/scala/org/apache/spark/util/EventLoop.scala +++ b/core/src/main/scala/org/apache/spark/util/EventLoop.scala @@ -36,7 +36,6 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging { private val eventQueue: BlockingQueue[E] = new LinkedBlockingDeque[E]() private val stopped = new AtomicBoolean(false) - private val nMsgs = new AtomicInteger(0) private val eventThread = new Thread(name) { setDaemon(true) @@ -47,7 +46,6 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging { val event = eventQueue.take() try { onReceive(event) - nMsgs.decrementAndGet() } catch { case NonFatal(e) => try { @@ -101,7 +99,6 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging { * Put the event into the event queue. The event thread will process it later. */ def post(event: E): Unit = { - nMsgs.incrementAndGet() eventQueue.put(event) } @@ -110,10 +107,6 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging { */ def isActive: Boolean = eventThread.isAlive - def isEmpty: Boolean = { - nMsgs.get() == 0 - } - /** * Invoked when `start()` is called but before the event thread starts. */ diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index 2c55d50aa0cd0..40a1bb4ce7c3b 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -16,9 +16,13 @@ */ package org.apache.spark.scheduler -import java.util.concurrent.TimeoutException +import java.util.Properties +import java.util.concurrent.atomic.AtomicBoolean import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} +import scala.concurrent.{Await, Future} +import scala.concurrent.duration.{Duration, SECONDS} +import scala.reflect.ClassTag import org.scalactic.TripleEquals import org.scalatest.Assertions.AssertionsHelper @@ -40,7 +44,8 @@ import org.apache.spark.util.CallSite * with spark's executors via a mocked backend (eg., task completion, task failure, executors * disconnecting, etc.). */ -class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with LocalSparkContext { +abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends SparkFunSuite + with BeforeAndAfter with LocalSparkContext { val conf = new SparkConf /** Set of TaskSets the DAGScheduler has requested executed. */ @@ -48,19 +53,21 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L var taskScheduler: TaskSchedulerImpl = null var scheduler: DAGScheduler = null - var backend: SingleCoreMockBackend = null + var backend: T = _ before { runningTaskSets.clear() results.clear() - sc = new SparkContext("mock[org.apache.spark.scheduler.SingleCoreMockBackend]", - "SchedulerIntegrationSuite") - backend = sc.schedulerBackend.asInstanceOf[SingleCoreMockBackend] + failure = null + val backendClassName = implicitly[ClassTag[T]].runtimeClass.getName() + sc = new SparkContext(s"mock[${backendClassName}]", this.getClass().getSimpleName()) + backend = sc.schedulerBackend.asInstanceOf[T] taskScheduler = new TaskSchedulerImpl(sc) { override def submitTasks(taskSet: TaskSet): Unit = { runningTaskSets += taskSet super.submitTasks(taskSet) } + override def taskSetFinished(manager: TaskSetManager): Unit = { runningTaskSets -= manager.taskSet super.taskSetFinished(manager) @@ -78,20 +85,8 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L scheduler.stop() } - /** - * Process the supplied event as if it were the top of the DAGScheduler event queue, expecting - * the scheduler not to exit. - */ - private def runEvent(event: DAGSchedulerEvent) { - scheduler.eventProcessLoop.post(event) - } - val results = new HashMap[Int, Any]() - var failure: Exception = _ - val jobListener = new JobListener() { - override def taskSucceeded(index: Int, result: Any) = results.put(index, result) - override def jobFailed(exception: Exception) = { failure = exception } - } + var failure: Throwable = _ /** * When we submit dummy Jobs, this is the compute function we supply. @@ -101,67 +96,33 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L throw new RuntimeException("jobComputeFunc shouldn't get called in this mock") } - /** Sends the rdd to the scheduler for scheduling and returns the job id. */ - private def submit( + /** Submits a job to the scheduler, and returns a future which does a bit of error handling. */ + protected def submit( rdd: RDD[_], partitions: Array[Int], - func: (TaskContext, Iterator[_]) => _ = jobComputeFunc, - listener: JobListener = jobListener): Int = { - val jobId = scheduler.nextJobId.getAndIncrement() - runEvent(JobSubmitted(jobId, rdd, func, partitions, CallSite("", ""), listener)) - jobId - } - - /** - * Return true if the backend has more work to do, false otherwise. It will block until it has - * a definitive answer either way -- eg., if the backend does not appear to have any work, but - * the dag scheduler has some events left to process, this will wait until the dag scheduler is - * done processing enough events to say for sure. - */ - private def backendHasWorkToDo: Boolean = { - // the ordering is somewhat important here -- avoid waiting if we can (both to speed up test, - // and also to test with more concurrency inside scheduler) - if (backend.runningTasks.nonEmpty) { - true - } else if (runningTaskSets.isEmpty && scheduler.msgSchedulerEmpty && - scheduler.eventProcessLoop.isEmpty && taskScheduler.taskResultGetter.isEmpty ) { - false - } else if (runningTaskSets.nonEmpty) { - // need to get all task results, as they might lead to finishing a taskSet - waitUntil(() => taskScheduler.taskResultGetter.isEmpty) - backendHasWorkToDo - } else { - waitUntil(() => taskScheduler.taskResultGetter.isEmpty) - waitUntil(() => scheduler.eventProcessLoop.isEmpty) - backendHasWorkToDo + func: (TaskContext, Iterator[_]) => _ = jobComputeFunc): Future[Any] = { + val waiter: JobWaiter[Any] = scheduler.submitJob(rdd, func, partitions.toSeq, CallSite("", ""), + (index, res) => results(index) = res, new Properties()) + import scala.concurrent.ExecutionContext.Implicits.global + waiter.completionFuture.recover { case ex => + failure = ex } } - private def waitUntil(condition: () => Boolean): Unit = { - val timeoutMillis = 1000L - val finishTime = System.currentTimeMillis + timeoutMillis - while (!condition()) { - if (System.currentTimeMillis > finishTime) { - throw new TimeoutException( - s"Not ready after $timeoutMillis milliseconds") - } - /* Sleep rather than using wait/notify, because this is used only for testing and - * wait/notify add overhead in the general case. */ - Thread.sleep(10) + protected def assertDataStructuresEmpty(noFailure: Boolean = true): Unit = { + if (noFailure) { + assert(failure === null) } - } - - private def assertDataStructuresEmpty(): Unit = { - assert(!backendHasWorkToDo) + assert(scheduler.activeJobs.isEmpty) assert(runningTaskSets.isEmpty) - assert(backend.runningTasks.isEmpty) + assert(!backend.hasTasks) } /** * Looks at all shuffleMapOutputs that are dependencies of the given RDD, and makes sure * they are all registered */ - private def assertMapOutputAvailable(targetRdd: MockRDD): Unit = { + def assertMapOutputAvailable(targetRdd: MockRDD): Unit = { val shuffleIds = targetRdd.shuffleDeps.map{_.shuffleId} val nParts = targetRdd.numPartitions for { @@ -176,11 +137,10 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L } } - /** models a stage boundary with a single dependency, like a shuffle */ def shuffle(nParts: Int, input: MockRDD): MockRDD = { val partitioner = new HashPartitioner(nParts) - val shuffleDep = new ShuffleDependency(input, partitioner) + val shuffleDep = new ShuffleDependency[Int, Int, Nothing](input, partitioner) new MockRDD(sc, nParts, List(shuffleDep)) } @@ -188,20 +148,245 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L def join(nParts: Int, inputs: MockRDD*): MockRDD = { val partitioner = new HashPartitioner(nParts) val shuffleDeps = inputs.map { inputRDD => - new ShuffleDependency(inputRDD, partitioner) + new ShuffleDependency[Int, Int, Nothing](inputRDD, partitioner) } new MockRDD(sc, nParts, shuffleDeps) } + /** + * Helper which makes it a little easier to setup a test, which starts a mock backend in another + * thread, responding to tasks with your custom function. You also supply the "body" of your + * test, where you submit jobs to your backend, wait for them to complete, then check + * whatever conditions you want. + */ + def withBackend(backendFunc: () => Unit)(testBody: => Unit): Unit = { + val backendContinue = new AtomicBoolean(true) + val backendThread = new Thread("mock backend thread") { + override def run(): Unit = { + while (backendContinue.get()) { + if (backend.hasTasksWaitingToRun) { + backendFunc() + } else { + Thread.sleep(10) + } + } + } + } + try { + backendThread.start() + testBody + } finally { + backendContinue.set(false) + backendThread.join() + } + } + +} + +private[spark] abstract class MockBackend( + conf: SparkConf, + var taskScheduler: TaskSchedulerImpl) extends SchedulerBackend with Logging { + + private val assignedTasksWaitingToRun = ArrayBuffer[TaskDescription]() + private val runningTasks = ArrayBuffer[TaskDescription]() + + def assignTasks(tasks: Seq[TaskDescription]): Unit = assignedTasksWaitingToRun.synchronized { + assignedTasksWaitingToRun ++= tasks + } + + def endTask(task: TaskDescription): Unit = runningTasks.synchronized { + runningTasks -= task + } + + def beginTask(): TaskDescription = { + val toRun = assignedTasksWaitingToRun.synchronized { + assignedTasksWaitingToRun.remove(assignedTasksWaitingToRun.size - 1) + } + runningTasks.synchronized { runningTasks += toRun } + toRun + } + + def hasTasks: Boolean = { + assignedTasksWaitingToRun.nonEmpty || runningTasks.nonEmpty + } + + def hasTasksWaitingToRun: Boolean = { + assignedTasksWaitingToRun.nonEmpty + } + + override def start(): Unit = {} + + override def stop(): Unit = {} + + var freeCores: Int = _ + val env = SparkEnv.get + + def executorIdToExecutor: Map[String, ExecutorTaskStatus] + + def generateOffers(): Seq[WorkerOffer] + + /** + * This is called by the scheduler whenever it has tasks it would like to schedule + */ + override def reviveOffers(): Unit = { + val offers: Seq[WorkerOffer] = generateOffers() + val newTasks = taskScheduler.resourceOffers(offers).flatten + synchronized { + newTasks.foreach { task => + executorIdToExecutor(task.executorId).freeCores -= taskScheduler.CPUS_PER_TASK + } + freeCores -= newTasks.size * taskScheduler.CPUS_PER_TASK + assignedTasksWaitingToRun ++= newTasks + } + } + + /** + * Tell the scheduler the task completed successfully, with the given result. Also + * updates some internal state for this mock. + */ + def taskSuccess(task: TaskDescription, result: Any): Unit = { + endTask(task) + val ser = env.serializer.newInstance() + val resultBytes = ser.serialize(result) + val metrics = new TaskMetrics + val directResult = new DirectTaskResult(resultBytes, Seq()) // no accumulator updates + val serializedDirectResult = ser.serialize(directResult) + taskScheduler.statusUpdate(task.taskId, TaskState.FINISHED, serializedDirectResult) + synchronized { + executorIdToExecutor(task.executorId).freeCores += taskScheduler.CPUS_PER_TASK + freeCores += taskScheduler.CPUS_PER_TASK + assignedTasksWaitingToRun -= task + } + reviveOffers() + } + + /** + * Tell the scheduler the task failed, with the given state and result (probably ExceptionFailure + * or FetchFailed). Also updates some internal state for this mock. + */ + def failTask(task: TaskDescription, state: TaskState, result: Any): Unit = { + endTask(task) + val ser = env.serializer.newInstance() + val resultBytes = ser.serialize(result) + taskScheduler.statusUpdate(task.taskId, state, resultBytes) + if (TaskState.isFinished(state)) { + synchronized { + executorIdToExecutor(task.executorId).freeCores += taskScheduler.CPUS_PER_TASK + freeCores += taskScheduler.CPUS_PER_TASK + assignedTasksWaitingToRun -= task + } + reviveOffers() + } + } +} + +/** + * A very simple mock backend that can just run one task at a time. + */ +private[spark] class SingleCoreMockBackend( + conf: SparkConf, + taskScheduler: TaskSchedulerImpl) extends MockBackend(conf, taskScheduler) { + + val cores = 1 + + override def defaultParallelism(): Int = conf.getInt("spark.default.parallelism", cores) + + freeCores = cores + val localExecutorId = SparkContext.DRIVER_IDENTIFIER + val localExecutorHostname = "localhost" + + val executorIdToExecutor: Map[String, ExecutorTaskStatus] = Map( + localExecutorId -> new ExecutorTaskStatus(localExecutorHostname, localExecutorId, freeCores) + ) + + override def generateOffers(): Seq[WorkerOffer] = { + Seq(new WorkerOffer(localExecutorId, localExecutorHostname, freeCores)) + } +} + +class MultiExecutorBackend( + conf: SparkConf, + taskScheduler: TaskSchedulerImpl) extends MockBackend(conf, taskScheduler) { + + val nHosts = 10 + val nExecutorsPerHost = 4 + val nCoresPerExecutor = 2 + + val executorIdToExecutor: Map[String, ExecutorTaskStatus] = (0 until nHosts).flatMap{ hostIdx => + val hostName = s"host-$hostIdx" + (0 until nExecutorsPerHost).map { execIdx => + val executorId = (hostIdx * nExecutorsPerHost + execIdx).toString + executorId -> new ExecutorTaskStatus(hostName, executorId, nCoresPerExecutor) + } + }.toMap + + val totalCores = nHosts * nExecutorsPerHost * nCoresPerExecutor + freeCores = totalCores + + override def generateOffers(): Seq[WorkerOffer] = { + // always offer all cores available on all executors + executorIdToExecutor.values.filter { exec => + exec.freeCores > taskScheduler.CPUS_PER_TASK + }.map { exec => + new WorkerOffer(exec.executorId, exec.host, exec.freeCores) + }.toSeq + } + + override def defaultParallelism(): Int = conf.getInt("spark.default.parallelism", totalCores) +} + +class ExecutorTaskStatus(val host: String, val executorId: String, var freeCores: Int) + +class MockRDD( + sc: SparkContext, + val numPartitions: Int, + val shuffleDeps: Seq[ShuffleDependency[Int, Int, Nothing]] +) extends RDD[(Int, Int)](sc, shuffleDeps) with Serializable { + + MockRDD.validate(numPartitions, shuffleDeps) + + override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] = + throw new RuntimeException("should not be reached") + override def getPartitions: Array[Partition] = (0 until numPartitions).map(i => new Partition { + override def index: Int = i + }).toArray + override def getPreferredLocations(split: Partition): Seq[String] = Nil + override def toString: String = "MockRDD " + id +} + +object MockRDD extends AssertionsHelper with TripleEquals { + /** + * make sure all the shuffle dependencies have a consistent number of output partitions + * (mostly to make sure the test setup makes sense, not that Spark itself would get this wrong) + */ + def validate(numPartitions: Int, dependencies: Seq[ShuffleDependency[_, _, _]]): Unit = { + dependencies.foreach { dependency => + val partitioner = dependency.partitioner + assert(partitioner != null) + assert(partitioner.numPartitions === numPartitions) + } + } +} + +/** + * Some very basic tests just to demonstrate the use of the test framework (and verify that it + * works). + */ +class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCoreMockBackend] { + /** * Very simple one stage job. Backend successfully completes each task, one by one */ test("super simple job") { - submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray) - while (backendHasWorkToDo) { - val task = backend.runningTasks.last + def runBackend(): Unit = { + val task = backend.beginTask() backend.taskSuccess(task, 42) } + withBackend(runBackend _) { + val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray) + val duration = Duration(1, SECONDS) + Await.ready(jobFuture, duration) + } assert(results === (0 until 10).map { _ -> 42 }.toMap) assertDataStructuresEmpty() } @@ -215,11 +400,6 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L * Backend successfully completes each task */ test("multi-stage job") { - val a = new MockRDD(sc, 2, Nil) - val b = shuffle(10, a) - val c = shuffle(20, a) - val d = join(30, b, c) - submit(d, (0 until 30).toArray) def stageToOutputParts(stageId: Int): Int = { stageId match { @@ -229,9 +409,13 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L } } - while (backendHasWorkToDo) { - assert(backend.runningTasks.nonEmpty) - val taskDescription = backend.runningTasks.last + val a = new MockRDD(sc, 2, Nil) + val b = shuffle(10, a) + val c = shuffle(20, a) + val d = join(30, b, c) + + def runBackend(): Unit = { + val taskDescription = backend.beginTask() val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet val task = taskSet.tasks(taskDescription.index) @@ -250,6 +434,11 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L backend.taskSuccess(taskDescription, 4321 + partition) } } + withBackend(runBackend _) { + val jobFuture = submit(d, (0 until 30).toArray) + val duration = Duration(1, SECONDS) + Await.ready(jobFuture, duration) + } assert(results === (0 until 30).map { idx => idx -> (4321 + idx) }.toMap) assertDataStructuresEmpty() } @@ -263,12 +452,11 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L val input = new MockRDD(sc, 2, Nil) val shuffledRdd = shuffle(10, input) val shuffleId = shuffledRdd.shuffleDeps.head.shuffleId - submit(shuffledRdd, (0 until 10).toArray) val stageToAttempts = new HashMap[Int, HashSet[Int]]() - while (backendHasWorkToDo) { - val taskDescription = backend.runningTasks.last + def runBackend(): Unit = { + val taskDescription = backend.beginTask() val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet val task = taskSet.tasks(taskDescription.index) stageToAttempts.getOrElseUpdate(task.stageId, new HashSet()) += task.stageAttemptId @@ -289,102 +477,29 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L backend.taskSuccess(taskDescription, 42 + partition) } } + withBackend(runBackend _) { + val jobFuture = submit(shuffledRdd, (0 until 10).toArray) + val duration = Duration(1, SECONDS) + Await.ready(jobFuture, duration) + } assert(results === (0 until 10).map { idx => idx -> (42 + idx) }.toMap) assert(stageToAttempts === Map(0 -> Set(0, 1), 1 -> Set(0, 1))) assertDataStructuresEmpty() } -} - -/** - * A very simple mock backend that can just run one task at a time. - */ -private[spark] class SingleCoreMockBackend( - conf: SparkConf, - var taskScheduler: TaskSchedulerImpl) extends SchedulerBackend with Logging { - - val cores = 1 - - override def start(): Unit = {} - - override def stop(): Unit = {} - - override def defaultParallelism(): Int = conf.getInt("spark.default.parallelism", cores) - - var freeCores = cores - val localExecutorId = SparkContext.DRIVER_IDENTIFIER - val localExecutorHostname = "localhost" - val env = SparkEnv.get - - val runningTasks = ArrayBuffer[TaskDescription]() - - /** - * This is called by the scheduler whenever it has tasks it would like to schedule - */ - override def reviveOffers(): Unit = { - val offers = Seq(new WorkerOffer(localExecutorId, localExecutorHostname, freeCores)) - val newTasks = taskScheduler.resourceOffers(offers).flatten - synchronized { - freeCores -= newTasks.size * taskScheduler.CPUS_PER_TASK - runningTasks ++= newTasks - } - } - - def taskSuccess(task: TaskDescription, result: Any): Unit = { - val ser = env.serializer.newInstance() - val resultBytes = ser.serialize(result) - val metrics = new TaskMetrics - val directResult = new DirectTaskResult(resultBytes, Seq()) // no accumulator updates - val serializedDirectResult = ser.serialize(directResult) - taskScheduler.statusUpdate(task.taskId, TaskState.FINISHED, serializedDirectResult) - synchronized { - freeCores += taskScheduler.CPUS_PER_TASK - runningTasks -= task - } - reviveOffers() - } - def failTask(task: TaskDescription, state: TaskState, result: Any): Unit = { - val ser = env.serializer.newInstance() - val resultBytes = ser.serialize(result) - taskScheduler.statusUpdate(task.taskId, state, resultBytes) - if (TaskState.isFinished(state)) { - synchronized { - freeCores += taskScheduler.CPUS_PER_TASK - runningTasks -= task - } - reviveOffers() + test("job failure after 4 attempts") { + def runBackend(): Unit = { + val task = backend.beginTask() + val failure = new ExceptionFailure(new RuntimeException("test task failure"), Seq()) + backend.failTask(task, TaskState.FAILED, failure) } - } - -} - -class MockRDD( - sc: SparkContext, - val numPartitions: Int, - val shuffleDeps: Seq[ShuffleDependency[Int, Int, Nothing]] -) extends RDD[(Int, Int)](sc, shuffleDeps) with Serializable { - - MockRDD.validate(numPartitions, shuffleDeps) - - override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] = - throw new RuntimeException("should not be reached") - override def getPartitions: Array[Partition] = (0 until numPartitions).map(i => new Partition { - override def index: Int = i - }).toArray - override def getPreferredLocations(split: Partition): Seq[String] = Nil - override def toString: String = "MockRDD " + id -} - -object MockRDD extends AssertionsHelper with TripleEquals { - /** - * make sure all the shuffle dependencies have a consistent number of output partitions - * (mostly to make sure the test setup makes sense, not that Spark itself would get this wrong) - */ - def validate(numPartitions: Int, dependencies: Seq[ShuffleDependency[_, _, _]]): Unit = { - dependencies.foreach { dependency => - val partitioner = dependency.partitioner - assert(partitioner != null) - assert(partitioner.numPartitions === numPartitions) + withBackend(runBackend _) { + val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray) + val duration = Duration(1, SECONDS) + Await.ready(jobFuture, duration) + failure.getMessage.contains("test task failure") } + assert(results.isEmpty) + assertDataStructuresEmpty(noFailure = false) } } From 421c2a18c1a5799c75884f51c73205a7b92a6166 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Fri, 13 May 2016 17:13:07 -0500 Subject: [PATCH 04/35] remove MultiExecutorBackend for now --- .../scheduler/SchedulerIntegrationSuite.scala | 31 ------------------- 1 file changed, 31 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index 40a1bb4ce7c3b..9b4bd2809dd24 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -304,37 +304,6 @@ private[spark] class SingleCoreMockBackend( } } -class MultiExecutorBackend( - conf: SparkConf, - taskScheduler: TaskSchedulerImpl) extends MockBackend(conf, taskScheduler) { - - val nHosts = 10 - val nExecutorsPerHost = 4 - val nCoresPerExecutor = 2 - - val executorIdToExecutor: Map[String, ExecutorTaskStatus] = (0 until nHosts).flatMap{ hostIdx => - val hostName = s"host-$hostIdx" - (0 until nExecutorsPerHost).map { execIdx => - val executorId = (hostIdx * nExecutorsPerHost + execIdx).toString - executorId -> new ExecutorTaskStatus(hostName, executorId, nCoresPerExecutor) - } - }.toMap - - val totalCores = nHosts * nExecutorsPerHost * nCoresPerExecutor - freeCores = totalCores - - override def generateOffers(): Seq[WorkerOffer] = { - // always offer all cores available on all executors - executorIdToExecutor.values.filter { exec => - exec.freeCores > taskScheduler.CPUS_PER_TASK - }.map { exec => - new WorkerOffer(exec.executorId, exec.host, exec.freeCores) - }.toSeq - } - - override def defaultParallelism(): Int = conf.getInt("spark.default.parallelism", totalCores) -} - class ExecutorTaskStatus(val host: String, val executorId: String, var freeCores: Int) class MockRDD( From c0911874783bdfadd7749a66a935cba2669f4ffa Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Tue, 17 May 2016 10:58:50 -0500 Subject: [PATCH 05/35] remove uncertain comment about messageScheduler --- .../scala/org/apache/spark/scheduler/DAGScheduler.scala | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index f9e9be40ab8e1..0c67becbc1b75 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -1283,13 +1283,8 @@ class DAGScheduler( // TODO: Cancel running tasks in the stage logInfo(s"Resubmitting $mapStage (${mapStage.name}) and " + s"$failedStage (${failedStage.name}) due to fetch failure") - // We might get lots of fetch failed for this stage, from lots of executors. - // Its better if we can resubmit for all the failed executors at one time, so lets - // just wait a *bit* before we resubmit. messageScheduler.schedule(new Runnable { - override def run(): Unit = { - eventProcessLoop.post(ResubmitFailedStages) - } + override def run(): Unit = eventProcessLoop.post(ResubmitFailedStages) }, DAGScheduler.RESUBMIT_TIMEOUT, TimeUnit.MILLISECONDS) } failedStages += failedStage From 3b67b2a950d75bc7f532c7a1151aaee864dc541f Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Tue, 17 May 2016 13:19:17 -0500 Subject: [PATCH 06/35] cleanup --- .../org/apache/spark/util/EventLoop.scala | 2 +- .../spark/scheduler/DAGSchedulerSuite.scala | 1 - .../scheduler/SchedulerIntegrationSuite.scala | 120 +++++++++--------- 3 files changed, 64 insertions(+), 59 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/EventLoop.scala b/core/src/main/scala/org/apache/spark/util/EventLoop.scala index eefe934c63883..3ea9139e11027 100644 --- a/core/src/main/scala/org/apache/spark/util/EventLoop.scala +++ b/core/src/main/scala/org/apache/spark/util/EventLoop.scala @@ -18,7 +18,7 @@ package org.apache.spark.util import java.util.concurrent.{BlockingQueue, LinkedBlockingDeque} -import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger} +import java.util.concurrent.atomic.AtomicBoolean import scala.util.control.NonFatal diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 9c004e0dd76dc..60051ef1f0d08 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -2070,7 +2070,6 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou } } - object DAGSchedulerSuite { def makeMapStatus(host: String, reduces: Int, sizes: Byte = 2): MapStatus = MapStatus(makeBlockManagerId(host), Array.fill[Long](reduces)(sizes)) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index 9b4bd2809dd24..fb4c0578a4cfd 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -33,7 +33,6 @@ import org.apache.spark.TaskState._ import org.apache.spark.executor.TaskMetrics import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD -import org.apache.spark.scheduler.DAGSchedulerSuite._ import org.apache.spark.util.CallSite /** @@ -157,7 +156,9 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa * Helper which makes it a little easier to setup a test, which starts a mock backend in another * thread, responding to tasks with your custom function. You also supply the "body" of your * test, where you submit jobs to your backend, wait for them to complete, then check - * whatever conditions you want. + * whatever conditions you want. Note that this is *not* safe to all bad backends -- + * in particular, your `backendFunc` has to return quickly, it can't throw errors, (instead + * it should send back the right TaskEndReason */ def withBackend(backendFunc: () => Unit)(testBody: => Unit): Unit = { val backendContinue = new AtomicBoolean(true) @@ -183,61 +184,23 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa } +/** + * Helper for running a backend in integration tests, does a bunch of the book-keeping + * so individual tests can focus on just responding to tasks. Individual tests will use + * [[beginTask]], [[taskSuccess]], and [[taskFailed]]. + */ private[spark] abstract class MockBackend( conf: SparkConf, var taskScheduler: TaskSchedulerImpl) extends SchedulerBackend with Logging { - private val assignedTasksWaitingToRun = ArrayBuffer[TaskDescription]() - private val runningTasks = ArrayBuffer[TaskDescription]() - - def assignTasks(tasks: Seq[TaskDescription]): Unit = assignedTasksWaitingToRun.synchronized { - assignedTasksWaitingToRun ++= tasks - } - - def endTask(task: TaskDescription): Unit = runningTasks.synchronized { - runningTasks -= task - } - - def beginTask(): TaskDescription = { - val toRun = assignedTasksWaitingToRun.synchronized { - assignedTasksWaitingToRun.remove(assignedTasksWaitingToRun.size - 1) - } - runningTasks.synchronized { runningTasks += toRun } - toRun - } - - def hasTasks: Boolean = { - assignedTasksWaitingToRun.nonEmpty || runningTasks.nonEmpty - } - - def hasTasksWaitingToRun: Boolean = { - assignedTasksWaitingToRun.nonEmpty - } - - override def start(): Unit = {} - - override def stop(): Unit = {} - - var freeCores: Int = _ - val env = SparkEnv.get - - def executorIdToExecutor: Map[String, ExecutorTaskStatus] - - def generateOffers(): Seq[WorkerOffer] - /** - * This is called by the scheduler whenever it has tasks it would like to schedule + * Test backends should call this to get a task that has been assigned to them by the scheduler. + * Each task should be responded to with either [[taskSuccess]] or [[taskFailed]]. */ - override def reviveOffers(): Unit = { - val offers: Seq[WorkerOffer] = generateOffers() - val newTasks = taskScheduler.resourceOffers(offers).flatten - synchronized { - newTasks.foreach { task => - executorIdToExecutor(task.executorId).freeCores -= taskScheduler.CPUS_PER_TASK - } - freeCores -= newTasks.size * taskScheduler.CPUS_PER_TASK - assignedTasksWaitingToRun ++= newTasks - } + def beginTask(): TaskDescription = synchronized { + val toRun = assignedTasksWaitingToRun.remove(assignedTasksWaitingToRun.size - 1) + runningTasks += toRun + toRun } /** @@ -264,7 +227,7 @@ private[spark] abstract class MockBackend( * Tell the scheduler the task failed, with the given state and result (probably ExceptionFailure * or FetchFailed). Also updates some internal state for this mock. */ - def failTask(task: TaskDescription, state: TaskState, result: Any): Unit = { + def taskFailed(task: TaskDescription, state: TaskState, result: Any): Unit = { endTask(task) val ser = env.serializer.newInstance() val resultBytes = ser.serialize(result) @@ -278,6 +241,47 @@ private[spark] abstract class MockBackend( reviveOffers() } } + + private val assignedTasksWaitingToRun = ArrayBuffer[TaskDescription]() + private val runningTasks = ArrayBuffer[TaskDescription]() + + def endTask(task: TaskDescription): Unit = synchronized { + runningTasks -= task + } + + def hasTasks: Boolean = synchronized { + assignedTasksWaitingToRun.nonEmpty || runningTasks.nonEmpty + } + + def hasTasksWaitingToRun: Boolean = synchronized { + assignedTasksWaitingToRun.nonEmpty + } + + override def start(): Unit = {} + + override def stop(): Unit = {} + + var freeCores: Int = _ + val env = SparkEnv.get + + def executorIdToExecutor: Map[String, ExecutorTaskStatus] + + def generateOffers(): Seq[WorkerOffer] + + /** + * This is called by the scheduler whenever it has tasks it would like to schedule + */ + override def reviveOffers(): Unit = { + val offers: Seq[WorkerOffer] = generateOffers() + val newTasks = taskScheduler.resourceOffers(offers).flatten + synchronized { + newTasks.foreach { task => + executorIdToExecutor(task.executorId).freeCores -= taskScheduler.CPUS_PER_TASK + } + freeCores -= newTasks.size * taskScheduler.CPUS_PER_TASK + assignedTasksWaitingToRun ++= newTasks + } + } } /** @@ -398,7 +402,8 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor (task.stageId, task.stageAttemptId, task.partitionId) match { case (stage, 0, _) if stage < 4 => - backend.taskSuccess(taskDescription, makeMapStatus("hostA", stageToOutputParts(stage))) + backend.taskSuccess(taskDescription, + DAGSchedulerSuite.makeMapStatus("hostA", stageToOutputParts(stage))) case (4, 0, partition) => backend.taskSuccess(taskDescription, 4321 + partition) } @@ -438,10 +443,11 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor (task.stageId, task.stageAttemptId, task.partitionId) match { case (0, _, _) => - backend.taskSuccess(taskDescription, makeMapStatus("hostA", 10)) + backend.taskSuccess(taskDescription, DAGSchedulerSuite.makeMapStatus("hostA", 10)) case (1, 0, 0) => - val fetchFailed = FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored") - backend.failTask(taskDescription, TaskState.FAILED, fetchFailed) + val fetchFailed = FetchFailed( + DAGSchedulerSuite.makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored") + backend.taskFailed(taskDescription, TaskState.FAILED, fetchFailed) case (1, _, partition) => backend.taskSuccess(taskDescription, 42 + partition) } @@ -460,7 +466,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor def runBackend(): Unit = { val task = backend.beginTask() val failure = new ExceptionFailure(new RuntimeException("test task failure"), Seq()) - backend.failTask(task, TaskState.FAILED, failure) + backend.taskFailed(task, TaskState.FAILED, failure) } withBackend(runBackend _) { val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray) From 79bc38416a914cebb33beaef3ec3179528848bca Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Wed, 18 May 2016 09:10:46 -0500 Subject: [PATCH 07/35] add BlacklistIntegrationSuite and corresponding refactoring --- .../scheduler/BlacklistIntegrationSuite.scala | 145 ++++++++++++++++++ .../scheduler/SchedulerIntegrationSuite.scala | 67 ++++++-- 2 files changed, 196 insertions(+), 16 deletions(-) create mode 100644 core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala new file mode 100644 index 0000000000000..3225866e317dd --- /dev/null +++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.scheduler + +import scala.concurrent.Await +import scala.concurrent.duration._ + +import org.apache.spark._ + +class BlacklistIntegrationSuite extends SchedulerIntegrationSuite[MultiExecutorMockBackend]{ + + val badHost = "host-0" + + /** + * This backend just always fails if the task is executed on a bad host, but otherwise succeeds + * all tasks. + */ + def badHostBackend(): Unit = { + val task = backend.beginTask() + val host = backend.executorIdToExecutor(task.executorId).host + if (host == badHost) { + val failure = new ExceptionFailure(new RuntimeException("I'm a bad host!"), Seq()) + backend.taskFailed(task, TaskState.FAILED, failure) + } else { + backend.taskSuccess(task, 42) + } + } + + // Test demonstrating the issue -- without a config change, the scheduler keeps scheduling + // according to locality preferences, and so the job fails + testScheduler("If preferred node is bad, without blacklist job will fail") { + val rdd = new MockRDDWithLocalityPrefs(sc, 10, Nil, badHost) + withBackend(badHostBackend _) { + val jobFuture = submit(rdd, (0 until 10).toArray) + val duration = Duration(1, SECONDS) + Await.ready(jobFuture, duration) + } + assert(results.isEmpty) + assertDataStructuresEmpty(noFailure = false) + } + + // even with the blacklist turned on, if maxTaskFailures is not more than the number + // of executors on the bad node, then locality preferences will lead to us cycling through + // the executors on the bad node, and still failing the job + testScheduler( + "With blacklist on, job will still fail if there are too many bad executors on bad host", + extraConfs = Seq( + // just set this to something much longer than the test duration + ("spark.scheduler.executorTaskBlacklistTime", "10000000") + ) + ) { + val rdd = new MockRDDWithLocalityPrefs(sc, 10, Nil, badHost) + withBackend(badHostBackend _) { + val jobFuture = submit(rdd, (0 until 10).toArray) + val duration = Duration(3, SECONDS) + Await.ready(jobFuture, duration) + } + assert(results.isEmpty) + assertDataStructuresEmpty(noFailure = false) + } + + // Here we run with the blacklist on, and maxTaskFailures high enough that we'll eventually + // schedule on a good node and succeed the job + testScheduler( + "Bad node with multiple executors, job will still succeed with the right confs", + extraConfs = Seq( + // just set this to something much longer than the test duration + ("spark.scheduler.executorTaskBlacklistTime", "10000000"), + // this has to be higher than the number of executors on the bad host + ("spark.task.maxFailures", "5"), + // just to avoid this test taking too long + ("spark.locality.wait", "10ms") + ) + ) { + val rdd = new MockRDDWithLocalityPrefs(sc, 10, Nil, badHost) + withBackend(badHostBackend _) { + val jobFuture = submit(rdd, (0 until 10).toArray) + val duration = Duration(1, SECONDS) + Await.ready(jobFuture, duration) + } + assert(results === (0 until 10).map { _ -> 42 }.toMap) + assertDataStructuresEmpty(noFailure = true) + } + +} + +class MultiExecutorMockBackend( + conf: SparkConf, + taskScheduler: TaskSchedulerImpl) extends MockBackend(conf, taskScheduler) { + + val nHosts = conf.getInt("spark.testing.nHosts", 5) + val nExecutorsPerHost = conf.getInt("spark.testing.nExecutorsPerHost", 4) + val nCoresPerExecutor = conf.getInt("spark.testing.nCoresPerExecutor", 2) + + override val executorIdToExecutor: Map[String, ExecutorTaskStatus] = { + (0 until nHosts).flatMap { hostIdx => + val hostName = "host-" + hostIdx + (0 until nExecutorsPerHost).map { subIdx => + val executorId = (hostIdx * nExecutorsPerHost + subIdx).toString + executorId -> + ExecutorTaskStatus(host = hostName, executorId = executorId, nCoresPerExecutor) + } + }.toMap + } + + override def generateOffers(): Seq[WorkerOffer] = { + executorIdToExecutor.values.map { exec => + WorkerOffer(executorId = exec.executorId, host = exec.host, + cores = exec.freeCores) + }.toSeq + } + + override def defaultParallelism(): Int = nHosts * nExecutorsPerHost * nCoresPerExecutor + + override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = { + // Its OK for this to be a no-op, because even if a backend does implement killTask, + // it really can only be "best-effort" in any case, and the scheduler should be robust to that. + // And in fact its reasonably simulating a case where a real backend finishes tasks in between + // the time when the scheduler sends the msg to kill tasks, and the backend receives the msg. + } +} + +class MockRDDWithLocalityPrefs( + sc: SparkContext, + numPartitions: Int, + shuffleDeps: Seq[ShuffleDependency[Int, Int, Nothing]], + val preferredLoc: String) extends MockRDD(sc, numPartitions, shuffleDeps) { + override def getPreferredLocations(split: Partition): Seq[String] = { + Seq(preferredLoc) + } +} diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index fb4c0578a4cfd..63820979b309c 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -33,7 +33,7 @@ import org.apache.spark.TaskState._ import org.apache.spark.executor.TaskMetrics import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD -import org.apache.spark.util.CallSite +import org.apache.spark.util.{CallSite, Utils} /** * Tests for the entire scheduler code -- DAGScheduler, TaskSchedulerImpl, TaskSets, @@ -44,7 +44,7 @@ import org.apache.spark.util.CallSite * disconnecting, etc.). */ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends SparkFunSuite - with BeforeAndAfter with LocalSparkContext { + with LocalSparkContext { val conf = new SparkConf /** Set of TaskSets the DAGScheduler has requested executed. */ @@ -54,12 +54,25 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa var scheduler: DAGScheduler = null var backend: T = _ - before { + override def beforeEach(): Unit = { runningTaskSets.clear() results.clear() failure = null + super.beforeEach() + } + + override def afterEach(): Unit = { + super.afterEach() + taskScheduler.stop() + backend.stop() + scheduler.stop() + } + + def setupScheduler(conf: SparkConf): Unit = { + conf.setAppName(this.getClass().getSimpleName()) val backendClassName = implicitly[ClassTag[T]].runtimeClass.getName() - sc = new SparkContext(s"mock[${backendClassName}]", this.getClass().getSimpleName()) + conf.setMaster(s"mock[${backendClassName}]") + sc = new SparkContext(conf) backend = sc.schedulerBackend.asInstanceOf[T] taskScheduler = new TaskSchedulerImpl(sc) { override def submitTasks(taskSet: TaskSet): Unit = { @@ -78,10 +91,17 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa taskScheduler.setDAGScheduler(scheduler) } - after { - taskScheduler.stop() - backend.stop() - scheduler.stop() + def testScheduler(name: String)(testBody: => Unit): Unit = { + testScheduler(name, Seq())(testBody) + } + + def testScheduler(name: String, extraConfs: Seq[(String, String)])(testBody: => Unit): Unit = { + test(name) { + val conf = new SparkConf() + extraConfs.foreach{ case (k, v) => conf.set(k, v)} + setupScheduler(conf) + testBody + } } val results = new HashMap[Int, Any]() @@ -110,11 +130,26 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa protected def assertDataStructuresEmpty(noFailure: Boolean = true): Unit = { if (noFailure) { - assert(failure === null) + // When a job fails, we terminate before waiting for all the task end events to come in, + // so there might still be a running task set + assert(runningTaskSets.isEmpty) + assert(!backend.hasTasks) + if (failure != null) { + // if there is a job failure, it can be a bit hard to tease the job failure msg apart + // from the test failure msg, so we do a little extra formatting + val msg = + raw""" + | There was a failed job. + | ----- Begin Job Failure Msg ----- + | ${Utils.exceptionString(failure)} + + | ----- End Job Failure Msg ---- + """. + stripMargin + fail(msg) + } } assert(scheduler.activeJobs.isEmpty) - assert(runningTaskSets.isEmpty) - assert(!backend.hasTasks) } /** @@ -308,7 +343,7 @@ private[spark] class SingleCoreMockBackend( } } -class ExecutorTaskStatus(val host: String, val executorId: String, var freeCores: Int) +case class ExecutorTaskStatus(host: String, executorId: String, var freeCores: Int) class MockRDD( sc: SparkContext, @@ -350,7 +385,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor /** * Very simple one stage job. Backend successfully completes each task, one by one */ - test("super simple job") { + testScheduler("super simple job") { def runBackend(): Unit = { val task = backend.beginTask() backend.taskSuccess(task, 42) @@ -372,7 +407,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor * * Backend successfully completes each task */ - test("multi-stage job") { + testScheduler("multi-stage job") { def stageToOutputParts(stageId: Int): Int = { stageId match { @@ -422,7 +457,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor * (a) map output is available whenever we run stage 1 * (b) we get a second attempt for stage 0 & stage 1 */ - test("job with fetch failure") { + testScheduler("job with fetch failure") { val input = new MockRDD(sc, 2, Nil) val shuffledRdd = shuffle(10, input) val shuffleId = shuffledRdd.shuffleDeps.head.shuffleId @@ -462,7 +497,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor assertDataStructuresEmpty() } - test("job failure after 4 attempts") { + testScheduler("job failure after 4 attempts") { def runBackend(): Unit = { val task = backend.beginTask() val failure = new ExceptionFailure(new RuntimeException("test task failure"), Seq()) From 8349b76ada807a9ce351cebbf4eddb88f67ca138 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Wed, 18 May 2016 10:37:19 -0500 Subject: [PATCH 08/35] cleanup --- .../spark/scheduler/BlacklistIntegrationSuite.scala | 7 ------- .../spark/scheduler/SchedulerIntegrationSuite.scala | 11 ++++++----- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala index 3225866e317dd..5283fcb1d8892 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala @@ -117,13 +117,6 @@ class MultiExecutorMockBackend( }.toMap } - override def generateOffers(): Seq[WorkerOffer] = { - executorIdToExecutor.values.map { exec => - WorkerOffer(executorId = exec.executorId, host = exec.host, - cores = exec.freeCores) - }.toSeq - } - override def defaultParallelism(): Int = nHosts * nExecutorsPerHost * nCoresPerExecutor override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = { diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index 63820979b309c..77d308b16514b 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -301,7 +301,12 @@ private[spark] abstract class MockBackend( def executorIdToExecutor: Map[String, ExecutorTaskStatus] - def generateOffers(): Seq[WorkerOffer] + def generateOffers(): Seq[WorkerOffer] = { + executorIdToExecutor.values.map { exec => + WorkerOffer(executorId = exec.executorId, host = exec.host, + cores = exec.freeCores) + }.toSeq + } /** * This is called by the scheduler whenever it has tasks it would like to schedule @@ -337,10 +342,6 @@ private[spark] class SingleCoreMockBackend( val executorIdToExecutor: Map[String, ExecutorTaskStatus] = Map( localExecutorId -> new ExecutorTaskStatus(localExecutorHostname, localExecutorId, freeCores) ) - - override def generateOffers(): Seq[WorkerOffer] = { - Seq(new WorkerOffer(localExecutorId, localExecutorHostname, freeCores)) - } } case class ExecutorTaskStatus(host: String, executorId: String, var freeCores: Int) From 7050b49b4dfd6e7b4e3e966e9e15e3558a343d67 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Wed, 18 May 2016 10:51:13 -0500 Subject: [PATCH 09/35] comments --- core/src/main/scala/org/apache/spark/SparkContext.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index c3f51923e73a7..33a5cce8d37c8 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -2421,6 +2421,8 @@ object SparkContext extends Logging { (backend, scheduler) case MOCK_REGEX(backendClassName) => + // This is a Scheduler integration test, so we setup a mock backend. Not a documented + // feature or meant to be publicly visible at all. val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true) val backendClass = Utils.classForName(backendClassName) val ctor = backendClass.getConstructor(classOf[SparkConf], classOf[TaskSchedulerImpl]) @@ -2528,6 +2530,7 @@ object SparkContext extends Logging { * A collection of regexes for extracting information from the master string. */ private object SparkMasterRegex { + /** Used for Scheduler integration tests, to plug in a mock backend */ val MOCK_REGEX = """mock\[(.*)\]""".r // Regular expression used for local[N] and local[*] master formats val LOCAL_N_REGEX = """local\[([0-9]+|\*)\]""".r From 00953764f8d6a3807329ddff028c64a913cba662 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Wed, 18 May 2016 10:58:04 -0500 Subject: [PATCH 10/35] move dummy killTask to MockBackend, otherwise occasional problems even in SingleCoreMockBackend when killTask is unsupported --- .../apache/spark/scheduler/BlacklistIntegrationSuite.scala | 7 ------- .../apache/spark/scheduler/SchedulerIntegrationSuite.scala | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala index 5283fcb1d8892..3cb07a404d39b 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala @@ -118,13 +118,6 @@ class MultiExecutorMockBackend( } override def defaultParallelism(): Int = nHosts * nExecutorsPerHost * nCoresPerExecutor - - override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = { - // Its OK for this to be a no-op, because even if a backend does implement killTask, - // it really can only be "best-effort" in any case, and the scheduler should be robust to that. - // And in fact its reasonably simulating a case where a real backend finishes tasks in between - // the time when the scheduler sends the msg to kill tasks, and the backend receives the msg. - } } class MockRDDWithLocalityPrefs( diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index 77d308b16514b..a37d2f06a4f0e 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -322,6 +322,13 @@ private[spark] abstract class MockBackend( assignedTasksWaitingToRun ++= newTasks } } + + override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = { + // Its OK for this to be a no-op, because even if a backend does implement killTask, + // it really can only be "best-effort" in any case, and the scheduler should be robust to that. + // And in fact its reasonably simulating a case where a real backend finishes tasks in between + // the time when the scheduler sends the msg to kill tasks, and the backend receives the msg. + } } /** From cb5860ffe8a995ac7566e80a774cf57c498f6182 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Wed, 18 May 2016 10:58:04 -0500 Subject: [PATCH 11/35] move dummy killTask to MockBackend, otherwise occasional problems even in SingleCoreMockBackend when killTask is unsupported --- .../apache/spark/scheduler/BlacklistIntegrationSuite.scala | 7 ------- .../apache/spark/scheduler/SchedulerIntegrationSuite.scala | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala index 5283fcb1d8892..3cb07a404d39b 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala @@ -118,13 +118,6 @@ class MultiExecutorMockBackend( } override def defaultParallelism(): Int = nHosts * nExecutorsPerHost * nCoresPerExecutor - - override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = { - // Its OK for this to be a no-op, because even if a backend does implement killTask, - // it really can only be "best-effort" in any case, and the scheduler should be robust to that. - // And in fact its reasonably simulating a case where a real backend finishes tasks in between - // the time when the scheduler sends the msg to kill tasks, and the backend receives the msg. - } } class MockRDDWithLocalityPrefs( diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index 77d308b16514b..a37d2f06a4f0e 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -322,6 +322,13 @@ private[spark] abstract class MockBackend( assignedTasksWaitingToRun ++= newTasks } } + + override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = { + // Its OK for this to be a no-op, because even if a backend does implement killTask, + // it really can only be "best-effort" in any case, and the scheduler should be robust to that. + // And in fact its reasonably simulating a case where a real backend finishes tasks in between + // the time when the scheduler sends the msg to kill tasks, and the backend receives the msg. + } } /** From 8034995249e16e80fa7db5e709c38d1444b98f08 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Wed, 18 May 2016 12:30:09 -0500 Subject: [PATCH 12/35] take advantage of ExternalClusteManager extension --- .../scala/org/apache/spark/SparkContext.scala | 12 ---- ...che.spark.scheduler.ExternalClusterManager | 3 +- .../scheduler/SchedulerIntegrationSuite.scala | 69 ++++++++++++++----- 3 files changed, 53 insertions(+), 31 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 33a5cce8d37c8..e391599336074 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -2420,16 +2420,6 @@ object SparkContext extends Logging { scheduler.initialize(backend) (backend, scheduler) - case MOCK_REGEX(backendClassName) => - // This is a Scheduler integration test, so we setup a mock backend. Not a documented - // feature or meant to be publicly visible at all. - val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true) - val backendClass = Utils.classForName(backendClassName) - val ctor = backendClass.getConstructor(classOf[SparkConf], classOf[TaskSchedulerImpl]) - val backend = ctor.newInstance(sc.getConf, scheduler).asInstanceOf[SchedulerBackend] - scheduler.initialize(backend) - (backend, scheduler) - case LOCAL_N_REGEX(threads) => def localCpuCount: Int = Runtime.getRuntime.availableProcessors() // local[*] estimates the number of cores on the machine; local[N] uses exactly N threads. @@ -2530,8 +2520,6 @@ object SparkContext extends Logging { * A collection of regexes for extracting information from the master string. */ private object SparkMasterRegex { - /** Used for Scheduler integration tests, to plug in a mock backend */ - val MOCK_REGEX = """mock\[(.*)\]""".r // Regular expression used for local[N] and local[*] master formats val LOCAL_N_REGEX = """local\[([0-9]+|\*)\]""".r // Regular expression for local[N, maxRetries], used in tests with failing tasks diff --git a/core/src/test/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager b/core/src/test/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager index 3c570ffd8f566..757c6d2296aff 100644 --- a/core/src/test/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager +++ b/core/src/test/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager @@ -1 +1,2 @@ -org.apache.spark.scheduler.DummyExternalClusterManager \ No newline at end of file +org.apache.spark.scheduler.DummyExternalClusterManager +org.apache.spark.scheduler.MockExternalClusterManager \ No newline at end of file diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index a37d2f06a4f0e..204a2bef610e5 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -26,7 +26,6 @@ import scala.reflect.ClassTag import org.scalactic.TripleEquals import org.scalatest.Assertions.AssertionsHelper -import org.scalatest.BeforeAndAfter import org.apache.spark._ import org.apache.spark.TaskState._ @@ -47,15 +46,14 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa with LocalSparkContext { val conf = new SparkConf - /** Set of TaskSets the DAGScheduler has requested executed. */ - val runningTaskSets = HashSet[TaskSet]() - - var taskScheduler: TaskSchedulerImpl = null + var taskScheduler: TestTaskScheduler = null var scheduler: DAGScheduler = null var backend: T = _ override def beforeEach(): Unit = { - runningTaskSets.clear() + if (taskScheduler != null) { + taskScheduler.runningTaskSets.clear() + } results.clear() failure = null super.beforeEach() @@ -74,17 +72,7 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa conf.setMaster(s"mock[${backendClassName}]") sc = new SparkContext(conf) backend = sc.schedulerBackend.asInstanceOf[T] - taskScheduler = new TaskSchedulerImpl(sc) { - override def submitTasks(taskSet: TaskSet): Unit = { - runningTaskSets += taskSet - super.submitTasks(taskSet) - } - - override def taskSetFinished(manager: TaskSetManager): Unit = { - runningTaskSets -= manager.taskSet - super.taskSetFinished(manager) - } - } + taskScheduler = sc.taskScheduler.asInstanceOf[TestTaskScheduler] taskScheduler.initialize(sc.schedulerBackend) backend.taskScheduler = taskScheduler scheduler = new DAGScheduler(sc, taskScheduler) @@ -132,7 +120,7 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa if (noFailure) { // When a job fails, we terminate before waiting for all the task end events to come in, // so there might still be a running task set - assert(runningTaskSets.isEmpty) + assert(taskScheduler.runningTaskSets.isEmpty) assert(!backend.hasTasks) if (failure != null) { // if there is a job failure, it can be a bit hard to tease the job failure msg apart @@ -384,6 +372,51 @@ object MockRDD extends AssertionsHelper with TripleEquals { } } +/** Simple cluster manager that wires up our mock backend. */ +private class MockExternalClusterManager extends ExternalClusterManager { + + val MOCK_REGEX = """mock\[(.*)\]""".r + def canCreate(masterURL: String): Boolean = MOCK_REGEX.findFirstIn(masterURL).isDefined + + def createTaskScheduler( + sc: SparkContext, + masterURL: String): TaskScheduler = { + new TestTaskScheduler(sc) + } + + def createSchedulerBackend( + sc: SparkContext, + masterURL: String, + scheduler: TaskScheduler): SchedulerBackend = { + masterURL match { + case MOCK_REGEX(backendClassName) => + val backendClass = Utils.classForName(backendClassName) + val ctor = backendClass.getConstructor(classOf[SparkConf], classOf[TaskSchedulerImpl]) + ctor.newInstance(sc.getConf, scheduler).asInstanceOf[SchedulerBackend] + } + } + + def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { + scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend) + } +} + +/** TaskSchedulerImpl that just tracks a tiny bit more state to enable checks in tests. */ +class TestTaskScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) { + /** Set of TaskSets the DAGScheduler has requested executed. */ + val runningTaskSets = HashSet[TaskSet]() + + override def submitTasks(taskSet: TaskSet): Unit = { + runningTaskSets += taskSet + super.submitTasks(taskSet) + } + + override def taskSetFinished(manager: TaskSetManager): Unit = { + runningTaskSets -= manager.taskSet + super.taskSetFinished(manager) + } +} + /** * Some very basic tests just to demonstrate the use of the test framework (and verify that it * works). From 360c7cdf7731fd173bbfaf5e66d413f743604e60 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Wed, 18 May 2016 12:39:52 -0500 Subject: [PATCH 13/35] cleanup --- .../org/apache/spark/scheduler/SchedulerIntegrationSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index 204a2bef610e5..dc2749cfbe4c3 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -130,7 +130,6 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa | There was a failed job. | ----- Begin Job Failure Msg ----- | ${Utils.exceptionString(failure)} - | ----- End Job Failure Msg ---- """. stripMargin @@ -312,6 +311,7 @@ private[spark] abstract class MockBackend( } override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = { + // We have to implement this b/c of SPARK-15385. // Its OK for this to be a no-op, because even if a backend does implement killTask, // it really can only be "best-effort" in any case, and the scheduler should be robust to that. // And in fact its reasonably simulating a case where a real backend finishes tasks in between From c7a78b0df04e86ab942617f84106085c3e750e00 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Thu, 19 May 2016 12:58:01 -0500 Subject: [PATCH 14/35] performance updates to mock backend + some utils --- .../scheduler/SchedulerIntegrationSuite.scala | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index dc2749cfbe4c3..9f06d7902ef6a 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -17,9 +17,11 @@ package org.apache.spark.scheduler import java.util.Properties +import java.util.concurrent.ArrayBlockingQueue import java.util.concurrent.atomic.AtomicBoolean import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} +import scala.collection.JavaConverters._ import scala.concurrent.{Await, Future} import scala.concurrent.duration.{Duration, SECONDS} import scala.reflect.ClassTag @@ -118,10 +120,6 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa protected def assertDataStructuresEmpty(noFailure: Boolean = true): Unit = { if (noFailure) { - // When a job fails, we terminate before waiting for all the task end events to come in, - // so there might still be a running task set - assert(taskScheduler.runningTaskSets.isEmpty) - assert(!backend.hasTasks) if (failure != null) { // if there is a job failure, it can be a bit hard to tease the job failure msg apart // from the test failure msg, so we do a little extra formatting @@ -135,6 +133,11 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa stripMargin fail(msg) } + // When a job fails, we terminate before waiting for all the task end events to come in, + // so there might still be a running task set. That is why we only check these conditions + // when the job succeeds + assert(taskScheduler.runningTaskSets.isEmpty) + assert(!backend.hasTasks) } assert(scheduler.activeJobs.isEmpty) } @@ -182,7 +185,7 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa * in particular, your `backendFunc` has to return quickly, it can't throw errors, (instead * it should send back the right TaskEndReason */ - def withBackend(backendFunc: () => Unit)(testBody: => Unit): Unit = { + def withBackend[T](backendFunc: () => Unit)(testBody: => T): T = { val backendContinue = new AtomicBoolean(true) val backendThread = new Thread("mock backend thread") { override def run(): Unit = { @@ -219,8 +222,8 @@ private[spark] abstract class MockBackend( * Test backends should call this to get a task that has been assigned to them by the scheduler. * Each task should be responded to with either [[taskSuccess]] or [[taskFailed]]. */ - def beginTask(): TaskDescription = synchronized { - val toRun = assignedTasksWaitingToRun.remove(assignedTasksWaitingToRun.size - 1) + def beginTask(): TaskDescription = { + val toRun = assignedTasksWaitingToRun.take() runningTasks += toRun toRun } @@ -240,11 +243,14 @@ private[spark] abstract class MockBackend( synchronized { executorIdToExecutor(task.executorId).freeCores += taskScheduler.CPUS_PER_TASK freeCores += taskScheduler.CPUS_PER_TASK - assignedTasksWaitingToRun -= task } reviveOffers() } + def taskFailedWithException(task: TaskDescription, state: TaskState, exc: Exception): Unit = { + taskFailed(task, state, new ExceptionFailure(exc, Seq())) + } + /** * Tell the scheduler the task failed, with the given state and result (probably ExceptionFailure * or FetchFailed). Also updates some internal state for this mock. @@ -258,13 +264,12 @@ private[spark] abstract class MockBackend( synchronized { executorIdToExecutor(task.executorId).freeCores += taskScheduler.CPUS_PER_TASK freeCores += taskScheduler.CPUS_PER_TASK - assignedTasksWaitingToRun -= task } reviveOffers() } } - private val assignedTasksWaitingToRun = ArrayBuffer[TaskDescription]() + private val assignedTasksWaitingToRun = new ArrayBlockingQueue[TaskDescription](10000) private val runningTasks = ArrayBuffer[TaskDescription]() def endTask(task: TaskDescription): Unit = synchronized { @@ -272,11 +277,11 @@ private[spark] abstract class MockBackend( } def hasTasks: Boolean = synchronized { - assignedTasksWaitingToRun.nonEmpty || runningTasks.nonEmpty + !assignedTasksWaitingToRun.isEmpty() || runningTasks.nonEmpty } - def hasTasksWaitingToRun: Boolean = synchronized { - assignedTasksWaitingToRun.nonEmpty + def hasTasksWaitingToRun: Boolean = { + !assignedTasksWaitingToRun.isEmpty() } override def start(): Unit = {} @@ -289,7 +294,9 @@ private[spark] abstract class MockBackend( def executorIdToExecutor: Map[String, ExecutorTaskStatus] def generateOffers(): Seq[WorkerOffer] = { - executorIdToExecutor.values.map { exec => + executorIdToExecutor.values.filter { exec => + exec.freeCores > 0 + }.map { exec => WorkerOffer(executorId = exec.executorId, host = exec.host, cores = exec.freeCores) }.toSeq @@ -306,7 +313,7 @@ private[spark] abstract class MockBackend( executorIdToExecutor(task.executorId).freeCores -= taskScheduler.CPUS_PER_TASK } freeCores -= newTasks.size * taskScheduler.CPUS_PER_TASK - assignedTasksWaitingToRun ++= newTasks + assignedTasksWaitingToRun.addAll(newTasks.asJava) } } From ee59913ac216f44b9e0b46a2b662f42b26f48da8 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Thu, 19 May 2016 12:58:40 -0500 Subject: [PATCH 15/35] add performance tests --- .../scheduler/SchedulerPerformanceSuite.scala | 228 ++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala new file mode 100644 index 0000000000000..0489719b6c94e --- /dev/null +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.scheduler + +import scala.concurrent.duration.Duration + +import org.apache.spark.TaskState +import org.apache.spark.util.Utils + +class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorMockBackend] { + + def simpleWorkload(N: Int): MockRDD = { + val a = new MockRDD(sc, N, Nil) + val b = shuffle(N, a) + val c = shuffle(N, a) + join(N, b, c) + } + + def goodBackend(N: Int): Unit = { + val taskDescription = backend.beginTask() + val host = backend.executorIdToExecutor(taskDescription.executorId).host + val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet + val task = taskSet.tasks(taskDescription.index) + + // every 5th stage is a ResultStage -- the rest are ShuffleMapStages + (task.stageId, task.partitionId) match { + case (stage, _) if stage % 5 != 4 => + backend.taskSuccess(taskDescription, + DAGSchedulerSuite.makeMapStatus(host, N)) + case (_, _) => + backend.taskSuccess(taskDescription, 42) + } + } + + def runJobWithBackend(N: Int, backend: () => Unit): Unit = { + // run as many jobs as we can in 10 seconds + var itrs = 0 + val totalMs = withBackend(backend) { + val start = System.currentTimeMillis() + while (System.currentTimeMillis() - start < 10000 ) { + withClue(s"failure in iteration = $itrs") { + val jobFuture = submit(simpleWorkload(N), new Array[Int](N)) + // Note: Do not call Await.ready(future) because that calls `scala.concurrent.blocking`, + // which causes concurrent SQL executions to fail if a fork-join pool is used. Note that + // due to idiosyncrasies in Scala, `awaitPermission` is not actually used anywhere so it's + // safe to pass in null here. For more detail, see SPARK-13747. + val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait] + jobFuture.ready(Duration.Inf)(awaitPermission) + assertDataStructuresEmpty(noFailure = true) + itrs += 1 + } + } + System.currentTimeMillis() - start + } + + val msPerItr = Utils.msDurationToString((totalMs.toDouble / itrs).toLong) + // scalastyle:off println + println(s"ran $itrs iterations in ${Utils.msDurationToString(totalMs)} ($msPerItr per itr)") + // scalastyle:on println + } + + def runSuccessfulJob(N: Int): Unit = { + runJobWithBackend(N, () => goodBackend(N)) + } + + testScheduler("Scheduling speed -- small job on a small cluster") { + runSuccessfulJob(40) + } + + testScheduler("Scheduling speed -- large job on a small cluster") { + runSuccessfulJob(3000) + } + + + testScheduler( + "Scheduling speed -- large job on a super node", + extraConfs = Seq( + "spark.testing.nHosts" -> "1", + "spark.testing.nExecutorsPerHost" -> "1", + "spark.testing.nCoresPerExecutor" -> "20000" + ) + ) { + runSuccessfulJob(3000) + } + + testScheduler( + // 4 execs per node, 2 cores per exec, so 400 cores + "Scheduling speed -- large job on 50 node cluster", + extraConfs = Seq( + "spark.testing.nHosts" -> "50" + ) + ) { + runSuccessfulJob(3000) + } + + testScheduler( + // 4 execs per node, 2 cores per exec, so 800 cores + "Scheduling speed -- large job on 100 node cluster", + extraConfs = Seq( + "spark.testing.nHosts" -> "100" + ) + ) { + runSuccessfulJob(3000) + } + + Seq(200, 300, 400, 450, 500, 550).foreach { nodes => + /* +ran 1 iterations in 12.9 s (12.9 s per itr) +[info] - COMPARE A: Scheduling speed -- large job on 200 node cluster (13 seconds, 861 milliseconds) +ran 1 iterations in 25.0 s (25.0 s per itr) +[info] - COMPARE A: Scheduling speed -- large job on 300 node cluster (25 seconds, 50 milliseconds) +ran 1 iterations in 34.6 s (34.6 s per itr) +[info] - COMPARE A: Scheduling speed -- large job on 400 node cluster (34 seconds, 668 milliseconds) +ran 1 iterations in 54.0 s (54.0 s per itr) +[info] - COMPARE A: Scheduling speed -- large job on 450 node cluster (53 seconds, 991 milliseconds) +ran 1 iterations in 1.8 m (1.8 m per itr) +[info] - COMPARE A: Scheduling speed -- large job on 500 node cluster (1 minute, 48 seconds) +ran 1 iterations in 2.3 m (2.3 m per itr) +[info] - COMPARE A: Scheduling speed -- large job on 550 node cluster (2 minutes, 19 seconds) + */ + testScheduler( + s"COMPARE A: Scheduling speed -- large job on ${nodes} node cluster", + extraConfs = Seq( + "spark.testing.nHosts" -> s"$nodes" + ) + ) { + runSuccessfulJob(3000) + } + } + + /* +nHosts = 400; nExecutorsPerHost = 1; nCores = 800 +ran 2 iterations in 11.7 s (5.9 s per itr) +[info] - COMPARE B: Lots of nodes (12 seconds, 679 milliseconds) +nHosts = 1; nExecutorsPerHost = 400; nCores = 800 +ran 3 iterations in 14.2 s (4.7 s per itr) +[info] - COMPARE B: Lots of executors, one node (14 seconds, 290 milliseconds) +nHosts = 1; nExecutorsPerHost = 1; nCores = 800 +ran 3 iterations in 11.0 s (3.7 s per itr) +[info] - COMPARE B: Super executor (11 seconds, 6 milliseconds) + */ + testScheduler( + s"COMPARE B: Lots of nodes", + extraConfs = Seq( + "spark.testing.nHosts" -> "400", + "spark.testing.nExecutorsPerHost" -> "1" + ) + ) { + runSuccessfulJob(3000) + } + + testScheduler( + s"COMPARE B: Lots of executors, one node", + extraConfs = Seq( + "spark.testing.nHosts" -> "1", + "spark.testing.nExecutorsPerHost" -> "400" + ) + ) { + runSuccessfulJob(3000) + } + + testScheduler( + s"COMPARE B: Super executor", + extraConfs = Seq( + "spark.testing.nHosts" -> "1", + "spark.testing.nExecutorsPerHost" -> "1", + "spark.testing.nCoresPerExecutor" -> "800" + ) + ) { + runSuccessfulJob(3000) + } + + def backendWithBadExecs(N: Int, badExecs: Set[String], badHosts: Set[String]): Unit = { + val taskDescription = backend.beginTask() + val host = backend.executorIdToExecutor(taskDescription.executorId).host + val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet + val task = taskSet.tasks(taskDescription.index) + if (badExecs(taskDescription.executorId)) { + val exc = new RuntimeException(s"bad exec ${taskDescription.executorId}") + backend.taskFailedWithException(taskDescription, TaskState.FAILED, exc) + } else if (badHosts(host)) { + val exc = new RuntimeException(s"bad host ${host}") + backend.taskFailedWithException(taskDescription, TaskState.FAILED, exc) + } else { + // every 5th stage is a ResultStage -- the rest are ShuffleMapStages + (task.stageId, task.partitionId) match { + case (stage, _) if stage % 5 != 4 => + backend.taskSuccess(taskDescription, + DAGSchedulerSuite.makeMapStatus(host, N)) + case (_, _) => + backend.taskSuccess(taskDescription, 42) + } + } + } + + def runBadExecJob(N: Int, badExecs: Set[String], badHosts: Set[String]): Unit = { + runJobWithBackend(N, () => backendWithBadExecs(N, badExecs, badHosts)) + } + + val badExecs = (0 until 2).map{_.toString}.toSet + val badHosts = Set[String]() + + // note this is *very* unlikely to succeed without blacklisting, even though its only + // one bad executor out of 20. When a task fails, it gets requeued immediately -- and guess + // which is the only executor which has a free slot? Bingo, the one it just failed on + testScheduler( + "bad execs, no blacklist", + extraConfs = Seq( + "spark.scheduler.executorTaskBlacklistTime" -> "10000000" + ) + ) { + runBadExecJob(3000, badExecs, badHosts) + } +} From 4fcbc1da351d050d88357f47b98c85e0cbc0eefc Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Thu, 19 May 2016 13:29:38 -0500 Subject: [PATCH 16/35] bug fix in mock scheduler --- .../org/apache/spark/scheduler/SchedulerPerformanceSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala index 0489719b6c94e..5539962fe45ca 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala @@ -53,7 +53,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM val start = System.currentTimeMillis() while (System.currentTimeMillis() - start < 10000 ) { withClue(s"failure in iteration = $itrs") { - val jobFuture = submit(simpleWorkload(N), new Array[Int](N)) + val jobFuture = submit(simpleWorkload(N), (0 until N).toArray) // Note: Do not call Await.ready(future) because that calls `scala.concurrent.blocking`, // which causes concurrent SQL executions to fail if a fork-join pool is used. Note that // due to idiosyncrasies in Scala, `awaitPermission` is not actually used anywhere so it's From 6ed19aeec4cccfcca8de7848dd9ff8a0160d920a Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Fri, 20 May 2016 11:45:43 -0500 Subject: [PATCH 17/35] style --- .../scala/org/apache/spark/scheduler/BlacklistTracker.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala index f2c73710c000c..5fe77bb0c0ba8 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala @@ -176,8 +176,8 @@ private[spark] class BlacklistTracker( private def executorsOnBlacklistedNode( sched: TaskSchedulerImpl, atomTask: StageAndPartition): Set[String] = { - nodeBlacklistForStage(atomTask.stageId).flatMap(sched.getExecutorsAliveOnHost(_) - .getOrElse(Set.empty[String])).toSet + nodeBlacklistForStage(atomTask.stageId).flatMap(sched.getExecutorsAliveOnHost(_) + .getOrElse(Set.empty[String])) } private def reEvaluateExecutorBlacklistAndUpdateCache( From 67acce9a56a8acc24c0f2cf2ca76378277ec24d3 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Fri, 20 May 2016 12:19:25 -0500 Subject: [PATCH 18/35] simplification and comments --- .../scheduler/BlacklistIntegrationSuite.scala | 3 +- .../scheduler/SchedulerIntegrationSuite.scala | 96 ++++++++++--------- 2 files changed, 53 insertions(+), 46 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala index 3cb07a404d39b..6c9d4fb6f3bcc 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala @@ -33,8 +33,7 @@ class BlacklistIntegrationSuite extends SchedulerIntegrationSuite[MultiExecutorM val task = backend.beginTask() val host = backend.executorIdToExecutor(task.executorId).host if (host == badHost) { - val failure = new ExceptionFailure(new RuntimeException("I'm a bad host!"), Seq()) - backend.taskFailed(task, TaskState.FAILED, failure) + backend.taskFailed(task, new RuntimeException("I'm a bad host!")) } else { backend.taskSuccess(task, 42) } diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index dc2749cfbe4c3..02aa5caa731ff 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -29,7 +29,6 @@ import org.scalatest.Assertions.AssertionsHelper import org.apache.spark._ import org.apache.spark.TaskState._ -import org.apache.spark.executor.TaskMetrics import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.util.{CallSite, Utils} @@ -44,7 +43,6 @@ import org.apache.spark.util.{CallSite, Utils} */ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends SparkFunSuite with LocalSparkContext { - val conf = new SparkConf var taskScheduler: TestTaskScheduler = null var scheduler: DAGScheduler = null @@ -74,7 +72,6 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa backend = sc.schedulerBackend.asInstanceOf[T] taskScheduler = sc.taskScheduler.asInstanceOf[TestTaskScheduler] taskScheduler.initialize(sc.schedulerBackend) - backend.taskScheduler = taskScheduler scheduler = new DAGScheduler(sc, taskScheduler) taskScheduler.setDAGScheduler(scheduler) } @@ -118,10 +115,6 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa protected def assertDataStructuresEmpty(noFailure: Boolean = true): Unit = { if (noFailure) { - // When a job fails, we terminate before waiting for all the task end events to come in, - // so there might still be a running task set - assert(taskScheduler.runningTaskSets.isEmpty) - assert(!backend.hasTasks) if (failure != null) { // if there is a job failure, it can be a bit hard to tease the job failure msg apart // from the test failure msg, so we do a little extra formatting @@ -135,6 +128,11 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa stripMargin fail(msg) } + // When a job fails, we terminate before waiting for all the task end events to come in, + // so there might still be a running task set. So we only check these conditions + // when the job succeeds + assert(taskScheduler.runningTaskSets.isEmpty) + assert(!backend.hasTasks) } assert(scheduler.activeJobs.isEmpty) } @@ -180,9 +178,9 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa * test, where you submit jobs to your backend, wait for them to complete, then check * whatever conditions you want. Note that this is *not* safe to all bad backends -- * in particular, your `backendFunc` has to return quickly, it can't throw errors, (instead - * it should send back the right TaskEndReason + * it should send back the right TaskEndReason) */ - def withBackend(backendFunc: () => Unit)(testBody: => Unit): Unit = { + def withBackend[T](backendFunc: () => Unit)(testBody: => T): T = { val backendContinue = new AtomicBoolean(true) val backendThread = new Thread("mock backend thread") { override def run(): Unit = { @@ -213,16 +211,18 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa */ private[spark] abstract class MockBackend( conf: SparkConf, - var taskScheduler: TaskSchedulerImpl) extends SchedulerBackend with Logging { + val taskScheduler: TaskSchedulerImpl) extends SchedulerBackend with Logging { /** * Test backends should call this to get a task that has been assigned to them by the scheduler. * Each task should be responded to with either [[taskSuccess]] or [[taskFailed]]. */ - def beginTask(): TaskDescription = synchronized { - val toRun = assignedTasksWaitingToRun.remove(assignedTasksWaitingToRun.size - 1) - runningTasks += toRun - toRun + def beginTask(): TaskDescription = { + synchronized { + val toRun = assignedTasksWaitingToRun.remove(assignedTasksWaitingToRun.size - 1) + runningTasks += toRun + toRun + } } /** @@ -230,52 +230,49 @@ private[spark] abstract class MockBackend( * updates some internal state for this mock. */ def taskSuccess(task: TaskDescription, result: Any): Unit = { - endTask(task) val ser = env.serializer.newInstance() val resultBytes = ser.serialize(result) - val metrics = new TaskMetrics val directResult = new DirectTaskResult(resultBytes, Seq()) // no accumulator updates - val serializedDirectResult = ser.serialize(directResult) - taskScheduler.statusUpdate(task.taskId, TaskState.FINISHED, serializedDirectResult) - synchronized { - executorIdToExecutor(task.executorId).freeCores += taskScheduler.CPUS_PER_TASK - freeCores += taskScheduler.CPUS_PER_TASK - assignedTasksWaitingToRun -= task - } - reviveOffers() + taskUpdate(task, TaskState.FINISHED, directResult) } /** * Tell the scheduler the task failed, with the given state and result (probably ExceptionFailure * or FetchFailed). Also updates some internal state for this mock. */ - def taskFailed(task: TaskDescription, state: TaskState, result: Any): Unit = { - endTask(task) + def taskFailed(task: TaskDescription, exc: Exception): Unit = { + taskUpdate(task, TaskState.FAILED, new ExceptionFailure(exc, Seq())) + } + + def taskFailed(task: TaskDescription, reason: TaskFailedReason): Unit = { + taskUpdate(task, TaskState.FAILED, reason) + } + + def taskUpdate(task: TaskDescription, state: TaskState, result: Any): Unit = { val ser = env.serializer.newInstance() val resultBytes = ser.serialize(result) + // statusUpdate is safe to call from multiple threads, its protected inside taskScheduler taskScheduler.statusUpdate(task.taskId, state, resultBytes) if (TaskState.isFinished(state)) { synchronized { + runningTasks -= task executorIdToExecutor(task.executorId).freeCores += taskScheduler.CPUS_PER_TASK freeCores += taskScheduler.CPUS_PER_TASK - assignedTasksWaitingToRun -= task } reviveOffers() } } - private val assignedTasksWaitingToRun = ArrayBuffer[TaskDescription]() + // protected by this + private val assignedTasksWaitingToRun = new ArrayBuffer[TaskDescription](10000) + // protected by this private val runningTasks = ArrayBuffer[TaskDescription]() - def endTask(task: TaskDescription): Unit = synchronized { - runningTasks -= task - } - def hasTasks: Boolean = synchronized { assignedTasksWaitingToRun.nonEmpty || runningTasks.nonEmpty } - def hasTasksWaitingToRun: Boolean = synchronized { + def hasTasksWaitingToRun: Boolean = { assignedTasksWaitingToRun.nonEmpty } @@ -283,20 +280,30 @@ private[spark] abstract class MockBackend( override def stop(): Unit = {} - var freeCores: Int = _ val env = SparkEnv.get + /** Accessed by both scheduling and backend thread, so should be protected by this. */ + var freeCores: Int = _ + + /** + * Accessed by both scheduling and backend thread, so should be protected by this. + * Most likely the only thing that needs to be protected are the inidividual ExecutorTaskStatus, + * but for simplicity in this mock just lock the whole backend. + */ def executorIdToExecutor: Map[String, ExecutorTaskStatus] - def generateOffers(): Seq[WorkerOffer] = { - executorIdToExecutor.values.map { exec => + private def generateOffers(): Seq[WorkerOffer] = { + executorIdToExecutor.values.filter { exec => + exec.freeCores > 0 + }.map { exec => WorkerOffer(executorId = exec.executorId, host = exec.host, cores = exec.freeCores) }.toSeq } /** - * This is called by the scheduler whenever it has tasks it would like to schedule + * This is called by the scheduler whenever it has tasks it would like to schedule. It gets + * called in the scheduling thread, not the backend thread. */ override def reviveOffers(): Unit = { val offers: Seq[WorkerOffer] = generateOffers() @@ -334,7 +341,7 @@ private[spark] class SingleCoreMockBackend( val localExecutorId = SparkContext.DRIVER_IDENTIFIER val localExecutorHostname = "localhost" - val executorIdToExecutor: Map[String, ExecutorTaskStatus] = Map( + override val executorIdToExecutor: Map[String, ExecutorTaskStatus] = Map( localExecutorId -> new ExecutorTaskStatus(localExecutorHostname, localExecutorId, freeCores) ) } @@ -351,9 +358,11 @@ class MockRDD( override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] = throw new RuntimeException("should not be reached") - override def getPartitions: Array[Partition] = (0 until numPartitions).map(i => new Partition { - override def index: Int = i - }).toArray + override def getPartitions: Array[Partition] = { + (0 until numPartitions).map(i => new Partition { + override def index: Int = i + }).toArray + } override def getPreferredLocations(split: Partition): Seq[String] = Nil override def toString: String = "MockRDD " + id } @@ -523,7 +532,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor case (1, 0, 0) => val fetchFailed = FetchFailed( DAGSchedulerSuite.makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored") - backend.taskFailed(taskDescription, TaskState.FAILED, fetchFailed) + backend.taskFailed(taskDescription, fetchFailed) case (1, _, partition) => backend.taskSuccess(taskDescription, 42 + partition) } @@ -541,8 +550,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor testScheduler("job failure after 4 attempts") { def runBackend(): Unit = { val task = backend.beginTask() - val failure = new ExceptionFailure(new RuntimeException("test task failure"), Seq()) - backend.taskFailed(task, TaskState.FAILED, failure) + backend.taskFailed(task, new RuntimeException("test task failure")) } withBackend(runBackend _) { val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray) From 17fcc9ec89ef9f4f32905fd33dcbe8f84c05eed7 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Fri, 20 May 2016 12:21:58 -0500 Subject: [PATCH 19/35] fix merge --- .../spark/scheduler/SchedulerIntegrationSuite.scala | 10 ++-------- .../spark/scheduler/SchedulerPerformanceSuite.scala | 5 ++--- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index 4306de0e67756..02aa5caa731ff 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -17,11 +17,9 @@ package org.apache.spark.scheduler import java.util.Properties -import java.util.concurrent.ArrayBlockingQueue import java.util.concurrent.atomic.AtomicBoolean import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} -import scala.collection.JavaConverters._ import scala.concurrent.{Await, Future} import scala.concurrent.duration.{Duration, SECONDS} import scala.reflect.ClassTag @@ -238,10 +236,6 @@ private[spark] abstract class MockBackend( taskUpdate(task, TaskState.FINISHED, directResult) } - def taskFailedWithException(task: TaskDescription, state: TaskState, exc: Exception): Unit = { - taskFailed(task, state, new ExceptionFailure(exc, Seq())) - } - /** * Tell the scheduler the task failed, with the given state and result (probably ExceptionFailure * or FetchFailed). Also updates some internal state for this mock. @@ -275,7 +269,7 @@ private[spark] abstract class MockBackend( private val runningTasks = ArrayBuffer[TaskDescription]() def hasTasks: Boolean = synchronized { - !assignedTasksWaitingToRun.isEmpty() || runningTasks.nonEmpty + assignedTasksWaitingToRun.nonEmpty || runningTasks.nonEmpty } def hasTasksWaitingToRun: Boolean = { @@ -319,7 +313,7 @@ private[spark] abstract class MockBackend( executorIdToExecutor(task.executorId).freeCores -= taskScheduler.CPUS_PER_TASK } freeCores -= newTasks.size * taskScheduler.CPUS_PER_TASK - assignedTasksWaitingToRun.addAll(newTasks.asJava) + assignedTasksWaitingToRun ++= newTasks } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala index 0489719b6c94e..76d12eded3f94 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala @@ -18,7 +18,6 @@ package org.apache.spark.scheduler import scala.concurrent.duration.Duration -import org.apache.spark.TaskState import org.apache.spark.util.Utils class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorMockBackend] { @@ -191,10 +190,10 @@ ran 3 iterations in 11.0 s (3.7 s per itr) val task = taskSet.tasks(taskDescription.index) if (badExecs(taskDescription.executorId)) { val exc = new RuntimeException(s"bad exec ${taskDescription.executorId}") - backend.taskFailedWithException(taskDescription, TaskState.FAILED, exc) + backend.taskFailed(taskDescription, exc) } else if (badHosts(host)) { val exc = new RuntimeException(s"bad host ${host}") - backend.taskFailedWithException(taskDescription, TaskState.FAILED, exc) + backend.taskFailed(taskDescription, exc) } else { // every 5th stage is a ResultStage -- the rest are ShuffleMapStages (task.stageId, task.partitionId) match { From b12b563d4890766d0d3ea31ccdf14d3da8fc8f82 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Fri, 20 May 2016 12:33:31 -0500 Subject: [PATCH 20/35] comments --- .../spark/scheduler/SchedulerPerformanceSuite.scala | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala index 76d12eded3f94..2299ed2bbe3a0 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala @@ -23,6 +23,8 @@ import org.apache.spark.util.Utils class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorMockBackend] { def simpleWorkload(N: Int): MockRDD = { + // relatively simple job with 5 stages, so scheduling includes some aspects of submitting stages + // in addition to tasks val a = new MockRDD(sc, N, Nil) val b = shuffle(N, a) val c = shuffle(N, a) @@ -46,7 +48,12 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM } def runJobWithBackend(N: Int, backend: () => Unit): Unit = { - // run as many jobs as we can in 10 seconds + // Try to run as many jobs as we can in 10 seconds, get the time per job. The idea here is to + // balance: + // 1) have a big enough job that we're not effected by delays just from waiting for job + // completion to propagate to the user thread (probably minor) + // 2) run enough iterations to get some reliable data + // 3) not wait toooooo long var itrs = 0 val totalMs = withBackend(backend) { val start = System.currentTimeMillis() From 5d547f4840856667d3e22abb15756d76a7f2407d Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Fri, 20 May 2016 16:37:13 -0500 Subject: [PATCH 21/35] more tests --- .../scheduler/SchedulerPerformanceSuite.scala | 79 +++++++++++++------ 1 file changed, 56 insertions(+), 23 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala index fa8c4c27a7578..3b437c728ae70 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala @@ -87,10 +87,19 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM runSuccessfulJob(40) } - testScheduler("Scheduling speed -- large job on a small cluster") { + testScheduler("COMPARE C Scheduling speed -- large job on a small cluster") { runSuccessfulJob(3000) } + testScheduler( + "COMPARE C Scheduling speed -- large job on a small cluster with advanced blacklist", + extraConfs = Seq( + "spark.scheduler.executorTaskBlacklistTime" -> "10000000", + "spark.scheduler.blacklist.advancedStrategy" -> "true" + ) + ) { + runSuccessfulJob(3000) + } testScheduler( "Scheduling speed -- large job on a super node", @@ -125,18 +134,22 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM Seq(200, 300, 400, 450, 500, 550).foreach { nodes => /* -ran 1 iterations in 12.9 s (12.9 s per itr) -[info] - COMPARE A: Scheduling speed -- large job on 200 node cluster (13 seconds, 861 milliseconds) -ran 1 iterations in 25.0 s (25.0 s per itr) -[info] - COMPARE A: Scheduling speed -- large job on 300 node cluster (25 seconds, 50 milliseconds) -ran 1 iterations in 34.6 s (34.6 s per itr) -[info] - COMPARE A: Scheduling speed -- large job on 400 node cluster (34 seconds, 668 milliseconds) -ran 1 iterations in 54.0 s (54.0 s per itr) -[info] - COMPARE A: Scheduling speed -- large job on 450 node cluster (53 seconds, 991 milliseconds) -ran 1 iterations in 1.8 m (1.8 m per itr) -[info] - COMPARE A: Scheduling speed -- large job on 500 node cluster (1 minute, 48 seconds) -ran 1 iterations in 2.3 m (2.3 m per itr) -[info] - COMPARE A: Scheduling speed -- large job on 550 node cluster (2 minutes, 19 seconds) + ran 1 iterations in 12.9 s (12.9 s per itr) + [info] - COMPARE A: Scheduling speed -- large job on 200 node cluster (13 seconds, 861 + milliseconds) + ran 1 iterations in 25.0 s (25.0 s per itr) + [info] - COMPARE A: Scheduling speed -- large job on 300 node cluster (25 seconds, 50 + milliseconds) + ran 1 iterations in 34.6 s (34.6 s per itr) + [info] - COMPARE A: Scheduling speed -- large job on 400 node cluster (34 seconds, + 668 milliseconds) + ran 1 iterations in 54.0 s (54.0 s per itr) + [info] - COMPARE A: Scheduling speed -- large job on 450 node cluster (53 seconds, + 991 milliseconds) + ran 1 iterations in 1.8 m (1.8 m per itr) + [info] - COMPARE A: Scheduling speed -- large job on 500 node cluster (1 minute, 48 seconds) + ran 1 iterations in 2.3 m (2.3 m per itr) + [info] - COMPARE A: Scheduling speed -- large job on 550 node cluster (2 minutes, 19 seconds) */ testScheduler( s"COMPARE A: Scheduling speed -- large job on ${nodes} node cluster", @@ -149,15 +162,15 @@ ran 1 iterations in 2.3 m (2.3 m per itr) } /* -nHosts = 400; nExecutorsPerHost = 1; nCores = 800 -ran 2 iterations in 11.7 s (5.9 s per itr) -[info] - COMPARE B: Lots of nodes (12 seconds, 679 milliseconds) -nHosts = 1; nExecutorsPerHost = 400; nCores = 800 -ran 3 iterations in 14.2 s (4.7 s per itr) -[info] - COMPARE B: Lots of executors, one node (14 seconds, 290 milliseconds) -nHosts = 1; nExecutorsPerHost = 1; nCores = 800 -ran 3 iterations in 11.0 s (3.7 s per itr) -[info] - COMPARE B: Super executor (11 seconds, 6 milliseconds) + nHosts = 400; nExecutorsPerHost = 1; nCores = 800 + ran 2 iterations in 11.7 s (5.9 s per itr) + [info] - COMPARE B: Lots of nodes (12 seconds, 679 milliseconds) + nHosts = 1; nExecutorsPerHost = 400; nCores = 800 + ran 3 iterations in 14.2 s (4.7 s per itr) + [info] - COMPARE B: Lots of executors, one node (14 seconds, 290 milliseconds) + nHosts = 1; nExecutorsPerHost = 1; nCores = 800 + ran 3 iterations in 11.0 s (3.7 s per itr) + [info] - COMPARE B: Super executor (11 seconds, 6 milliseconds) */ testScheduler( s"COMPARE B: Lots of nodes", @@ -224,11 +237,31 @@ ran 3 iterations in 11.0 s (3.7 s per itr) // one bad executor out of 20. When a task fails, it gets requeued immediately -- and guess // which is the only executor which has a free slot? Bingo, the one it just failed on testScheduler( - "bad execs, no blacklist", + "bad execs with blacklist", extraConfs = Seq( "spark.scheduler.executorTaskBlacklistTime" -> "10000000" ) ) { runBadExecJob(3000, badExecs, badHosts) } + + testScheduler( + "COMPARE D bad execs with advanced blacklist", + extraConfs = Seq( + "spark.scheduler.executorTaskBlacklistTime" -> "10000000", + "spark.scheduler.blacklist.advancedStrategy" -> "true" + ) + ) { + runBadExecJob(3000, badExecs, badHosts) + } + + testScheduler( + "COMPARE D bad execs with simple blacklist", + extraConfs = Seq( + "spark.scheduler.executorTaskBlacklistTime" -> "10000000", + "spark.scheduler.blacklist.advancedStrategy" -> "false" + ) + ) { + runBadExecJob(3000, badExecs, badHosts) + } } From d46c65d90a8cb6dae0160029d7c8e4e035682d56 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Fri, 20 May 2016 16:45:32 -0500 Subject: [PATCH 22/35] smaller demo of performance difference --- .../apache/spark/scheduler/BlacklistTracker.scala | 15 +++++++++++---- .../scheduler/SchedulerPerformanceSuite.scala | 4 ++-- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala index 5fe77bb0c0ba8..54a71b017950c 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala @@ -22,6 +22,7 @@ import java.util.concurrent.TimeUnit import scala.collection.mutable import org.apache.spark.SparkConf +import org.apache.spark.internal.Logging import org.apache.spark.util.Clock import org.apache.spark.util.SystemClock import org.apache.spark.util.ThreadUtils @@ -37,7 +38,7 @@ import org.apache.spark.util.Utils */ private[spark] class BlacklistTracker( sparkConf: SparkConf, - clock: Clock = new SystemClock()) extends BlacklistCache{ + clock: Clock = new SystemClock()) extends BlacklistCache with Logging { // maintain a ExecutorId --> FailureStatus HashMap private val executorIdToFailureStatus: mutable.HashMap[String, FailureStatus] = mutable.HashMap() @@ -69,6 +70,7 @@ private[spark] class BlacklistTracker( // The actual implementation is delegated to strategy private[scheduler] def expireExecutorsInBlackList(): Unit = synchronized { val updated = strategy.expireExecutorsInBlackList(executorIdToFailureStatus, clock) + logInfo(s"Checked for expired blacklist: ${updated}") if (updated) { invalidateCache() } @@ -76,12 +78,17 @@ private[spark] class BlacklistTracker( // The actual implementation is delegated to strategy def executorBlacklist( - sched: TaskSchedulerImpl, stageId: Int, partition: Int): Set[String] = synchronized { + sched: TaskSchedulerImpl, + stageId: Int, + partition: Int): Set[String] = synchronized { + // note that this is NOT only called from the dag scheduler event loop val atomTask = StageAndPartition(stageId, partition) if (!isBlacklistExecutorCacheValid) { reEvaluateExecutorBlacklistAndUpdateCache(sched, atomTask, clock) } else { +// getExecutorBlacklistFromCache(atomTask).getOrElse(Set.empty[String]) getExecutorBlacklistFromCache(atomTask).getOrElse { + // TODO Why is this necessary? reEvaluateExecutorBlacklistAndUpdateCache(sched, atomTask, clock) } } @@ -200,8 +207,7 @@ private[spark] class BlacklistTracker( /** * Hide cache details in this trait to make code clean and avoid operation mistake */ -private[scheduler] trait BlacklistCache { - +private[scheduler] trait BlacklistCache extends Logging { // local cache to minimize the the work when query blacklisted executor and node private val blacklistExecutorCache = mutable.HashMap.empty[StageAndPartition, Set[String]] private val blacklistNodeCache = mutable.Set.empty[String] @@ -249,6 +255,7 @@ private[scheduler] trait BlacklistCache { } protected def invalidateCache(): Unit = cacheLock.synchronized { + logInfo("invalidatinig blacklist cache") _isBlacklistExecutorCacheValid = false _isBlacklistNodeCacheValid = false _isBlacklistNodeForStageCacheValid = false diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala index 3b437c728ae70..ede52d09b39db 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala @@ -252,7 +252,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM "spark.scheduler.blacklist.advancedStrategy" -> "true" ) ) { - runBadExecJob(3000, badExecs, badHosts) + runBadExecJob(50, badExecs, badHosts) } testScheduler( @@ -262,6 +262,6 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM "spark.scheduler.blacklist.advancedStrategy" -> "false" ) ) { - runBadExecJob(3000, badExecs, badHosts) + runBadExecJob(50, badExecs, badHosts) } } From a394ab72d90207529a74debd4d9113eca60a5838 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Mon, 23 May 2016 09:10:02 -0500 Subject: [PATCH 23/35] labels --- .../apache/spark/scheduler/SchedulerPerformanceSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala index ede52d09b39db..713ecc97b4e68 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala @@ -102,7 +102,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM } testScheduler( - "Scheduling speed -- large job on a super node", + "COMPARE A Scheduling speed -- large job on a super node", extraConfs = Seq( "spark.testing.nHosts" -> "1", "spark.testing.nExecutorsPerHost" -> "1", @@ -114,7 +114,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM testScheduler( // 4 execs per node, 2 cores per exec, so 400 cores - "Scheduling speed -- large job on 50 node cluster", + "COMPARE A Scheduling speed -- large job on 50 node cluster", extraConfs = Seq( "spark.testing.nHosts" -> "50" ) @@ -124,7 +124,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM testScheduler( // 4 execs per node, 2 cores per exec, so 800 cores - "Scheduling speed -- large job on 100 node cluster", + "COMPARE A Scheduling speed -- large job on 100 node cluster", extraConfs = Seq( "spark.testing.nHosts" -> "100" ) From f4609da04f9bc3d50506c2e89210cfa5100b4c1d Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Mon, 23 May 2016 12:08:59 -0500 Subject: [PATCH 24/35] wip -- some instrumentation, easier repro of slowdown --- .../spark/scheduler/BlacklistStrategy.scala | 6 +++++ .../spark/scheduler/BlacklistTracker.scala | 10 +++++++- .../scheduler/SchedulerIntegrationSuite.scala | 13 ++++++---- .../scheduler/SchedulerPerformanceSuite.scala | 24 ++++++++----------- 4 files changed, 34 insertions(+), 19 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala index edaeb658d0822..7d19ff54e9a09 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala @@ -24,6 +24,8 @@ import org.apache.spark.util.Clock /** * The interface to determine executor blacklist and node blacklist. + * + * TODO notes on thread-safety */ private[scheduler] trait BlacklistStrategy { /** Define a time interval to expire failure information of executors */ @@ -81,10 +83,12 @@ private[scheduler] trait BlacklistStrategy { */ private[scheduler] class SingleTaskStrategy( val expireTimeInMilliseconds: Long) extends BlacklistStrategy { + var executorBlacklistCallCount = 0 def getExecutorBlacklist( executorIdToFailureStatus: mutable.HashMap[String, FailureStatus], atomTask: StageAndPartition, clock: Clock): Set[String] = { + executorBlacklistCallCount += 1 executorIdToFailureStatus.filter{ case (_, failureStatus) => failureStatus.numFailuresPerTask.keySet.contains(atomTask) && clock.getTimeMillis() - failureStatus.updatedTime < expireTimeInMilliseconds @@ -104,10 +108,12 @@ private[scheduler] class SingleTaskStrategy( private[scheduler] class AdvancedSingleTaskStrategy( expireTimeInMilliseconds: Long) extends SingleTaskStrategy(expireTimeInMilliseconds) { + var nodeBlacklistCallCount = 0 override def getNodeBlacklistForStage( executorIdToFailureStatus: mutable.HashMap[String, FailureStatus], stageId: Int, clock: Clock): Set[String] = { + nodeBlacklistCallCount += 1 val nodes = executorIdToFailureStatus.filter{ case (_, failureStatus) => failureStatus.numFailuresPerTask.keySet.map(_.stageId).contains(stageId) && diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala index 54a71b017950c..d1a058495a921 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala @@ -65,6 +65,14 @@ private[spark] class BlacklistTracker( def stop(): Unit = { scheduler.shutdown() scheduler.awaitTermination(10, TimeUnit.SECONDS) + logDebug(s"Executor Blacklist callcount =" + + s" ${strategy.asInstanceOf[SingleTaskStrategy].executorBlacklistCallCount}") + strategy match { + case as: AdvancedSingleTaskStrategy => + logDebug(s"Node Blacklist callcount =" + + s" ${as.nodeBlacklistCallCount}") + case _ => // no op + } } // The actual implementation is delegated to strategy @@ -255,7 +263,7 @@ private[scheduler] trait BlacklistCache extends Logging { } protected def invalidateCache(): Unit = cacheLock.synchronized { - logInfo("invalidatinig blacklist cache") + logInfo("invalidating blacklist cache") _isBlacklistExecutorCacheValid = false _isBlacklistNodeCacheValid = false _isBlacklistNodeForStageCacheValid = false diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index 02aa5caa731ff..3015dbe30379f 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -131,8 +131,13 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa // When a job fails, we terminate before waiting for all the task end events to come in, // so there might still be a running task set. So we only check these conditions // when the job succeeds - assert(taskScheduler.runningTaskSets.isEmpty) - assert(!backend.hasTasks) + if (taskScheduler.runningTaskSets.nonEmpty) { + fail(s"taskScheduler still has running taskSets: ${taskScheduler.runningTaskSets}") + } + if (backend.hasTasks) { + fail(s"backend still has tasks. Waiting to run: ${backend.assignedTasksWaitingToRun}; " + + s"running : ${backend.runningTasks}") + } } assert(scheduler.activeJobs.isEmpty) } @@ -264,9 +269,9 @@ private[spark] abstract class MockBackend( } // protected by this - private val assignedTasksWaitingToRun = new ArrayBuffer[TaskDescription](10000) + val assignedTasksWaitingToRun = new ArrayBuffer[TaskDescription](10000) // protected by this - private val runningTasks = ArrayBuffer[TaskDescription]() + val runningTasks = ArrayBuffer[TaskDescription]() def hasTasks: Boolean = synchronized { assignedTasksWaitingToRun.nonEmpty || runningTasks.nonEmpty diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala index 713ecc97b4e68..7294dec0ed239 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala @@ -57,7 +57,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM var itrs = 0 val totalMs = withBackend(backend) { val start = System.currentTimeMillis() - while (System.currentTimeMillis() - start < 10000 ) { + while (System.currentTimeMillis() - start < 30000 ) { withClue(s"failure in iteration = $itrs") { val jobFuture = submit(simpleWorkload(N), (0 until N).toArray) // Note: Do not call Await.ready(future) because that calls `scala.concurrent.blocking`, @@ -236,32 +236,28 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM // note this is *very* unlikely to succeed without blacklisting, even though its only // one bad executor out of 20. When a task fails, it gets requeued immediately -- and guess // which is the only executor which has a free slot? Bingo, the one it just failed on - testScheduler( - "bad execs with blacklist", - extraConfs = Seq( - "spark.scheduler.executorTaskBlacklistTime" -> "10000000" - ) - ) { - runBadExecJob(3000, badExecs, badHosts) - } - testScheduler( "COMPARE D bad execs with advanced blacklist", extraConfs = Seq( "spark.scheduler.executorTaskBlacklistTime" -> "10000000", - "spark.scheduler.blacklist.advancedStrategy" -> "true" + "spark.scheduler.blacklist.advancedStrategy" -> "true", + "spark.testing.nHosts" -> "2", + "spark.testing.nExecutorsPerHost" -> "2" ) ) { - runBadExecJob(50, badExecs, badHosts) + runBadExecJob(100, badExecs, badHosts) } testScheduler( "COMPARE D bad execs with simple blacklist", extraConfs = Seq( "spark.scheduler.executorTaskBlacklistTime" -> "10000000", - "spark.scheduler.blacklist.advancedStrategy" -> "false" + "spark.scheduler.blacklist.advancedStrategy" -> "false", + "spark.testing.nHosts" -> "2", + "spark.testing.nExecutorsPerHost" -> "2" ) ) { - runBadExecJob(50, badExecs, badHosts) + runBadExecJob(100, badExecs, badHosts) } + } From e852e0c41666e6bb126ecd9dc79f5a08cd2ca4f6 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Mon, 23 May 2016 17:11:09 -0500 Subject: [PATCH 25/35] notes mostly --- .../spark/scheduler/BlacklistStrategy.scala | 12 ++++++++++-- .../spark/scheduler/BlacklistTracker.scala | 6 ++++-- .../scheduler/SchedulerPerformanceSuite.scala | 18 +++++++++++++++--- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala index 7d19ff54e9a09..8c690b6e3223b 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala @@ -83,7 +83,7 @@ private[scheduler] trait BlacklistStrategy { */ private[scheduler] class SingleTaskStrategy( val expireTimeInMilliseconds: Long) extends BlacklistStrategy { - var executorBlacklistCallCount = 0 + var executorBlacklistCallCount = 0L def getExecutorBlacklist( executorIdToFailureStatus: mutable.HashMap[String, FailureStatus], atomTask: StageAndPartition, @@ -108,18 +108,26 @@ private[scheduler] class SingleTaskStrategy( private[scheduler] class AdvancedSingleTaskStrategy( expireTimeInMilliseconds: Long) extends SingleTaskStrategy(expireTimeInMilliseconds) { - var nodeBlacklistCallCount = 0 + var nodeBlacklistCallCount = 0L override def getNodeBlacklistForStage( executorIdToFailureStatus: mutable.HashMap[String, FailureStatus], stageId: Int, clock: Clock): Set[String] = { nodeBlacklistCallCount += 1 + // when there is one bad node (or executor), this is really slow. We pile up a ton of + // task failures, and we've got to iterate through failure data for each task. Furthermore, + // since we don't actively blacklist the bad node / executor, we just keep assigning it more + // tasks that fail. And after each failure, we invalidate our cache, which means we need + // to call this again. + // This can be particularly painful when the failures are fast, since its likely the only + // executor with free slots is the one which just failed some tasks, which just keep going ... val nodes = executorIdToFailureStatus.filter{ case (_, failureStatus) => failureStatus.numFailuresPerTask.keySet.map(_.stageId).contains(stageId) && clock.getTimeMillis() - failureStatus.updatedTime < expireTimeInMilliseconds }.values.map(_.host) getDuplicateElem(nodes, 1) + super.getNodeBlacklistForStage(executorIdToFailureStatus, stageId, clock) } override def getNodeBlacklist( diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala index d1a058495a921..792964cfc9d92 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala @@ -65,11 +65,11 @@ private[spark] class BlacklistTracker( def stop(): Unit = { scheduler.shutdown() scheduler.awaitTermination(10, TimeUnit.SECONDS) - logDebug(s"Executor Blacklist callcount =" + + logInfo(s"Executor Blacklist callcount =" + s" ${strategy.asInstanceOf[SingleTaskStrategy].executorBlacklistCallCount}") strategy match { case as: AdvancedSingleTaskStrategy => - logDebug(s"Node Blacklist callcount =" + + logInfo(s"Node Blacklist callcount =" + s" ${as.nodeBlacklistCallCount}") case _ => // no op } @@ -201,12 +201,14 @@ private[spark] class BlacklistTracker( clock: Clock): Set[String] = { val executors = executorsOnBlacklistedNode(sched, atomTask) ++ strategy.getExecutorBlacklist(executorIdToFailureStatus, atomTask, clock) + logInfo(s"Blacklisting executors ${executors} for task ${atomTask}") updateBlacklistExecutorCache(atomTask, executors) executors } private def reEvaluateNodeBlacklistForStageAndUpdateCache(stageId: Int): Set[String] = { val nodes = strategy.getNodeBlacklistForStage(executorIdToFailureStatus, stageId, clock) + logInfo(s"Blacklisting nodes ${nodes} for stage ${stageId}") updateBlacklistNodeCache(nodes) nodes } diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala index 7294dec0ed239..85faf114e27a0 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala @@ -236,6 +236,18 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM // note this is *very* unlikely to succeed without blacklisting, even though its only // one bad executor out of 20. When a task fails, it gets requeued immediately -- and guess // which is the only executor which has a free slot? Bingo, the one it just failed on + testScheduler( + "COMPARE D bad execs with simple blacklist", + extraConfs = Seq( + "spark.scheduler.executorTaskBlacklistTime" -> "10000000", + "spark.scheduler.blacklist.advancedStrategy" -> "false", + "spark.testing.nHosts" -> "2", + "spark.testing.nExecutorsPerHost" -> "2" + ) + ) { + runBadExecJob(100, badExecs, badHosts) + } + testScheduler( "COMPARE D bad execs with advanced blacklist", extraConfs = Seq( @@ -249,15 +261,15 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM } testScheduler( - "COMPARE D bad execs with simple blacklist", + "COMPARE D bad host with advanced blacklist", extraConfs = Seq( "spark.scheduler.executorTaskBlacklistTime" -> "10000000", - "spark.scheduler.blacklist.advancedStrategy" -> "false", + "spark.scheduler.blacklist.advancedStrategy" -> "true", "spark.testing.nHosts" -> "2", "spark.testing.nExecutorsPerHost" -> "2" ) ) { - runBadExecJob(100, badExecs, badHosts) + runBadExecJob(100, badExecs, Set("host-0")) } } From 8b78d3f83328e24b86af65ee63f446ef7ebf4047 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Mon, 23 May 2016 21:28:42 -0500 Subject: [PATCH 26/35] more notes --- .../scheduler/SchedulerPerformanceSuite.scala | 116 +++++++++++++----- 1 file changed, 82 insertions(+), 34 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala index 85faf114e27a0..4956cb0efb911 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala @@ -57,8 +57,10 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM var itrs = 0 val totalMs = withBackend(backend) { val start = System.currentTimeMillis() - while (System.currentTimeMillis() - start < 30000 ) { + while (System.currentTimeMillis() - start < 10000 ) { +// while (System.currentTimeMillis() - start < 10000 && itrs == 0) { withClue(s"failure in iteration = $itrs") { + val itrStart = System.currentTimeMillis() val jobFuture = submit(simpleWorkload(N), (0 until N).toArray) // Note: Do not call Await.ready(future) because that calls `scala.concurrent.blocking`, // which causes concurrent SQL executions to fail if a fork-join pool is used. Note that @@ -66,6 +68,10 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM // safe to pass in null here. For more detail, see SPARK-13747. val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait] jobFuture.ready(Duration.Inf)(awaitPermission) + // scalastyle:off println + println(s"Iteration $itrs finished in" + + s" ${Utils.msDurationToString(System.currentTimeMillis() - itrStart)}") + // scalastyle:on println assertDataStructuresEmpty(noFailure = true) itrs += 1 } @@ -231,45 +237,87 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM } val badExecs = (0 until 2).map{_.toString}.toSet - val badHosts = Set[String]() // note this is *very* unlikely to succeed without blacklisting, even though its only // one bad executor out of 20. When a task fails, it gets requeued immediately -- and guess // which is the only executor which has a free slot? Bingo, the one it just failed on - testScheduler( - "COMPARE D bad execs with simple blacklist", - extraConfs = Seq( - "spark.scheduler.executorTaskBlacklistTime" -> "10000000", - "spark.scheduler.blacklist.advancedStrategy" -> "false", - "spark.testing.nHosts" -> "2", - "spark.testing.nExecutorsPerHost" -> "2" - ) - ) { - runBadExecJob(100, badExecs, badHosts) + Seq( + ("bad execs with simple blacklist", "false", Set[String]()), + ("bad execs with advanced blacklist", "true", Set[String]()), + ("bad hosts with advanced blacklist", "true", Set[String]("host-0")) + ).foreach { case (name, strategy, badHosts) => + testScheduler( + s"COMPARE D $name", + extraConfs = Seq( + "spark.scheduler.executorTaskBlacklistTime" -> "10000000", + "spark.scheduler.blacklist.advancedStrategy" -> strategy + ) + ) { + runBadExecJob(3000, badExecs, badHosts) + } } - testScheduler( - "COMPARE D bad execs with advanced blacklist", - extraConfs = Seq( - "spark.scheduler.executorTaskBlacklistTime" -> "10000000", - "spark.scheduler.blacklist.advancedStrategy" -> "true", - "spark.testing.nHosts" -> "2", - "spark.testing.nExecutorsPerHost" -> "2" - ) - ) { - runBadExecJob(100, badExecs, badHosts) - } - testScheduler( - "COMPARE D bad host with advanced blacklist", - extraConfs = Seq( - "spark.scheduler.executorTaskBlacklistTime" -> "10000000", - "spark.scheduler.blacklist.advancedStrategy" -> "true", - "spark.testing.nHosts" -> "2", - "spark.testing.nExecutorsPerHost" -> "2" - ) - ) { - runBadExecJob(100, badExecs, Set("host-0")) - } + /* + Here's how you can get into really slow scheduling, even with the simple blacklist. Say there + is just one bad executor. You've got a bunch of tasks to run, and you schedule all available + slots. Then one task fails on your bad executor. You don't re-schedule that task on the bad + executor, but you do think you've got one open slot, so you try to find the next task you can + schedule. Since you've got a massive backlog of tasks, you just take the next task and schedule + it on your bad executor. The task fails again. + + This repeats a while, and now you've gone through and failed a bunch of tasks on this one bad + executor. But each time, you clear the cache of invalid executors, so you do a bunch of work + to recompute the set of OK executors. This is *really* expensive, and doesn't help you at all + anyway. + + + +16/05/23 20:53:57.871 dag-scheduler-event-loop INFO BlacklistTracker: Blacklisting executors Set() for task StageAndPartition(8,38) +16/05/23 20:53:57.871 dag-scheduler-event-loop INFO TaskSetManager: Starting task 38.0 in stage 8.0 (TID 21056, host-2, partition 38, PROCESS_LOCAL, 5112 bytes) +16/05/23 20:53:57.871 dag-scheduler-event-loop INFO BlacklistTracker: Blacklisting nodes Set() for stage 8 +16/05/23 20:53:57.871 dag-scheduler-event-loop INFO BlacklistTracker: Blacklisting executors Set() for task StageAndPartition(8,39) +16/05/23 20:53:57.871 dag-scheduler-event-loop INFO TaskSetManager: Starting task 39.0 in stage 8.0 (TID 21057, host-0, partition 39, PROCESS_LOCAL, 5112 bytes) +16/05/23 20:53:57.871 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8 +16/05/23 20:53:57.871 mock backend thread INFO BlacklistTracker: Blacklisting executors Set() for task StageAndPartition(8,40) +16/05/23 20:53:57.871 dag-scheduler-event-loop INFO DAGScheduler: ShuffleMapStage 5 (RDD at SchedulerIntegrationSuite.scala:360) finished in 1.731 s +16/05/23 20:53:57.871 dag-scheduler-event-loop INFO DAGScheduler: looking for newly runnable stages +16/05/23 20:53:57.871 dag-scheduler-event-loop INFO DAGScheduler: running: Set(ShuffleMapStage 8) +16/05/23 20:53:57.871 dag-scheduler-event-loop INFO DAGScheduler: waiting: Set(ResultStage 9, ShuffleMapStage 6) +16/05/23 20:53:57.871 dag-scheduler-event-loop INFO DAGScheduler: failed: Set() +16/05/23 20:53:57.872 mock backend thread INFO TaskSetManager: Starting task 40.0 in stage 8.0 (TID 21058, host-0, partition 40, PROCESS_LOCAL, 5112 bytes) +16/05/23 20:53:57.872 task-result-getter-2 WARN TaskSetManager: Lost task 39.0 in stage 8.0 (TID 21057, host-0): java.lang.RuntimeException: bad exec 1 + at org.apache.spark.scheduler.SchedulerPerformanceSuite.backendWithBadExecs(SchedulerPerformanceSuite.scala:218) + at org.apache.spark.scheduler.SchedulerPerformanceSuite$$anonfun$runBadExecJob$1.apply$mcV$sp(SchedulerPerformanceSuite.scala:236) + at org.apache.spark.scheduler.SchedulerIntegrationSuite$$anon$2.run(SchedulerIntegrationSuite.scala:194) + +16/05/23 20:53:57.872 task-result-getter-2 INFO BlacklistTracker: invalidating blacklist cache +16/05/23 20:53:57.872 dag-scheduler-event-loop INFO DAGScheduler: Submitting ShuffleMapStage 6 (MockRDD 5), which has no missing parents +16/05/23 20:53:57.872 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8 +16/05/23 20:53:57.872 mock backend thread INFO BlacklistTracker: Blacklisting executors Set(1) for task StageAndPartition(8,39) +16/05/23 20:53:57.872 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8 +16/05/23 20:53:57.872 mock backend thread INFO BlacklistTracker: Blacklisting executors Set() for task StageAndPartition(8,41) +16/05/23 20:53:57.872 mock backend thread INFO TaskSetManager: Starting task 41.0 in stage 8.0 (TID 21059, host-0, partition 41, PROCESS_LOCAL, 5112 bytes) +16/05/23 20:53:57.872 task-result-getter-3 INFO TaskSetManager: Lost task 40.0 in stage 8.0 (TID 21058) on executor host-0: java.lang.RuntimeException (bad exec 1) [duplicate 1] +16/05/23 20:53:57.872 task-result-getter-3 INFO BlacklistTracker: invalidating blacklist cache +16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8 +16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting executors Set(1) for task StageAndPartition(8,40) +16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8 +16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting executors Set(1) for task StageAndPartition(8,39) +16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8 +16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting executors Set() for task StageAndPartition(8,42) +16/05/23 20:53:57.873 mock backend thread INFO TaskSetManager: Starting task 42.0 in stage 8.0 (TID 21060, host-0, partition 42, PROCESS_LOCAL, 5112 bytes) +16/05/23 20:53:57.873 task-result-getter-1 INFO TaskSetManager: Lost task 41.0 in stage 8.0 (TID 21059) on executor host-0: java.lang.RuntimeException (bad exec 1) [duplicate 2] +16/05/23 20:53:57.873 task-result-getter-1 INFO BlacklistTracker: invalidating blacklist cache +16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8 +16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting executors Set(1) for task StageAndPartition(8,41) +16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8 +16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting executors Set(1) for task StageAndPartition(8,40) +16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8 +16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting executors Set(1) for task StageAndPartition(8,39) +16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8 +16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting executors Set() for task StageAndPartition(8,43) + + */ } From 883bfd7dd219998bc7a806c6d8dc8a50cd1f6a1a Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Tue, 24 May 2016 15:56:26 -0500 Subject: [PATCH 27/35] fix race condition w/ runningTaskSets --- .../spark/scheduler/SchedulerIntegrationSuite.scala | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index 3015dbe30379f..718963da7dc17 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -418,15 +418,20 @@ private class MockExternalClusterManager extends ExternalClusterManager { /** TaskSchedulerImpl that just tracks a tiny bit more state to enable checks in tests. */ class TestTaskScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) { /** Set of TaskSets the DAGScheduler has requested executed. */ + // protected by this val runningTaskSets = HashSet[TaskSet]() override def submitTasks(taskSet: TaskSet): Unit = { - runningTaskSets += taskSet + synchronized { + runningTaskSets += taskSet + } super.submitTasks(taskSet) } override def taskSetFinished(manager: TaskSetManager): Unit = { - runningTaskSets -= manager.taskSet + synchronized { + runningTaskSets -= manager.taskSet + } super.taskSetFinished(manager) } } From 4358b2fdb022ed0b84668bbe8f5b8aa56c8ce637 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Tue, 24 May 2016 16:56:12 -0500 Subject: [PATCH 28/35] updated logging --- .../org/apache/spark/scheduler/BlacklistTracker.scala | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala index 792964cfc9d92..e51f2b4ebb274 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala @@ -199,16 +199,15 @@ private[spark] class BlacklistTracker( sched: TaskSchedulerImpl, atomTask: StageAndPartition, clock: Clock): Set[String] = { + // TODO some kind of logging when the blacklist is *updated* val executors = executorsOnBlacklistedNode(sched, atomTask) ++ strategy.getExecutorBlacklist(executorIdToFailureStatus, atomTask, clock) - logInfo(s"Blacklisting executors ${executors} for task ${atomTask}") updateBlacklistExecutorCache(atomTask, executors) executors } private def reEvaluateNodeBlacklistForStageAndUpdateCache(stageId: Int): Set[String] = { val nodes = strategy.getNodeBlacklistForStage(executorIdToFailureStatus, stageId, clock) - logInfo(s"Blacklisting nodes ${nodes} for stage ${stageId}") updateBlacklistNodeCache(nodes) nodes } @@ -257,6 +256,10 @@ private[scheduler] trait BlacklistCache extends Logging { protected def updateBlacklistNodeForStageCache( stageId: Int, blacklistNode: Set[String]): Unit = cacheLock.synchronized { + val wasBlacklisted = blacklistNodeForStageCache.getOrElse(stageId, Set.empty[String]) + if (wasBlacklisted != blacklistNode) { + logInfo(s"Updating node blacklist for Stage ${stageId} to ${blacklistNode}") + } if (!_isBlacklistNodeForStageCacheValid) { blacklistNodeForStageCache.clear() } From f850a300f9cbb0ab951ccc83e1b36767a252962f Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Tue, 24 May 2016 17:32:29 -0500 Subject: [PATCH 29/35] log executor in addition to host --- .../org/apache/spark/scheduler/TaskSetManager.scala | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index bd74eef10e485..1a716ff901d86 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -462,8 +462,8 @@ private[spark] class TaskSetManager( // a good proxy to task serialization time. // val timeTaken = clock.getTime() - startTime val taskName = s"task ${info.id} in stage ${taskSet.id}" - logInfo(s"Starting $taskName (TID $taskId, $host, partition ${task.partitionId}," + - s" $taskLocality, ${serializedTask.limit} bytes)") + logInfo(s"Starting $taskName (TID $taskId, $host, exec ${info.executorId}, " + + s"partition ${task.partitionId},$taskLocality, ${serializedTask.limit} bytes)") sched.dagScheduler.taskStarted(task, info) return Some(new TaskDescription(taskId = taskId, attemptNumber = attemptNum, execId, @@ -603,8 +603,9 @@ private[spark] class TaskSetManager( sched.dagScheduler.taskEnded(tasks(index), Success, result.value(), result.accumUpdates, info) if (!successful(index)) { tasksSuccessful += 1 - logInfo("Finished task %s in stage %s (TID %d) in %d ms on %s (%d/%d)".format( - info.id, taskSet.id, info.taskId, info.duration, info.host, tasksSuccessful, numTasks)) + logInfo("Finished task %s in stage %s (TID %d) in %d ms on %s / exec %s (%d/%d)".format( + info.id, taskSet.id, info.taskId, info.duration, info.host, info.executorId, + tasksSuccessful, numTasks)) // Mark successful and stop if all the tasks have succeeded. successful(index) = true if (tasksSuccessful == numTasks) { @@ -635,8 +636,8 @@ private[spark] class TaskSetManager( val index = info.index copiesRunning(index) -= 1 var accumUpdates: Seq[AccumulatorV2[_, _]] = Seq.empty - val failureReason = s"Lost task ${info.id} in stage ${taskSet.id} (TID $tid, ${info.host}): " + - reason.asInstanceOf[TaskFailedReason].toErrorString + val failureReason = s"Lost task ${info.id} in stage ${taskSet.id} (TID $tid, ${info.host}," + + s" exec ${info.executorId}): ${reason.asInstanceOf[TaskFailedReason].toErrorString}" val failureException: Option[Throwable] = reason match { case fetchFailed: FetchFailed => logWarning(failureReason) From 4ac99c6f43349ab6d40e9906a4ad316d85387b27 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Wed, 25 May 2016 15:17:50 -0500 Subject: [PATCH 30/35] wip, logging and some logic updates --- .../apache/spark/scheduler/BlacklistStrategy.scala | 1 - .../org/apache/spark/scheduler/BlacklistTracker.scala | 11 +++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala index 8c690b6e3223b..b16fb642e65f0 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala @@ -127,7 +127,6 @@ private[scheduler] class AdvancedSingleTaskStrategy( clock.getTimeMillis() - failureStatus.updatedTime < expireTimeInMilliseconds }.values.map(_.host) getDuplicateElem(nodes, 1) - super.getNodeBlacklistForStage(executorIdToFailureStatus, stageId, clock) } override def getNodeBlacklist( diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala index e51f2b4ebb274..1cbd36b574243 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala @@ -96,7 +96,8 @@ private[spark] class BlacklistTracker( } else { // getExecutorBlacklistFromCache(atomTask).getOrElse(Set.empty[String]) getExecutorBlacklistFromCache(atomTask).getOrElse { - // TODO Why is this necessary? + // TODO Why is this necessary? (its because we clear the entire map on an invalidate, + // and lazily rebuild it) reEvaluateExecutorBlacklistAndUpdateCache(sched, atomTask, clock) } } @@ -191,8 +192,12 @@ private[spark] class BlacklistTracker( private def executorsOnBlacklistedNode( sched: TaskSchedulerImpl, atomTask: StageAndPartition): Set[String] = { - nodeBlacklistForStage(atomTask.stageId).flatMap(sched.getExecutorsAliveOnHost(_) + val nodeBl = nodeBlacklistForStage(atomTask.stageId).flatMap(sched.getExecutorsAliveOnHost(_) .getOrElse(Set.empty[String])) + if (nodeBl.nonEmpty) { + logInfo(s"${atomTask} is blacklisted on executors ${nodeBl} from node blacklist") + } + nodeBl } private def reEvaluateExecutorBlacklistAndUpdateCache( @@ -208,6 +213,7 @@ private[spark] class BlacklistTracker( private def reEvaluateNodeBlacklistForStageAndUpdateCache(stageId: Int): Set[String] = { val nodes = strategy.getNodeBlacklistForStage(executorIdToFailureStatus, stageId, clock) +// updateBlacklistNodeForStageCache(stageId, nodes) updateBlacklistNodeCache(nodes) nodes } @@ -256,6 +262,7 @@ private[scheduler] trait BlacklistCache extends Logging { protected def updateBlacklistNodeForStageCache( stageId: Int, blacklistNode: Set[String]): Unit = cacheLock.synchronized { + // TODO this needs to actually get called, and add unit test val wasBlacklisted = blacklistNodeForStageCache.getOrElse(stageId, Set.empty[String]) if (wasBlacklisted != blacklistNode) { logInfo(s"Updating node blacklist for Stage ${stageId} to ${blacklistNode}") From 6f02ded3730c85571713bb3b4c108a00850f7306 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Wed, 25 May 2016 15:20:46 -0500 Subject: [PATCH 31/35] performance suite updates --- .../scheduler/SchedulerPerformanceSuite.scala | 154 +++++++++++++++--- 1 file changed, 130 insertions(+), 24 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala index 4956cb0efb911..7368bcac0e28a 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala @@ -16,6 +16,8 @@ */ package org.apache.spark.scheduler +import java.util.concurrent.atomic.AtomicBoolean + import scala.concurrent.duration.Duration import org.apache.spark.util.Utils @@ -47,7 +49,11 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM } } - def runJobWithBackend(N: Int, backend: () => Unit): Unit = { + def runJobWithBackend(N: Int, backendFunc: () => Unit): Unit = { + runJobWithCustomBackend(N, new SimpleWrappedBackend(backend, backendFunc)) + } + + def runJobWithCustomBackend(N: Int, backendWrapper: WrappedBackend): Unit = { // Try to run as many jobs as we can in 10 seconds, get the time per job. The idea here is to // balance: // 1) have a big enough job that we're not effected by delays just from waiting for job @@ -55,7 +61,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM // 2) run enough iterations to get some reliable data // 3) not wait toooooo long var itrs = 0 - val totalMs = withBackend(backend) { + val totalMs = backendWrapper.withBackend { val start = System.currentTimeMillis() while (System.currentTimeMillis() - start < 10000 ) { // while (System.currentTimeMillis() - start < 10000 && itrs == 0) { @@ -76,7 +82,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM itrs += 1 } } - System.currentTimeMillis() - start + (System.currentTimeMillis() - start) } val msPerItr = Utils.msDurationToString((totalMs.toDouble / itrs).toLong) @@ -209,31 +215,89 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM runSuccessfulJob(3000) } - def backendWithBadExecs(N: Int, badExecs: Set[String], badHosts: Set[String]): Unit = { - val taskDescription = backend.beginTask() - val host = backend.executorIdToExecutor(taskDescription.executorId).host - val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet - val task = taskSet.tasks(taskDescription.index) - if (badExecs(taskDescription.executorId)) { - val exc = new RuntimeException(s"bad exec ${taskDescription.executorId}") - backend.taskFailed(taskDescription, exc) - } else if (badHosts(host)) { - val exc = new RuntimeException(s"bad host ${host}") - backend.taskFailed(taskDescription, exc) - } else { - // every 5th stage is a ResultStage -- the rest are ShuffleMapStages - (task.stageId, task.partitionId) match { - case (stage, _) if stage % 5 != 4 => - backend.taskSuccess(taskDescription, - DAGSchedulerSuite.makeMapStatus(host, N)) - case (_, _) => - backend.taskSuccess(taskDescription, 42) + def backendWithBadExecs( + continue: AtomicBoolean, + N: Int, + badExecs: Set[String], + badHosts: Set[String]): Unit = { + var tasksToFail = List[TaskDescription]() + var tasksToSucceed = List[TaskDescription]() + val FAILURES_TILL_SUCCESS = 100 // that is, we get a task failure 100 times as fast as success + val waitForSuccess = 100 + var failuresSinceLastSuccess = 0 + while (continue.get()) { + // don't *just* keep failing tasks on the same executor. While there are tasks to fail, + // we fail them more often, but we fail across all executors. Furthermore, after X failures, + // we do have a task success + + // first, queue up all the tasks needing to run + while (backend.hasTasksWaitingToRun) { + val taskDescription = backend.beginTask() + val host = backend.executorIdToExecutor(taskDescription.executorId).host + val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet + val task = taskSet.tasks(taskDescription.index) + if (badExecs(taskDescription.executorId) || badHosts(host)) { + tasksToFail :+= taskDescription + } else { + tasksToSucceed :+= taskDescription + } + } + + // send a task result. Failure if there are any and we haven't had too many failures in a row + def failTask(): Unit = { + failuresSinceLastSuccess += 1 + val toFail = tasksToFail.head + tasksToFail = tasksToFail.tail + val host = backend.executorIdToExecutor(toFail.executorId).host + if (badExecs(toFail.executorId)) { + val exc = new RuntimeException(s"bad exec ${toFail.executorId}") + backend.taskFailed(toFail, exc) + } else if (badHosts(host)) { + val exc = new RuntimeException(s"bad host ${host}") + backend.taskFailed(toFail, exc) + } + } + if (tasksToFail.nonEmpty && failuresSinceLastSuccess < FAILURES_TILL_SUCCESS) { + failTask() + } else if (tasksToSucceed.nonEmpty) { + // we might get here just by some chance of thread-scheduling in this mock. Tasks fail, + // but the dag scheduler thread hasn't processed those before this thread tries to find + // another task to respond to. +// Thread.sleep(waitForSuccess) + if (tasksToFail.nonEmpty && failuresSinceLastSuccess < FAILURES_TILL_SUCCESS) { + failTask() + } else { + logInfo(s"tasksToFail.size = ${tasksToFail.size}; " + + s"tasksToSucceed.size = ${tasksToSucceed.size}; " + + s"failuresSinceLastSuccess = ${failuresSinceLastSuccess}") + failuresSinceLastSuccess = 0 + val taskDescription = tasksToSucceed.head + tasksToSucceed = tasksToSucceed.tail + val host = backend.executorIdToExecutor(taskDescription.executorId).host + val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet + val task = taskSet.tasks(taskDescription.index) + // every 5th stage is a ResultStage -- the rest are ShuffleMapStages + (task.stageId, task.partitionId) match { + case (stage, _) if stage % 5 != 4 => + backend.taskSuccess(taskDescription, + DAGSchedulerSuite.makeMapStatus(host, N)) + case (_, _) => + backend.taskSuccess(taskDescription, 42) + } + } + } else { + Thread.sleep(10) // wait till we've got work to do } } } def runBadExecJob(N: Int, badExecs: Set[String], badHosts: Set[String]): Unit = { - runJobWithBackend(N, () => backendWithBadExecs(N, badExecs, badHosts)) + val backendWrapper = new WrappedBackend(backend) { + override def runBackend(continue: AtomicBoolean): Unit = { + backendWithBadExecs(continue, N, badExecs, badHosts) + } + } + runJobWithCustomBackend(N, backendWrapper) } val badExecs = (0 until 2).map{_.toString}.toSet @@ -253,11 +317,20 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM "spark.scheduler.blacklist.advancedStrategy" -> strategy ) ) { + // scalastyle:off println + println(s"Bad execs = ${badExecs}") + // scalastyle:on println + + // because offers get shuffled, its a crapshoot whether or not the "bad" executor will finish + // tasks first. (A more complicated mock backend could make sure it fails the first executor + // it gets assigned) runBadExecJob(3000, badExecs, badHosts) } } + // scalastyle:off line.size.limit + /* Here's how you can get into really slow scheduling, even with the simple blacklist. Say there is just one bad executor. You've got a bunch of tasks to run, and you schedule all available @@ -272,7 +345,6 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM anyway. - 16/05/23 20:53:57.871 dag-scheduler-event-loop INFO BlacklistTracker: Blacklisting executors Set() for task StageAndPartition(8,38) 16/05/23 20:53:57.871 dag-scheduler-event-loop INFO TaskSetManager: Starting task 38.0 in stage 8.0 (TID 21056, host-2, partition 38, PROCESS_LOCAL, 5112 bytes) 16/05/23 20:53:57.871 dag-scheduler-event-loop INFO BlacklistTracker: Blacklisting nodes Set() for stage 8 @@ -320,4 +392,38 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM */ + // scalastyle:on line.size.limit + + abstract class WrappedBackend(backend: MockBackend) { + val backendContinue = new AtomicBoolean(true) + def runBackend(continue: AtomicBoolean): Unit + val backendThread = new Thread("mock backend thread") { + override def run(): Unit = { + runBackend(backendContinue) + } + } + + def withBackend[T](testBody: => T): T = { + try { + backendThread.start() + testBody + } finally { + backendContinue.set(false) + backendThread.join() + } + } + } + + class SimpleWrappedBackend(backend: MockBackend, backendFunc: () => Unit) + extends WrappedBackend(backend) { + override def runBackend(continue: AtomicBoolean): Unit = { + while (continue.get()) { + if (backend.hasTasksWaitingToRun) { + backendFunc() + } else { + Thread.sleep(10) + } + } + } + } } From 71f1b477eafe47bcaee513987fbd2e8d4a4d5358 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Wed, 25 May 2016 22:53:45 -0500 Subject: [PATCH 32/35] optimization -- skip blacklisted executors earlier in scheduling loop --- .../org/apache/spark/scheduler/TaskSchedulerImpl.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index 371fb8602f785..3b3dfa206a4e5 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -248,10 +248,16 @@ private[spark] class TaskSchedulerImpl( availableCpus: Array[Int], tasks: Seq[ArrayBuffer[TaskDescription]]) : Boolean = { var launchedTask = false + // TODO unit test, and also add executor-stage filtering as well + // This is an optimization -- the taskSet might contain a very long list of pending tasks. + // Rather than wasting time checking the offer against each task, and then realizing the + // executor is blacklisted, just filter out the bad executor immediately. + val nodeBlacklist = taskSet.blacklistTracker.map{_.nodeBlacklistForStage(taskSet.stageId)} + .getOrElse(Set()) for (i <- 0 until shuffledOffers.size) { val execId = shuffledOffers(i).executorId val host = shuffledOffers(i).host - if (availableCpus(i) >= CPUS_PER_TASK) { + if (!nodeBlacklist(host) && availableCpus(i) >= CPUS_PER_TASK) { try { for (task <- taskSet.resourceOffer(execId, host, maxLocality)) { tasks(i) += task From ffd0f252f012c3f5e12d6f1f500667700f1a5f65 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Wed, 25 May 2016 22:55:41 -0500 Subject: [PATCH 33/35] bug fix -- update the right cache in nodeBlacklistForStage --- .../scala/org/apache/spark/scheduler/BlacklistTracker.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala index 1cbd36b574243..617c6ce8f9b80 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala @@ -213,8 +213,8 @@ private[spark] class BlacklistTracker( private def reEvaluateNodeBlacklistForStageAndUpdateCache(stageId: Int): Set[String] = { val nodes = strategy.getNodeBlacklistForStage(executorIdToFailureStatus, stageId, clock) -// updateBlacklistNodeForStageCache(stageId, nodes) - updateBlacklistNodeCache(nodes) + updateBlacklistNodeForStageCache(stageId, nodes) +// updateBlacklistNodeCache(nodes) nodes } } From 3effef6c17cc5c5e4c4385103c7d96320b015672 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Wed, 25 May 2016 22:56:11 -0500 Subject: [PATCH 34/35] cleanup, TODOs --- .../org/apache/spark/scheduler/BlacklistTracker.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala index 617c6ce8f9b80..4ca0713880a64 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala @@ -116,6 +116,10 @@ private[spark] class BlacklistTracker( // The actual implementation is delegated to strategy def nodeBlacklistForStage(stageId: Int): Set[String] = synchronized { + // TODO here and elsewhere -- we invalidate the cache way too often. In general, we should + // be able to do an in-place update of the caches. (a) this is slow and (b) it makes + // it really hard to track when the blacklist actually changes (would be *really* nice to + // log a msg about node level blacklisting at least) if (isBlacklistNodeForStageCacheValid) { getNodeBlacklistForStageFromCache(stageId).getOrElse( reEvaluateNodeBlacklistForStageAndUpdateCache(stageId)) @@ -192,12 +196,8 @@ private[spark] class BlacklistTracker( private def executorsOnBlacklistedNode( sched: TaskSchedulerImpl, atomTask: StageAndPartition): Set[String] = { - val nodeBl = nodeBlacklistForStage(atomTask.stageId).flatMap(sched.getExecutorsAliveOnHost(_) + nodeBlacklistForStage(atomTask.stageId).flatMap(sched.getExecutorsAliveOnHost(_) .getOrElse(Set.empty[String])) - if (nodeBl.nonEmpty) { - logInfo(s"${atomTask} is blacklisted on executors ${nodeBl} from node blacklist") - } - nodeBl } private def reEvaluateExecutorBlacklistAndUpdateCache( From 456f578121801257bb90b0cbfbd9fa37a117961e Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Wed, 25 May 2016 22:57:01 -0500 Subject: [PATCH 35/35] process tasks in LIFO order for all performance tests, more cases, etc. --- .../scheduler/SchedulerPerformanceSuite.scala | 204 +++++++++--------- 1 file changed, 102 insertions(+), 102 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala index 7368bcac0e28a..515ce0a4d6e69 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala @@ -33,26 +33,6 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM join(N, b, c) } - def goodBackend(N: Int): Unit = { - val taskDescription = backend.beginTask() - val host = backend.executorIdToExecutor(taskDescription.executorId).host - val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet - val task = taskSet.tasks(taskDescription.index) - - // every 5th stage is a ResultStage -- the rest are ShuffleMapStages - (task.stageId, task.partitionId) match { - case (stage, _) if stage % 5 != 4 => - backend.taskSuccess(taskDescription, - DAGSchedulerSuite.makeMapStatus(host, N)) - case (_, _) => - backend.taskSuccess(taskDescription, 42) - } - } - - def runJobWithBackend(N: Int, backendFunc: () => Unit): Unit = { - runJobWithCustomBackend(N, new SimpleWrappedBackend(backend, backendFunc)) - } - def runJobWithCustomBackend(N: Int, backendWrapper: WrappedBackend): Unit = { // Try to run as many jobs as we can in 10 seconds, get the time per job. The idea here is to // balance: @@ -92,7 +72,17 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM } def runSuccessfulJob(N: Int): Unit = { - runJobWithBackend(N, () => goodBackend(N)) + runJobWithCustomBackend(N, new QueuingWrappedBackend(backend) { + override def handleTask(taskDesc: TaskDescription, task: Task[_], host: String): Unit = { + // every 5th stage is a ResultStage -- the rest are ShuffleMapStages + (task.stageId, task.partitionId) match { + case (stage, _) if stage % 5 != 4 => + queueSuccess(taskDesc, DAGSchedulerSuite.makeMapStatus(host, N)) + case (_, _) => + queueSuccess(taskDesc, 42) + } + } + }) } testScheduler("Scheduling speed -- small job on a small cluster") { @@ -215,101 +205,44 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM runSuccessfulJob(3000) } - def backendWithBadExecs( - continue: AtomicBoolean, - N: Int, - badExecs: Set[String], - badHosts: Set[String]): Unit = { - var tasksToFail = List[TaskDescription]() - var tasksToSucceed = List[TaskDescription]() - val FAILURES_TILL_SUCCESS = 100 // that is, we get a task failure 100 times as fast as success - val waitForSuccess = 100 - var failuresSinceLastSuccess = 0 - while (continue.get()) { - // don't *just* keep failing tasks on the same executor. While there are tasks to fail, - // we fail them more often, but we fail across all executors. Furthermore, after X failures, - // we do have a task success - - // first, queue up all the tasks needing to run - while (backend.hasTasksWaitingToRun) { - val taskDescription = backend.beginTask() - val host = backend.executorIdToExecutor(taskDescription.executorId).host - val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet - val task = taskSet.tasks(taskDescription.index) - if (badExecs(taskDescription.executorId) || badHosts(host)) { - tasksToFail :+= taskDescription - } else { - tasksToSucceed :+= taskDescription - } - } - - // send a task result. Failure if there are any and we haven't had too many failures in a row - def failTask(): Unit = { - failuresSinceLastSuccess += 1 - val toFail = tasksToFail.head - tasksToFail = tasksToFail.tail - val host = backend.executorIdToExecutor(toFail.executorId).host - if (badExecs(toFail.executorId)) { - val exc = new RuntimeException(s"bad exec ${toFail.executorId}") - backend.taskFailed(toFail, exc) + def runBadExecJob(N: Int, badExecs: Set[String], badHosts: Set[String]): Unit = { + val backendWrapper = new QueuingWrappedBackend(backend) { + override def handleTask(taskDesc: TaskDescription, task: Task[_], host: String): Unit = { + if (badExecs(taskDesc.executorId)) { + val exc = new RuntimeException(s"bad exec ${taskDesc.executorId}") + queueFailure(taskDesc, exc) } else if (badHosts(host)) { val exc = new RuntimeException(s"bad host ${host}") - backend.taskFailed(toFail, exc) - } - } - if (tasksToFail.nonEmpty && failuresSinceLastSuccess < FAILURES_TILL_SUCCESS) { - failTask() - } else if (tasksToSucceed.nonEmpty) { - // we might get here just by some chance of thread-scheduling in this mock. Tasks fail, - // but the dag scheduler thread hasn't processed those before this thread tries to find - // another task to respond to. -// Thread.sleep(waitForSuccess) - if (tasksToFail.nonEmpty && failuresSinceLastSuccess < FAILURES_TILL_SUCCESS) { - failTask() + queueFailure(taskDesc, exc) } else { - logInfo(s"tasksToFail.size = ${tasksToFail.size}; " + - s"tasksToSucceed.size = ${tasksToSucceed.size}; " + - s"failuresSinceLastSuccess = ${failuresSinceLastSuccess}") - failuresSinceLastSuccess = 0 - val taskDescription = tasksToSucceed.head - tasksToSucceed = tasksToSucceed.tail - val host = backend.executorIdToExecutor(taskDescription.executorId).host - val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet - val task = taskSet.tasks(taskDescription.index) // every 5th stage is a ResultStage -- the rest are ShuffleMapStages (task.stageId, task.partitionId) match { case (stage, _) if stage % 5 != 4 => - backend.taskSuccess(taskDescription, - DAGSchedulerSuite.makeMapStatus(host, N)) + queueSuccess(taskDesc, DAGSchedulerSuite.makeMapStatus(host, N)) case (_, _) => - backend.taskSuccess(taskDescription, 42) + queueSuccess(taskDesc, 42) } } - } else { - Thread.sleep(10) // wait till we've got work to do - } - } - } - - def runBadExecJob(N: Int, badExecs: Set[String], badHosts: Set[String]): Unit = { - val backendWrapper = new WrappedBackend(backend) { - override def runBackend(continue: AtomicBoolean): Unit = { - backendWithBadExecs(continue, N, badExecs, badHosts) } } runJobWithCustomBackend(N, backendWrapper) } - val badExecs = (0 until 2).map{_.toString}.toSet + val oneBadExec = Set("0") + // intentionally on different nodes, so they don't trigger node blacklist + val twoBadExecs = Set("0", "15") + // note this is *very* unlikely to succeed without blacklisting, even though its only // one bad executor out of 20. When a task fails, it gets requeued immediately -- and guess // which is the only executor which has a free slot? Bingo, the one it just failed on Seq( - ("bad execs with simple blacklist", "false", Set[String]()), - ("bad execs with advanced blacklist", "true", Set[String]()), - ("bad hosts with advanced blacklist", "true", Set[String]("host-0")) - ).foreach { case (name, strategy, badHosts) => + ("bad exec with simple blacklist", "false", oneBadExec, Set[String]()), + ("two bad execs with simple blacklist", "false", twoBadExecs, Set[String]()), + ("bad exec with advanced blacklist", "true", oneBadExec, Set[String]()), + ("bad host with advanced blacklist", "true", Set[String](), Set[String]("host-0")), + ("bad exec and host with advanced blacklist", "true", oneBadExec, Set[String]("host-3")) + ).foreach { case (name, strategy, badExecs, badHosts) => testScheduler( s"COMPARE D $name", extraConfs = Seq( @@ -394,6 +327,18 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM // scalastyle:on line.size.limit + + /* + RESULTS + + On a happy cluster, speed is about the same in all modes, ~5s per iteration + + On a bad cluster, slow in all versions, about 2m per iteration (original code, and new code with + various strategies). the reason is that we waste soooooo long looping all tasks through + the bad nodes, and that has one n^2 penalty. + + */ + abstract class WrappedBackend(backend: MockBackend) { val backendContinue = new AtomicBoolean(true) def runBackend(continue: AtomicBoolean): Unit @@ -412,16 +357,71 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM backendThread.join() } } + } - class SimpleWrappedBackend(backend: MockBackend, backendFunc: () => Unit) - extends WrappedBackend(backend) { + abstract class QueuingWrappedBackend(backend: MockBackend) extends WrappedBackend(backend) { + var tasksToFail = List[(TaskDescription, Exception)]() + var tasksToSucceed = List[(TaskDescription, Any)]() + val FAILURES_TILL_SUCCESS = 100 + // that is, we get a task failure 100 times as fast as success + val waitForSuccess = 100 + var failuresSinceLastSuccess = 0 + + def handleTask(taskDesc: TaskDescription, task: Task[_], host: String): Unit + + def queueSuccess(taskDesc: TaskDescription, result: Any): Unit = { + tasksToSucceed :+= taskDesc -> result + } + + def queueFailure(taskDesc: TaskDescription, exc: Exception): Unit = { + tasksToFail :+= taskDesc -> exc + } + override def runBackend(continue: AtomicBoolean): Unit = { while (continue.get()) { - if (backend.hasTasksWaitingToRun) { - backendFunc() + // don't *just* keep failing tasks on the same executor. While there are tasks to fail, + // we fail them more often, but we fail across all executors. Furthermore, after X failures + // we do have a task success + + // first, queue up all the tasks needing to run + while (backend.hasTasksWaitingToRun) { + val taskDescription = backend.beginTask() + val host = backend.executorIdToExecutor(taskDescription.executorId).host + val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet + val task = taskSet.tasks(taskDescription.index) + handleTask(taskDescription, task, host) + } + + // send a task result. Prioritize failures, if we haven't had too many failures in a row + def failTask(): Unit = { + failuresSinceLastSuccess += 1 + val (toFail, exc) = tasksToFail.head + tasksToFail = tasksToFail.tail + backend.taskFailed(toFail, exc) + } + + if (tasksToFail.nonEmpty && failuresSinceLastSuccess < FAILURES_TILL_SUCCESS) { + failTask() + } else if (tasksToSucceed.nonEmpty) { + // we might get here just by some chance of thread-scheduling in this mock. Tasks fail, + // but the scheduler thread hasn't processed those before this thread tries to find + // another task to respond to. + // if (tasksToFail.nonEmpty && failuresSinceLastSuccess < FAILURES_TILL_SUCCESS) { + // failTask() + // } else { + logInfo(s"tasksToFail.size = ${tasksToFail.size}; " + + s"tasksToSucceed.size = ${tasksToSucceed.size}; " + + s"failuresSinceLastSuccess = ${failuresSinceLastSuccess}") + failuresSinceLastSuccess = 0 + val (taskDescription, result) = tasksToSucceed.head + tasksToSucceed = tasksToSucceed.tail + val host = backend.executorIdToExecutor(taskDescription.executorId).host + val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet + val task = taskSet.tasks(taskDescription.index) + backend.taskSuccess(taskDescription, result) } else { - Thread.sleep(10) + Thread.sleep(10) // wait till we've got work to do } } }