From a6e94d7dac23431462b576b059de701eac548643 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Tue, 10 May 2016 15:31:57 -0500
Subject: [PATCH 01/35] basic test framework for entire spark scheduler

---
 .../scala/org/apache/spark/SparkContext.scala |   9 +
 .../apache/spark/scheduler/DAGScheduler.scala |  19 +-
 .../spark/scheduler/TaskResultGetter.scala    |  12 +
 .../org/apache/spark/util/EventLoop.scala     |   9 +-
 .../spark/scheduler/DAGSchedulerSuite.scala   |  16 +-
 .../scheduler/SchedulerIntegrationSuite.scala | 390 ++++++++++++++++++
 6 files changed, 446 insertions(+), 9 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index e391599336074..c3f51923e73a7 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -2420,6 +2420,14 @@ object SparkContext extends Logging {
         scheduler.initialize(backend)
         (backend, scheduler)
 
+      case MOCK_REGEX(backendClassName) =>
+        val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
+        val backendClass = Utils.classForName(backendClassName)
+        val ctor = backendClass.getConstructor(classOf[SparkConf], classOf[TaskSchedulerImpl])
+        val backend = ctor.newInstance(sc.getConf, scheduler).asInstanceOf[SchedulerBackend]
+        scheduler.initialize(backend)
+        (backend, scheduler)
+
       case LOCAL_N_REGEX(threads) =>
         def localCpuCount: Int = Runtime.getRuntime.availableProcessors()
         // local[*] estimates the number of cores on the machine; local[N] uses exactly N threads.
@@ -2520,6 +2528,7 @@ object SparkContext extends Logging {
  * A collection of regexes for extracting information from the master string.
  */
 private object SparkMasterRegex {
+  val MOCK_REGEX = """mock\[(.*)\]""".r
   // Regular expression used for local[N] and local[*] master formats
   val LOCAL_N_REGEX = """local\[([0-9]+|\*)\]""".r
   // Regular expression for local[N, maxRetries], used in tests with failing tasks
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 5291b663667ea..709514f4327c4 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -183,6 +183,14 @@ class DAGScheduler(
 
   private val messageScheduler =
     ThreadUtils.newDaemonSingleThreadScheduledExecutor("dag-scheduler-message")
+  private val msgsScheduled = new AtomicInteger(0)
+
+  /**
+   * Visible for testing, to know if the DAGScheduler is still "busy"
+   */
+  private[scheduler] def msgSchedulerEmpty: Boolean = {
+    msgsScheduled.get() == 0
+  }
 
   private[scheduler] val eventProcessLoop = new DAGSchedulerEventProcessLoop(this)
   taskScheduler.setDAGScheduler(this)
@@ -1283,8 +1291,15 @@ class DAGScheduler(
             // TODO: Cancel running tasks in the stage
             logInfo(s"Resubmitting $mapStage (${mapStage.name}) and " +
               s"$failedStage (${failedStage.name}) due to fetch failure")
+            // We might get lots of fetch failed for this stage, from lots of executors.
+            // Its better if we can resubmit for all the failed executors at one time, so lets
+            // just wait a *bit* before we resubmit.
+            msgsScheduled.incrementAndGet()
             messageScheduler.schedule(new Runnable {
-              override def run(): Unit = eventProcessLoop.post(ResubmitFailedStages)
+              override def run(): Unit = {
+                eventProcessLoop.post(ResubmitFailedStages)
+                msgsScheduled.decrementAndGet()
+              }
             }, DAGScheduler.RESUBMIT_TIMEOUT, TimeUnit.MILLISECONDS)
           }
           failedStages += failedStage
@@ -1411,7 +1426,7 @@ class DAGScheduler(
       stage.clearFailures()
     } else {
       stage.latestInfo.stageFailed(errorMessage.get)
-      logInfo("%s (%s) failed in %s s".format(stage, stage.name, serviceTime))
+      logInfo(s"$stage (${stage.name}) failed in $serviceTime s due to ${errorMessage.get}")
     }
 
     outputCommitCoordinator.stageEnd(stage.id)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
index 685ef55c66876..bc7b32ad0b5c5 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
@@ -19,6 +19,7 @@ package org.apache.spark.scheduler
 
 import java.nio.ByteBuffer
 import java.util.concurrent.{ExecutorService, RejectedExecutionException}
+import java.util.concurrent.atomic.AtomicInteger
 
 import scala.language.existentials
 import scala.util.control.NonFatal
@@ -37,6 +38,11 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
 
   private val THREADS = sparkEnv.conf.getInt("spark.resultGetter.threads", 4)
 
+  private val nTasks = new AtomicInteger(0)
+  def isEmpty: Boolean = {
+    nTasks.get() == 0
+  }
+
   // Exposed for testing.
   protected val getTaskResultExecutor: ExecutorService =
     ThreadUtils.newDaemonFixedThreadPool(THREADS, "task-result-getter")
@@ -52,6 +58,7 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
       taskSetManager: TaskSetManager,
       tid: Long,
       serializedData: ByteBuffer): Unit = {
+    nTasks.incrementAndGet()
     getTaskResultExecutor.execute(new Runnable {
       override def run(): Unit = Utils.logUncaughtExceptions {
         try {
@@ -111,6 +118,8 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
           case NonFatal(ex) =>
             logError("Exception while getting task result", ex)
             taskSetManager.abort("Exception while getting task result: %s".format(ex))
+        } finally {
+          nTasks.decrementAndGet()
         }
       }
     })
@@ -119,6 +128,7 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
   def enqueueFailedTask(taskSetManager: TaskSetManager, tid: Long, taskState: TaskState,
     serializedData: ByteBuffer) {
     var reason : TaskEndReason = UnknownReason
+    nTasks.incrementAndGet()
     try {
       getTaskResultExecutor.execute(new Runnable {
         override def run(): Unit = Utils.logUncaughtExceptions {
@@ -142,6 +152,8 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
     } catch {
       case e: RejectedExecutionException if sparkEnv.isStopped =>
         // ignore it
+    } finally {
+      nTasks.decrementAndGet()
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/util/EventLoop.scala b/core/src/main/scala/org/apache/spark/util/EventLoop.scala
index 3ea9139e11027..d5ece57b88c9e 100644
--- a/core/src/main/scala/org/apache/spark/util/EventLoop.scala
+++ b/core/src/main/scala/org/apache/spark/util/EventLoop.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.util
 
 import java.util.concurrent.{BlockingQueue, LinkedBlockingDeque}
-import java.util.concurrent.atomic.AtomicBoolean
+import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger}
 
 import scala.util.control.NonFatal
 
@@ -36,6 +36,7 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging {
   private val eventQueue: BlockingQueue[E] = new LinkedBlockingDeque[E]()
 
   private val stopped = new AtomicBoolean(false)
+  private val nMsgs = new AtomicInteger(0)
 
   private val eventThread = new Thread(name) {
     setDaemon(true)
@@ -46,6 +47,7 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging {
           val event = eventQueue.take()
           try {
             onReceive(event)
+            nMsgs.decrementAndGet()
           } catch {
             case NonFatal(e) =>
               try {
@@ -99,6 +101,7 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging {
    * Put the event into the event queue. The event thread will process it later.
    */
   def post(event: E): Unit = {
+    nMsgs.incrementAndGet()
     eventQueue.put(event)
   }
 
@@ -107,6 +110,10 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging {
    */
   def isActive: Boolean = eventThread.isAlive
 
+  def isEmpty: Boolean = {
+    nMsgs.get() == 0
+  }
+
   /**
    * Invoked when `start()` is called but before the event thread starts.
    */
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 088a476086217..9c004e0dd76dc 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -98,6 +98,8 @@ class DAGSchedulerSuiteDummyException extends Exception
 
 class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeouts {
 
+  import DAGSchedulerSuite._
+
   val conf = new SparkConf
   /** Set of TaskSets the DAGScheduler has requested executed. */
   val taskSets = scala.collection.mutable.Buffer[TaskSet]()
@@ -2027,12 +2029,6 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
     }
   }
 
-  private def makeMapStatus(host: String, reduces: Int, sizes: Byte = 2): MapStatus =
-    MapStatus(makeBlockManagerId(host), Array.fill[Long](reduces)(sizes))
-
-  private def makeBlockManagerId(host: String): BlockManagerId =
-    BlockManagerId("exec-" + host, host, 12345)
-
   private def assertDataStructuresEmpty(): Unit = {
     assert(scheduler.activeJobs.isEmpty)
     assert(scheduler.failedStages.isEmpty)
@@ -2072,5 +2068,13 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
     }
     CompletionEvent(task, reason, result, accumUpdates ++ extraAccumUpdates, taskInfo)
   }
+}
+
 
+object DAGSchedulerSuite {
+  def makeMapStatus(host: String, reduces: Int, sizes: Byte = 2): MapStatus =
+    MapStatus(makeBlockManagerId(host), Array.fill[Long](reduces)(sizes))
+
+  def makeBlockManagerId(host: String): BlockManagerId =
+    BlockManagerId("exec-" + host, host, 12345)
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
new file mode 100644
index 0000000000000..2c55d50aa0cd0
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -0,0 +1,390 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler
+
+import java.util.concurrent.TimeoutException
+
+import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
+
+import org.scalactic.TripleEquals
+import org.scalatest.Assertions.AssertionsHelper
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark._
+import org.apache.spark.TaskState._
+import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.internal.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.scheduler.DAGSchedulerSuite._
+import org.apache.spark.util.CallSite
+
+/**
+ * Tests for the  entire scheduler code -- DAGScheduler, TaskSchedulerImpl, TaskSets,
+ * TaskSetManagers.
+ *
+ * Test cases are configured by providing a set of jobs to submit, and then simulating interaction
+ * with spark's executors via a mocked backend (eg., task completion, task failure, executors
+ * disconnecting, etc.).
+ */
+class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with LocalSparkContext {
+  val conf = new SparkConf
+
+  /** Set of TaskSets the DAGScheduler has requested executed. */
+  val runningTaskSets = HashSet[TaskSet]()
+
+  var taskScheduler: TaskSchedulerImpl = null
+  var scheduler: DAGScheduler = null
+  var backend: SingleCoreMockBackend = null
+
+  before {
+    runningTaskSets.clear()
+    results.clear()
+    sc = new SparkContext("mock[org.apache.spark.scheduler.SingleCoreMockBackend]",
+      "SchedulerIntegrationSuite")
+    backend = sc.schedulerBackend.asInstanceOf[SingleCoreMockBackend]
+    taskScheduler = new TaskSchedulerImpl(sc) {
+      override def submitTasks(taskSet: TaskSet): Unit = {
+        runningTaskSets += taskSet
+        super.submitTasks(taskSet)
+      }
+      override def taskSetFinished(manager: TaskSetManager): Unit = {
+        runningTaskSets -= manager.taskSet
+        super.taskSetFinished(manager)
+      }
+    }
+    taskScheduler.initialize(sc.schedulerBackend)
+    backend.taskScheduler = taskScheduler
+    scheduler = new DAGScheduler(sc, taskScheduler)
+    taskScheduler.setDAGScheduler(scheduler)
+  }
+
+  after {
+    taskScheduler.stop()
+    backend.stop()
+    scheduler.stop()
+  }
+
+  /**
+   * Process the supplied event as if it were the top of the DAGScheduler event queue, expecting
+   * the scheduler not to exit.
+   */
+  private def runEvent(event: DAGSchedulerEvent) {
+    scheduler.eventProcessLoop.post(event)
+  }
+
+  val results = new HashMap[Int, Any]()
+  var failure: Exception = _
+  val jobListener = new JobListener() {
+    override def taskSucceeded(index: Int, result: Any) = results.put(index, result)
+    override def jobFailed(exception: Exception) = { failure = exception }
+  }
+
+  /**
+   * When we submit dummy Jobs, this is the compute function we supply.
+   */
+  private val jobComputeFunc: (TaskContext, scala.Iterator[_]) => Any = {
+    (context: TaskContext, it: Iterator[(_)]) =>
+      throw new RuntimeException("jobComputeFunc shouldn't get called in this mock")
+  }
+
+  /** Sends the rdd to the scheduler for scheduling and returns the job id. */
+  private def submit(
+      rdd: RDD[_],
+      partitions: Array[Int],
+      func: (TaskContext, Iterator[_]) => _ = jobComputeFunc,
+      listener: JobListener = jobListener): Int = {
+    val jobId = scheduler.nextJobId.getAndIncrement()
+    runEvent(JobSubmitted(jobId, rdd, func, partitions, CallSite("", ""), listener))
+    jobId
+  }
+
+  /**
+   * Return true if the backend has more work to do, false otherwise.  It will block until it has
+   * a definitive answer either way -- eg., if the backend does not appear to have any work, but
+   * the dag scheduler has some events left to process, this will wait until the dag scheduler is
+   * done processing enough events to say for sure.
+   */
+  private def backendHasWorkToDo: Boolean = {
+    // the ordering is somewhat important here -- avoid waiting if we can (both to speed up test,
+    // and also to test with more concurrency inside scheduler)
+    if (backend.runningTasks.nonEmpty) {
+      true
+    } else if (runningTaskSets.isEmpty && scheduler.msgSchedulerEmpty &&
+      scheduler.eventProcessLoop.isEmpty && taskScheduler.taskResultGetter.isEmpty ) {
+      false
+    } else if (runningTaskSets.nonEmpty) {
+      // need to get all task results, as they might lead to finishing a taskSet
+      waitUntil(() => taskScheduler.taskResultGetter.isEmpty)
+      backendHasWorkToDo
+    } else {
+      waitUntil(() => taskScheduler.taskResultGetter.isEmpty)
+      waitUntil(() => scheduler.eventProcessLoop.isEmpty)
+      backendHasWorkToDo
+    }
+  }
+
+  private def waitUntil(condition: () => Boolean): Unit = {
+    val timeoutMillis = 1000L
+    val finishTime = System.currentTimeMillis + timeoutMillis
+    while (!condition()) {
+      if (System.currentTimeMillis > finishTime) {
+        throw new TimeoutException(
+          s"Not ready after $timeoutMillis milliseconds")
+      }
+      /* Sleep rather than using wait/notify, because this is used only for testing and
+       * wait/notify add overhead in the general case. */
+      Thread.sleep(10)
+    }
+  }
+
+  private def assertDataStructuresEmpty(): Unit = {
+    assert(!backendHasWorkToDo)
+    assert(runningTaskSets.isEmpty)
+    assert(backend.runningTasks.isEmpty)
+  }
+
+  /**
+   * Looks at all shuffleMapOutputs that are dependencies of the given RDD, and makes sure
+   * they are all registered
+   */
+  private def assertMapOutputAvailable(targetRdd: MockRDD): Unit = {
+    val shuffleIds = targetRdd.shuffleDeps.map{_.shuffleId}
+    val nParts = targetRdd.numPartitions
+    for {
+      shuffleId <- shuffleIds
+      reduceIdx <- (0 until nParts)
+    } {
+      val statuses = taskScheduler.mapOutputTracker.getMapSizesByExecutorId(shuffleId, reduceIdx)
+      // really we should have already thrown an exception rather than fail either of these
+      // asserts, but just to be extra defensive let's double check the statuses are OK
+      assert(statuses != null)
+      assert(statuses.nonEmpty)
+    }
+  }
+
+
+  /** models a stage boundary with a single dependency, like a shuffle */
+  def shuffle(nParts: Int, input: MockRDD): MockRDD = {
+    val partitioner = new HashPartitioner(nParts)
+    val shuffleDep = new ShuffleDependency(input, partitioner)
+    new MockRDD(sc, nParts, List(shuffleDep))
+  }
+
+  /** models a stage boundary with multiple dependencies, like a join */
+  def join(nParts: Int, inputs: MockRDD*): MockRDD = {
+    val partitioner = new HashPartitioner(nParts)
+    val shuffleDeps = inputs.map { inputRDD =>
+      new ShuffleDependency(inputRDD, partitioner)
+    }
+    new MockRDD(sc, nParts, shuffleDeps)
+  }
+
+  /**
+   * Very simple one stage job.  Backend successfully completes each task, one by one
+   */
+  test("super simple job") {
+    submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray)
+    while (backendHasWorkToDo) {
+      val task = backend.runningTasks.last
+      backend.taskSuccess(task, 42)
+    }
+    assert(results === (0 until 10).map { _ -> 42 }.toMap)
+    assertDataStructuresEmpty()
+  }
+
+  /**
+   * 5 stage job, diamond dependencies.
+   *
+   * a ----> b ----> d --> result
+   *    \--> c --/
+   *
+   * Backend successfully completes each task
+   */
+  test("multi-stage job") {
+    val a = new MockRDD(sc, 2, Nil)
+    val b = shuffle(10, a)
+    val c = shuffle(20, a)
+    val d = join(30, b, c)
+    submit(d, (0 until 30).toArray)
+
+    def stageToOutputParts(stageId: Int): Int = {
+      stageId match {
+        case 0 => 10
+        case 2 => 20
+        case _ => 30
+      }
+    }
+
+    while (backendHasWorkToDo) {
+      assert(backend.runningTasks.nonEmpty)
+      val taskDescription = backend.runningTasks.last
+      val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet
+      val task = taskSet.tasks(taskDescription.index)
+
+      // make sure the required map output is available
+      task.stageId match {
+        case 1 => assertMapOutputAvailable(b)
+        case 3 => assertMapOutputAvailable(c)
+        case 4 => assertMapOutputAvailable(d)
+        case _ => // no shuffle map input, nothing to check
+      }
+
+      (task.stageId, task.stageAttemptId, task.partitionId) match {
+        case (stage, 0, _) if stage < 4 =>
+          backend.taskSuccess(taskDescription, makeMapStatus("hostA", stageToOutputParts(stage)))
+        case (4, 0, partition) =>
+          backend.taskSuccess(taskDescription, 4321 + partition)
+      }
+    }
+    assert(results === (0 until 30).map { idx => idx -> (4321 + idx) }.toMap)
+    assertDataStructuresEmpty()
+  }
+
+  /**
+   * 2 stage job, with a fetch failure.  Make sure that:
+   * (a) map output is available whenever we run stage 1
+   * (b) we get a second attempt for stage 0 & stage 1
+   */
+  test("job with fetch failure") {
+    val input = new MockRDD(sc, 2, Nil)
+    val shuffledRdd = shuffle(10, input)
+    val shuffleId = shuffledRdd.shuffleDeps.head.shuffleId
+    submit(shuffledRdd, (0 until 10).toArray)
+
+    val stageToAttempts = new HashMap[Int, HashSet[Int]]()
+
+    while (backendHasWorkToDo) {
+      val taskDescription = backend.runningTasks.last
+      val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet
+      val task = taskSet.tasks(taskDescription.index)
+      stageToAttempts.getOrElseUpdate(task.stageId, new HashSet()) += task.stageAttemptId
+
+      // make sure the required map output is available
+      task.stageId match {
+        case 1 => assertMapOutputAvailable(shuffledRdd)
+        case _ => // no shuffle map input, nothing to check
+      }
+
+      (task.stageId, task.stageAttemptId, task.partitionId) match {
+        case (0, _, _) =>
+          backend.taskSuccess(taskDescription, makeMapStatus("hostA", 10))
+        case (1, 0, 0) =>
+          val fetchFailed = FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored")
+          backend.failTask(taskDescription, TaskState.FAILED, fetchFailed)
+        case (1, _, partition) =>
+          backend.taskSuccess(taskDescription, 42 + partition)
+      }
+    }
+    assert(results === (0 until 10).map { idx => idx -> (42 + idx) }.toMap)
+    assert(stageToAttempts === Map(0 -> Set(0, 1), 1 -> Set(0, 1)))
+    assertDataStructuresEmpty()
+  }
+}
+
+/**
+ * A very simple mock backend that can just run one task at a time.
+ */
+private[spark] class SingleCoreMockBackend(
+  conf: SparkConf,
+  var taskScheduler: TaskSchedulerImpl) extends SchedulerBackend with Logging {
+
+  val cores = 1
+
+  override def start(): Unit = {}
+
+  override def stop(): Unit = {}
+
+  override def defaultParallelism(): Int = conf.getInt("spark.default.parallelism", cores)
+
+  var freeCores = cores
+  val localExecutorId = SparkContext.DRIVER_IDENTIFIER
+  val localExecutorHostname = "localhost"
+  val env = SparkEnv.get
+
+  val runningTasks = ArrayBuffer[TaskDescription]()
+
+  /**
+   * This is called by the scheduler whenever it has tasks it would like to schedule
+   */
+  override def reviveOffers(): Unit = {
+    val offers = Seq(new WorkerOffer(localExecutorId, localExecutorHostname, freeCores))
+    val newTasks = taskScheduler.resourceOffers(offers).flatten
+    synchronized {
+      freeCores -= newTasks.size * taskScheduler.CPUS_PER_TASK
+      runningTasks ++= newTasks
+    }
+  }
+
+  def taskSuccess(task: TaskDescription, result: Any): Unit = {
+    val ser = env.serializer.newInstance()
+    val resultBytes = ser.serialize(result)
+    val metrics = new TaskMetrics
+    val directResult = new DirectTaskResult(resultBytes, Seq()) // no accumulator updates
+    val serializedDirectResult = ser.serialize(directResult)
+    taskScheduler.statusUpdate(task.taskId, TaskState.FINISHED, serializedDirectResult)
+    synchronized {
+      freeCores += taskScheduler.CPUS_PER_TASK
+      runningTasks -= task
+    }
+    reviveOffers()
+  }
+
+  def failTask(task: TaskDescription, state: TaskState, result: Any): Unit = {
+    val ser = env.serializer.newInstance()
+    val resultBytes = ser.serialize(result)
+    taskScheduler.statusUpdate(task.taskId, state, resultBytes)
+    if (TaskState.isFinished(state)) {
+      synchronized {
+        freeCores += taskScheduler.CPUS_PER_TASK
+        runningTasks -= task
+      }
+      reviveOffers()
+    }
+  }
+
+}
+
+class MockRDD(
+  sc: SparkContext,
+  val numPartitions: Int,
+  val shuffleDeps: Seq[ShuffleDependency[Int, Int, Nothing]]
+) extends RDD[(Int, Int)](sc, shuffleDeps) with Serializable {
+
+  MockRDD.validate(numPartitions, shuffleDeps)
+
+  override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] =
+    throw new RuntimeException("should not be reached")
+  override def getPartitions: Array[Partition] = (0 until numPartitions).map(i => new Partition {
+    override def index: Int = i
+  }).toArray
+  override def getPreferredLocations(split: Partition): Seq[String] = Nil
+  override def toString: String = "MockRDD " + id
+}
+
+object MockRDD extends AssertionsHelper with TripleEquals {
+  /**
+   * make sure all the shuffle dependencies have a consistent number of output partitions
+   * (mostly to make sure the test setup makes sense, not that Spark itself would get this wrong)
+   */
+  def validate(numPartitions: Int, dependencies: Seq[ShuffleDependency[_, _, _]]): Unit = {
+    dependencies.foreach { dependency =>
+      val partitioner = dependency.partitioner
+      assert(partitioner != null)
+      assert(partitioner.numPartitions === numPartitions)
+    }
+  }
+}

From 20fb3e98ea08cd3cb777227d0b08e35ea3408de3 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Tue, 10 May 2016 16:19:55 -0500
Subject: [PATCH 02/35] TaskResultGetter now expects there to always be
 non-null accum updates

---
 core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
index 80f2bf41224b5..77fda6fcff959 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResult.scala
@@ -59,7 +59,7 @@ private[spark] class DirectTaskResult[T](
 
     val numUpdates = in.readInt
     if (numUpdates == 0) {
-      accumUpdates = null
+      accumUpdates = Seq()
     } else {
       val _accumUpdates = new ArrayBuffer[AccumulatorV2[_, _]]
       for (i <- 0 until numUpdates) {

From 0ca981547832780de4405278fce8314f8be73a84 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Fri, 13 May 2016 13:13:59 -0500
Subject: [PATCH 03/35] switch to making backend run in another thread

---
 .../apache/spark/scheduler/DAGScheduler.scala |  10 -
 .../spark/scheduler/TaskResultGetter.scala    |  12 -
 .../org/apache/spark/util/EventLoop.scala     |   7 -
 .../scheduler/SchedulerIntegrationSuite.scala | 463 +++++++++++-------
 4 files changed, 289 insertions(+), 203 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 709514f4327c4..f9e9be40ab8e1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -183,14 +183,6 @@ class DAGScheduler(
 
   private val messageScheduler =
     ThreadUtils.newDaemonSingleThreadScheduledExecutor("dag-scheduler-message")
-  private val msgsScheduled = new AtomicInteger(0)
-
-  /**
-   * Visible for testing, to know if the DAGScheduler is still "busy"
-   */
-  private[scheduler] def msgSchedulerEmpty: Boolean = {
-    msgsScheduled.get() == 0
-  }
 
   private[scheduler] val eventProcessLoop = new DAGSchedulerEventProcessLoop(this)
   taskScheduler.setDAGScheduler(this)
@@ -1294,11 +1286,9 @@ class DAGScheduler(
             // We might get lots of fetch failed for this stage, from lots of executors.
             // Its better if we can resubmit for all the failed executors at one time, so lets
             // just wait a *bit* before we resubmit.
-            msgsScheduled.incrementAndGet()
             messageScheduler.schedule(new Runnable {
               override def run(): Unit = {
                 eventProcessLoop.post(ResubmitFailedStages)
-                msgsScheduled.decrementAndGet()
               }
             }, DAGScheduler.RESUBMIT_TIMEOUT, TimeUnit.MILLISECONDS)
           }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
index bc7b32ad0b5c5..685ef55c66876 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
@@ -19,7 +19,6 @@ package org.apache.spark.scheduler
 
 import java.nio.ByteBuffer
 import java.util.concurrent.{ExecutorService, RejectedExecutionException}
-import java.util.concurrent.atomic.AtomicInteger
 
 import scala.language.existentials
 import scala.util.control.NonFatal
@@ -38,11 +37,6 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
 
   private val THREADS = sparkEnv.conf.getInt("spark.resultGetter.threads", 4)
 
-  private val nTasks = new AtomicInteger(0)
-  def isEmpty: Boolean = {
-    nTasks.get() == 0
-  }
-
   // Exposed for testing.
   protected val getTaskResultExecutor: ExecutorService =
     ThreadUtils.newDaemonFixedThreadPool(THREADS, "task-result-getter")
@@ -58,7 +52,6 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
       taskSetManager: TaskSetManager,
       tid: Long,
       serializedData: ByteBuffer): Unit = {
-    nTasks.incrementAndGet()
     getTaskResultExecutor.execute(new Runnable {
       override def run(): Unit = Utils.logUncaughtExceptions {
         try {
@@ -118,8 +111,6 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
           case NonFatal(ex) =>
             logError("Exception while getting task result", ex)
             taskSetManager.abort("Exception while getting task result: %s".format(ex))
-        } finally {
-          nTasks.decrementAndGet()
         }
       }
     })
@@ -128,7 +119,6 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
   def enqueueFailedTask(taskSetManager: TaskSetManager, tid: Long, taskState: TaskState,
     serializedData: ByteBuffer) {
     var reason : TaskEndReason = UnknownReason
-    nTasks.incrementAndGet()
     try {
       getTaskResultExecutor.execute(new Runnable {
         override def run(): Unit = Utils.logUncaughtExceptions {
@@ -152,8 +142,6 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
     } catch {
       case e: RejectedExecutionException if sparkEnv.isStopped =>
         // ignore it
-    } finally {
-      nTasks.decrementAndGet()
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/util/EventLoop.scala b/core/src/main/scala/org/apache/spark/util/EventLoop.scala
index d5ece57b88c9e..eefe934c63883 100644
--- a/core/src/main/scala/org/apache/spark/util/EventLoop.scala
+++ b/core/src/main/scala/org/apache/spark/util/EventLoop.scala
@@ -36,7 +36,6 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging {
   private val eventQueue: BlockingQueue[E] = new LinkedBlockingDeque[E]()
 
   private val stopped = new AtomicBoolean(false)
-  private val nMsgs = new AtomicInteger(0)
 
   private val eventThread = new Thread(name) {
     setDaemon(true)
@@ -47,7 +46,6 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging {
           val event = eventQueue.take()
           try {
             onReceive(event)
-            nMsgs.decrementAndGet()
           } catch {
             case NonFatal(e) =>
               try {
@@ -101,7 +99,6 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging {
    * Put the event into the event queue. The event thread will process it later.
    */
   def post(event: E): Unit = {
-    nMsgs.incrementAndGet()
     eventQueue.put(event)
   }
 
@@ -110,10 +107,6 @@ private[spark] abstract class EventLoop[E](name: String) extends Logging {
    */
   def isActive: Boolean = eventThread.isAlive
 
-  def isEmpty: Boolean = {
-    nMsgs.get() == 0
-  }
-
   /**
    * Invoked when `start()` is called but before the event thread starts.
    */
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index 2c55d50aa0cd0..40a1bb4ce7c3b 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -16,9 +16,13 @@
  */
 package org.apache.spark.scheduler
 
-import java.util.concurrent.TimeoutException
+import java.util.Properties
+import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
+import scala.concurrent.{Await, Future}
+import scala.concurrent.duration.{Duration, SECONDS}
+import scala.reflect.ClassTag
 
 import org.scalactic.TripleEquals
 import org.scalatest.Assertions.AssertionsHelper
@@ -40,7 +44,8 @@ import org.apache.spark.util.CallSite
  * with spark's executors via a mocked backend (eg., task completion, task failure, executors
  * disconnecting, etc.).
  */
-class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with LocalSparkContext {
+abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends SparkFunSuite
+    with BeforeAndAfter with LocalSparkContext {
   val conf = new SparkConf
 
   /** Set of TaskSets the DAGScheduler has requested executed. */
@@ -48,19 +53,21 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L
 
   var taskScheduler: TaskSchedulerImpl = null
   var scheduler: DAGScheduler = null
-  var backend: SingleCoreMockBackend = null
+  var backend: T = _
 
   before {
     runningTaskSets.clear()
     results.clear()
-    sc = new SparkContext("mock[org.apache.spark.scheduler.SingleCoreMockBackend]",
-      "SchedulerIntegrationSuite")
-    backend = sc.schedulerBackend.asInstanceOf[SingleCoreMockBackend]
+    failure = null
+    val backendClassName = implicitly[ClassTag[T]].runtimeClass.getName()
+    sc = new SparkContext(s"mock[${backendClassName}]", this.getClass().getSimpleName())
+    backend = sc.schedulerBackend.asInstanceOf[T]
     taskScheduler = new TaskSchedulerImpl(sc) {
       override def submitTasks(taskSet: TaskSet): Unit = {
         runningTaskSets += taskSet
         super.submitTasks(taskSet)
       }
+
       override def taskSetFinished(manager: TaskSetManager): Unit = {
         runningTaskSets -= manager.taskSet
         super.taskSetFinished(manager)
@@ -78,20 +85,8 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L
     scheduler.stop()
   }
 
-  /**
-   * Process the supplied event as if it were the top of the DAGScheduler event queue, expecting
-   * the scheduler not to exit.
-   */
-  private def runEvent(event: DAGSchedulerEvent) {
-    scheduler.eventProcessLoop.post(event)
-  }
-
   val results = new HashMap[Int, Any]()
-  var failure: Exception = _
-  val jobListener = new JobListener() {
-    override def taskSucceeded(index: Int, result: Any) = results.put(index, result)
-    override def jobFailed(exception: Exception) = { failure = exception }
-  }
+  var failure: Throwable = _
 
   /**
    * When we submit dummy Jobs, this is the compute function we supply.
@@ -101,67 +96,33 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L
       throw new RuntimeException("jobComputeFunc shouldn't get called in this mock")
   }
 
-  /** Sends the rdd to the scheduler for scheduling and returns the job id. */
-  private def submit(
+  /** Submits a job to the scheduler, and returns a future which does a bit of error handling. */
+  protected def submit(
       rdd: RDD[_],
       partitions: Array[Int],
-      func: (TaskContext, Iterator[_]) => _ = jobComputeFunc,
-      listener: JobListener = jobListener): Int = {
-    val jobId = scheduler.nextJobId.getAndIncrement()
-    runEvent(JobSubmitted(jobId, rdd, func, partitions, CallSite("", ""), listener))
-    jobId
-  }
-
-  /**
-   * Return true if the backend has more work to do, false otherwise.  It will block until it has
-   * a definitive answer either way -- eg., if the backend does not appear to have any work, but
-   * the dag scheduler has some events left to process, this will wait until the dag scheduler is
-   * done processing enough events to say for sure.
-   */
-  private def backendHasWorkToDo: Boolean = {
-    // the ordering is somewhat important here -- avoid waiting if we can (both to speed up test,
-    // and also to test with more concurrency inside scheduler)
-    if (backend.runningTasks.nonEmpty) {
-      true
-    } else if (runningTaskSets.isEmpty && scheduler.msgSchedulerEmpty &&
-      scheduler.eventProcessLoop.isEmpty && taskScheduler.taskResultGetter.isEmpty ) {
-      false
-    } else if (runningTaskSets.nonEmpty) {
-      // need to get all task results, as they might lead to finishing a taskSet
-      waitUntil(() => taskScheduler.taskResultGetter.isEmpty)
-      backendHasWorkToDo
-    } else {
-      waitUntil(() => taskScheduler.taskResultGetter.isEmpty)
-      waitUntil(() => scheduler.eventProcessLoop.isEmpty)
-      backendHasWorkToDo
+      func: (TaskContext, Iterator[_]) => _ = jobComputeFunc): Future[Any] = {
+    val waiter: JobWaiter[Any] = scheduler.submitJob(rdd, func, partitions.toSeq, CallSite("", ""),
+      (index, res) => results(index) = res, new Properties())
+    import scala.concurrent.ExecutionContext.Implicits.global
+    waiter.completionFuture.recover { case ex =>
+      failure = ex
     }
   }
 
-  private def waitUntil(condition: () => Boolean): Unit = {
-    val timeoutMillis = 1000L
-    val finishTime = System.currentTimeMillis + timeoutMillis
-    while (!condition()) {
-      if (System.currentTimeMillis > finishTime) {
-        throw new TimeoutException(
-          s"Not ready after $timeoutMillis milliseconds")
-      }
-      /* Sleep rather than using wait/notify, because this is used only for testing and
-       * wait/notify add overhead in the general case. */
-      Thread.sleep(10)
+  protected def assertDataStructuresEmpty(noFailure: Boolean = true): Unit = {
+    if (noFailure) {
+      assert(failure === null)
     }
-  }
-
-  private def assertDataStructuresEmpty(): Unit = {
-    assert(!backendHasWorkToDo)
+    assert(scheduler.activeJobs.isEmpty)
     assert(runningTaskSets.isEmpty)
-    assert(backend.runningTasks.isEmpty)
+    assert(!backend.hasTasks)
   }
 
   /**
    * Looks at all shuffleMapOutputs that are dependencies of the given RDD, and makes sure
    * they are all registered
    */
-  private def assertMapOutputAvailable(targetRdd: MockRDD): Unit = {
+  def assertMapOutputAvailable(targetRdd: MockRDD): Unit = {
     val shuffleIds = targetRdd.shuffleDeps.map{_.shuffleId}
     val nParts = targetRdd.numPartitions
     for {
@@ -176,11 +137,10 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L
     }
   }
 
-
   /** models a stage boundary with a single dependency, like a shuffle */
   def shuffle(nParts: Int, input: MockRDD): MockRDD = {
     val partitioner = new HashPartitioner(nParts)
-    val shuffleDep = new ShuffleDependency(input, partitioner)
+    val shuffleDep = new ShuffleDependency[Int, Int, Nothing](input, partitioner)
     new MockRDD(sc, nParts, List(shuffleDep))
   }
 
@@ -188,20 +148,245 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L
   def join(nParts: Int, inputs: MockRDD*): MockRDD = {
     val partitioner = new HashPartitioner(nParts)
     val shuffleDeps = inputs.map { inputRDD =>
-      new ShuffleDependency(inputRDD, partitioner)
+      new ShuffleDependency[Int, Int, Nothing](inputRDD, partitioner)
     }
     new MockRDD(sc, nParts, shuffleDeps)
   }
 
+  /**
+   * Helper which makes it a little easier to setup a test, which starts a mock backend in another
+   * thread, responding to tasks with your custom function.  You also supply the "body" of your
+   * test, where you submit jobs to your backend, wait for them to complete, then check
+   * whatever conditions you want.
+   */
+  def withBackend(backendFunc: () => Unit)(testBody: => Unit): Unit = {
+    val backendContinue = new AtomicBoolean(true)
+    val backendThread = new Thread("mock backend thread") {
+      override def run(): Unit = {
+        while (backendContinue.get()) {
+          if (backend.hasTasksWaitingToRun) {
+            backendFunc()
+          } else {
+            Thread.sleep(10)
+          }
+        }
+      }
+    }
+    try {
+      backendThread.start()
+      testBody
+    } finally {
+      backendContinue.set(false)
+      backendThread.join()
+    }
+  }
+
+}
+
+private[spark] abstract class MockBackend(
+    conf: SparkConf,
+    var taskScheduler: TaskSchedulerImpl) extends SchedulerBackend with Logging {
+
+  private val assignedTasksWaitingToRun = ArrayBuffer[TaskDescription]()
+  private val runningTasks = ArrayBuffer[TaskDescription]()
+
+  def assignTasks(tasks: Seq[TaskDescription]): Unit = assignedTasksWaitingToRun.synchronized {
+    assignedTasksWaitingToRun ++= tasks
+  }
+
+  def endTask(task: TaskDescription): Unit = runningTasks.synchronized {
+    runningTasks -= task
+  }
+
+  def beginTask(): TaskDescription = {
+    val toRun = assignedTasksWaitingToRun.synchronized {
+      assignedTasksWaitingToRun.remove(assignedTasksWaitingToRun.size - 1)
+    }
+    runningTasks.synchronized { runningTasks += toRun }
+    toRun
+  }
+
+  def hasTasks: Boolean = {
+    assignedTasksWaitingToRun.nonEmpty || runningTasks.nonEmpty
+  }
+
+  def hasTasksWaitingToRun: Boolean = {
+    assignedTasksWaitingToRun.nonEmpty
+  }
+
+  override def start(): Unit = {}
+
+  override def stop(): Unit = {}
+
+  var freeCores: Int = _
+  val env = SparkEnv.get
+
+  def executorIdToExecutor: Map[String, ExecutorTaskStatus]
+
+  def generateOffers(): Seq[WorkerOffer]
+
+  /**
+   * This is called by the scheduler whenever it has tasks it would like to schedule
+   */
+  override def reviveOffers(): Unit = {
+    val offers: Seq[WorkerOffer] = generateOffers()
+    val newTasks = taskScheduler.resourceOffers(offers).flatten
+    synchronized {
+      newTasks.foreach { task =>
+        executorIdToExecutor(task.executorId).freeCores -= taskScheduler.CPUS_PER_TASK
+      }
+      freeCores -= newTasks.size * taskScheduler.CPUS_PER_TASK
+      assignedTasksWaitingToRun ++= newTasks
+    }
+  }
+
+  /**
+   * Tell the scheduler the task completed successfully, with the given result.  Also
+   * updates some internal state for this mock.
+   */
+  def taskSuccess(task: TaskDescription, result: Any): Unit = {
+    endTask(task)
+    val ser = env.serializer.newInstance()
+    val resultBytes = ser.serialize(result)
+    val metrics = new TaskMetrics
+    val directResult = new DirectTaskResult(resultBytes, Seq()) // no accumulator updates
+    val serializedDirectResult = ser.serialize(directResult)
+    taskScheduler.statusUpdate(task.taskId, TaskState.FINISHED, serializedDirectResult)
+    synchronized {
+      executorIdToExecutor(task.executorId).freeCores += taskScheduler.CPUS_PER_TASK
+      freeCores += taskScheduler.CPUS_PER_TASK
+      assignedTasksWaitingToRun -= task
+    }
+    reviveOffers()
+  }
+
+  /**
+   * Tell the scheduler the task failed, with the given state and result (probably ExceptionFailure
+   * or FetchFailed).  Also updates some internal state for this mock.
+   */
+  def failTask(task: TaskDescription, state: TaskState, result: Any): Unit = {
+    endTask(task)
+    val ser = env.serializer.newInstance()
+    val resultBytes = ser.serialize(result)
+    taskScheduler.statusUpdate(task.taskId, state, resultBytes)
+    if (TaskState.isFinished(state)) {
+      synchronized {
+        executorIdToExecutor(task.executorId).freeCores += taskScheduler.CPUS_PER_TASK
+        freeCores += taskScheduler.CPUS_PER_TASK
+        assignedTasksWaitingToRun -= task
+      }
+      reviveOffers()
+    }
+  }
+}
+
+/**
+ * A very simple mock backend that can just run one task at a time.
+ */
+private[spark] class SingleCoreMockBackend(
+  conf: SparkConf,
+  taskScheduler: TaskSchedulerImpl) extends MockBackend(conf, taskScheduler) {
+
+  val cores = 1
+
+  override def defaultParallelism(): Int = conf.getInt("spark.default.parallelism", cores)
+
+  freeCores = cores
+  val localExecutorId = SparkContext.DRIVER_IDENTIFIER
+  val localExecutorHostname = "localhost"
+
+  val executorIdToExecutor: Map[String, ExecutorTaskStatus] = Map(
+    localExecutorId -> new ExecutorTaskStatus(localExecutorHostname, localExecutorId, freeCores)
+  )
+
+  override def generateOffers(): Seq[WorkerOffer] = {
+    Seq(new WorkerOffer(localExecutorId, localExecutorHostname, freeCores))
+  }
+}
+
+class MultiExecutorBackend(
+  conf: SparkConf,
+  taskScheduler: TaskSchedulerImpl) extends MockBackend(conf, taskScheduler) {
+
+  val nHosts = 10
+  val nExecutorsPerHost = 4
+  val nCoresPerExecutor = 2
+
+  val executorIdToExecutor: Map[String, ExecutorTaskStatus] = (0 until nHosts).flatMap{ hostIdx =>
+    val hostName = s"host-$hostIdx"
+    (0 until nExecutorsPerHost).map { execIdx =>
+      val executorId = (hostIdx * nExecutorsPerHost + execIdx).toString
+      executorId -> new ExecutorTaskStatus(hostName, executorId, nCoresPerExecutor)
+    }
+  }.toMap
+
+  val totalCores = nHosts * nExecutorsPerHost * nCoresPerExecutor
+  freeCores = totalCores
+
+  override def generateOffers(): Seq[WorkerOffer] = {
+    // always offer all cores available on all executors
+    executorIdToExecutor.values.filter { exec =>
+      exec.freeCores > taskScheduler.CPUS_PER_TASK
+    }.map { exec =>
+      new WorkerOffer(exec.executorId, exec.host, exec.freeCores)
+    }.toSeq
+  }
+
+  override def defaultParallelism(): Int = conf.getInt("spark.default.parallelism", totalCores)
+}
+
+class ExecutorTaskStatus(val host: String, val executorId: String, var freeCores: Int)
+
+class MockRDD(
+  sc: SparkContext,
+  val numPartitions: Int,
+  val shuffleDeps: Seq[ShuffleDependency[Int, Int, Nothing]]
+) extends RDD[(Int, Int)](sc, shuffleDeps) with Serializable {
+
+  MockRDD.validate(numPartitions, shuffleDeps)
+
+  override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] =
+    throw new RuntimeException("should not be reached")
+  override def getPartitions: Array[Partition] = (0 until numPartitions).map(i => new Partition {
+    override def index: Int = i
+  }).toArray
+  override def getPreferredLocations(split: Partition): Seq[String] = Nil
+  override def toString: String = "MockRDD " + id
+}
+
+object MockRDD extends AssertionsHelper with TripleEquals {
+  /**
+   * make sure all the shuffle dependencies have a consistent number of output partitions
+   * (mostly to make sure the test setup makes sense, not that Spark itself would get this wrong)
+   */
+  def validate(numPartitions: Int, dependencies: Seq[ShuffleDependency[_, _, _]]): Unit = {
+    dependencies.foreach { dependency =>
+      val partitioner = dependency.partitioner
+      assert(partitioner != null)
+      assert(partitioner.numPartitions === numPartitions)
+    }
+  }
+}
+
+/**
+ * Some very basic tests just to demonstrate the use of the test framework (and verify that it
+ * works).
+ */
+class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCoreMockBackend] {
+
   /**
    * Very simple one stage job.  Backend successfully completes each task, one by one
    */
   test("super simple job") {
-    submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray)
-    while (backendHasWorkToDo) {
-      val task = backend.runningTasks.last
+    def runBackend(): Unit = {
+      val task = backend.beginTask()
       backend.taskSuccess(task, 42)
     }
+    withBackend(runBackend _) {
+      val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray)
+      val duration = Duration(1, SECONDS)
+      Await.ready(jobFuture, duration)
+    }
     assert(results === (0 until 10).map { _ -> 42 }.toMap)
     assertDataStructuresEmpty()
   }
@@ -215,11 +400,6 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L
    * Backend successfully completes each task
    */
   test("multi-stage job") {
-    val a = new MockRDD(sc, 2, Nil)
-    val b = shuffle(10, a)
-    val c = shuffle(20, a)
-    val d = join(30, b, c)
-    submit(d, (0 until 30).toArray)
 
     def stageToOutputParts(stageId: Int): Int = {
       stageId match {
@@ -229,9 +409,13 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L
       }
     }
 
-    while (backendHasWorkToDo) {
-      assert(backend.runningTasks.nonEmpty)
-      val taskDescription = backend.runningTasks.last
+    val a = new MockRDD(sc, 2, Nil)
+    val b = shuffle(10, a)
+    val c = shuffle(20, a)
+    val d = join(30, b, c)
+
+    def runBackend(): Unit = {
+      val taskDescription = backend.beginTask()
       val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet
       val task = taskSet.tasks(taskDescription.index)
 
@@ -250,6 +434,11 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L
           backend.taskSuccess(taskDescription, 4321 + partition)
       }
     }
+    withBackend(runBackend _) {
+      val jobFuture = submit(d, (0 until 30).toArray)
+      val duration = Duration(1, SECONDS)
+      Await.ready(jobFuture, duration)
+    }
     assert(results === (0 until 30).map { idx => idx -> (4321 + idx) }.toMap)
     assertDataStructuresEmpty()
   }
@@ -263,12 +452,11 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L
     val input = new MockRDD(sc, 2, Nil)
     val shuffledRdd = shuffle(10, input)
     val shuffleId = shuffledRdd.shuffleDeps.head.shuffleId
-    submit(shuffledRdd, (0 until 10).toArray)
 
     val stageToAttempts = new HashMap[Int, HashSet[Int]]()
 
-    while (backendHasWorkToDo) {
-      val taskDescription = backend.runningTasks.last
+    def runBackend(): Unit = {
+      val taskDescription = backend.beginTask()
       val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet
       val task = taskSet.tasks(taskDescription.index)
       stageToAttempts.getOrElseUpdate(task.stageId, new HashSet()) += task.stageAttemptId
@@ -289,102 +477,29 @@ class SchedulerIntegrationSuite extends SparkFunSuite with BeforeAndAfter with L
           backend.taskSuccess(taskDescription, 42 + partition)
       }
     }
+    withBackend(runBackend _) {
+      val jobFuture = submit(shuffledRdd, (0 until 10).toArray)
+      val duration = Duration(1, SECONDS)
+      Await.ready(jobFuture, duration)
+    }
     assert(results === (0 until 10).map { idx => idx -> (42 + idx) }.toMap)
     assert(stageToAttempts === Map(0 -> Set(0, 1), 1 -> Set(0, 1)))
     assertDataStructuresEmpty()
   }
-}
-
-/**
- * A very simple mock backend that can just run one task at a time.
- */
-private[spark] class SingleCoreMockBackend(
-  conf: SparkConf,
-  var taskScheduler: TaskSchedulerImpl) extends SchedulerBackend with Logging {
-
-  val cores = 1
-
-  override def start(): Unit = {}
-
-  override def stop(): Unit = {}
-
-  override def defaultParallelism(): Int = conf.getInt("spark.default.parallelism", cores)
-
-  var freeCores = cores
-  val localExecutorId = SparkContext.DRIVER_IDENTIFIER
-  val localExecutorHostname = "localhost"
-  val env = SparkEnv.get
-
-  val runningTasks = ArrayBuffer[TaskDescription]()
-
-  /**
-   * This is called by the scheduler whenever it has tasks it would like to schedule
-   */
-  override def reviveOffers(): Unit = {
-    val offers = Seq(new WorkerOffer(localExecutorId, localExecutorHostname, freeCores))
-    val newTasks = taskScheduler.resourceOffers(offers).flatten
-    synchronized {
-      freeCores -= newTasks.size * taskScheduler.CPUS_PER_TASK
-      runningTasks ++= newTasks
-    }
-  }
-
-  def taskSuccess(task: TaskDescription, result: Any): Unit = {
-    val ser = env.serializer.newInstance()
-    val resultBytes = ser.serialize(result)
-    val metrics = new TaskMetrics
-    val directResult = new DirectTaskResult(resultBytes, Seq()) // no accumulator updates
-    val serializedDirectResult = ser.serialize(directResult)
-    taskScheduler.statusUpdate(task.taskId, TaskState.FINISHED, serializedDirectResult)
-    synchronized {
-      freeCores += taskScheduler.CPUS_PER_TASK
-      runningTasks -= task
-    }
-    reviveOffers()
-  }
 
-  def failTask(task: TaskDescription, state: TaskState, result: Any): Unit = {
-    val ser = env.serializer.newInstance()
-    val resultBytes = ser.serialize(result)
-    taskScheduler.statusUpdate(task.taskId, state, resultBytes)
-    if (TaskState.isFinished(state)) {
-      synchronized {
-        freeCores += taskScheduler.CPUS_PER_TASK
-        runningTasks -= task
-      }
-      reviveOffers()
+  test("job failure after 4 attempts") {
+    def runBackend(): Unit = {
+      val task = backend.beginTask()
+      val failure = new ExceptionFailure(new RuntimeException("test task failure"), Seq())
+      backend.failTask(task, TaskState.FAILED, failure)
     }
-  }
-
-}
-
-class MockRDD(
-  sc: SparkContext,
-  val numPartitions: Int,
-  val shuffleDeps: Seq[ShuffleDependency[Int, Int, Nothing]]
-) extends RDD[(Int, Int)](sc, shuffleDeps) with Serializable {
-
-  MockRDD.validate(numPartitions, shuffleDeps)
-
-  override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] =
-    throw new RuntimeException("should not be reached")
-  override def getPartitions: Array[Partition] = (0 until numPartitions).map(i => new Partition {
-    override def index: Int = i
-  }).toArray
-  override def getPreferredLocations(split: Partition): Seq[String] = Nil
-  override def toString: String = "MockRDD " + id
-}
-
-object MockRDD extends AssertionsHelper with TripleEquals {
-  /**
-   * make sure all the shuffle dependencies have a consistent number of output partitions
-   * (mostly to make sure the test setup makes sense, not that Spark itself would get this wrong)
-   */
-  def validate(numPartitions: Int, dependencies: Seq[ShuffleDependency[_, _, _]]): Unit = {
-    dependencies.foreach { dependency =>
-      val partitioner = dependency.partitioner
-      assert(partitioner != null)
-      assert(partitioner.numPartitions === numPartitions)
+    withBackend(runBackend _) {
+      val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray)
+      val duration = Duration(1, SECONDS)
+      Await.ready(jobFuture, duration)
+      failure.getMessage.contains("test task failure")
     }
+    assert(results.isEmpty)
+    assertDataStructuresEmpty(noFailure = false)
   }
 }

From 421c2a18c1a5799c75884f51c73205a7b92a6166 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Fri, 13 May 2016 17:13:07 -0500
Subject: [PATCH 04/35] remove MultiExecutorBackend for now

---
 .../scheduler/SchedulerIntegrationSuite.scala | 31 -------------------
 1 file changed, 31 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index 40a1bb4ce7c3b..9b4bd2809dd24 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -304,37 +304,6 @@ private[spark] class SingleCoreMockBackend(
   }
 }
 
-class MultiExecutorBackend(
-  conf: SparkConf,
-  taskScheduler: TaskSchedulerImpl) extends MockBackend(conf, taskScheduler) {
-
-  val nHosts = 10
-  val nExecutorsPerHost = 4
-  val nCoresPerExecutor = 2
-
-  val executorIdToExecutor: Map[String, ExecutorTaskStatus] = (0 until nHosts).flatMap{ hostIdx =>
-    val hostName = s"host-$hostIdx"
-    (0 until nExecutorsPerHost).map { execIdx =>
-      val executorId = (hostIdx * nExecutorsPerHost + execIdx).toString
-      executorId -> new ExecutorTaskStatus(hostName, executorId, nCoresPerExecutor)
-    }
-  }.toMap
-
-  val totalCores = nHosts * nExecutorsPerHost * nCoresPerExecutor
-  freeCores = totalCores
-
-  override def generateOffers(): Seq[WorkerOffer] = {
-    // always offer all cores available on all executors
-    executorIdToExecutor.values.filter { exec =>
-      exec.freeCores > taskScheduler.CPUS_PER_TASK
-    }.map { exec =>
-      new WorkerOffer(exec.executorId, exec.host, exec.freeCores)
-    }.toSeq
-  }
-
-  override def defaultParallelism(): Int = conf.getInt("spark.default.parallelism", totalCores)
-}
-
 class ExecutorTaskStatus(val host: String, val executorId: String, var freeCores: Int)
 
 class MockRDD(

From c0911874783bdfadd7749a66a935cba2669f4ffa Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Tue, 17 May 2016 10:58:50 -0500
Subject: [PATCH 05/35] remove uncertain comment about messageScheduler

---
 .../scala/org/apache/spark/scheduler/DAGScheduler.scala    | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index f9e9be40ab8e1..0c67becbc1b75 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1283,13 +1283,8 @@ class DAGScheduler(
             // TODO: Cancel running tasks in the stage
             logInfo(s"Resubmitting $mapStage (${mapStage.name}) and " +
               s"$failedStage (${failedStage.name}) due to fetch failure")
-            // We might get lots of fetch failed for this stage, from lots of executors.
-            // Its better if we can resubmit for all the failed executors at one time, so lets
-            // just wait a *bit* before we resubmit.
             messageScheduler.schedule(new Runnable {
-              override def run(): Unit = {
-                eventProcessLoop.post(ResubmitFailedStages)
-              }
+              override def run(): Unit = eventProcessLoop.post(ResubmitFailedStages)
             }, DAGScheduler.RESUBMIT_TIMEOUT, TimeUnit.MILLISECONDS)
           }
           failedStages += failedStage

From 3b67b2a950d75bc7f532c7a1151aaee864dc541f Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Tue, 17 May 2016 13:19:17 -0500
Subject: [PATCH 06/35] cleanup

---
 .../org/apache/spark/util/EventLoop.scala     |   2 +-
 .../spark/scheduler/DAGSchedulerSuite.scala   |   1 -
 .../scheduler/SchedulerIntegrationSuite.scala | 120 +++++++++---------
 3 files changed, 64 insertions(+), 59 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/EventLoop.scala b/core/src/main/scala/org/apache/spark/util/EventLoop.scala
index eefe934c63883..3ea9139e11027 100644
--- a/core/src/main/scala/org/apache/spark/util/EventLoop.scala
+++ b/core/src/main/scala/org/apache/spark/util/EventLoop.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.util
 
 import java.util.concurrent.{BlockingQueue, LinkedBlockingDeque}
-import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger}
+import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.util.control.NonFatal
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 9c004e0dd76dc..60051ef1f0d08 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -2070,7 +2070,6 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
   }
 }
 
-
 object DAGSchedulerSuite {
   def makeMapStatus(host: String, reduces: Int, sizes: Byte = 2): MapStatus =
     MapStatus(makeBlockManagerId(host), Array.fill[Long](reduces)(sizes))
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index 9b4bd2809dd24..fb4c0578a4cfd 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -33,7 +33,6 @@ import org.apache.spark.TaskState._
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
-import org.apache.spark.scheduler.DAGSchedulerSuite._
 import org.apache.spark.util.CallSite
 
 /**
@@ -157,7 +156,9 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
    * Helper which makes it a little easier to setup a test, which starts a mock backend in another
    * thread, responding to tasks with your custom function.  You also supply the "body" of your
    * test, where you submit jobs to your backend, wait for them to complete, then check
-   * whatever conditions you want.
+   * whatever conditions you want.  Note that this is *not* safe to all bad backends --
+   * in particular, your `backendFunc` has to return quickly, it can't throw errors, (instead
+   * it should send back the right TaskEndReason
    */
   def withBackend(backendFunc: () => Unit)(testBody: => Unit): Unit = {
     val backendContinue = new AtomicBoolean(true)
@@ -183,61 +184,23 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
 
 }
 
+/**
+ * Helper for running a backend in integration tests, does a bunch of the book-keeping
+ * so individual tests can focus on just responding to tasks.  Individual tests will use
+ * [[beginTask]], [[taskSuccess]], and [[taskFailed]].
+ */
 private[spark] abstract class MockBackend(
     conf: SparkConf,
     var taskScheduler: TaskSchedulerImpl) extends SchedulerBackend with Logging {
 
-  private val assignedTasksWaitingToRun = ArrayBuffer[TaskDescription]()
-  private val runningTasks = ArrayBuffer[TaskDescription]()
-
-  def assignTasks(tasks: Seq[TaskDescription]): Unit = assignedTasksWaitingToRun.synchronized {
-    assignedTasksWaitingToRun ++= tasks
-  }
-
-  def endTask(task: TaskDescription): Unit = runningTasks.synchronized {
-    runningTasks -= task
-  }
-
-  def beginTask(): TaskDescription = {
-    val toRun = assignedTasksWaitingToRun.synchronized {
-      assignedTasksWaitingToRun.remove(assignedTasksWaitingToRun.size - 1)
-    }
-    runningTasks.synchronized { runningTasks += toRun }
-    toRun
-  }
-
-  def hasTasks: Boolean = {
-    assignedTasksWaitingToRun.nonEmpty || runningTasks.nonEmpty
-  }
-
-  def hasTasksWaitingToRun: Boolean = {
-    assignedTasksWaitingToRun.nonEmpty
-  }
-
-  override def start(): Unit = {}
-
-  override def stop(): Unit = {}
-
-  var freeCores: Int = _
-  val env = SparkEnv.get
-
-  def executorIdToExecutor: Map[String, ExecutorTaskStatus]
-
-  def generateOffers(): Seq[WorkerOffer]
-
   /**
-   * This is called by the scheduler whenever it has tasks it would like to schedule
+   * Test backends should call this to get a task that has been assigned to them by the scheduler.
+   * Each task should be responded to with either [[taskSuccess]] or [[taskFailed]].
    */
-  override def reviveOffers(): Unit = {
-    val offers: Seq[WorkerOffer] = generateOffers()
-    val newTasks = taskScheduler.resourceOffers(offers).flatten
-    synchronized {
-      newTasks.foreach { task =>
-        executorIdToExecutor(task.executorId).freeCores -= taskScheduler.CPUS_PER_TASK
-      }
-      freeCores -= newTasks.size * taskScheduler.CPUS_PER_TASK
-      assignedTasksWaitingToRun ++= newTasks
-    }
+  def beginTask(): TaskDescription = synchronized {
+    val toRun = assignedTasksWaitingToRun.remove(assignedTasksWaitingToRun.size - 1)
+    runningTasks += toRun
+    toRun
   }
 
   /**
@@ -264,7 +227,7 @@ private[spark] abstract class MockBackend(
    * Tell the scheduler the task failed, with the given state and result (probably ExceptionFailure
    * or FetchFailed).  Also updates some internal state for this mock.
    */
-  def failTask(task: TaskDescription, state: TaskState, result: Any): Unit = {
+  def taskFailed(task: TaskDescription, state: TaskState, result: Any): Unit = {
     endTask(task)
     val ser = env.serializer.newInstance()
     val resultBytes = ser.serialize(result)
@@ -278,6 +241,47 @@ private[spark] abstract class MockBackend(
       reviveOffers()
     }
   }
+
+  private val assignedTasksWaitingToRun = ArrayBuffer[TaskDescription]()
+  private val runningTasks = ArrayBuffer[TaskDescription]()
+
+  def endTask(task: TaskDescription): Unit = synchronized {
+    runningTasks -= task
+  }
+
+  def hasTasks: Boolean = synchronized {
+    assignedTasksWaitingToRun.nonEmpty || runningTasks.nonEmpty
+  }
+
+  def hasTasksWaitingToRun: Boolean = synchronized {
+    assignedTasksWaitingToRun.nonEmpty
+  }
+
+  override def start(): Unit = {}
+
+  override def stop(): Unit = {}
+
+  var freeCores: Int = _
+  val env = SparkEnv.get
+
+  def executorIdToExecutor: Map[String, ExecutorTaskStatus]
+
+  def generateOffers(): Seq[WorkerOffer]
+
+  /**
+   * This is called by the scheduler whenever it has tasks it would like to schedule
+   */
+  override def reviveOffers(): Unit = {
+    val offers: Seq[WorkerOffer] = generateOffers()
+    val newTasks = taskScheduler.resourceOffers(offers).flatten
+    synchronized {
+      newTasks.foreach { task =>
+        executorIdToExecutor(task.executorId).freeCores -= taskScheduler.CPUS_PER_TASK
+      }
+      freeCores -= newTasks.size * taskScheduler.CPUS_PER_TASK
+      assignedTasksWaitingToRun ++= newTasks
+    }
+  }
 }
 
 /**
@@ -398,7 +402,8 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor
 
       (task.stageId, task.stageAttemptId, task.partitionId) match {
         case (stage, 0, _) if stage < 4 =>
-          backend.taskSuccess(taskDescription, makeMapStatus("hostA", stageToOutputParts(stage)))
+          backend.taskSuccess(taskDescription,
+            DAGSchedulerSuite.makeMapStatus("hostA", stageToOutputParts(stage)))
         case (4, 0, partition) =>
           backend.taskSuccess(taskDescription, 4321 + partition)
       }
@@ -438,10 +443,11 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor
 
       (task.stageId, task.stageAttemptId, task.partitionId) match {
         case (0, _, _) =>
-          backend.taskSuccess(taskDescription, makeMapStatus("hostA", 10))
+          backend.taskSuccess(taskDescription, DAGSchedulerSuite.makeMapStatus("hostA", 10))
         case (1, 0, 0) =>
-          val fetchFailed = FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored")
-          backend.failTask(taskDescription, TaskState.FAILED, fetchFailed)
+          val fetchFailed = FetchFailed(
+            DAGSchedulerSuite.makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored")
+          backend.taskFailed(taskDescription, TaskState.FAILED, fetchFailed)
         case (1, _, partition) =>
           backend.taskSuccess(taskDescription, 42 + partition)
       }
@@ -460,7 +466,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor
     def runBackend(): Unit = {
       val task = backend.beginTask()
       val failure = new ExceptionFailure(new RuntimeException("test task failure"), Seq())
-      backend.failTask(task, TaskState.FAILED, failure)
+      backend.taskFailed(task, TaskState.FAILED, failure)
     }
     withBackend(runBackend _) {
       val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray)

From 79bc38416a914cebb33beaef3ec3179528848bca Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Wed, 18 May 2016 09:10:46 -0500
Subject: [PATCH 07/35] add BlacklistIntegrationSuite and corresponding
 refactoring

---
 .../scheduler/BlacklistIntegrationSuite.scala | 145 ++++++++++++++++++
 .../scheduler/SchedulerIntegrationSuite.scala |  67 ++++++--
 2 files changed, 196 insertions(+), 16 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala

diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
new file mode 100644
index 0000000000000..3225866e317dd
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler
+
+import scala.concurrent.Await
+import scala.concurrent.duration._
+
+import org.apache.spark._
+
+class BlacklistIntegrationSuite extends SchedulerIntegrationSuite[MultiExecutorMockBackend]{
+
+  val badHost = "host-0"
+
+  /**
+   * This backend just always fails if the task is executed on a bad host, but otherwise succeeds
+   * all tasks.
+   */
+  def badHostBackend(): Unit = {
+    val task = backend.beginTask()
+    val host = backend.executorIdToExecutor(task.executorId).host
+    if (host == badHost) {
+      val failure = new ExceptionFailure(new RuntimeException("I'm a bad host!"), Seq())
+      backend.taskFailed(task, TaskState.FAILED, failure)
+    } else {
+      backend.taskSuccess(task, 42)
+    }
+  }
+
+  // Test demonstrating the issue -- without a config change, the scheduler keeps scheduling
+  // according to locality preferences, and so the job fails
+  testScheduler("If preferred node is bad, without blacklist job will fail") {
+    val rdd = new MockRDDWithLocalityPrefs(sc, 10, Nil, badHost)
+    withBackend(badHostBackend _) {
+      val jobFuture = submit(rdd, (0 until 10).toArray)
+      val duration = Duration(1, SECONDS)
+      Await.ready(jobFuture, duration)
+    }
+    assert(results.isEmpty)
+    assertDataStructuresEmpty(noFailure = false)
+  }
+
+  // even with the blacklist turned on, if maxTaskFailures is not more than the number
+  // of executors on the bad node, then locality preferences will lead to us cycling through
+  // the executors on the bad node, and still failing the job
+  testScheduler(
+    "With blacklist on, job will still fail if there are too many bad executors on bad host",
+    extraConfs = Seq(
+      // just set this to something much longer than the test duration
+      ("spark.scheduler.executorTaskBlacklistTime", "10000000")
+    )
+  ) {
+    val rdd = new MockRDDWithLocalityPrefs(sc, 10, Nil, badHost)
+    withBackend(badHostBackend _) {
+      val jobFuture = submit(rdd, (0 until 10).toArray)
+      val duration = Duration(3, SECONDS)
+      Await.ready(jobFuture, duration)
+    }
+    assert(results.isEmpty)
+    assertDataStructuresEmpty(noFailure = false)
+  }
+
+  // Here we run with the blacklist on, and maxTaskFailures high enough that we'll eventually
+  // schedule on a good node and succeed the job
+  testScheduler(
+    "Bad node with multiple executors, job will still succeed with the right confs",
+    extraConfs = Seq(
+      // just set this to something much longer than the test duration
+      ("spark.scheduler.executorTaskBlacklistTime", "10000000"),
+      // this has to be higher than the number of executors on the bad host
+      ("spark.task.maxFailures", "5"),
+      // just to avoid this test taking too long
+      ("spark.locality.wait", "10ms")
+    )
+  ) {
+    val rdd = new MockRDDWithLocalityPrefs(sc, 10, Nil, badHost)
+    withBackend(badHostBackend _) {
+      val jobFuture = submit(rdd, (0 until 10).toArray)
+      val duration = Duration(1, SECONDS)
+      Await.ready(jobFuture, duration)
+    }
+    assert(results === (0 until 10).map { _ -> 42 }.toMap)
+    assertDataStructuresEmpty(noFailure = true)
+  }
+
+}
+
+class MultiExecutorMockBackend(
+    conf: SparkConf,
+    taskScheduler: TaskSchedulerImpl) extends MockBackend(conf, taskScheduler) {
+
+  val nHosts = conf.getInt("spark.testing.nHosts", 5)
+  val nExecutorsPerHost = conf.getInt("spark.testing.nExecutorsPerHost", 4)
+  val nCoresPerExecutor = conf.getInt("spark.testing.nCoresPerExecutor", 2)
+
+  override val executorIdToExecutor: Map[String, ExecutorTaskStatus] = {
+    (0 until nHosts).flatMap { hostIdx =>
+      val hostName = "host-" + hostIdx
+      (0 until nExecutorsPerHost).map { subIdx =>
+        val executorId = (hostIdx * nExecutorsPerHost + subIdx).toString
+        executorId ->
+          ExecutorTaskStatus(host = hostName, executorId = executorId, nCoresPerExecutor)
+      }
+    }.toMap
+  }
+
+  override def generateOffers(): Seq[WorkerOffer] = {
+    executorIdToExecutor.values.map { exec =>
+      WorkerOffer(executorId = exec.executorId, host = exec.host,
+        cores = exec.freeCores)
+    }.toSeq
+  }
+
+  override def defaultParallelism(): Int = nHosts * nExecutorsPerHost * nCoresPerExecutor
+
+  override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = {
+    // Its OK for this to be a no-op, because even if a backend does implement killTask,
+    // it really can only be "best-effort" in any case, and the scheduler should be robust to that.
+    // And in fact its reasonably simulating a case where a real backend finishes tasks in between
+    // the time when the scheduler sends the msg to kill tasks, and the backend receives the msg.
+  }
+}
+
+class MockRDDWithLocalityPrefs(
+    sc: SparkContext,
+    numPartitions: Int,
+    shuffleDeps: Seq[ShuffleDependency[Int, Int, Nothing]],
+    val preferredLoc: String) extends MockRDD(sc, numPartitions, shuffleDeps) {
+  override def getPreferredLocations(split: Partition): Seq[String] = {
+    Seq(preferredLoc)
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index fb4c0578a4cfd..63820979b309c 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -33,7 +33,7 @@ import org.apache.spark.TaskState._
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
-import org.apache.spark.util.CallSite
+import org.apache.spark.util.{CallSite, Utils}
 
 /**
  * Tests for the  entire scheduler code -- DAGScheduler, TaskSchedulerImpl, TaskSets,
@@ -44,7 +44,7 @@ import org.apache.spark.util.CallSite
  * disconnecting, etc.).
  */
 abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends SparkFunSuite
-    with BeforeAndAfter with LocalSparkContext {
+    with LocalSparkContext {
   val conf = new SparkConf
 
   /** Set of TaskSets the DAGScheduler has requested executed. */
@@ -54,12 +54,25 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
   var scheduler: DAGScheduler = null
   var backend: T = _
 
-  before {
+  override def beforeEach(): Unit = {
     runningTaskSets.clear()
     results.clear()
     failure = null
+    super.beforeEach()
+  }
+
+  override def afterEach(): Unit = {
+    super.afterEach()
+    taskScheduler.stop()
+    backend.stop()
+    scheduler.stop()
+  }
+
+  def setupScheduler(conf: SparkConf): Unit = {
+    conf.setAppName(this.getClass().getSimpleName())
     val backendClassName = implicitly[ClassTag[T]].runtimeClass.getName()
-    sc = new SparkContext(s"mock[${backendClassName}]", this.getClass().getSimpleName())
+    conf.setMaster(s"mock[${backendClassName}]")
+    sc = new SparkContext(conf)
     backend = sc.schedulerBackend.asInstanceOf[T]
     taskScheduler = new TaskSchedulerImpl(sc) {
       override def submitTasks(taskSet: TaskSet): Unit = {
@@ -78,10 +91,17 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
     taskScheduler.setDAGScheduler(scheduler)
   }
 
-  after {
-    taskScheduler.stop()
-    backend.stop()
-    scheduler.stop()
+  def testScheduler(name: String)(testBody: => Unit): Unit = {
+    testScheduler(name, Seq())(testBody)
+  }
+
+  def testScheduler(name: String, extraConfs: Seq[(String, String)])(testBody: => Unit): Unit = {
+    test(name) {
+      val conf = new SparkConf()
+      extraConfs.foreach{ case (k, v) => conf.set(k, v)}
+      setupScheduler(conf)
+      testBody
+    }
   }
 
   val results = new HashMap[Int, Any]()
@@ -110,11 +130,26 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
 
   protected def assertDataStructuresEmpty(noFailure: Boolean = true): Unit = {
     if (noFailure) {
-      assert(failure === null)
+      // When a job fails, we terminate before waiting for all the task end events to come in,
+      // so there might still be a running task set
+      assert(runningTaskSets.isEmpty)
+      assert(!backend.hasTasks)
+      if (failure != null) {
+        // if there is a job failure, it can be a bit hard to tease the job failure msg apart
+        // from the test failure msg, so we do a little extra formatting
+        val msg =
+        raw"""
+          | There was a failed job.
+          | ----- Begin Job Failure Msg -----
+          | ${Utils.exceptionString(failure)}
+
+          | ----- End Job Failure Msg ----
+        """.
+          stripMargin
+        fail(msg)
+      }
     }
     assert(scheduler.activeJobs.isEmpty)
-    assert(runningTaskSets.isEmpty)
-    assert(!backend.hasTasks)
   }
 
   /**
@@ -308,7 +343,7 @@ private[spark] class SingleCoreMockBackend(
   }
 }
 
-class ExecutorTaskStatus(val host: String, val executorId: String, var freeCores: Int)
+case class ExecutorTaskStatus(host: String, executorId: String, var freeCores: Int)
 
 class MockRDD(
   sc: SparkContext,
@@ -350,7 +385,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor
   /**
    * Very simple one stage job.  Backend successfully completes each task, one by one
    */
-  test("super simple job") {
+  testScheduler("super simple job") {
     def runBackend(): Unit = {
       val task = backend.beginTask()
       backend.taskSuccess(task, 42)
@@ -372,7 +407,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor
    *
    * Backend successfully completes each task
    */
-  test("multi-stage job") {
+  testScheduler("multi-stage job") {
 
     def stageToOutputParts(stageId: Int): Int = {
       stageId match {
@@ -422,7 +457,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor
    * (a) map output is available whenever we run stage 1
    * (b) we get a second attempt for stage 0 & stage 1
    */
-  test("job with fetch failure") {
+  testScheduler("job with fetch failure") {
     val input = new MockRDD(sc, 2, Nil)
     val shuffledRdd = shuffle(10, input)
     val shuffleId = shuffledRdd.shuffleDeps.head.shuffleId
@@ -462,7 +497,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor
     assertDataStructuresEmpty()
   }
 
-  test("job failure after 4 attempts") {
+  testScheduler("job failure after 4 attempts") {
     def runBackend(): Unit = {
       val task = backend.beginTask()
       val failure = new ExceptionFailure(new RuntimeException("test task failure"), Seq())

From 8349b76ada807a9ce351cebbf4eddb88f67ca138 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Wed, 18 May 2016 10:37:19 -0500
Subject: [PATCH 08/35] cleanup

---
 .../spark/scheduler/BlacklistIntegrationSuite.scala   |  7 -------
 .../spark/scheduler/SchedulerIntegrationSuite.scala   | 11 ++++++-----
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
index 3225866e317dd..5283fcb1d8892 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
@@ -117,13 +117,6 @@ class MultiExecutorMockBackend(
     }.toMap
   }
 
-  override def generateOffers(): Seq[WorkerOffer] = {
-    executorIdToExecutor.values.map { exec =>
-      WorkerOffer(executorId = exec.executorId, host = exec.host,
-        cores = exec.freeCores)
-    }.toSeq
-  }
-
   override def defaultParallelism(): Int = nHosts * nExecutorsPerHost * nCoresPerExecutor
 
   override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index 63820979b309c..77d308b16514b 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -301,7 +301,12 @@ private[spark] abstract class MockBackend(
 
   def executorIdToExecutor: Map[String, ExecutorTaskStatus]
 
-  def generateOffers(): Seq[WorkerOffer]
+  def generateOffers(): Seq[WorkerOffer] = {
+    executorIdToExecutor.values.map { exec =>
+      WorkerOffer(executorId = exec.executorId, host = exec.host,
+        cores = exec.freeCores)
+    }.toSeq
+  }
 
   /**
    * This is called by the scheduler whenever it has tasks it would like to schedule
@@ -337,10 +342,6 @@ private[spark] class SingleCoreMockBackend(
   val executorIdToExecutor: Map[String, ExecutorTaskStatus] = Map(
     localExecutorId -> new ExecutorTaskStatus(localExecutorHostname, localExecutorId, freeCores)
   )
-
-  override def generateOffers(): Seq[WorkerOffer] = {
-    Seq(new WorkerOffer(localExecutorId, localExecutorHostname, freeCores))
-  }
 }
 
 case class ExecutorTaskStatus(host: String, executorId: String, var freeCores: Int)

From 7050b49b4dfd6e7b4e3e966e9e15e3558a343d67 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Wed, 18 May 2016 10:51:13 -0500
Subject: [PATCH 09/35] comments

---
 core/src/main/scala/org/apache/spark/SparkContext.scala | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index c3f51923e73a7..33a5cce8d37c8 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -2421,6 +2421,8 @@ object SparkContext extends Logging {
         (backend, scheduler)
 
       case MOCK_REGEX(backendClassName) =>
+        // This is a Scheduler integration test, so we setup a mock backend.  Not a documented
+        // feature or meant to be publicly visible at all.
         val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
         val backendClass = Utils.classForName(backendClassName)
         val ctor = backendClass.getConstructor(classOf[SparkConf], classOf[TaskSchedulerImpl])
@@ -2528,6 +2530,7 @@ object SparkContext extends Logging {
  * A collection of regexes for extracting information from the master string.
  */
 private object SparkMasterRegex {
+  /** Used for Scheduler integration tests, to plug in a mock backend */
   val MOCK_REGEX = """mock\[(.*)\]""".r
   // Regular expression used for local[N] and local[*] master formats
   val LOCAL_N_REGEX = """local\[([0-9]+|\*)\]""".r

From 00953764f8d6a3807329ddff028c64a913cba662 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Wed, 18 May 2016 10:58:04 -0500
Subject: [PATCH 10/35] move dummy killTask to MockBackend, otherwise
 occasional problems even in SingleCoreMockBackend when killTask is
 unsupported

---
 .../apache/spark/scheduler/BlacklistIntegrationSuite.scala | 7 -------
 .../apache/spark/scheduler/SchedulerIntegrationSuite.scala | 7 +++++++
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
index 5283fcb1d8892..3cb07a404d39b 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
@@ -118,13 +118,6 @@ class MultiExecutorMockBackend(
   }
 
   override def defaultParallelism(): Int = nHosts * nExecutorsPerHost * nCoresPerExecutor
-
-  override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = {
-    // Its OK for this to be a no-op, because even if a backend does implement killTask,
-    // it really can only be "best-effort" in any case, and the scheduler should be robust to that.
-    // And in fact its reasonably simulating a case where a real backend finishes tasks in between
-    // the time when the scheduler sends the msg to kill tasks, and the backend receives the msg.
-  }
 }
 
 class MockRDDWithLocalityPrefs(
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index 77d308b16514b..a37d2f06a4f0e 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -322,6 +322,13 @@ private[spark] abstract class MockBackend(
       assignedTasksWaitingToRun ++= newTasks
     }
   }
+
+  override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = {
+    // Its OK for this to be a no-op, because even if a backend does implement killTask,
+    // it really can only be "best-effort" in any case, and the scheduler should be robust to that.
+    // And in fact its reasonably simulating a case where a real backend finishes tasks in between
+    // the time when the scheduler sends the msg to kill tasks, and the backend receives the msg.
+  }
 }
 
 /**

From cb5860ffe8a995ac7566e80a774cf57c498f6182 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Wed, 18 May 2016 10:58:04 -0500
Subject: [PATCH 11/35] move dummy killTask to MockBackend, otherwise
 occasional problems even in SingleCoreMockBackend when killTask is
 unsupported

---
 .../apache/spark/scheduler/BlacklistIntegrationSuite.scala | 7 -------
 .../apache/spark/scheduler/SchedulerIntegrationSuite.scala | 7 +++++++
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
index 5283fcb1d8892..3cb07a404d39b 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
@@ -118,13 +118,6 @@ class MultiExecutorMockBackend(
   }
 
   override def defaultParallelism(): Int = nHosts * nExecutorsPerHost * nCoresPerExecutor
-
-  override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = {
-    // Its OK for this to be a no-op, because even if a backend does implement killTask,
-    // it really can only be "best-effort" in any case, and the scheduler should be robust to that.
-    // And in fact its reasonably simulating a case where a real backend finishes tasks in between
-    // the time when the scheduler sends the msg to kill tasks, and the backend receives the msg.
-  }
 }
 
 class MockRDDWithLocalityPrefs(
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index 77d308b16514b..a37d2f06a4f0e 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -322,6 +322,13 @@ private[spark] abstract class MockBackend(
       assignedTasksWaitingToRun ++= newTasks
     }
   }
+
+  override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = {
+    // Its OK for this to be a no-op, because even if a backend does implement killTask,
+    // it really can only be "best-effort" in any case, and the scheduler should be robust to that.
+    // And in fact its reasonably simulating a case where a real backend finishes tasks in between
+    // the time when the scheduler sends the msg to kill tasks, and the backend receives the msg.
+  }
 }
 
 /**

From 8034995249e16e80fa7db5e709c38d1444b98f08 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Wed, 18 May 2016 12:30:09 -0500
Subject: [PATCH 12/35] take advantage of ExternalClusteManager extension

---
 .../scala/org/apache/spark/SparkContext.scala | 12 ----
 ...che.spark.scheduler.ExternalClusterManager |  3 +-
 .../scheduler/SchedulerIntegrationSuite.scala | 69 ++++++++++++++-----
 3 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 33a5cce8d37c8..e391599336074 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -2420,16 +2420,6 @@ object SparkContext extends Logging {
         scheduler.initialize(backend)
         (backend, scheduler)
 
-      case MOCK_REGEX(backendClassName) =>
-        // This is a Scheduler integration test, so we setup a mock backend.  Not a documented
-        // feature or meant to be publicly visible at all.
-        val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
-        val backendClass = Utils.classForName(backendClassName)
-        val ctor = backendClass.getConstructor(classOf[SparkConf], classOf[TaskSchedulerImpl])
-        val backend = ctor.newInstance(sc.getConf, scheduler).asInstanceOf[SchedulerBackend]
-        scheduler.initialize(backend)
-        (backend, scheduler)
-
       case LOCAL_N_REGEX(threads) =>
         def localCpuCount: Int = Runtime.getRuntime.availableProcessors()
         // local[*] estimates the number of cores on the machine; local[N] uses exactly N threads.
@@ -2530,8 +2520,6 @@ object SparkContext extends Logging {
  * A collection of regexes for extracting information from the master string.
  */
 private object SparkMasterRegex {
-  /** Used for Scheduler integration tests, to plug in a mock backend */
-  val MOCK_REGEX = """mock\[(.*)\]""".r
   // Regular expression used for local[N] and local[*] master formats
   val LOCAL_N_REGEX = """local\[([0-9]+|\*)\]""".r
   // Regular expression for local[N, maxRetries], used in tests with failing tasks
diff --git a/core/src/test/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager b/core/src/test/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
index 3c570ffd8f566..757c6d2296aff 100644
--- a/core/src/test/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
+++ b/core/src/test/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
@@ -1 +1,2 @@
-org.apache.spark.scheduler.DummyExternalClusterManager
\ No newline at end of file
+org.apache.spark.scheduler.DummyExternalClusterManager
+org.apache.spark.scheduler.MockExternalClusterManager
\ No newline at end of file
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index a37d2f06a4f0e..204a2bef610e5 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -26,7 +26,6 @@ import scala.reflect.ClassTag
 
 import org.scalactic.TripleEquals
 import org.scalatest.Assertions.AssertionsHelper
-import org.scalatest.BeforeAndAfter
 
 import org.apache.spark._
 import org.apache.spark.TaskState._
@@ -47,15 +46,14 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
     with LocalSparkContext {
   val conf = new SparkConf
 
-  /** Set of TaskSets the DAGScheduler has requested executed. */
-  val runningTaskSets = HashSet[TaskSet]()
-
-  var taskScheduler: TaskSchedulerImpl = null
+  var taskScheduler: TestTaskScheduler = null
   var scheduler: DAGScheduler = null
   var backend: T = _
 
   override def beforeEach(): Unit = {
-    runningTaskSets.clear()
+    if (taskScheduler != null) {
+      taskScheduler.runningTaskSets.clear()
+    }
     results.clear()
     failure = null
     super.beforeEach()
@@ -74,17 +72,7 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
     conf.setMaster(s"mock[${backendClassName}]")
     sc = new SparkContext(conf)
     backend = sc.schedulerBackend.asInstanceOf[T]
-    taskScheduler = new TaskSchedulerImpl(sc) {
-      override def submitTasks(taskSet: TaskSet): Unit = {
-        runningTaskSets += taskSet
-        super.submitTasks(taskSet)
-      }
-
-      override def taskSetFinished(manager: TaskSetManager): Unit = {
-        runningTaskSets -= manager.taskSet
-        super.taskSetFinished(manager)
-      }
-    }
+    taskScheduler = sc.taskScheduler.asInstanceOf[TestTaskScheduler]
     taskScheduler.initialize(sc.schedulerBackend)
     backend.taskScheduler = taskScheduler
     scheduler = new DAGScheduler(sc, taskScheduler)
@@ -132,7 +120,7 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
     if (noFailure) {
       // When a job fails, we terminate before waiting for all the task end events to come in,
       // so there might still be a running task set
-      assert(runningTaskSets.isEmpty)
+      assert(taskScheduler.runningTaskSets.isEmpty)
       assert(!backend.hasTasks)
       if (failure != null) {
         // if there is a job failure, it can be a bit hard to tease the job failure msg apart
@@ -384,6 +372,51 @@ object MockRDD extends AssertionsHelper with TripleEquals {
   }
 }
 
+/** Simple cluster manager that wires up our mock backend. */
+private class MockExternalClusterManager extends ExternalClusterManager {
+
+  val MOCK_REGEX = """mock\[(.*)\]""".r
+  def canCreate(masterURL: String): Boolean = MOCK_REGEX.findFirstIn(masterURL).isDefined
+
+  def createTaskScheduler(
+      sc: SparkContext,
+      masterURL: String): TaskScheduler = {
+    new TestTaskScheduler(sc)
+ }
+
+  def createSchedulerBackend(
+      sc: SparkContext,
+      masterURL: String,
+      scheduler: TaskScheduler): SchedulerBackend = {
+    masterURL match {
+      case MOCK_REGEX(backendClassName) =>
+        val backendClass = Utils.classForName(backendClassName)
+        val ctor = backendClass.getConstructor(classOf[SparkConf], classOf[TaskSchedulerImpl])
+        ctor.newInstance(sc.getConf, scheduler).asInstanceOf[SchedulerBackend]
+    }
+  }
+
+  def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
+    scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
+  }
+}
+
+/** TaskSchedulerImpl that just tracks a tiny bit more state to enable checks in tests. */
+class TestTaskScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) {
+  /** Set of TaskSets the DAGScheduler has requested executed. */
+  val runningTaskSets = HashSet[TaskSet]()
+
+  override def submitTasks(taskSet: TaskSet): Unit = {
+    runningTaskSets += taskSet
+    super.submitTasks(taskSet)
+  }
+
+  override def taskSetFinished(manager: TaskSetManager): Unit = {
+    runningTaskSets -= manager.taskSet
+    super.taskSetFinished(manager)
+  }
+}
+
 /**
  * Some very basic tests just to demonstrate the use of the test framework (and verify that it
  * works).

From 360c7cdf7731fd173bbfaf5e66d413f743604e60 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Wed, 18 May 2016 12:39:52 -0500
Subject: [PATCH 13/35] cleanup

---
 .../org/apache/spark/scheduler/SchedulerIntegrationSuite.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index 204a2bef610e5..dc2749cfbe4c3 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -130,7 +130,6 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
           | There was a failed job.
           | ----- Begin Job Failure Msg -----
           | ${Utils.exceptionString(failure)}
-
           | ----- End Job Failure Msg ----
         """.
           stripMargin
@@ -312,6 +311,7 @@ private[spark] abstract class MockBackend(
   }
 
   override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = {
+    // We have to implement this b/c of SPARK-15385.
     // Its OK for this to be a no-op, because even if a backend does implement killTask,
     // it really can only be "best-effort" in any case, and the scheduler should be robust to that.
     // And in fact its reasonably simulating a case where a real backend finishes tasks in between

From c7a78b0df04e86ab942617f84106085c3e750e00 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Thu, 19 May 2016 12:58:01 -0500
Subject: [PATCH 14/35] performance updates to mock backend + some utils

---
 .../scheduler/SchedulerIntegrationSuite.scala | 37 +++++++++++--------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index dc2749cfbe4c3..9f06d7902ef6a 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -17,9 +17,11 @@
 package org.apache.spark.scheduler
 
 import java.util.Properties
+import java.util.concurrent.ArrayBlockingQueue
 import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
+import scala.collection.JavaConverters._
 import scala.concurrent.{Await, Future}
 import scala.concurrent.duration.{Duration, SECONDS}
 import scala.reflect.ClassTag
@@ -118,10 +120,6 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
 
   protected def assertDataStructuresEmpty(noFailure: Boolean = true): Unit = {
     if (noFailure) {
-      // When a job fails, we terminate before waiting for all the task end events to come in,
-      // so there might still be a running task set
-      assert(taskScheduler.runningTaskSets.isEmpty)
-      assert(!backend.hasTasks)
       if (failure != null) {
         // if there is a job failure, it can be a bit hard to tease the job failure msg apart
         // from the test failure msg, so we do a little extra formatting
@@ -135,6 +133,11 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
           stripMargin
         fail(msg)
       }
+      // When a job fails, we terminate before waiting for all the task end events to come in,
+      // so there might still be a running task set.  That is why we only check these conditions
+      // when the job succeeds
+      assert(taskScheduler.runningTaskSets.isEmpty)
+      assert(!backend.hasTasks)
     }
     assert(scheduler.activeJobs.isEmpty)
   }
@@ -182,7 +185,7 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
    * in particular, your `backendFunc` has to return quickly, it can't throw errors, (instead
    * it should send back the right TaskEndReason
    */
-  def withBackend(backendFunc: () => Unit)(testBody: => Unit): Unit = {
+  def withBackend[T](backendFunc: () => Unit)(testBody: => T): T = {
     val backendContinue = new AtomicBoolean(true)
     val backendThread = new Thread("mock backend thread") {
       override def run(): Unit = {
@@ -219,8 +222,8 @@ private[spark] abstract class MockBackend(
    * Test backends should call this to get a task that has been assigned to them by the scheduler.
    * Each task should be responded to with either [[taskSuccess]] or [[taskFailed]].
    */
-  def beginTask(): TaskDescription = synchronized {
-    val toRun = assignedTasksWaitingToRun.remove(assignedTasksWaitingToRun.size - 1)
+  def beginTask(): TaskDescription = {
+    val toRun = assignedTasksWaitingToRun.take()
     runningTasks += toRun
     toRun
   }
@@ -240,11 +243,14 @@ private[spark] abstract class MockBackend(
     synchronized {
       executorIdToExecutor(task.executorId).freeCores += taskScheduler.CPUS_PER_TASK
       freeCores += taskScheduler.CPUS_PER_TASK
-      assignedTasksWaitingToRun -= task
     }
     reviveOffers()
   }
 
+  def taskFailedWithException(task: TaskDescription, state: TaskState, exc: Exception): Unit = {
+    taskFailed(task, state, new ExceptionFailure(exc, Seq()))
+  }
+
   /**
    * Tell the scheduler the task failed, with the given state and result (probably ExceptionFailure
    * or FetchFailed).  Also updates some internal state for this mock.
@@ -258,13 +264,12 @@ private[spark] abstract class MockBackend(
       synchronized {
         executorIdToExecutor(task.executorId).freeCores += taskScheduler.CPUS_PER_TASK
         freeCores += taskScheduler.CPUS_PER_TASK
-        assignedTasksWaitingToRun -= task
       }
       reviveOffers()
     }
   }
 
-  private val assignedTasksWaitingToRun = ArrayBuffer[TaskDescription]()
+  private val assignedTasksWaitingToRun = new ArrayBlockingQueue[TaskDescription](10000)
   private val runningTasks = ArrayBuffer[TaskDescription]()
 
   def endTask(task: TaskDescription): Unit = synchronized {
@@ -272,11 +277,11 @@ private[spark] abstract class MockBackend(
   }
 
   def hasTasks: Boolean = synchronized {
-    assignedTasksWaitingToRun.nonEmpty || runningTasks.nonEmpty
+    !assignedTasksWaitingToRun.isEmpty() || runningTasks.nonEmpty
   }
 
-  def hasTasksWaitingToRun: Boolean = synchronized {
-    assignedTasksWaitingToRun.nonEmpty
+  def hasTasksWaitingToRun: Boolean = {
+    !assignedTasksWaitingToRun.isEmpty()
   }
 
   override def start(): Unit = {}
@@ -289,7 +294,9 @@ private[spark] abstract class MockBackend(
   def executorIdToExecutor: Map[String, ExecutorTaskStatus]
 
   def generateOffers(): Seq[WorkerOffer] = {
-    executorIdToExecutor.values.map { exec =>
+    executorIdToExecutor.values.filter { exec =>
+      exec.freeCores > 0
+    }.map { exec =>
       WorkerOffer(executorId = exec.executorId, host = exec.host,
         cores = exec.freeCores)
     }.toSeq
@@ -306,7 +313,7 @@ private[spark] abstract class MockBackend(
         executorIdToExecutor(task.executorId).freeCores -= taskScheduler.CPUS_PER_TASK
       }
       freeCores -= newTasks.size * taskScheduler.CPUS_PER_TASK
-      assignedTasksWaitingToRun ++= newTasks
+      assignedTasksWaitingToRun.addAll(newTasks.asJava)
     }
   }
 

From ee59913ac216f44b9e0b46a2b662f42b26f48da8 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Thu, 19 May 2016 12:58:40 -0500
Subject: [PATCH 15/35] add performance tests

---
 .../scheduler/SchedulerPerformanceSuite.scala | 228 ++++++++++++++++++
 1 file changed, 228 insertions(+)
 create mode 100644 core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala

diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
new file mode 100644
index 0000000000000..0489719b6c94e
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler
+
+import scala.concurrent.duration.Duration
+
+import org.apache.spark.TaskState
+import org.apache.spark.util.Utils
+
+class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorMockBackend] {
+
+  def simpleWorkload(N: Int): MockRDD = {
+    val a = new MockRDD(sc, N, Nil)
+    val b = shuffle(N, a)
+    val c = shuffle(N, a)
+    join(N, b, c)
+  }
+
+  def goodBackend(N: Int): Unit = {
+    val taskDescription = backend.beginTask()
+    val host = backend.executorIdToExecutor(taskDescription.executorId).host
+    val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet
+    val task = taskSet.tasks(taskDescription.index)
+
+    // every 5th stage is a ResultStage -- the rest are ShuffleMapStages
+    (task.stageId, task.partitionId) match {
+      case (stage, _) if stage % 5 != 4 =>
+        backend.taskSuccess(taskDescription,
+          DAGSchedulerSuite.makeMapStatus(host, N))
+      case (_, _) =>
+        backend.taskSuccess(taskDescription, 42)
+    }
+  }
+
+  def runJobWithBackend(N: Int, backend: () => Unit): Unit = {
+    // run as many jobs as we can in 10 seconds
+    var itrs = 0
+    val totalMs = withBackend(backend) {
+      val start = System.currentTimeMillis()
+      while (System.currentTimeMillis() - start < 10000 ) {
+        withClue(s"failure in iteration = $itrs") {
+          val jobFuture = submit(simpleWorkload(N), new Array[Int](N))
+          // Note: Do not call Await.ready(future) because that calls `scala.concurrent.blocking`,
+          // which causes concurrent SQL executions to fail if a fork-join pool is used. Note that
+          // due to idiosyncrasies in Scala, `awaitPermission` is not actually used anywhere so it's
+          // safe to pass in null here. For more detail, see SPARK-13747.
+          val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait]
+          jobFuture.ready(Duration.Inf)(awaitPermission)
+          assertDataStructuresEmpty(noFailure = true)
+          itrs += 1
+        }
+      }
+      System.currentTimeMillis() - start
+    }
+
+    val msPerItr = Utils.msDurationToString((totalMs.toDouble / itrs).toLong)
+    // scalastyle:off println
+    println(s"ran $itrs iterations in ${Utils.msDurationToString(totalMs)} ($msPerItr per itr)")
+    // scalastyle:on println
+  }
+
+  def runSuccessfulJob(N: Int): Unit = {
+    runJobWithBackend(N, () => goodBackend(N))
+  }
+
+  testScheduler("Scheduling speed -- small job on a small cluster") {
+    runSuccessfulJob(40)
+  }
+
+  testScheduler("Scheduling speed -- large job on a small cluster") {
+    runSuccessfulJob(3000)
+  }
+
+
+  testScheduler(
+    "Scheduling speed -- large job on a super node",
+    extraConfs = Seq(
+      "spark.testing.nHosts" -> "1",
+      "spark.testing.nExecutorsPerHost" -> "1",
+      "spark.testing.nCoresPerExecutor" -> "20000"
+    )
+  ) {
+    runSuccessfulJob(3000)
+  }
+
+  testScheduler(
+    // 4 execs per node, 2 cores per exec, so 400 cores
+    "Scheduling speed -- large job on 50 node cluster",
+    extraConfs = Seq(
+      "spark.testing.nHosts" -> "50"
+    )
+  ) {
+    runSuccessfulJob(3000)
+  }
+
+  testScheduler(
+    // 4 execs per node, 2 cores per exec, so 800 cores
+    "Scheduling speed -- large job on 100 node cluster",
+    extraConfs = Seq(
+      "spark.testing.nHosts" -> "100"
+    )
+  ) {
+    runSuccessfulJob(3000)
+  }
+
+  Seq(200, 300, 400, 450, 500, 550).foreach { nodes =>
+    /*
+ran 1 iterations in 12.9 s (12.9 s per itr)
+[info] - COMPARE A: Scheduling speed -- large job on 200 node cluster (13 seconds, 861 milliseconds)
+ran 1 iterations in 25.0 s (25.0 s per itr)
+[info] - COMPARE A: Scheduling speed -- large job on 300 node cluster (25 seconds, 50 milliseconds)
+ran 1 iterations in 34.6 s (34.6 s per itr)
+[info] - COMPARE A: Scheduling speed -- large job on 400 node cluster (34 seconds, 668 milliseconds)
+ran 1 iterations in 54.0 s (54.0 s per itr)
+[info] - COMPARE A: Scheduling speed -- large job on 450 node cluster (53 seconds, 991 milliseconds)
+ran 1 iterations in 1.8 m (1.8 m per itr)
+[info] - COMPARE A: Scheduling speed -- large job on 500 node cluster (1 minute, 48 seconds)
+ran 1 iterations in 2.3 m (2.3 m per itr)
+[info] - COMPARE A: Scheduling speed -- large job on 550 node cluster (2 minutes, 19 seconds)
+     */
+    testScheduler(
+      s"COMPARE A: Scheduling speed -- large job on ${nodes} node cluster",
+      extraConfs = Seq(
+        "spark.testing.nHosts" -> s"$nodes"
+      )
+    ) {
+      runSuccessfulJob(3000)
+    }
+  }
+
+  /*
+nHosts = 400; nExecutorsPerHost = 1; nCores = 800
+ran 2 iterations in 11.7 s (5.9 s per itr)
+[info] - COMPARE B: Lots of nodes (12 seconds, 679 milliseconds)
+nHosts = 1; nExecutorsPerHost = 400; nCores = 800
+ran 3 iterations in 14.2 s (4.7 s per itr)
+[info] - COMPARE B: Lots of executors, one node (14 seconds, 290 milliseconds)
+nHosts = 1; nExecutorsPerHost = 1; nCores = 800
+ran 3 iterations in 11.0 s (3.7 s per itr)
+[info] - COMPARE B: Super executor (11 seconds, 6 milliseconds)
+   */
+  testScheduler(
+    s"COMPARE B: Lots of nodes",
+    extraConfs = Seq(
+      "spark.testing.nHosts" -> "400",
+      "spark.testing.nExecutorsPerHost" -> "1"
+    )
+  ) {
+    runSuccessfulJob(3000)
+  }
+
+  testScheduler(
+    s"COMPARE B: Lots of executors, one node",
+    extraConfs = Seq(
+      "spark.testing.nHosts" -> "1",
+      "spark.testing.nExecutorsPerHost" -> "400"
+    )
+  ) {
+    runSuccessfulJob(3000)
+  }
+
+  testScheduler(
+    s"COMPARE B: Super executor",
+    extraConfs = Seq(
+      "spark.testing.nHosts" -> "1",
+      "spark.testing.nExecutorsPerHost" -> "1",
+      "spark.testing.nCoresPerExecutor" -> "800"
+    )
+  ) {
+    runSuccessfulJob(3000)
+  }
+
+  def backendWithBadExecs(N: Int, badExecs: Set[String], badHosts: Set[String]): Unit = {
+    val taskDescription = backend.beginTask()
+    val host = backend.executorIdToExecutor(taskDescription.executorId).host
+    val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet
+    val task = taskSet.tasks(taskDescription.index)
+    if (badExecs(taskDescription.executorId)) {
+      val exc = new RuntimeException(s"bad exec ${taskDescription.executorId}")
+      backend.taskFailedWithException(taskDescription, TaskState.FAILED, exc)
+    } else if (badHosts(host)) {
+      val exc = new RuntimeException(s"bad host ${host}")
+      backend.taskFailedWithException(taskDescription, TaskState.FAILED, exc)
+    } else {
+      // every 5th stage is a ResultStage -- the rest are ShuffleMapStages
+      (task.stageId, task.partitionId) match {
+        case (stage, _) if stage % 5 != 4 =>
+          backend.taskSuccess(taskDescription,
+            DAGSchedulerSuite.makeMapStatus(host, N))
+        case (_, _) =>
+          backend.taskSuccess(taskDescription, 42)
+      }
+    }
+  }
+
+  def runBadExecJob(N: Int, badExecs: Set[String], badHosts: Set[String]): Unit = {
+    runJobWithBackend(N, () => backendWithBadExecs(N, badExecs, badHosts))
+  }
+
+  val badExecs = (0 until 2).map{_.toString}.toSet
+  val badHosts = Set[String]()
+
+  // note this is *very* unlikely to succeed without blacklisting, even though its only
+  // one bad executor out of 20.  When a task fails, it gets requeued immediately -- and guess
+  // which is the only executor which has a free slot?  Bingo, the one it just failed on
+  testScheduler(
+    "bad execs, no blacklist",
+    extraConfs = Seq(
+      "spark.scheduler.executorTaskBlacklistTime" -> "10000000"
+    )
+  ) {
+    runBadExecJob(3000, badExecs, badHosts)
+  }
+}

From 4fcbc1da351d050d88357f47b98c85e0cbc0eefc Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Thu, 19 May 2016 13:29:38 -0500
Subject: [PATCH 16/35] bug fix in mock scheduler

---
 .../org/apache/spark/scheduler/SchedulerPerformanceSuite.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
index 0489719b6c94e..5539962fe45ca 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
@@ -53,7 +53,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
       val start = System.currentTimeMillis()
       while (System.currentTimeMillis() - start < 10000 ) {
         withClue(s"failure in iteration = $itrs") {
-          val jobFuture = submit(simpleWorkload(N), new Array[Int](N))
+          val jobFuture = submit(simpleWorkload(N), (0 until N).toArray)
           // Note: Do not call Await.ready(future) because that calls `scala.concurrent.blocking`,
           // which causes concurrent SQL executions to fail if a fork-join pool is used. Note that
           // due to idiosyncrasies in Scala, `awaitPermission` is not actually used anywhere so it's

From 6ed19aeec4cccfcca8de7848dd9ff8a0160d920a Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Fri, 20 May 2016 11:45:43 -0500
Subject: [PATCH 17/35] style

---
 .../scala/org/apache/spark/scheduler/BlacklistTracker.scala   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
index f2c73710c000c..5fe77bb0c0ba8 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
@@ -176,8 +176,8 @@ private[spark] class BlacklistTracker(
   private def executorsOnBlacklistedNode(
       sched: TaskSchedulerImpl,
       atomTask: StageAndPartition): Set[String] = {
-      nodeBlacklistForStage(atomTask.stageId).flatMap(sched.getExecutorsAliveOnHost(_)
-        .getOrElse(Set.empty[String])).toSet
+    nodeBlacklistForStage(atomTask.stageId).flatMap(sched.getExecutorsAliveOnHost(_)
+      .getOrElse(Set.empty[String]))
   }
 
   private def reEvaluateExecutorBlacklistAndUpdateCache(

From 67acce9a56a8acc24c0f2cf2ca76378277ec24d3 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Fri, 20 May 2016 12:19:25 -0500
Subject: [PATCH 18/35] simplification and comments

---
 .../scheduler/BlacklistIntegrationSuite.scala |  3 +-
 .../scheduler/SchedulerIntegrationSuite.scala | 96 ++++++++++---------
 2 files changed, 53 insertions(+), 46 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
index 3cb07a404d39b..6c9d4fb6f3bcc 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
@@ -33,8 +33,7 @@ class BlacklistIntegrationSuite extends SchedulerIntegrationSuite[MultiExecutorM
     val task = backend.beginTask()
     val host = backend.executorIdToExecutor(task.executorId).host
     if (host == badHost) {
-      val failure = new ExceptionFailure(new RuntimeException("I'm a bad host!"), Seq())
-      backend.taskFailed(task, TaskState.FAILED, failure)
+      backend.taskFailed(task, new RuntimeException("I'm a bad host!"))
     } else {
       backend.taskSuccess(task, 42)
     }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index dc2749cfbe4c3..02aa5caa731ff 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -29,7 +29,6 @@ import org.scalatest.Assertions.AssertionsHelper
 
 import org.apache.spark._
 import org.apache.spark.TaskState._
-import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.{CallSite, Utils}
@@ -44,7 +43,6 @@ import org.apache.spark.util.{CallSite, Utils}
  */
 abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends SparkFunSuite
     with LocalSparkContext {
-  val conf = new SparkConf
 
   var taskScheduler: TestTaskScheduler = null
   var scheduler: DAGScheduler = null
@@ -74,7 +72,6 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
     backend = sc.schedulerBackend.asInstanceOf[T]
     taskScheduler = sc.taskScheduler.asInstanceOf[TestTaskScheduler]
     taskScheduler.initialize(sc.schedulerBackend)
-    backend.taskScheduler = taskScheduler
     scheduler = new DAGScheduler(sc, taskScheduler)
     taskScheduler.setDAGScheduler(scheduler)
   }
@@ -118,10 +115,6 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
 
   protected def assertDataStructuresEmpty(noFailure: Boolean = true): Unit = {
     if (noFailure) {
-      // When a job fails, we terminate before waiting for all the task end events to come in,
-      // so there might still be a running task set
-      assert(taskScheduler.runningTaskSets.isEmpty)
-      assert(!backend.hasTasks)
       if (failure != null) {
         // if there is a job failure, it can be a bit hard to tease the job failure msg apart
         // from the test failure msg, so we do a little extra formatting
@@ -135,6 +128,11 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
           stripMargin
         fail(msg)
       }
+      // When a job fails, we terminate before waiting for all the task end events to come in,
+      // so there might still be a running task set.  So we only check these conditions
+      // when the job succeeds
+      assert(taskScheduler.runningTaskSets.isEmpty)
+      assert(!backend.hasTasks)
     }
     assert(scheduler.activeJobs.isEmpty)
   }
@@ -180,9 +178,9 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
    * test, where you submit jobs to your backend, wait for them to complete, then check
    * whatever conditions you want.  Note that this is *not* safe to all bad backends --
    * in particular, your `backendFunc` has to return quickly, it can't throw errors, (instead
-   * it should send back the right TaskEndReason
+   * it should send back the right TaskEndReason)
    */
-  def withBackend(backendFunc: () => Unit)(testBody: => Unit): Unit = {
+  def withBackend[T](backendFunc: () => Unit)(testBody: => T): T = {
     val backendContinue = new AtomicBoolean(true)
     val backendThread = new Thread("mock backend thread") {
       override def run(): Unit = {
@@ -213,16 +211,18 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
  */
 private[spark] abstract class MockBackend(
     conf: SparkConf,
-    var taskScheduler: TaskSchedulerImpl) extends SchedulerBackend with Logging {
+    val taskScheduler: TaskSchedulerImpl) extends SchedulerBackend with Logging {
 
   /**
    * Test backends should call this to get a task that has been assigned to them by the scheduler.
    * Each task should be responded to with either [[taskSuccess]] or [[taskFailed]].
    */
-  def beginTask(): TaskDescription = synchronized {
-    val toRun = assignedTasksWaitingToRun.remove(assignedTasksWaitingToRun.size - 1)
-    runningTasks += toRun
-    toRun
+  def beginTask(): TaskDescription = {
+    synchronized {
+      val toRun = assignedTasksWaitingToRun.remove(assignedTasksWaitingToRun.size - 1)
+      runningTasks += toRun
+      toRun
+    }
   }
 
   /**
@@ -230,52 +230,49 @@ private[spark] abstract class MockBackend(
    * updates some internal state for this mock.
    */
   def taskSuccess(task: TaskDescription, result: Any): Unit = {
-    endTask(task)
     val ser = env.serializer.newInstance()
     val resultBytes = ser.serialize(result)
-    val metrics = new TaskMetrics
     val directResult = new DirectTaskResult(resultBytes, Seq()) // no accumulator updates
-    val serializedDirectResult = ser.serialize(directResult)
-    taskScheduler.statusUpdate(task.taskId, TaskState.FINISHED, serializedDirectResult)
-    synchronized {
-      executorIdToExecutor(task.executorId).freeCores += taskScheduler.CPUS_PER_TASK
-      freeCores += taskScheduler.CPUS_PER_TASK
-      assignedTasksWaitingToRun -= task
-    }
-    reviveOffers()
+    taskUpdate(task, TaskState.FINISHED, directResult)
   }
 
   /**
    * Tell the scheduler the task failed, with the given state and result (probably ExceptionFailure
    * or FetchFailed).  Also updates some internal state for this mock.
    */
-  def taskFailed(task: TaskDescription, state: TaskState, result: Any): Unit = {
-    endTask(task)
+  def taskFailed(task: TaskDescription, exc: Exception): Unit = {
+    taskUpdate(task, TaskState.FAILED, new ExceptionFailure(exc, Seq()))
+  }
+
+  def taskFailed(task: TaskDescription, reason: TaskFailedReason): Unit = {
+    taskUpdate(task, TaskState.FAILED, reason)
+  }
+
+  def taskUpdate(task: TaskDescription, state: TaskState, result: Any): Unit = {
     val ser = env.serializer.newInstance()
     val resultBytes = ser.serialize(result)
+    // statusUpdate is safe to call from multiple threads, its protected inside taskScheduler
     taskScheduler.statusUpdate(task.taskId, state, resultBytes)
     if (TaskState.isFinished(state)) {
       synchronized {
+        runningTasks -= task
         executorIdToExecutor(task.executorId).freeCores += taskScheduler.CPUS_PER_TASK
         freeCores += taskScheduler.CPUS_PER_TASK
-        assignedTasksWaitingToRun -= task
       }
       reviveOffers()
     }
   }
 
-  private val assignedTasksWaitingToRun = ArrayBuffer[TaskDescription]()
+  // protected by this
+  private val assignedTasksWaitingToRun = new ArrayBuffer[TaskDescription](10000)
+  // protected by this
   private val runningTasks = ArrayBuffer[TaskDescription]()
 
-  def endTask(task: TaskDescription): Unit = synchronized {
-    runningTasks -= task
-  }
-
   def hasTasks: Boolean = synchronized {
     assignedTasksWaitingToRun.nonEmpty || runningTasks.nonEmpty
   }
 
-  def hasTasksWaitingToRun: Boolean = synchronized {
+  def hasTasksWaitingToRun: Boolean = {
     assignedTasksWaitingToRun.nonEmpty
   }
 
@@ -283,20 +280,30 @@ private[spark] abstract class MockBackend(
 
   override def stop(): Unit = {}
 
-  var freeCores: Int = _
   val env = SparkEnv.get
 
+  /** Accessed by both scheduling and backend thread, so should be protected by this. */
+  var freeCores: Int = _
+
+  /**
+   * Accessed by both scheduling and backend thread, so should be protected by this.
+   * Most likely the only thing that needs to be protected are the inidividual ExecutorTaskStatus,
+   * but for simplicity in this mock just lock the whole backend.
+   */
   def executorIdToExecutor: Map[String, ExecutorTaskStatus]
 
-  def generateOffers(): Seq[WorkerOffer] = {
-    executorIdToExecutor.values.map { exec =>
+  private def generateOffers(): Seq[WorkerOffer] = {
+    executorIdToExecutor.values.filter { exec =>
+      exec.freeCores > 0
+    }.map { exec =>
       WorkerOffer(executorId = exec.executorId, host = exec.host,
         cores = exec.freeCores)
     }.toSeq
   }
 
   /**
-   * This is called by the scheduler whenever it has tasks it would like to schedule
+   * This is called by the scheduler whenever it has tasks it would like to schedule.  It gets
+   * called in the scheduling thread, not the backend thread.
    */
   override def reviveOffers(): Unit = {
     val offers: Seq[WorkerOffer] = generateOffers()
@@ -334,7 +341,7 @@ private[spark] class SingleCoreMockBackend(
   val localExecutorId = SparkContext.DRIVER_IDENTIFIER
   val localExecutorHostname = "localhost"
 
-  val executorIdToExecutor: Map[String, ExecutorTaskStatus] = Map(
+  override val executorIdToExecutor: Map[String, ExecutorTaskStatus] = Map(
     localExecutorId -> new ExecutorTaskStatus(localExecutorHostname, localExecutorId, freeCores)
   )
 }
@@ -351,9 +358,11 @@ class MockRDD(
 
   override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] =
     throw new RuntimeException("should not be reached")
-  override def getPartitions: Array[Partition] = (0 until numPartitions).map(i => new Partition {
-    override def index: Int = i
-  }).toArray
+  override def getPartitions: Array[Partition] = {
+    (0 until numPartitions).map(i => new Partition {
+      override def index: Int = i
+    }).toArray
+  }
   override def getPreferredLocations(split: Partition): Seq[String] = Nil
   override def toString: String = "MockRDD " + id
 }
@@ -523,7 +532,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor
         case (1, 0, 0) =>
           val fetchFailed = FetchFailed(
             DAGSchedulerSuite.makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored")
-          backend.taskFailed(taskDescription, TaskState.FAILED, fetchFailed)
+          backend.taskFailed(taskDescription, fetchFailed)
         case (1, _, partition) =>
           backend.taskSuccess(taskDescription, 42 + partition)
       }
@@ -541,8 +550,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor
   testScheduler("job failure after 4 attempts") {
     def runBackend(): Unit = {
       val task = backend.beginTask()
-      val failure = new ExceptionFailure(new RuntimeException("test task failure"), Seq())
-      backend.taskFailed(task, TaskState.FAILED, failure)
+      backend.taskFailed(task, new RuntimeException("test task failure"))
     }
     withBackend(runBackend _) {
       val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray)

From 17fcc9ec89ef9f4f32905fd33dcbe8f84c05eed7 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Fri, 20 May 2016 12:21:58 -0500
Subject: [PATCH 19/35] fix merge

---
 .../spark/scheduler/SchedulerIntegrationSuite.scala    | 10 ++--------
 .../spark/scheduler/SchedulerPerformanceSuite.scala    |  5 ++---
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index 4306de0e67756..02aa5caa731ff 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -17,11 +17,9 @@
 package org.apache.spark.scheduler
 
 import java.util.Properties
-import java.util.concurrent.ArrayBlockingQueue
 import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
-import scala.collection.JavaConverters._
 import scala.concurrent.{Await, Future}
 import scala.concurrent.duration.{Duration, SECONDS}
 import scala.reflect.ClassTag
@@ -238,10 +236,6 @@ private[spark] abstract class MockBackend(
     taskUpdate(task, TaskState.FINISHED, directResult)
   }
 
-  def taskFailedWithException(task: TaskDescription, state: TaskState, exc: Exception): Unit = {
-    taskFailed(task, state, new ExceptionFailure(exc, Seq()))
-  }
-
   /**
    * Tell the scheduler the task failed, with the given state and result (probably ExceptionFailure
    * or FetchFailed).  Also updates some internal state for this mock.
@@ -275,7 +269,7 @@ private[spark] abstract class MockBackend(
   private val runningTasks = ArrayBuffer[TaskDescription]()
 
   def hasTasks: Boolean = synchronized {
-    !assignedTasksWaitingToRun.isEmpty() || runningTasks.nonEmpty
+    assignedTasksWaitingToRun.nonEmpty || runningTasks.nonEmpty
   }
 
   def hasTasksWaitingToRun: Boolean = {
@@ -319,7 +313,7 @@ private[spark] abstract class MockBackend(
         executorIdToExecutor(task.executorId).freeCores -= taskScheduler.CPUS_PER_TASK
       }
       freeCores -= newTasks.size * taskScheduler.CPUS_PER_TASK
-      assignedTasksWaitingToRun.addAll(newTasks.asJava)
+      assignedTasksWaitingToRun ++= newTasks
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
index 0489719b6c94e..76d12eded3f94 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
@@ -18,7 +18,6 @@ package org.apache.spark.scheduler
 
 import scala.concurrent.duration.Duration
 
-import org.apache.spark.TaskState
 import org.apache.spark.util.Utils
 
 class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorMockBackend] {
@@ -191,10 +190,10 @@ ran 3 iterations in 11.0 s (3.7 s per itr)
     val task = taskSet.tasks(taskDescription.index)
     if (badExecs(taskDescription.executorId)) {
       val exc = new RuntimeException(s"bad exec ${taskDescription.executorId}")
-      backend.taskFailedWithException(taskDescription, TaskState.FAILED, exc)
+      backend.taskFailed(taskDescription, exc)
     } else if (badHosts(host)) {
       val exc = new RuntimeException(s"bad host ${host}")
-      backend.taskFailedWithException(taskDescription, TaskState.FAILED, exc)
+      backend.taskFailed(taskDescription, exc)
     } else {
       // every 5th stage is a ResultStage -- the rest are ShuffleMapStages
       (task.stageId, task.partitionId) match {

From b12b563d4890766d0d3ea31ccdf14d3da8fc8f82 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Fri, 20 May 2016 12:33:31 -0500
Subject: [PATCH 20/35] comments

---
 .../spark/scheduler/SchedulerPerformanceSuite.scala      | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
index 76d12eded3f94..2299ed2bbe3a0 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
@@ -23,6 +23,8 @@ import org.apache.spark.util.Utils
 class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorMockBackend] {
 
   def simpleWorkload(N: Int): MockRDD = {
+    // relatively simple job with 5 stages, so scheduling includes some aspects of submitting stages
+    // in addition to tasks
     val a = new MockRDD(sc, N, Nil)
     val b = shuffle(N, a)
     val c = shuffle(N, a)
@@ -46,7 +48,12 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
   }
 
   def runJobWithBackend(N: Int, backend: () => Unit): Unit = {
-    // run as many jobs as we can in 10 seconds
+    // Try to run as many jobs as we can in 10 seconds, get the time per job.  The idea here is to
+    // balance:
+    // 1) have a big enough job that we're not effected by delays just from waiting for job
+    //   completion to propagate to the user thread (probably minor)
+    // 2) run enough iterations to get some reliable data
+    // 3) not wait toooooo long
     var itrs = 0
     val totalMs = withBackend(backend) {
       val start = System.currentTimeMillis()

From 5d547f4840856667d3e22abb15756d76a7f2407d Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Fri, 20 May 2016 16:37:13 -0500
Subject: [PATCH 21/35] more tests

---
 .../scheduler/SchedulerPerformanceSuite.scala | 79 +++++++++++++------
 1 file changed, 56 insertions(+), 23 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
index fa8c4c27a7578..3b437c728ae70 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
@@ -87,10 +87,19 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
     runSuccessfulJob(40)
   }
 
-  testScheduler("Scheduling speed -- large job on a small cluster") {
+  testScheduler("COMPARE C Scheduling speed -- large job on a small cluster") {
     runSuccessfulJob(3000)
   }
 
+  testScheduler(
+    "COMPARE C Scheduling speed -- large job on a small cluster with advanced blacklist",
+    extraConfs = Seq(
+      "spark.scheduler.executorTaskBlacklistTime" -> "10000000",
+      "spark.scheduler.blacklist.advancedStrategy" -> "true"
+    )
+  ) {
+    runSuccessfulJob(3000)
+  }
 
   testScheduler(
     "Scheduling speed -- large job on a super node",
@@ -125,18 +134,22 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
 
   Seq(200, 300, 400, 450, 500, 550).foreach { nodes =>
     /*
-ran 1 iterations in 12.9 s (12.9 s per itr)
-[info] - COMPARE A: Scheduling speed -- large job on 200 node cluster (13 seconds, 861 milliseconds)
-ran 1 iterations in 25.0 s (25.0 s per itr)
-[info] - COMPARE A: Scheduling speed -- large job on 300 node cluster (25 seconds, 50 milliseconds)
-ran 1 iterations in 34.6 s (34.6 s per itr)
-[info] - COMPARE A: Scheduling speed -- large job on 400 node cluster (34 seconds, 668 milliseconds)
-ran 1 iterations in 54.0 s (54.0 s per itr)
-[info] - COMPARE A: Scheduling speed -- large job on 450 node cluster (53 seconds, 991 milliseconds)
-ran 1 iterations in 1.8 m (1.8 m per itr)
-[info] - COMPARE A: Scheduling speed -- large job on 500 node cluster (1 minute, 48 seconds)
-ran 1 iterations in 2.3 m (2.3 m per itr)
-[info] - COMPARE A: Scheduling speed -- large job on 550 node cluster (2 minutes, 19 seconds)
+  ran 1 iterations in 12.9 s (12.9 s per itr)
+  [info] - COMPARE A: Scheduling speed -- large job on 200 node cluster (13 seconds, 861
+   milliseconds)
+  ran 1 iterations in 25.0 s (25.0 s per itr)
+  [info] - COMPARE A: Scheduling speed -- large job on 300 node cluster (25 seconds, 50
+   milliseconds)
+  ran 1 iterations in 34.6 s (34.6 s per itr)
+  [info] - COMPARE A: Scheduling speed -- large job on 400 node cluster (34 seconds,
+   668 milliseconds)
+  ran 1 iterations in 54.0 s (54.0 s per itr)
+  [info] - COMPARE A: Scheduling speed -- large job on 450 node cluster (53 seconds,
+   991 milliseconds)
+  ran 1 iterations in 1.8 m (1.8 m per itr)
+  [info] - COMPARE A: Scheduling speed -- large job on 500 node cluster (1 minute, 48 seconds)
+  ran 1 iterations in 2.3 m (2.3 m per itr)
+  [info] - COMPARE A: Scheduling speed -- large job on 550 node cluster (2 minutes, 19 seconds)
      */
     testScheduler(
       s"COMPARE A: Scheduling speed -- large job on ${nodes} node cluster",
@@ -149,15 +162,15 @@ ran 1 iterations in 2.3 m (2.3 m per itr)
   }
 
   /*
-nHosts = 400; nExecutorsPerHost = 1; nCores = 800
-ran 2 iterations in 11.7 s (5.9 s per itr)
-[info] - COMPARE B: Lots of nodes (12 seconds, 679 milliseconds)
-nHosts = 1; nExecutorsPerHost = 400; nCores = 800
-ran 3 iterations in 14.2 s (4.7 s per itr)
-[info] - COMPARE B: Lots of executors, one node (14 seconds, 290 milliseconds)
-nHosts = 1; nExecutorsPerHost = 1; nCores = 800
-ran 3 iterations in 11.0 s (3.7 s per itr)
-[info] - COMPARE B: Super executor (11 seconds, 6 milliseconds)
+  nHosts = 400; nExecutorsPerHost = 1; nCores = 800
+  ran 2 iterations in 11.7 s (5.9 s per itr)
+  [info] - COMPARE B: Lots of nodes (12 seconds, 679 milliseconds)
+  nHosts = 1; nExecutorsPerHost = 400; nCores = 800
+  ran 3 iterations in 14.2 s (4.7 s per itr)
+  [info] - COMPARE B: Lots of executors, one node (14 seconds, 290 milliseconds)
+  nHosts = 1; nExecutorsPerHost = 1; nCores = 800
+  ran 3 iterations in 11.0 s (3.7 s per itr)
+  [info] - COMPARE B: Super executor (11 seconds, 6 milliseconds)
    */
   testScheduler(
     s"COMPARE B: Lots of nodes",
@@ -224,11 +237,31 @@ ran 3 iterations in 11.0 s (3.7 s per itr)
   // one bad executor out of 20.  When a task fails, it gets requeued immediately -- and guess
   // which is the only executor which has a free slot?  Bingo, the one it just failed on
   testScheduler(
-    "bad execs, no blacklist",
+    "bad execs with blacklist",
     extraConfs = Seq(
       "spark.scheduler.executorTaskBlacklistTime" -> "10000000"
     )
   ) {
     runBadExecJob(3000, badExecs, badHosts)
   }
+
+  testScheduler(
+    "COMPARE D bad execs with advanced blacklist",
+    extraConfs = Seq(
+      "spark.scheduler.executorTaskBlacklistTime" -> "10000000",
+      "spark.scheduler.blacklist.advancedStrategy" -> "true"
+    )
+  ) {
+    runBadExecJob(3000, badExecs, badHosts)
+  }
+
+  testScheduler(
+    "COMPARE D bad execs with simple blacklist",
+    extraConfs = Seq(
+      "spark.scheduler.executorTaskBlacklistTime" -> "10000000",
+      "spark.scheduler.blacklist.advancedStrategy" -> "false"
+    )
+  ) {
+    runBadExecJob(3000, badExecs, badHosts)
+  }
 }

From d46c65d90a8cb6dae0160029d7c8e4e035682d56 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Fri, 20 May 2016 16:45:32 -0500
Subject: [PATCH 22/35] smaller demo of performance difference

---
 .../apache/spark/scheduler/BlacklistTracker.scala | 15 +++++++++++----
 .../scheduler/SchedulerPerformanceSuite.scala     |  4 ++--
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
index 5fe77bb0c0ba8..54a71b017950c 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
@@ -22,6 +22,7 @@ import java.util.concurrent.TimeUnit
 import scala.collection.mutable
 
 import org.apache.spark.SparkConf
+import org.apache.spark.internal.Logging
 import org.apache.spark.util.Clock
 import org.apache.spark.util.SystemClock
 import org.apache.spark.util.ThreadUtils
@@ -37,7 +38,7 @@ import org.apache.spark.util.Utils
  */
 private[spark] class BlacklistTracker(
     sparkConf: SparkConf,
-    clock: Clock = new SystemClock()) extends BlacklistCache{
+    clock: Clock = new SystemClock()) extends BlacklistCache with Logging {
 
   // maintain a ExecutorId --> FailureStatus HashMap
   private val executorIdToFailureStatus: mutable.HashMap[String, FailureStatus] = mutable.HashMap()
@@ -69,6 +70,7 @@ private[spark] class BlacklistTracker(
   // The actual implementation is delegated to strategy
   private[scheduler] def expireExecutorsInBlackList(): Unit = synchronized {
     val updated = strategy.expireExecutorsInBlackList(executorIdToFailureStatus, clock)
+    logInfo(s"Checked for expired blacklist: ${updated}")
     if (updated) {
       invalidateCache()
     }
@@ -76,12 +78,17 @@ private[spark] class BlacklistTracker(
 
   // The actual implementation is delegated to strategy
   def executorBlacklist(
-      sched: TaskSchedulerImpl, stageId: Int, partition: Int): Set[String] = synchronized {
+      sched: TaskSchedulerImpl,
+      stageId: Int,
+      partition: Int): Set[String] = synchronized {
+    // note that this is NOT only called from the dag scheduler event loop
     val atomTask = StageAndPartition(stageId, partition)
     if (!isBlacklistExecutorCacheValid) {
       reEvaluateExecutorBlacklistAndUpdateCache(sched, atomTask, clock)
     } else {
+//      getExecutorBlacklistFromCache(atomTask).getOrElse(Set.empty[String])
       getExecutorBlacklistFromCache(atomTask).getOrElse {
+        // TODO Why is this necessary?
         reEvaluateExecutorBlacklistAndUpdateCache(sched, atomTask, clock)
       }
     }
@@ -200,8 +207,7 @@ private[spark] class BlacklistTracker(
 /**
  * Hide cache details in this trait to make code clean and avoid operation mistake
  */
-private[scheduler] trait BlacklistCache {
-
+private[scheduler] trait BlacklistCache extends Logging {
   // local cache to minimize the the work when query blacklisted executor and node
   private val blacklistExecutorCache = mutable.HashMap.empty[StageAndPartition, Set[String]]
   private val blacklistNodeCache = mutable.Set.empty[String]
@@ -249,6 +255,7 @@ private[scheduler] trait BlacklistCache {
   }
 
   protected def invalidateCache(): Unit = cacheLock.synchronized {
+    logInfo("invalidatinig blacklist cache")
     _isBlacklistExecutorCacheValid = false
     _isBlacklistNodeCacheValid = false
     _isBlacklistNodeForStageCacheValid = false
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
index 3b437c728ae70..ede52d09b39db 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
@@ -252,7 +252,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
       "spark.scheduler.blacklist.advancedStrategy" -> "true"
     )
   ) {
-    runBadExecJob(3000, badExecs, badHosts)
+    runBadExecJob(50, badExecs, badHosts)
   }
 
   testScheduler(
@@ -262,6 +262,6 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
       "spark.scheduler.blacklist.advancedStrategy" -> "false"
     )
   ) {
-    runBadExecJob(3000, badExecs, badHosts)
+    runBadExecJob(50, badExecs, badHosts)
   }
 }

From a394ab72d90207529a74debd4d9113eca60a5838 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Mon, 23 May 2016 09:10:02 -0500
Subject: [PATCH 23/35] labels

---
 .../apache/spark/scheduler/SchedulerPerformanceSuite.scala  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
index ede52d09b39db..713ecc97b4e68 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
@@ -102,7 +102,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
   }
 
   testScheduler(
-    "Scheduling speed -- large job on a super node",
+    "COMPARE A Scheduling speed -- large job on a super node",
     extraConfs = Seq(
       "spark.testing.nHosts" -> "1",
       "spark.testing.nExecutorsPerHost" -> "1",
@@ -114,7 +114,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
 
   testScheduler(
     // 4 execs per node, 2 cores per exec, so 400 cores
-    "Scheduling speed -- large job on 50 node cluster",
+    "COMPARE A Scheduling speed -- large job on 50 node cluster",
     extraConfs = Seq(
       "spark.testing.nHosts" -> "50"
     )
@@ -124,7 +124,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
 
   testScheduler(
     // 4 execs per node, 2 cores per exec, so 800 cores
-    "Scheduling speed -- large job on 100 node cluster",
+    "COMPARE A Scheduling speed -- large job on 100 node cluster",
     extraConfs = Seq(
       "spark.testing.nHosts" -> "100"
     )

From f4609da04f9bc3d50506c2e89210cfa5100b4c1d Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Mon, 23 May 2016 12:08:59 -0500
Subject: [PATCH 24/35] wip -- some instrumentation, easier repro of slowdown

---
 .../spark/scheduler/BlacklistStrategy.scala   |  6 +++++
 .../spark/scheduler/BlacklistTracker.scala    | 10 +++++++-
 .../scheduler/SchedulerIntegrationSuite.scala | 13 ++++++----
 .../scheduler/SchedulerPerformanceSuite.scala | 24 ++++++++-----------
 4 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala
index edaeb658d0822..7d19ff54e9a09 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala
@@ -24,6 +24,8 @@ import org.apache.spark.util.Clock
 
 /**
  * The interface to determine executor blacklist and node blacklist.
+ *
+ * TODO notes on thread-safety
  */
 private[scheduler] trait BlacklistStrategy {
   /** Define a time interval to expire failure information of executors */
@@ -81,10 +83,12 @@ private[scheduler] trait BlacklistStrategy {
  */
 private[scheduler] class SingleTaskStrategy(
     val expireTimeInMilliseconds: Long) extends BlacklistStrategy {
+  var executorBlacklistCallCount = 0
   def getExecutorBlacklist(
       executorIdToFailureStatus: mutable.HashMap[String, FailureStatus],
       atomTask: StageAndPartition,
       clock: Clock): Set[String] = {
+    executorBlacklistCallCount += 1
     executorIdToFailureStatus.filter{
       case (_, failureStatus) => failureStatus.numFailuresPerTask.keySet.contains(atomTask) &&
         clock.getTimeMillis() - failureStatus.updatedTime < expireTimeInMilliseconds
@@ -104,10 +108,12 @@ private[scheduler] class SingleTaskStrategy(
 private[scheduler] class AdvancedSingleTaskStrategy(
     expireTimeInMilliseconds: Long) extends SingleTaskStrategy(expireTimeInMilliseconds) {
 
+  var nodeBlacklistCallCount = 0
   override def getNodeBlacklistForStage(
       executorIdToFailureStatus: mutable.HashMap[String, FailureStatus],
       stageId: Int,
       clock: Clock): Set[String] = {
+    nodeBlacklistCallCount += 1
     val nodes = executorIdToFailureStatus.filter{
       case (_, failureStatus) =>
         failureStatus.numFailuresPerTask.keySet.map(_.stageId).contains(stageId) &&
diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
index 54a71b017950c..d1a058495a921 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
@@ -65,6 +65,14 @@ private[spark] class BlacklistTracker(
   def stop(): Unit = {
     scheduler.shutdown()
     scheduler.awaitTermination(10, TimeUnit.SECONDS)
+    logDebug(s"Executor Blacklist callcount =" +
+      s" ${strategy.asInstanceOf[SingleTaskStrategy].executorBlacklistCallCount}")
+    strategy match {
+      case as: AdvancedSingleTaskStrategy =>
+        logDebug(s"Node Blacklist callcount =" +
+          s" ${as.nodeBlacklistCallCount}")
+      case _ => // no op
+    }
   }
 
   // The actual implementation is delegated to strategy
@@ -255,7 +263,7 @@ private[scheduler] trait BlacklistCache extends Logging {
   }
 
   protected def invalidateCache(): Unit = cacheLock.synchronized {
-    logInfo("invalidatinig blacklist cache")
+    logInfo("invalidating blacklist cache")
     _isBlacklistExecutorCacheValid = false
     _isBlacklistNodeCacheValid = false
     _isBlacklistNodeForStageCacheValid = false
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index 02aa5caa731ff..3015dbe30379f 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -131,8 +131,13 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
       // When a job fails, we terminate before waiting for all the task end events to come in,
       // so there might still be a running task set.  So we only check these conditions
       // when the job succeeds
-      assert(taskScheduler.runningTaskSets.isEmpty)
-      assert(!backend.hasTasks)
+      if (taskScheduler.runningTaskSets.nonEmpty) {
+        fail(s"taskScheduler still has running taskSets: ${taskScheduler.runningTaskSets}")
+      }
+      if (backend.hasTasks) {
+        fail(s"backend still has tasks. Waiting to run: ${backend.assignedTasksWaitingToRun}; " +
+          s"running : ${backend.runningTasks}")
+      }
     }
     assert(scheduler.activeJobs.isEmpty)
   }
@@ -264,9 +269,9 @@ private[spark] abstract class MockBackend(
   }
 
   // protected by this
-  private val assignedTasksWaitingToRun = new ArrayBuffer[TaskDescription](10000)
+  val assignedTasksWaitingToRun = new ArrayBuffer[TaskDescription](10000)
   // protected by this
-  private val runningTasks = ArrayBuffer[TaskDescription]()
+  val runningTasks = ArrayBuffer[TaskDescription]()
 
   def hasTasks: Boolean = synchronized {
     assignedTasksWaitingToRun.nonEmpty || runningTasks.nonEmpty
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
index 713ecc97b4e68..7294dec0ed239 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
@@ -57,7 +57,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
     var itrs = 0
     val totalMs = withBackend(backend) {
       val start = System.currentTimeMillis()
-      while (System.currentTimeMillis() - start < 10000 ) {
+      while (System.currentTimeMillis() - start < 30000 ) {
         withClue(s"failure in iteration = $itrs") {
           val jobFuture = submit(simpleWorkload(N), (0 until N).toArray)
           // Note: Do not call Await.ready(future) because that calls `scala.concurrent.blocking`,
@@ -236,32 +236,28 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
   // note this is *very* unlikely to succeed without blacklisting, even though its only
   // one bad executor out of 20.  When a task fails, it gets requeued immediately -- and guess
   // which is the only executor which has a free slot?  Bingo, the one it just failed on
-  testScheduler(
-    "bad execs with blacklist",
-    extraConfs = Seq(
-      "spark.scheduler.executorTaskBlacklistTime" -> "10000000"
-    )
-  ) {
-    runBadExecJob(3000, badExecs, badHosts)
-  }
-
   testScheduler(
     "COMPARE D bad execs with advanced blacklist",
     extraConfs = Seq(
       "spark.scheduler.executorTaskBlacklistTime" -> "10000000",
-      "spark.scheduler.blacklist.advancedStrategy" -> "true"
+      "spark.scheduler.blacklist.advancedStrategy" -> "true",
+      "spark.testing.nHosts" -> "2",
+      "spark.testing.nExecutorsPerHost" -> "2"
     )
   ) {
-    runBadExecJob(50, badExecs, badHosts)
+    runBadExecJob(100, badExecs, badHosts)
   }
 
   testScheduler(
     "COMPARE D bad execs with simple blacklist",
     extraConfs = Seq(
       "spark.scheduler.executorTaskBlacklistTime" -> "10000000",
-      "spark.scheduler.blacklist.advancedStrategy" -> "false"
+      "spark.scheduler.blacklist.advancedStrategy" -> "false",
+      "spark.testing.nHosts" -> "2",
+      "spark.testing.nExecutorsPerHost" -> "2"
     )
   ) {
-    runBadExecJob(50, badExecs, badHosts)
+    runBadExecJob(100, badExecs, badHosts)
   }
+
 }

From e852e0c41666e6bb126ecd9dc79f5a08cd2ca4f6 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Mon, 23 May 2016 17:11:09 -0500
Subject: [PATCH 25/35] notes mostly

---
 .../spark/scheduler/BlacklistStrategy.scala    | 12 ++++++++++--
 .../spark/scheduler/BlacklistTracker.scala     |  6 ++++--
 .../scheduler/SchedulerPerformanceSuite.scala  | 18 +++++++++++++++---
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala
index 7d19ff54e9a09..8c690b6e3223b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala
@@ -83,7 +83,7 @@ private[scheduler] trait BlacklistStrategy {
  */
 private[scheduler] class SingleTaskStrategy(
     val expireTimeInMilliseconds: Long) extends BlacklistStrategy {
-  var executorBlacklistCallCount = 0
+  var executorBlacklistCallCount = 0L
   def getExecutorBlacklist(
       executorIdToFailureStatus: mutable.HashMap[String, FailureStatus],
       atomTask: StageAndPartition,
@@ -108,18 +108,26 @@ private[scheduler] class SingleTaskStrategy(
 private[scheduler] class AdvancedSingleTaskStrategy(
     expireTimeInMilliseconds: Long) extends SingleTaskStrategy(expireTimeInMilliseconds) {
 
-  var nodeBlacklistCallCount = 0
+  var nodeBlacklistCallCount = 0L
   override def getNodeBlacklistForStage(
       executorIdToFailureStatus: mutable.HashMap[String, FailureStatus],
       stageId: Int,
       clock: Clock): Set[String] = {
     nodeBlacklistCallCount += 1
+    // when there is one bad node (or executor), this is really slow.  We pile up a ton of
+    // task failures, and we've got to iterate through failure data for each task.  Furthermore,
+    // since we don't actively blacklist the bad node / executor, we just keep assigning it more
+    // tasks that fail.  And after each failure, we invalidate our cache, which means we need
+    // to call this again.
+    // This can be particularly painful when the failures are fast, since its likely the only
+    // executor with free slots is the one which just failed some tasks, which just keep going ...
     val nodes = executorIdToFailureStatus.filter{
       case (_, failureStatus) =>
         failureStatus.numFailuresPerTask.keySet.map(_.stageId).contains(stageId) &&
         clock.getTimeMillis() - failureStatus.updatedTime < expireTimeInMilliseconds
     }.values.map(_.host)
     getDuplicateElem(nodes, 1)
+    super.getNodeBlacklistForStage(executorIdToFailureStatus, stageId, clock)
   }
 
   override def getNodeBlacklist(
diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
index d1a058495a921..792964cfc9d92 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
@@ -65,11 +65,11 @@ private[spark] class BlacklistTracker(
   def stop(): Unit = {
     scheduler.shutdown()
     scheduler.awaitTermination(10, TimeUnit.SECONDS)
-    logDebug(s"Executor Blacklist callcount =" +
+    logInfo(s"Executor Blacklist callcount =" +
       s" ${strategy.asInstanceOf[SingleTaskStrategy].executorBlacklistCallCount}")
     strategy match {
       case as: AdvancedSingleTaskStrategy =>
-        logDebug(s"Node Blacklist callcount =" +
+        logInfo(s"Node Blacklist callcount =" +
           s" ${as.nodeBlacklistCallCount}")
       case _ => // no op
     }
@@ -201,12 +201,14 @@ private[spark] class BlacklistTracker(
       clock: Clock): Set[String] = {
     val executors = executorsOnBlacklistedNode(sched, atomTask) ++
       strategy.getExecutorBlacklist(executorIdToFailureStatus, atomTask, clock)
+    logInfo(s"Blacklisting executors ${executors} for task ${atomTask}")
     updateBlacklistExecutorCache(atomTask, executors)
     executors
   }
 
   private def reEvaluateNodeBlacklistForStageAndUpdateCache(stageId: Int): Set[String] = {
     val nodes = strategy.getNodeBlacklistForStage(executorIdToFailureStatus, stageId, clock)
+    logInfo(s"Blacklisting nodes ${nodes} for stage ${stageId}")
     updateBlacklistNodeCache(nodes)
     nodes
   }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
index 7294dec0ed239..85faf114e27a0 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
@@ -236,6 +236,18 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
   // note this is *very* unlikely to succeed without blacklisting, even though its only
   // one bad executor out of 20.  When a task fails, it gets requeued immediately -- and guess
   // which is the only executor which has a free slot?  Bingo, the one it just failed on
+  testScheduler(
+    "COMPARE D bad execs with simple blacklist",
+    extraConfs = Seq(
+      "spark.scheduler.executorTaskBlacklistTime" -> "10000000",
+      "spark.scheduler.blacklist.advancedStrategy" -> "false",
+      "spark.testing.nHosts" -> "2",
+      "spark.testing.nExecutorsPerHost" -> "2"
+    )
+  ) {
+    runBadExecJob(100, badExecs, badHosts)
+  }
+
   testScheduler(
     "COMPARE D bad execs with advanced blacklist",
     extraConfs = Seq(
@@ -249,15 +261,15 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
   }
 
   testScheduler(
-    "COMPARE D bad execs with simple blacklist",
+    "COMPARE D bad host with advanced blacklist",
     extraConfs = Seq(
       "spark.scheduler.executorTaskBlacklistTime" -> "10000000",
-      "spark.scheduler.blacklist.advancedStrategy" -> "false",
+      "spark.scheduler.blacklist.advancedStrategy" -> "true",
       "spark.testing.nHosts" -> "2",
       "spark.testing.nExecutorsPerHost" -> "2"
     )
   ) {
-    runBadExecJob(100, badExecs, badHosts)
+    runBadExecJob(100, badExecs, Set("host-0"))
   }
 
 }

From 8b78d3f83328e24b86af65ee63f446ef7ebf4047 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Mon, 23 May 2016 21:28:42 -0500
Subject: [PATCH 26/35] more notes

---
 .../scheduler/SchedulerPerformanceSuite.scala | 116 +++++++++++++-----
 1 file changed, 82 insertions(+), 34 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
index 85faf114e27a0..4956cb0efb911 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
@@ -57,8 +57,10 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
     var itrs = 0
     val totalMs = withBackend(backend) {
       val start = System.currentTimeMillis()
-      while (System.currentTimeMillis() - start < 30000 ) {
+      while (System.currentTimeMillis() - start < 10000 ) {
+//        while (System.currentTimeMillis() - start < 10000  && itrs == 0) {
         withClue(s"failure in iteration = $itrs") {
+          val itrStart = System.currentTimeMillis()
           val jobFuture = submit(simpleWorkload(N), (0 until N).toArray)
           // Note: Do not call Await.ready(future) because that calls `scala.concurrent.blocking`,
           // which causes concurrent SQL executions to fail if a fork-join pool is used. Note that
@@ -66,6 +68,10 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
           // safe to pass in null here. For more detail, see SPARK-13747.
           val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait]
           jobFuture.ready(Duration.Inf)(awaitPermission)
+          // scalastyle:off println
+          println(s"Iteration $itrs finished in" +
+            s" ${Utils.msDurationToString(System.currentTimeMillis() - itrStart)}")
+          // scalastyle:on println
           assertDataStructuresEmpty(noFailure = true)
           itrs += 1
         }
@@ -231,45 +237,87 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
   }
 
   val badExecs = (0 until 2).map{_.toString}.toSet
-  val badHosts = Set[String]()
 
   // note this is *very* unlikely to succeed without blacklisting, even though its only
   // one bad executor out of 20.  When a task fails, it gets requeued immediately -- and guess
   // which is the only executor which has a free slot?  Bingo, the one it just failed on
-  testScheduler(
-    "COMPARE D bad execs with simple blacklist",
-    extraConfs = Seq(
-      "spark.scheduler.executorTaskBlacklistTime" -> "10000000",
-      "spark.scheduler.blacklist.advancedStrategy" -> "false",
-      "spark.testing.nHosts" -> "2",
-      "spark.testing.nExecutorsPerHost" -> "2"
-    )
-  ) {
-    runBadExecJob(100, badExecs, badHosts)
+  Seq(
+    ("bad execs with simple blacklist", "false", Set[String]()),
+    ("bad execs with advanced blacklist", "true", Set[String]()),
+    ("bad hosts with advanced blacklist", "true", Set[String]("host-0"))
+  ).foreach { case (name, strategy, badHosts) =>
+    testScheduler(
+      s"COMPARE D $name",
+      extraConfs = Seq(
+        "spark.scheduler.executorTaskBlacklistTime" -> "10000000",
+        "spark.scheduler.blacklist.advancedStrategy" -> strategy
+      )
+    ) {
+      runBadExecJob(3000, badExecs, badHosts)
+    }
   }
 
-  testScheduler(
-    "COMPARE D bad execs with advanced blacklist",
-    extraConfs = Seq(
-      "spark.scheduler.executorTaskBlacklistTime" -> "10000000",
-      "spark.scheduler.blacklist.advancedStrategy" -> "true",
-      "spark.testing.nHosts" -> "2",
-      "spark.testing.nExecutorsPerHost" -> "2"
-    )
-  ) {
-    runBadExecJob(100, badExecs, badHosts)
-  }
 
-  testScheduler(
-    "COMPARE D bad host with advanced blacklist",
-    extraConfs = Seq(
-      "spark.scheduler.executorTaskBlacklistTime" -> "10000000",
-      "spark.scheduler.blacklist.advancedStrategy" -> "true",
-      "spark.testing.nHosts" -> "2",
-      "spark.testing.nExecutorsPerHost" -> "2"
-    )
-  ) {
-    runBadExecJob(100, badExecs, Set("host-0"))
-  }
+  /*
+  Here's how you can get into really slow scheduling, even with the simple blacklist.  Say there
+  is just one bad executor.  You've got a bunch of tasks to run, and you schedule all available
+  slots.  Then one task fails on your bad executor.  You don't re-schedule that task on the bad
+  executor, but you do think you've got one open slot, so you try to find the next task you can
+  schedule.  Since you've got a massive backlog of tasks, you just take the next task and schedule
+  it on your bad executor.  The task fails again.
+
+  This repeats a while, and now you've gone through and failed a bunch of tasks on this one bad
+  executor.  But each time, you clear the cache of invalid executors, so you do a bunch of work
+  to recompute the set of OK executors.  This is *really* expensive, and doesn't help you at all
+  anyway.
+
+
+
+16/05/23 20:53:57.871 dag-scheduler-event-loop INFO BlacklistTracker: Blacklisting executors Set() for task StageAndPartition(8,38)
+16/05/23 20:53:57.871 dag-scheduler-event-loop INFO TaskSetManager: Starting task 38.0 in stage 8.0 (TID 21056, host-2, partition 38, PROCESS_LOCAL, 5112 bytes)
+16/05/23 20:53:57.871 dag-scheduler-event-loop INFO BlacklistTracker: Blacklisting nodes Set() for stage 8
+16/05/23 20:53:57.871 dag-scheduler-event-loop INFO BlacklistTracker: Blacklisting executors Set() for task StageAndPartition(8,39)
+16/05/23 20:53:57.871 dag-scheduler-event-loop INFO TaskSetManager: Starting task 39.0 in stage 8.0 (TID 21057, host-0, partition 39, PROCESS_LOCAL, 5112 bytes)
+16/05/23 20:53:57.871 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8
+16/05/23 20:53:57.871 mock backend thread INFO BlacklistTracker: Blacklisting executors Set() for task StageAndPartition(8,40)
+16/05/23 20:53:57.871 dag-scheduler-event-loop INFO DAGScheduler: ShuffleMapStage 5 (RDD at SchedulerIntegrationSuite.scala:360) finished in 1.731 s
+16/05/23 20:53:57.871 dag-scheduler-event-loop INFO DAGScheduler: looking for newly runnable stages
+16/05/23 20:53:57.871 dag-scheduler-event-loop INFO DAGScheduler: running: Set(ShuffleMapStage 8)
+16/05/23 20:53:57.871 dag-scheduler-event-loop INFO DAGScheduler: waiting: Set(ResultStage 9, ShuffleMapStage 6)
+16/05/23 20:53:57.871 dag-scheduler-event-loop INFO DAGScheduler: failed: Set()
+16/05/23 20:53:57.872 mock backend thread INFO TaskSetManager: Starting task 40.0 in stage 8.0 (TID 21058, host-0, partition 40, PROCESS_LOCAL, 5112 bytes)
+16/05/23 20:53:57.872 task-result-getter-2 WARN TaskSetManager: Lost task 39.0 in stage 8.0 (TID 21057, host-0): java.lang.RuntimeException: bad exec 1
+        at org.apache.spark.scheduler.SchedulerPerformanceSuite.backendWithBadExecs(SchedulerPerformanceSuite.scala:218)
+        at org.apache.spark.scheduler.SchedulerPerformanceSuite$$anonfun$runBadExecJob$1.apply$mcV$sp(SchedulerPerformanceSuite.scala:236)
+        at org.apache.spark.scheduler.SchedulerIntegrationSuite$$anon$2.run(SchedulerIntegrationSuite.scala:194)
+
+16/05/23 20:53:57.872 task-result-getter-2 INFO BlacklistTracker: invalidating blacklist cache
+16/05/23 20:53:57.872 dag-scheduler-event-loop INFO DAGScheduler: Submitting ShuffleMapStage 6 (MockRDD 5), which has no missing parents
+16/05/23 20:53:57.872 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8
+16/05/23 20:53:57.872 mock backend thread INFO BlacklistTracker: Blacklisting executors Set(1) for task StageAndPartition(8,39)
+16/05/23 20:53:57.872 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8
+16/05/23 20:53:57.872 mock backend thread INFO BlacklistTracker: Blacklisting executors Set() for task StageAndPartition(8,41)
+16/05/23 20:53:57.872 mock backend thread INFO TaskSetManager: Starting task 41.0 in stage 8.0 (TID 21059, host-0, partition 41, PROCESS_LOCAL, 5112 bytes)
+16/05/23 20:53:57.872 task-result-getter-3 INFO TaskSetManager: Lost task 40.0 in stage 8.0 (TID 21058) on executor host-0: java.lang.RuntimeException (bad exec 1) [duplicate 1]
+16/05/23 20:53:57.872 task-result-getter-3 INFO BlacklistTracker: invalidating blacklist cache
+16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8
+16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting executors Set(1) for task StageAndPartition(8,40)
+16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8
+16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting executors Set(1) for task StageAndPartition(8,39)
+16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8
+16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting executors Set() for task StageAndPartition(8,42)
+16/05/23 20:53:57.873 mock backend thread INFO TaskSetManager: Starting task 42.0 in stage 8.0 (TID 21060, host-0, partition 42, PROCESS_LOCAL, 5112 bytes)
+16/05/23 20:53:57.873 task-result-getter-1 INFO TaskSetManager: Lost task 41.0 in stage 8.0 (TID 21059) on executor host-0: java.lang.RuntimeException (bad exec 1) [duplicate 2]
+16/05/23 20:53:57.873 task-result-getter-1 INFO BlacklistTracker: invalidating blacklist cache
+16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8
+16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting executors Set(1) for task StageAndPartition(8,41)
+16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8
+16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting executors Set(1) for task StageAndPartition(8,40)
+16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8
+16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting executors Set(1) for task StageAndPartition(8,39)
+16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting nodes Set() for stage 8
+16/05/23 20:53:57.873 mock backend thread INFO BlacklistTracker: Blacklisting executors Set() for task StageAndPartition(8,43)
+
+   */
 
 }

From 883bfd7dd219998bc7a806c6d8dc8a50cd1f6a1a Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Tue, 24 May 2016 15:56:26 -0500
Subject: [PATCH 27/35] fix race condition w/ runningTaskSets

---
 .../spark/scheduler/SchedulerIntegrationSuite.scala      | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index 3015dbe30379f..718963da7dc17 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -418,15 +418,20 @@ private class MockExternalClusterManager extends ExternalClusterManager {
 /** TaskSchedulerImpl that just tracks a tiny bit more state to enable checks in tests. */
 class TestTaskScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) {
   /** Set of TaskSets the DAGScheduler has requested executed. */
+  // protected by this
   val runningTaskSets = HashSet[TaskSet]()
 
   override def submitTasks(taskSet: TaskSet): Unit = {
-    runningTaskSets += taskSet
+    synchronized {
+      runningTaskSets += taskSet
+    }
     super.submitTasks(taskSet)
   }
 
   override def taskSetFinished(manager: TaskSetManager): Unit = {
-    runningTaskSets -= manager.taskSet
+    synchronized {
+      runningTaskSets -= manager.taskSet
+    }
     super.taskSetFinished(manager)
   }
 }

From 4358b2fdb022ed0b84668bbe8f5b8aa56c8ce637 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Tue, 24 May 2016 16:56:12 -0500
Subject: [PATCH 28/35] updated logging

---
 .../org/apache/spark/scheduler/BlacklistTracker.scala      | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
index 792964cfc9d92..e51f2b4ebb274 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
@@ -199,16 +199,15 @@ private[spark] class BlacklistTracker(
       sched: TaskSchedulerImpl,
       atomTask: StageAndPartition,
       clock: Clock): Set[String] = {
+    // TODO some kind of logging when the blacklist is *updated*
     val executors = executorsOnBlacklistedNode(sched, atomTask) ++
       strategy.getExecutorBlacklist(executorIdToFailureStatus, atomTask, clock)
-    logInfo(s"Blacklisting executors ${executors} for task ${atomTask}")
     updateBlacklistExecutorCache(atomTask, executors)
     executors
   }
 
   private def reEvaluateNodeBlacklistForStageAndUpdateCache(stageId: Int): Set[String] = {
     val nodes = strategy.getNodeBlacklistForStage(executorIdToFailureStatus, stageId, clock)
-    logInfo(s"Blacklisting nodes ${nodes} for stage ${stageId}")
     updateBlacklistNodeCache(nodes)
     nodes
   }
@@ -257,6 +256,10 @@ private[scheduler] trait BlacklistCache extends Logging {
   protected def updateBlacklistNodeForStageCache(
       stageId: Int,
       blacklistNode: Set[String]): Unit = cacheLock.synchronized {
+    val wasBlacklisted = blacklistNodeForStageCache.getOrElse(stageId, Set.empty[String])
+    if (wasBlacklisted != blacklistNode) {
+      logInfo(s"Updating node blacklist for Stage ${stageId} to ${blacklistNode}")
+    }
     if (!_isBlacklistNodeForStageCacheValid) {
       blacklistNodeForStageCache.clear()
     }

From f850a300f9cbb0ab951ccc83e1b36767a252962f Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Tue, 24 May 2016 17:32:29 -0500
Subject: [PATCH 29/35] log executor in addition to host

---
 .../org/apache/spark/scheduler/TaskSetManager.scala | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index bd74eef10e485..1a716ff901d86 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -462,8 +462,8 @@ private[spark] class TaskSetManager(
           // a good proxy to task serialization time.
           // val timeTaken = clock.getTime() - startTime
           val taskName = s"task ${info.id} in stage ${taskSet.id}"
-          logInfo(s"Starting $taskName (TID $taskId, $host, partition ${task.partitionId}," +
-            s" $taskLocality, ${serializedTask.limit} bytes)")
+          logInfo(s"Starting $taskName (TID $taskId, $host, exec ${info.executorId}, " +
+            s"partition ${task.partitionId},$taskLocality, ${serializedTask.limit} bytes)")
 
           sched.dagScheduler.taskStarted(task, info)
           return Some(new TaskDescription(taskId = taskId, attemptNumber = attemptNum, execId,
@@ -603,8 +603,9 @@ private[spark] class TaskSetManager(
     sched.dagScheduler.taskEnded(tasks(index), Success, result.value(), result.accumUpdates, info)
     if (!successful(index)) {
       tasksSuccessful += 1
-      logInfo("Finished task %s in stage %s (TID %d) in %d ms on %s (%d/%d)".format(
-        info.id, taskSet.id, info.taskId, info.duration, info.host, tasksSuccessful, numTasks))
+      logInfo("Finished task %s in stage %s (TID %d) in %d ms on %s / exec %s (%d/%d)".format(
+        info.id, taskSet.id, info.taskId, info.duration, info.host, info.executorId,
+        tasksSuccessful, numTasks))
       // Mark successful and stop if all the tasks have succeeded.
       successful(index) = true
       if (tasksSuccessful == numTasks) {
@@ -635,8 +636,8 @@ private[spark] class TaskSetManager(
     val index = info.index
     copiesRunning(index) -= 1
     var accumUpdates: Seq[AccumulatorV2[_, _]] = Seq.empty
-    val failureReason = s"Lost task ${info.id} in stage ${taskSet.id} (TID $tid, ${info.host}): " +
-      reason.asInstanceOf[TaskFailedReason].toErrorString
+    val failureReason = s"Lost task ${info.id} in stage ${taskSet.id} (TID $tid, ${info.host}," +
+      s" exec ${info.executorId}): ${reason.asInstanceOf[TaskFailedReason].toErrorString}"
     val failureException: Option[Throwable] = reason match {
       case fetchFailed: FetchFailed =>
         logWarning(failureReason)

From 4ac99c6f43349ab6d40e9906a4ad316d85387b27 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Wed, 25 May 2016 15:17:50 -0500
Subject: [PATCH 30/35] wip, logging and some logic updates

---
 .../apache/spark/scheduler/BlacklistStrategy.scala    |  1 -
 .../org/apache/spark/scheduler/BlacklistTracker.scala | 11 +++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala
index 8c690b6e3223b..b16fb642e65f0 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistStrategy.scala
@@ -127,7 +127,6 @@ private[scheduler] class AdvancedSingleTaskStrategy(
         clock.getTimeMillis() - failureStatus.updatedTime < expireTimeInMilliseconds
     }.values.map(_.host)
     getDuplicateElem(nodes, 1)
-    super.getNodeBlacklistForStage(executorIdToFailureStatus, stageId, clock)
   }
 
   override def getNodeBlacklist(
diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
index e51f2b4ebb274..1cbd36b574243 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
@@ -96,7 +96,8 @@ private[spark] class BlacklistTracker(
     } else {
 //      getExecutorBlacklistFromCache(atomTask).getOrElse(Set.empty[String])
       getExecutorBlacklistFromCache(atomTask).getOrElse {
-        // TODO Why is this necessary?
+        // TODO Why is this necessary? (its because we clear the entire map on an invalidate,
+        // and lazily rebuild it)
         reEvaluateExecutorBlacklistAndUpdateCache(sched, atomTask, clock)
       }
     }
@@ -191,8 +192,12 @@ private[spark] class BlacklistTracker(
   private def executorsOnBlacklistedNode(
       sched: TaskSchedulerImpl,
       atomTask: StageAndPartition): Set[String] = {
-    nodeBlacklistForStage(atomTask.stageId).flatMap(sched.getExecutorsAliveOnHost(_)
+    val nodeBl = nodeBlacklistForStage(atomTask.stageId).flatMap(sched.getExecutorsAliveOnHost(_)
       .getOrElse(Set.empty[String]))
+    if (nodeBl.nonEmpty) {
+      logInfo(s"${atomTask} is blacklisted on executors ${nodeBl} from node blacklist")
+    }
+    nodeBl
   }
 
   private def reEvaluateExecutorBlacklistAndUpdateCache(
@@ -208,6 +213,7 @@ private[spark] class BlacklistTracker(
 
   private def reEvaluateNodeBlacklistForStageAndUpdateCache(stageId: Int): Set[String] = {
     val nodes = strategy.getNodeBlacklistForStage(executorIdToFailureStatus, stageId, clock)
+//    updateBlacklistNodeForStageCache(stageId, nodes)
     updateBlacklistNodeCache(nodes)
     nodes
   }
@@ -256,6 +262,7 @@ private[scheduler] trait BlacklistCache extends Logging {
   protected def updateBlacklistNodeForStageCache(
       stageId: Int,
       blacklistNode: Set[String]): Unit = cacheLock.synchronized {
+    // TODO this needs to actually get called, and add unit test
     val wasBlacklisted = blacklistNodeForStageCache.getOrElse(stageId, Set.empty[String])
     if (wasBlacklisted != blacklistNode) {
       logInfo(s"Updating node blacklist for Stage ${stageId} to ${blacklistNode}")

From 6f02ded3730c85571713bb3b4c108a00850f7306 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Wed, 25 May 2016 15:20:46 -0500
Subject: [PATCH 31/35] performance suite updates

---
 .../scheduler/SchedulerPerformanceSuite.scala | 154 +++++++++++++++---
 1 file changed, 130 insertions(+), 24 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
index 4956cb0efb911..7368bcac0e28a 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
@@ -16,6 +16,8 @@
  */
 package org.apache.spark.scheduler
 
+import java.util.concurrent.atomic.AtomicBoolean
+
 import scala.concurrent.duration.Duration
 
 import org.apache.spark.util.Utils
@@ -47,7 +49,11 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
     }
   }
 
-  def runJobWithBackend(N: Int, backend: () => Unit): Unit = {
+  def runJobWithBackend(N: Int, backendFunc: () => Unit): Unit = {
+    runJobWithCustomBackend(N, new SimpleWrappedBackend(backend, backendFunc))
+  }
+
+  def runJobWithCustomBackend(N: Int, backendWrapper: WrappedBackend): Unit = {
     // Try to run as many jobs as we can in 10 seconds, get the time per job.  The idea here is to
     // balance:
     // 1) have a big enough job that we're not effected by delays just from waiting for job
@@ -55,7 +61,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
     // 2) run enough iterations to get some reliable data
     // 3) not wait toooooo long
     var itrs = 0
-    val totalMs = withBackend(backend) {
+    val totalMs = backendWrapper.withBackend {
       val start = System.currentTimeMillis()
       while (System.currentTimeMillis() - start < 10000 ) {
 //        while (System.currentTimeMillis() - start < 10000  && itrs == 0) {
@@ -76,7 +82,7 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
           itrs += 1
         }
       }
-      System.currentTimeMillis() - start
+      (System.currentTimeMillis() - start)
     }
 
     val msPerItr = Utils.msDurationToString((totalMs.toDouble / itrs).toLong)
@@ -209,31 +215,89 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
     runSuccessfulJob(3000)
   }
 
-  def backendWithBadExecs(N: Int, badExecs: Set[String], badHosts: Set[String]): Unit = {
-    val taskDescription = backend.beginTask()
-    val host = backend.executorIdToExecutor(taskDescription.executorId).host
-    val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet
-    val task = taskSet.tasks(taskDescription.index)
-    if (badExecs(taskDescription.executorId)) {
-      val exc = new RuntimeException(s"bad exec ${taskDescription.executorId}")
-      backend.taskFailed(taskDescription, exc)
-    } else if (badHosts(host)) {
-      val exc = new RuntimeException(s"bad host ${host}")
-      backend.taskFailed(taskDescription, exc)
-    } else {
-      // every 5th stage is a ResultStage -- the rest are ShuffleMapStages
-      (task.stageId, task.partitionId) match {
-        case (stage, _) if stage % 5 != 4 =>
-          backend.taskSuccess(taskDescription,
-            DAGSchedulerSuite.makeMapStatus(host, N))
-        case (_, _) =>
-          backend.taskSuccess(taskDescription, 42)
+  def backendWithBadExecs(
+      continue: AtomicBoolean,
+      N: Int,
+      badExecs: Set[String],
+      badHosts: Set[String]): Unit = {
+    var tasksToFail = List[TaskDescription]()
+    var tasksToSucceed = List[TaskDescription]()
+    val FAILURES_TILL_SUCCESS = 100 // that is, we get a task failure 100 times as fast as success
+    val waitForSuccess = 100
+    var failuresSinceLastSuccess = 0
+    while (continue.get()) {
+      // don't *just* keep failing tasks on the same executor.  While there are tasks to fail,
+      // we fail them more often, but we fail across all executors.  Furthermore, after X failures,
+      // we do have a task success
+
+      // first, queue up all the tasks needing to run
+      while (backend.hasTasksWaitingToRun) {
+        val taskDescription = backend.beginTask()
+        val host = backend.executorIdToExecutor(taskDescription.executorId).host
+        val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet
+        val task = taskSet.tasks(taskDescription.index)
+        if (badExecs(taskDescription.executorId) || badHosts(host)) {
+          tasksToFail :+= taskDescription
+        } else {
+          tasksToSucceed :+= taskDescription
+        }
+      }
+
+      // send a task result.  Failure if there are any and we haven't had too many failures in a row
+      def failTask(): Unit = {
+        failuresSinceLastSuccess += 1
+        val toFail = tasksToFail.head
+        tasksToFail = tasksToFail.tail
+        val host = backend.executorIdToExecutor(toFail.executorId).host
+        if (badExecs(toFail.executorId)) {
+          val exc = new RuntimeException(s"bad exec ${toFail.executorId}")
+          backend.taskFailed(toFail, exc)
+        } else if (badHosts(host)) {
+          val exc = new RuntimeException(s"bad host ${host}")
+          backend.taskFailed(toFail, exc)
+        }
+      }
+      if (tasksToFail.nonEmpty && failuresSinceLastSuccess < FAILURES_TILL_SUCCESS) {
+        failTask()
+      } else if (tasksToSucceed.nonEmpty) {
+        // we might get here just by some chance of thread-scheduling in this mock.  Tasks fail,
+        // but the dag scheduler thread hasn't processed those before this thread tries to find
+        // another task to respond to.
+//        Thread.sleep(waitForSuccess)
+        if (tasksToFail.nonEmpty && failuresSinceLastSuccess < FAILURES_TILL_SUCCESS) {
+          failTask()
+        } else {
+          logInfo(s"tasksToFail.size = ${tasksToFail.size}; " +
+            s"tasksToSucceed.size = ${tasksToSucceed.size}; " +
+            s"failuresSinceLastSuccess = ${failuresSinceLastSuccess}")
+          failuresSinceLastSuccess = 0
+          val taskDescription = tasksToSucceed.head
+          tasksToSucceed = tasksToSucceed.tail
+          val host = backend.executorIdToExecutor(taskDescription.executorId).host
+          val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet
+          val task = taskSet.tasks(taskDescription.index)
+          // every 5th stage is a ResultStage -- the rest are ShuffleMapStages
+          (task.stageId, task.partitionId) match {
+            case (stage, _) if stage % 5 != 4 =>
+              backend.taskSuccess(taskDescription,
+                DAGSchedulerSuite.makeMapStatus(host, N))
+            case (_, _) =>
+              backend.taskSuccess(taskDescription, 42)
+          }
+        }
+      } else {
+        Thread.sleep(10)  // wait till we've got work to do
       }
     }
   }
 
   def runBadExecJob(N: Int, badExecs: Set[String], badHosts: Set[String]): Unit = {
-    runJobWithBackend(N, () => backendWithBadExecs(N, badExecs, badHosts))
+    val backendWrapper = new WrappedBackend(backend) {
+      override def runBackend(continue: AtomicBoolean): Unit = {
+        backendWithBadExecs(continue, N, badExecs, badHosts)
+      }
+    }
+    runJobWithCustomBackend(N, backendWrapper)
   }
 
   val badExecs = (0 until 2).map{_.toString}.toSet
@@ -253,11 +317,20 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
         "spark.scheduler.blacklist.advancedStrategy" -> strategy
       )
     ) {
+      // scalastyle:off println
+      println(s"Bad execs = ${badExecs}")
+      // scalastyle:on println
+
+      // because offers get shuffled, its a crapshoot whether or not the "bad" executor will finish
+      // tasks first.  (A more complicated mock backend could make sure it fails the first executor
+      // it gets assigned)
       runBadExecJob(3000, badExecs, badHosts)
     }
   }
 
 
+  // scalastyle:off line.size.limit
+
   /*
   Here's how you can get into really slow scheduling, even with the simple blacklist.  Say there
   is just one bad executor.  You've got a bunch of tasks to run, and you schedule all available
@@ -272,7 +345,6 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
   anyway.
 
 
-
 16/05/23 20:53:57.871 dag-scheduler-event-loop INFO BlacklistTracker: Blacklisting executors Set() for task StageAndPartition(8,38)
 16/05/23 20:53:57.871 dag-scheduler-event-loop INFO TaskSetManager: Starting task 38.0 in stage 8.0 (TID 21056, host-2, partition 38, PROCESS_LOCAL, 5112 bytes)
 16/05/23 20:53:57.871 dag-scheduler-event-loop INFO BlacklistTracker: Blacklisting nodes Set() for stage 8
@@ -320,4 +392,38 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
 
    */
 
+  // scalastyle:on line.size.limit
+
+  abstract class WrappedBackend(backend: MockBackend) {
+    val backendContinue = new AtomicBoolean(true)
+    def runBackend(continue: AtomicBoolean): Unit
+    val backendThread = new Thread("mock backend thread") {
+      override def run(): Unit = {
+        runBackend(backendContinue)
+      }
+    }
+
+    def withBackend[T](testBody: => T): T = {
+      try {
+        backendThread.start()
+        testBody
+      } finally {
+        backendContinue.set(false)
+        backendThread.join()
+      }
+    }
+  }
+
+  class SimpleWrappedBackend(backend: MockBackend, backendFunc: () => Unit)
+      extends WrappedBackend(backend) {
+    override def runBackend(continue: AtomicBoolean): Unit = {
+      while (continue.get()) {
+        if (backend.hasTasksWaitingToRun) {
+          backendFunc()
+        } else {
+          Thread.sleep(10)
+        }
+      }
+    }
+  }
 }

From 71f1b477eafe47bcaee513987fbd2e8d4a4d5358 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Wed, 25 May 2016 22:53:45 -0500
Subject: [PATCH 32/35] optimization -- skip blacklisted executors earlier in
 scheduling loop

---
 .../org/apache/spark/scheduler/TaskSchedulerImpl.scala    | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 371fb8602f785..3b3dfa206a4e5 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -248,10 +248,16 @@ private[spark] class TaskSchedulerImpl(
       availableCpus: Array[Int],
       tasks: Seq[ArrayBuffer[TaskDescription]]) : Boolean = {
     var launchedTask = false
+    // TODO unit test, and also add executor-stage filtering as well
+    // This is an optimization -- the taskSet might contain a very long list of pending tasks.
+    // Rather than wasting time checking the offer against each task, and then realizing the
+    // executor is blacklisted, just filter out the bad executor immediately.
+    val nodeBlacklist = taskSet.blacklistTracker.map{_.nodeBlacklistForStage(taskSet.stageId)}
+      .getOrElse(Set())
     for (i <- 0 until shuffledOffers.size) {
       val execId = shuffledOffers(i).executorId
       val host = shuffledOffers(i).host
-      if (availableCpus(i) >= CPUS_PER_TASK) {
+      if (!nodeBlacklist(host) && availableCpus(i) >= CPUS_PER_TASK) {
         try {
           for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
             tasks(i) += task

From ffd0f252f012c3f5e12d6f1f500667700f1a5f65 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Wed, 25 May 2016 22:55:41 -0500
Subject: [PATCH 33/35] bug fix -- update the right cache in
 nodeBlacklistForStage

---
 .../scala/org/apache/spark/scheduler/BlacklistTracker.scala   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
index 1cbd36b574243..617c6ce8f9b80 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
@@ -213,8 +213,8 @@ private[spark] class BlacklistTracker(
 
   private def reEvaluateNodeBlacklistForStageAndUpdateCache(stageId: Int): Set[String] = {
     val nodes = strategy.getNodeBlacklistForStage(executorIdToFailureStatus, stageId, clock)
-//    updateBlacklistNodeForStageCache(stageId, nodes)
-    updateBlacklistNodeCache(nodes)
+    updateBlacklistNodeForStageCache(stageId, nodes)
+//    updateBlacklistNodeCache(nodes)
     nodes
   }
 }

From 3effef6c17cc5c5e4c4385103c7d96320b015672 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Wed, 25 May 2016 22:56:11 -0500
Subject: [PATCH 34/35] cleanup, TODOs

---
 .../org/apache/spark/scheduler/BlacklistTracker.scala  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
index 617c6ce8f9b80..4ca0713880a64 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
@@ -116,6 +116,10 @@ private[spark] class BlacklistTracker(
 
   // The actual implementation is delegated to strategy
   def nodeBlacklistForStage(stageId: Int): Set[String] = synchronized {
+    // TODO here and elsewhere -- we invalidate the cache way too often.  In general, we should
+    // be able to do an in-place update of the caches.  (a) this is slow and (b) it makes
+    // it really hard to track when the blacklist actually changes (would be *really* nice to
+    // log a msg about node level blacklisting at least)
     if (isBlacklistNodeForStageCacheValid) {
       getNodeBlacklistForStageFromCache(stageId).getOrElse(
         reEvaluateNodeBlacklistForStageAndUpdateCache(stageId))
@@ -192,12 +196,8 @@ private[spark] class BlacklistTracker(
   private def executorsOnBlacklistedNode(
       sched: TaskSchedulerImpl,
       atomTask: StageAndPartition): Set[String] = {
-    val nodeBl = nodeBlacklistForStage(atomTask.stageId).flatMap(sched.getExecutorsAliveOnHost(_)
+    nodeBlacklistForStage(atomTask.stageId).flatMap(sched.getExecutorsAliveOnHost(_)
       .getOrElse(Set.empty[String]))
-    if (nodeBl.nonEmpty) {
-      logInfo(s"${atomTask} is blacklisted on executors ${nodeBl} from node blacklist")
-    }
-    nodeBl
   }
 
   private def reEvaluateExecutorBlacklistAndUpdateCache(

From 456f578121801257bb90b0cbfbd9fa37a117961e Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Wed, 25 May 2016 22:57:01 -0500
Subject: [PATCH 35/35] process tasks in LIFO order for all performance tests,
 more cases, etc.

---
 .../scheduler/SchedulerPerformanceSuite.scala | 204 +++++++++---------
 1 file changed, 102 insertions(+), 102 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
index 7368bcac0e28a..515ce0a4d6e69 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerPerformanceSuite.scala
@@ -33,26 +33,6 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
     join(N, b, c)
   }
 
-  def goodBackend(N: Int): Unit = {
-    val taskDescription = backend.beginTask()
-    val host = backend.executorIdToExecutor(taskDescription.executorId).host
-    val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet
-    val task = taskSet.tasks(taskDescription.index)
-
-    // every 5th stage is a ResultStage -- the rest are ShuffleMapStages
-    (task.stageId, task.partitionId) match {
-      case (stage, _) if stage % 5 != 4 =>
-        backend.taskSuccess(taskDescription,
-          DAGSchedulerSuite.makeMapStatus(host, N))
-      case (_, _) =>
-        backend.taskSuccess(taskDescription, 42)
-    }
-  }
-
-  def runJobWithBackend(N: Int, backendFunc: () => Unit): Unit = {
-    runJobWithCustomBackend(N, new SimpleWrappedBackend(backend, backendFunc))
-  }
-
   def runJobWithCustomBackend(N: Int, backendWrapper: WrappedBackend): Unit = {
     // Try to run as many jobs as we can in 10 seconds, get the time per job.  The idea here is to
     // balance:
@@ -92,7 +72,17 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
   }
 
   def runSuccessfulJob(N: Int): Unit = {
-    runJobWithBackend(N, () => goodBackend(N))
+    runJobWithCustomBackend(N, new QueuingWrappedBackend(backend) {
+      override def handleTask(taskDesc: TaskDescription, task: Task[_], host: String): Unit = {
+        // every 5th stage is a ResultStage -- the rest are ShuffleMapStages
+        (task.stageId, task.partitionId) match {
+          case (stage, _) if stage % 5 != 4 =>
+            queueSuccess(taskDesc, DAGSchedulerSuite.makeMapStatus(host, N))
+          case (_, _) =>
+            queueSuccess(taskDesc, 42)
+        }
+      }
+    })
   }
 
   testScheduler("Scheduling speed -- small job on a small cluster") {
@@ -215,101 +205,44 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
     runSuccessfulJob(3000)
   }
 
-  def backendWithBadExecs(
-      continue: AtomicBoolean,
-      N: Int,
-      badExecs: Set[String],
-      badHosts: Set[String]): Unit = {
-    var tasksToFail = List[TaskDescription]()
-    var tasksToSucceed = List[TaskDescription]()
-    val FAILURES_TILL_SUCCESS = 100 // that is, we get a task failure 100 times as fast as success
-    val waitForSuccess = 100
-    var failuresSinceLastSuccess = 0
-    while (continue.get()) {
-      // don't *just* keep failing tasks on the same executor.  While there are tasks to fail,
-      // we fail them more often, but we fail across all executors.  Furthermore, after X failures,
-      // we do have a task success
-
-      // first, queue up all the tasks needing to run
-      while (backend.hasTasksWaitingToRun) {
-        val taskDescription = backend.beginTask()
-        val host = backend.executorIdToExecutor(taskDescription.executorId).host
-        val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet
-        val task = taskSet.tasks(taskDescription.index)
-        if (badExecs(taskDescription.executorId) || badHosts(host)) {
-          tasksToFail :+= taskDescription
-        } else {
-          tasksToSucceed :+= taskDescription
-        }
-      }
-
-      // send a task result.  Failure if there are any and we haven't had too many failures in a row
-      def failTask(): Unit = {
-        failuresSinceLastSuccess += 1
-        val toFail = tasksToFail.head
-        tasksToFail = tasksToFail.tail
-        val host = backend.executorIdToExecutor(toFail.executorId).host
-        if (badExecs(toFail.executorId)) {
-          val exc = new RuntimeException(s"bad exec ${toFail.executorId}")
-          backend.taskFailed(toFail, exc)
+  def runBadExecJob(N: Int, badExecs: Set[String], badHosts: Set[String]): Unit = {
+    val backendWrapper = new QueuingWrappedBackend(backend) {
+      override def handleTask(taskDesc: TaskDescription, task: Task[_], host: String): Unit = {
+        if (badExecs(taskDesc.executorId)) {
+          val exc = new RuntimeException(s"bad exec ${taskDesc.executorId}")
+          queueFailure(taskDesc, exc)
         } else if (badHosts(host)) {
           val exc = new RuntimeException(s"bad host ${host}")
-          backend.taskFailed(toFail, exc)
-        }
-      }
-      if (tasksToFail.nonEmpty && failuresSinceLastSuccess < FAILURES_TILL_SUCCESS) {
-        failTask()
-      } else if (tasksToSucceed.nonEmpty) {
-        // we might get here just by some chance of thread-scheduling in this mock.  Tasks fail,
-        // but the dag scheduler thread hasn't processed those before this thread tries to find
-        // another task to respond to.
-//        Thread.sleep(waitForSuccess)
-        if (tasksToFail.nonEmpty && failuresSinceLastSuccess < FAILURES_TILL_SUCCESS) {
-          failTask()
+          queueFailure(taskDesc, exc)
         } else {
-          logInfo(s"tasksToFail.size = ${tasksToFail.size}; " +
-            s"tasksToSucceed.size = ${tasksToSucceed.size}; " +
-            s"failuresSinceLastSuccess = ${failuresSinceLastSuccess}")
-          failuresSinceLastSuccess = 0
-          val taskDescription = tasksToSucceed.head
-          tasksToSucceed = tasksToSucceed.tail
-          val host = backend.executorIdToExecutor(taskDescription.executorId).host
-          val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet
-          val task = taskSet.tasks(taskDescription.index)
           // every 5th stage is a ResultStage -- the rest are ShuffleMapStages
           (task.stageId, task.partitionId) match {
             case (stage, _) if stage % 5 != 4 =>
-              backend.taskSuccess(taskDescription,
-                DAGSchedulerSuite.makeMapStatus(host, N))
+              queueSuccess(taskDesc, DAGSchedulerSuite.makeMapStatus(host, N))
             case (_, _) =>
-              backend.taskSuccess(taskDescription, 42)
+              queueSuccess(taskDesc, 42)
           }
         }
-      } else {
-        Thread.sleep(10)  // wait till we've got work to do
-      }
-    }
-  }
-
-  def runBadExecJob(N: Int, badExecs: Set[String], badHosts: Set[String]): Unit = {
-    val backendWrapper = new WrappedBackend(backend) {
-      override def runBackend(continue: AtomicBoolean): Unit = {
-        backendWithBadExecs(continue, N, badExecs, badHosts)
       }
     }
     runJobWithCustomBackend(N, backendWrapper)
   }
 
-  val badExecs = (0 until 2).map{_.toString}.toSet
+  val oneBadExec = Set("0")
+  // intentionally on different nodes, so they don't trigger node blacklist
+  val twoBadExecs = Set("0", "15")
+
 
   // note this is *very* unlikely to succeed without blacklisting, even though its only
   // one bad executor out of 20.  When a task fails, it gets requeued immediately -- and guess
   // which is the only executor which has a free slot?  Bingo, the one it just failed on
   Seq(
-    ("bad execs with simple blacklist", "false", Set[String]()),
-    ("bad execs with advanced blacklist", "true", Set[String]()),
-    ("bad hosts with advanced blacklist", "true", Set[String]("host-0"))
-  ).foreach { case (name, strategy, badHosts) =>
+    ("bad exec with simple blacklist", "false", oneBadExec, Set[String]()),
+    ("two bad execs with simple blacklist", "false", twoBadExecs, Set[String]()),
+    ("bad exec with advanced blacklist", "true", oneBadExec, Set[String]()),
+    ("bad host with advanced blacklist", "true", Set[String](), Set[String]("host-0")),
+    ("bad exec and host with advanced blacklist", "true", oneBadExec, Set[String]("host-3"))
+  ).foreach { case (name, strategy, badExecs, badHosts) =>
     testScheduler(
       s"COMPARE D $name",
       extraConfs = Seq(
@@ -394,6 +327,18 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
 
   // scalastyle:on line.size.limit
 
+
+  /*
+  RESULTS
+
+  On a happy cluster, speed is about the same in all modes, ~5s per iteration
+
+  On a bad cluster, slow in all versions, about 2m per iteration (original code, and new code with
+  various strategies).  the reason is that we waste soooooo long looping all tasks through
+  the bad nodes, and that has one n^2 penalty.
+
+   */
+
   abstract class WrappedBackend(backend: MockBackend) {
     val backendContinue = new AtomicBoolean(true)
     def runBackend(continue: AtomicBoolean): Unit
@@ -412,16 +357,71 @@ class SchedulerPerformanceSuite extends SchedulerIntegrationSuite[MultiExecutorM
         backendThread.join()
       }
     }
+
   }
 
-  class SimpleWrappedBackend(backend: MockBackend, backendFunc: () => Unit)
-      extends WrappedBackend(backend) {
+  abstract class QueuingWrappedBackend(backend: MockBackend) extends WrappedBackend(backend) {
+    var tasksToFail = List[(TaskDescription, Exception)]()
+    var tasksToSucceed = List[(TaskDescription, Any)]()
+    val FAILURES_TILL_SUCCESS = 100
+    // that is, we get a task failure 100 times as fast as success
+    val waitForSuccess = 100
+    var failuresSinceLastSuccess = 0
+
+    def handleTask(taskDesc: TaskDescription, task: Task[_], host: String): Unit
+
+    def queueSuccess(taskDesc: TaskDescription, result: Any): Unit = {
+      tasksToSucceed :+= taskDesc -> result
+    }
+
+    def queueFailure(taskDesc: TaskDescription, exc: Exception): Unit = {
+      tasksToFail :+= taskDesc -> exc
+    }
+
     override def runBackend(continue: AtomicBoolean): Unit = {
       while (continue.get()) {
-        if (backend.hasTasksWaitingToRun) {
-          backendFunc()
+        // don't *just* keep failing tasks on the same executor.  While there are tasks to fail,
+        // we fail them more often, but we fail across all executors.  Furthermore, after X failures
+        // we do have a task success
+
+        // first, queue up all the tasks needing to run
+        while (backend.hasTasksWaitingToRun) {
+          val taskDescription = backend.beginTask()
+          val host = backend.executorIdToExecutor(taskDescription.executorId).host
+          val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet
+          val task = taskSet.tasks(taskDescription.index)
+          handleTask(taskDescription, task, host)
+        }
+
+        // send a task result.  Prioritize failures, if we haven't had too many failures in a row
+        def failTask(): Unit = {
+          failuresSinceLastSuccess += 1
+          val (toFail, exc) = tasksToFail.head
+          tasksToFail = tasksToFail.tail
+          backend.taskFailed(toFail, exc)
+        }
+
+        if (tasksToFail.nonEmpty && failuresSinceLastSuccess < FAILURES_TILL_SUCCESS) {
+          failTask()
+        } else if (tasksToSucceed.nonEmpty) {
+          // we might get here just by some chance of thread-scheduling in this mock.  Tasks fail,
+          // but the scheduler thread hasn't processed those before this thread tries to find
+          // another task to respond to.
+          //        if (tasksToFail.nonEmpty && failuresSinceLastSuccess < FAILURES_TILL_SUCCESS) {
+          //          failTask()
+          //        } else {
+          logInfo(s"tasksToFail.size = ${tasksToFail.size}; " +
+            s"tasksToSucceed.size = ${tasksToSucceed.size}; " +
+            s"failuresSinceLastSuccess = ${failuresSinceLastSuccess}")
+          failuresSinceLastSuccess = 0
+          val (taskDescription, result) = tasksToSucceed.head
+          tasksToSucceed = tasksToSucceed.tail
+          val host = backend.executorIdToExecutor(taskDescription.executorId).host
+          val taskSet = taskScheduler.taskIdToTaskSetManager(taskDescription.taskId).taskSet
+          val task = taskSet.tasks(taskDescription.index)
+          backend.taskSuccess(taskDescription, result)
         } else {
-          Thread.sleep(10)
+          Thread.sleep(10) // wait till we've got work to do
         }
       }
     }