apache · jinxing64 · Apr 4, 2017 · Apr 5, 2017 · Apr 5, 2017 · Apr 5, 2017
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -471,6 +471,34 @@ class DAGScheduler(
     missing.toList
   }
 
+  /**
+   * Get ancestor splits in ShuffledRDD.
+   */
+  private[spark] def parentSplitsInShuffledRDD(stageId: Int, pId: Int): Option[Map[Int, Set[Int]]] =
+  {
+    stageIdToStage.get(stageId) match {
+      case Some(stage) =>
+        val waitingForVisit = new Stack[Tuple2[RDD[_], Int]]
+        waitingForVisit.push((stage.rdd, pId))
+        val ret = new HashMap[Int, HashSet[Int]]()
+        while(waitingForVisit.nonEmpty) {
+          val (rdd, split) = waitingForVisit.pop()
+          rdd.dependencies.foreach {
+            case dep: ShuffleDependency[_, _, _] =>
+              ret.getOrElseUpdate(dep.shuffleId, new HashSet[Int]()).add(split)
+            case dep: NarrowDependency[_] =>
+              dep.getParents(split).foreach {
+                case parentSplit =>
+                  waitingForVisit.push((dep.rdd, parentSplit))
+              }
+          }
+        }
+        Some(ret.mapValues(_.toSet).toMap)
+      case None =>
+        None
+    }
+  }
+
   /**
    * Registers the given jobId among the jobs that need the given stage and
    * all of that stage's ancestors.

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -20,6 +20,7 @@ package org.apache.spark.scheduler
 import java.io.NotSerializableException
 import java.nio.ByteBuffer
 import java.util.concurrent.ConcurrentLinkedQueue
+import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
 import scala.math.max
@@ -138,7 +139,7 @@ private[spark] class TaskSetManager(
   private[scheduler] var pendingTasksWithNoPrefs = new ArrayBuffer[Int]
 
   // Set containing all pending tasks (also used as a stack, as above).
-  private val allPendingTasks = new ArrayBuffer[Int]
+  private var allPendingTasks = new ArrayBuffer[Int]
 
   // Tasks that can be speculated. Since these will be a small fraction of total
   // tasks, we'll just hold them in a HashSet.
@@ -168,6 +169,10 @@ private[spark] class TaskSetManager(
     t.epoch = epoch
   }
 
+  private val sortedPendingTasks = new AtomicBoolean(false)
+
+  val taskInputSizeFromShuffledRDD = HashMap[Task[_], Long]()
+
   // Add all our tasks to the pending lists. We do this in reverse order
   // of task index so that tasks with low indices get launched first.
   for (i <- (0 until numTasks).reverse) {
@@ -438,6 +443,10 @@ private[spark] class TaskSetManager(
         blacklist.isExecutorBlacklistedForTaskSet(execId)
     }
     if (!isZombie && !offerBlacklisted) {
+      if (sortedPendingTasks.compareAndSet(false, true)) {
+        sortPendingTasks()
+      }
+
       val curTime = clock.getTimeMillis()
 
       var allowedLocality = maxLocality
@@ -512,6 +521,51 @@ private[spark] class TaskSetManager(
     }
   }
 
+  private[this] def sortPendingTasks(): Unit = {
+    val taskIndexs = (0 until numTasks).toArray
+    def ordFunc(x: Int, y: Int): Boolean = {
+      getTaskInputSizeFromShuffledRDD(tasks(x)) < getTaskInputSizeFromShuffledRDD(tasks(y))
+    }
+    if (tasks.nonEmpty) {
+      // Sort the tasks based on their input size from ShuffledRDD.
+      pendingTasksForExecutor.foreach {
+        case (k, v) => pendingTasksForExecutor(k) = v.sortWith(ordFunc)
+      }
+      pendingTasksForHost.foreach {
+        case (k, v) => pendingTasksForHost(k) = v.sortWith(ordFunc)
+      }
+      pendingTasksForRack.foreach {
+        case (k, v) => pendingTasksForRack(k) = v.sortWith(ordFunc)
+      }
+      pendingTasksWithNoPrefs = pendingTasksWithNoPrefs.sortWith(ordFunc)
+      allPendingTasks = allPendingTasks.sortWith(ordFunc)
+    }
+  }
+
+  // Visible for testing
+  private[spark] def setTaskInputSizeFromShuffledRDD(inputSize: Map[Task[_], Long]) = {
+    taskInputSizeFromShuffledRDD.clear()
+    taskInputSizeFromShuffledRDD ++= inputSize
+  }
+
+  private[this] def getTaskInputSizeFromShuffledRDD(task: Task[_]): Long = {
+    taskInputSizeFromShuffledRDD.get(task) match {
+      case Some(size) => size
+      case None =>
+        val size =
+          sched.dagScheduler.parentSplitsInShuffledRDD(task.stageId, task.partitionId).map {
+            case parentSplits =>
+              parentSplits.map {
+                case (shuffleId, splits) =>
+                  splits.map(sched.mapOutputTracker.getMapSizesByExecutorId(shuffleId, _)
+                    .flatMap(_._2.map(_._2)).sum).sum
+              }.sum
+          }.getOrElse(0L)
+        taskInputSizeFromShuffledRDD(task) = size
+        size
+    }
+  }
+
   private def maybeFinishTaskSet() {
     if (isZombie && runningTasks == 0) {
       sched.taskSetFinished(this)
@@ -833,6 +887,7 @@ private[spark] class TaskSetManager(
         s" has already succeeded).")
     } else {
       addPendingTask(index)
+      sortPendingTasks()
     }
 
     if (!isZombie && reason.countTowardsTaskFailures) {
@@ -904,6 +959,7 @@ private[spark] class TaskSetManager(
           copiesRunning(index) -= 1
           tasksSuccessful -= 1
           addPendingTask(index)
+          sortPendingTasks()
           // Tell the DAGScheduler that this task was resubmitted so that it doesn't think our
           // stage finishes when a total of tasks.size tasks finish.
           sched.dagScheduler.taskEnded(

diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -180,7 +180,6 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     }
   }
 
-
   test("TaskSet with no preferences") {
     sc = new SparkContext("local", "test")
     sched = new FakeTaskScheduler(sc, ("exec1", "host1"))
@@ -1139,6 +1138,19 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
       .updateBlacklistForFailedTask(anyString(), anyString(), anyInt())
   }
 
+  test("Schedule tasks based on size of input from ShuffledRDD.") {
+    sc = new SparkContext("local", "test")
+    sched = new FakeTaskScheduler(sc)
+    val taskSet = FakeTask.createTaskSet(4)
+    val clock = new ManualClock()
+    val manager = new TaskSetManager(sched, taskSet, 1, clock = clock)
+    manager.setTaskInputSizeFromShuffledRDD(taskSet.tasks.zip(Seq(1L, 100L, 10000L, 1000L)).toMap)
+    assert(manager.resourceOffer("exec", "host", ANY).get.index === 2)
+    assert(manager.resourceOffer("exec", "host", ANY).get.index === 3)
+    assert(manager.resourceOffer("exec", "host", ANY).get.index === 1)
+    assert(manager.resourceOffer("exec", "host", ANY).get.index === 0)
+  }
+
   private def createTaskResult(
       id: Int,
       accumUpdates: Seq[AccumulatorV2[_, _]] = Seq.empty): DirectTaskResult[Int] = {