apache
diff --git a/‎core/src/main/scala/org/apache/spark/FutureAction.scala‎
Lines changed: 2 additions & 5 deletions b/‎core/src/main/scala/org/apache/spark/FutureAction.scala‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/Partitioner.scala‎
Lines changed: 6 additions & 23 deletions b/‎core/src/main/scala/org/apache/spark/Partitioner.scala‎
Lines changed: 6 additions & 23 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala‎
Lines changed: 26 additions & 38 deletions b/‎core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala‎
Lines changed: 26 additions & 38 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala‎
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala‎
Lines changed: 2 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala‎
Lines changed: 8 additions & 38 deletions b/‎core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala‎
Lines changed: 8 additions & 38 deletions
@@ -208,7 +208,7 @@ class ComplexFutureAction[T] extends FutureAction[T] {
       processPartition: Iterator[T] => U,
       partitions: Seq[Int],
       resultHandler: (Int, U) => Unit,
-      resultFunc: => R): R = {
+      resultFunc: => R) {
     // If the action hasn't been cancelled yet, submit the job. The check and the submitJob
     // command need to be in an atomic block.
     val job = this.synchronized {
@@ -223,10 +223,7 @@ class ComplexFutureAction[T] extends FutureAction[T] {
     // cancel the job and stop the execution. This is not in a synchronized block because
     // Await.ready eventually waits on the monitor in FutureJob.jobWaiter.
     try {
-      Await.ready(job, Duration.Inf).value.get match {
-        case scala.util.Failure(e) => throw e
-        case scala.util.Success(v) => v
-      }
+      Await.ready(job, Duration.Inf)
     } catch {
       case e: InterruptedException =>
         job.cancel()
 
@@ -29,10 +29,6 @@ import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.util.{CollectionsUtils, Utils}
 import org.apache.spark.util.random.{XORShiftRandom, SamplingUtils}
 
-import org.apache.spark.SparkContext.rddToAsyncRDDActions
-import scala.concurrent.Await
-import scala.concurrent.duration.Duration
-
 /**
  * An object that defines how the elements in a key-value pair RDD are partitioned by key.
  * Maps each key to a partition ID, from 0 to `numPartitions - 1`.
@@ -117,12 +113,8 @@ class RangePartitioner[K : Ordering : ClassTag, V](
   private var ordering = implicitly[Ordering[K]]
 
   // An array of upper bounds for the first (partitions - 1) partitions
-  @volatile private var valRB: Array[K] = null
-
-  private def rangeBounds: Array[K] = this.synchronized {
-    if (valRB != null) return valRB
-
-    valRB = if (partitions <= 1) {
+  private var rangeBounds: Array[K] = {
+    if (partitions <= 1) {
       Array.empty
     } else {
       // This is the sample size we need to have roughly balanced output partitions, capped at 1M.
@@ -160,8 +152,6 @@ class RangePartitioner[K : Ordering : ClassTag, V](
         RangePartitioner.determineBounds(candidates, partitions)
       }
     }
-
-    valRB
   }
 
   def numPartitions = rangeBounds.length + 1
@@ -232,8 +222,7 @@ class RangePartitioner[K : Ordering : ClassTag, V](
   }
 
   @throws(classOf[IOException])
-  private def readObject(in: ObjectInputStream): Unit = this.synchronized {
-    if (valRB != null) return
+  private def readObject(in: ObjectInputStream) {
     val sfactory = SparkEnv.get.serializer
     sfactory match {
       case js: JavaSerializer => in.defaultReadObject()
@@ -245,7 +234,7 @@ class RangePartitioner[K : Ordering : ClassTag, V](
         val ser = sfactory.newInstance()
         Utils.deserializeViaNestedStream(in, ser) { ds =>
           implicit val classTag = ds.readObject[ClassTag[Array[K]]]()
-          valRB = ds.readObject[Array[K]]()
+          rangeBounds = ds.readObject[Array[K]]()
         }
     }
   }
@@ -265,18 +254,12 @@ private[spark] object RangePartitioner {
       sampleSizePerPartition: Int): (Long, Array[(Int, Int, Array[K])]) = {
     val shift = rdd.id
     // val classTagK = classTag[K] // to avoid serializing the entire partitioner object
-    // use collectAsync here to run this job as a future, which is cancellable
-    val sketchFuture = rdd.mapPartitionsWithIndex { (idx, iter) =>
+    val sketched = rdd.mapPartitionsWithIndex { (idx, iter) =>
       val seed = byteswap32(idx ^ (shift << 16))
       val (sample, n) = SamplingUtils.reservoirSampleAndCount(
         iter, sampleSizePerPartition, seed)
       Iterator((idx, n, sample))
-    }.collectAsync()
-    // We do need the future's value to continue any further
-    val sketched = Await.ready(sketchFuture, Duration.Inf).value.get match {
-      case scala.util.Success(v) => v.toArray
-      case scala.util.Failure(e) => throw e
-    }
+    }.collect()
     val numItems = sketched.map(_._2.toLong).sum
     (numItems, sketched)
   }
 
@@ -23,7 +23,6 @@ import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.ExecutionContext.Implicits.global
 import scala.reflect.ClassTag
 
-import org.apache.spark.util.Utils
 import org.apache.spark.{ComplexFutureAction, FutureAction, Logging}
 import org.apache.spark.annotation.Experimental
 
@@ -39,30 +38,29 @@ class AsyncRDDActions[T: ClassTag](self: RDD[T]) extends Serializable with Loggi
    * Returns a future for counting the number of elements in the RDD.
    */
   def countAsync(): FutureAction[Long] = {
-    val f = new ComplexFutureAction[Long]
-    f.run {
-      val totalCount = new AtomicLong
-      f.runJob(self,
-               (iter: Iterator[T]) => Utils.getIteratorSize(iter),
-               Range(0, self.partitions.size),
-               (index: Int, data: Long) => totalCount.addAndGet(data),
-               totalCount.get())
-    }
+    val totalCount = new AtomicLong
+    self.context.submitJob(
+      self,
+      (iter: Iterator[T]) => {
+        var result = 0L
+        while (iter.hasNext) {
+          result += 1L
+          iter.next()
+        }
+        result
+      },
+      Range(0, self.partitions.size),
+      (index: Int, data: Long) => totalCount.addAndGet(data),
+      totalCount.get())
   }
 
   /**
    * Returns a future for retrieving all elements of this RDD.
    */
   def collectAsync(): FutureAction[Seq[T]] = {
-    val f = new ComplexFutureAction[Seq[T]]
-    f.run {
-      val results = new Array[Array[T]](self.partitions.size)
-      f.runJob(self,
-               (iter: Iterator[T]) => iter.toArray,
-               Range(0, self.partitions.size),
-               (index: Int, data: Array[T]) => results(index) = data,
-               results.flatten.toSeq)
-    }
+    val results = new Array[Array[T]](self.partitions.size)
+    self.context.submitJob[T, Array[T], Seq[T]](self, _.toArray, Range(0, self.partitions.size),
+      (index, data) => results(index) = data, results.flatten.toSeq)
   }
 
   /**
@@ -106,34 +104,24 @@ class AsyncRDDActions[T: ClassTag](self: RDD[T]) extends Serializable with Loggi
       }
       results.toSeq
     }
+
+    f
   }
 
   /**
    * Applies a function f to all elements of this RDD.
    */
-  def foreachAsync(expr: T => Unit): FutureAction[Unit] = {
-    val f = new ComplexFutureAction[Unit]
-    val exprClean = self.context.clean(expr)
-    f.run {
-      f.runJob(self,
-               (iter: Iterator[T]) => iter.foreach(exprClean),
-               Range(0, self.partitions.size),
-               (index: Int, data: Unit) => Unit,
-               Unit)
-    }
+  def foreachAsync(f: T => Unit): FutureAction[Unit] = {
+    val cleanF = self.context.clean(f)
+    self.context.submitJob[T, Unit, Unit](self, _.foreach(cleanF), Range(0, self.partitions.size),
+      (index, data) => Unit, Unit)
   }
 
   /**
    * Applies a function f to each partition of this RDD.
    */
-  def foreachPartitionAsync(expr: Iterator[T] => Unit): FutureAction[Unit] = {
-    val f = new ComplexFutureAction[Unit]
-    f.run {
-      f.runJob(self,
-               expr,
-               Range(0, self.partitions.size),
-               (index: Int, data: Unit) => Unit,
-               Unit)
-    }
+  def foreachPartitionAsync(f: Iterator[T] => Unit): FutureAction[Unit] = {
+    self.context.submitJob[T, Unit, Unit](self, f, Range(0, self.partitions.size),
+      (index, data) => Unit, Unit)
   }
 }
@@ -194,7 +194,7 @@ class HadoopRDD[K, V](
       val jobConf = getJobConf()
       val inputFormat = getInputFormat(jobConf)
       HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmm").format(createTime),
-        context.stageId, theSplit.index, context.attemptId.toInt, jobConf)
+        context.getStageId, theSplit.index, context.getAttemptId.toInt, jobConf)
       reader = inputFormat.getRecordReader(split.inputSplit.value, jobConf, Reporter.NULL)
 
       // Register an on-task-completion callback to close the input stream.
 
@@ -86,7 +86,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     }
     val aggregator = new Aggregator[K, V, C](createCombiner, mergeValue, mergeCombiners)
     if (self.partitioner == Some(partitioner)) {
-      self.mapPartitionsWithContext((context, iter) => {
+      self.mapPartitions(iter => {
+        val context = TaskContext.get()
         new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
       }, preservesPartitioning = true)
     } else {
 
@@ -66,23 +66,19 @@ class ExternalAppendOnlyMap[K, V, C](
     mergeCombiners: (C, C) => C,
     serializer: Serializer = SparkEnv.get.serializer,
     blockManager: BlockManager = SparkEnv.get.blockManager)
-  extends Iterable[(K, C)] with Serializable with Logging {
+  extends Iterable[(K, C)]
+  with Serializable
+  with Logging
+  with Spillable[SizeTracker] {
 
   private var currentMap = new SizeTrackingAppendOnlyMap[K, C]
   private val spilledMaps = new ArrayBuffer[DiskMapIterator]
   private val sparkConf = SparkEnv.get.conf
   private val diskBlockManager = blockManager.diskBlockManager
-  private val shuffleMemoryManager = SparkEnv.get.shuffleMemoryManager
 
   // Number of pairs inserted since last spill; note that we count them even if a value is merged
   // with a previous key in case we're doing something like groupBy where the result grows
-  private var elementsRead = 0L
-
-  // Number of in-memory pairs inserted before tracking the map's shuffle memory usage
-  private val trackMemoryThreshold = 1000
-
-  // How much of the shared memory pool this collection has claimed
-  private var myMemoryThreshold = 0L
+  protected[this] var elementsRead = 0L
 
   /**
    * Size of object batches when reading/writing from serializers.
@@ -95,11 +91,7 @@ class ExternalAppendOnlyMap[K, V, C](
    */
   private val serializerBatchSize = sparkConf.getLong("spark.shuffle.spill.batchSize", 10000)
 
-  // How many times we have spilled so far
-  private var spillCount = 0
-
   // Number of bytes spilled in total
-  private var _memoryBytesSpilled = 0L
   private var _diskBytesSpilled = 0L
 
   private val fileBufferSize = sparkConf.getInt("spark.shuffle.file.buffer.kb", 32) * 1024
@@ -136,19 +128,8 @@ class ExternalAppendOnlyMap[K, V, C](
 
     while (entries.hasNext) {
       curEntry = entries.next()
-      if (elementsRead > trackMemoryThreshold && elementsRead % 32 == 0 &&
-          currentMap.estimateSize() >= myMemoryThreshold)
-      {
-        // Claim up to double our current memory from the shuffle memory pool
-        val currentMemory = currentMap.estimateSize()
-        val amountToRequest = 2 * currentMemory - myMemoryThreshold
-        val granted = shuffleMemoryManager.tryToAcquire(amountToRequest)
-        myMemoryThreshold += granted
-        if (myMemoryThreshold <= currentMemory) {
-          // We were granted too little memory to grow further (either tryToAcquire returned 0,
-          // or we already had more memory than myMemoryThreshold); spill the current collection
-          spill(currentMemory)  // Will also release memory back to ShuffleMemoryManager
-        }
+      if (maybeSpill(currentMap, currentMap.estimateSize())) {
+        currentMap = new SizeTrackingAppendOnlyMap[K, C]
       }
       currentMap.changeValue(curEntry._1, update)
       elementsRead += 1
@@ -171,11 +152,7 @@ class ExternalAppendOnlyMap[K, V, C](
   /**
    * Sort the existing contents of the in-memory map and spill them to a temporary file on disk.
    */
-  private def spill(mapSize: Long): Unit = {
-    spillCount += 1
-    val threadId = Thread.currentThread().getId
-    logInfo("Thread %d spilling in-memory map of %d MB to disk (%d time%s so far)"
-      .format(threadId, mapSize / (1024 * 1024), spillCount, if (spillCount > 1) "s" else ""))
+  override protected[this] def spill(collection: SizeTracker): Unit = {
     val (blockId, file) = diskBlockManager.createTempBlock()
     curWriteMetrics = new ShuffleWriteMetrics()
     var writer = blockManager.getDiskWriter(blockId, file, serializer, fileBufferSize,
@@ -231,18 +208,11 @@ class ExternalAppendOnlyMap[K, V, C](
       }
     }
 
-    currentMap = new SizeTrackingAppendOnlyMap[K, C]
     spilledMaps.append(new DiskMapIterator(file, blockId, batchSizes))
 
-    // Release our memory back to the shuffle pool so that other threads can grab it
-    shuffleMemoryManager.release(myMemoryThreshold)
-    myMemoryThreshold = 0L
-
     elementsRead = 0
-    _memoryBytesSpilled += mapSize
   }
 
-  def memoryBytesSpilled: Long = _memoryBytesSpilled
   def diskBytesSpilled: Long = _diskBytesSpilled
 
   /**