palantir · bulldozer-bot · Apr 30, 2019 · Mar 20, 2019 · Mar 21, 2019 · Mar 22, 2019
diff --git a/core/src/main/java/org/apache/spark/api/shuffle/ShuffleBlockInfo.java b/core/src/main/java/org/apache/spark/api/shuffle/ShuffleBlockInfo.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.shuffle;
+
+import org.apache.spark.api.java.Optional;
+
+import java.util.Objects;
+
+/**
+ * :: Experimental ::
+ * An object defining the shuffle block and length metadata associated with the block.
+ * @since 3.0.0
+ */
+public class ShuffleBlockInfo {
+  private final int shuffleId;
+  private final int mapId;
+  private final int reduceId;
+  private final long length;
+  private final Optional<ShuffleLocation> shuffleLocation;
+
+  public ShuffleBlockInfo(int shuffleId, int mapId, int reduceId, long length,
+    Optional<ShuffleLocation> shuffleLocation) {
+    this.shuffleId = shuffleId;
+    this.mapId = mapId;
+    this.reduceId = reduceId;
+    this.length = length;
+    this.shuffleLocation = shuffleLocation;
+  }
+
+  public int getShuffleId() {
+    return shuffleId;
+  }
+
+  public int getMapId() {
+    return mapId;
+  }
+
+  public int getReduceId() {
+    return reduceId;
+  }
+
+  public long getLength() {
+    return length;
+  }
+
+  public Optional<ShuffleLocation> getShuffleLocation() {
+    return shuffleLocation;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    return other instanceof ShuffleBlockInfo
+        && shuffleId == ((ShuffleBlockInfo) other).shuffleId
+        && mapId == ((ShuffleBlockInfo) other).mapId
+        && reduceId == ((ShuffleBlockInfo) other).reduceId
+        && length == ((ShuffleBlockInfo) other).length
+        && Objects.equals(shuffleLocation, ((ShuffleBlockInfo) other).shuffleLocation);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(shuffleId, mapId, reduceId, length, shuffleLocation);
+  }
+}
diff --git a/core/src/main/java/org/apache/spark/api/shuffle/ShuffleExecutorComponents.java b/core/src/main/java/org/apache/spark/api/shuffle/ShuffleExecutorComponents.java
@@ -30,4 +30,6 @@ public interface ShuffleExecutorComponents {
   void initializeExecutor(String appId, String execId);
 
   ShuffleWriteSupport writes();
+
+  ShuffleReadSupport reads();
 }
diff --git a/core/src/main/java/org/apache/spark/api/shuffle/ShuffleLocation.java b/core/src/main/java/org/apache/spark/api/shuffle/ShuffleLocation.java
@@ -21,5 +21,4 @@
  * Marker interface representing a location of a shuffle block. Implementations of shuffle readers
  * and writers are expected to cast this down to an implementation-specific representation.
  */
-public interface ShuffleLocation {
-}
+public interface ShuffleLocation {}
diff --git a/core/src/main/java/org/apache/spark/api/shuffle/ShuffleReadSupport.java b/core/src/main/java/org/apache/spark/api/shuffle/ShuffleReadSupport.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.shuffle;
+
+import org.apache.spark.annotation.Experimental;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * :: Experimental ::
+ * An interface for reading shuffle records.
+ * @since 3.0.0
+ */
+@Experimental
+public interface ShuffleReadSupport {
+  /**
+   * Returns an underlying {@link Iterable<InputStream>} that will iterate
+   * through shuffle data, given an iterable for the shuffle blocks to fetch.
+   */
+  Iterable<InputStream> getPartitionReaders(Iterable<ShuffleBlockInfo> blockMetadata)
+      throws IOException;
+}
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/io/DefaultShuffleExecutorComponents.java b/core/src/main/java/org/apache/spark/shuffle/sort/io/DefaultShuffleExecutorComponents.java
@@ -17,18 +17,24 @@
 
 package org.apache.spark.shuffle.sort.io;
 
+import org.apache.spark.MapOutputTracker;
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkEnv;
 import org.apache.spark.api.shuffle.ShuffleExecutorComponents;
+import org.apache.spark.api.shuffle.ShuffleReadSupport;
 import org.apache.spark.api.shuffle.ShuffleWriteSupport;
+import org.apache.spark.serializer.SerializerManager;
 import org.apache.spark.shuffle.IndexShuffleBlockResolver;
+import org.apache.spark.shuffle.io.DefaultShuffleReadSupport;
 import org.apache.spark.storage.BlockManager;
 
 public class DefaultShuffleExecutorComponents implements ShuffleExecutorComponents {
 
   private final SparkConf sparkConf;
   private BlockManager blockManager;
   private IndexShuffleBlockResolver blockResolver;
+  private MapOutputTracker mapOutputTracker;
+  private SerializerManager serializerManager;
 
   public DefaultShuffleExecutorComponents(SparkConf sparkConf) {
     this.sparkConf = sparkConf;
@@ -37,15 +43,30 @@ public DefaultShuffleExecutorComponents(SparkConf sparkConf) {
   @Override
   public void initializeExecutor(String appId, String execId) {
     blockManager = SparkEnv.get().blockManager();
+    mapOutputTracker = SparkEnv.get().mapOutputTracker();
+    serializerManager = SparkEnv.get().serializerManager();
     blockResolver = new IndexShuffleBlockResolver(sparkConf, blockManager);
   }
 
   @Override
   public ShuffleWriteSupport writes() {
+    checkInitialized();
+    return new DefaultShuffleWriteSupport(sparkConf, blockResolver, blockManager.shuffleServerId());
+  }
+
+  @Override
+  public ShuffleReadSupport reads() {
+    checkInitialized();
+    return new DefaultShuffleReadSupport(blockManager,
+        mapOutputTracker,
+        serializerManager,
+        sparkConf);
+  }
+
+  private void checkInitialized() {
     if (blockResolver == null) {
       throw new IllegalStateException(
-        "Executor components must be initialized before getting writers.");
+          "Executor components must be initialized before getting writers.");
     }
-    return new DefaultShuffleWriteSupport(sparkConf, blockResolver, blockManager.shuffleServerId());
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -283,7 +283,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
 
   // For testing
   def getMapSizesByShuffleLocation(shuffleId: Int, reduceId: Int)
-      : Iterator[(ShuffleLocation, Seq[(BlockId, Long)])] = {
+      : Iterator[(Option[ShuffleLocation], Seq[(BlockId, Long)])] = {
     getMapSizesByShuffleLocation(shuffleId, reduceId, reduceId + 1)
   }
 
@@ -297,7 +297,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
    *         describing the shuffle blocks that are stored at that block manager.
    */
   def getMapSizesByShuffleLocation(shuffleId: Int, startPartition: Int, endPartition: Int)
-      : Iterator[(ShuffleLocation, Seq[(BlockId, Long)])]
+      : Iterator[(Option[ShuffleLocation], Seq[(BlockId, Long)])]
 
   /**
    * Deletes map output status information for the specified shuffle stage.
@@ -647,7 +647,7 @@ private[spark] class MapOutputTrackerMaster(
   // Get blocks sizes by executor Id. Note that zero-sized blocks are excluded in the result.
   // This method is only called in local-mode.
   def getMapSizesByShuffleLocation(shuffleId: Int, startPartition: Int, endPartition: Int)
-      : Iterator[(ShuffleLocation, Seq[(BlockId, Long)])] = {
+      : Iterator[(Option[ShuffleLocation], Seq[(BlockId, Long)])] = {
     logDebug(s"Fetching outputs for shuffle $shuffleId, partitions $startPartition-$endPartition")
     shuffleStatuses.get(shuffleId) match {
       case Some (shuffleStatus) =>
@@ -684,7 +684,7 @@ private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTr
 
   // Get blocks sizes by executor Id. Note that zero-sized blocks are excluded in the result.
   override def getMapSizesByShuffleLocation(shuffleId: Int, startPartition: Int, endPartition: Int)
-      : Iterator[(ShuffleLocation, Seq[(BlockId, Long)])] = {
+      : Iterator[(Option[ShuffleLocation], Seq[(BlockId, Long)])] = {
     logDebug(s"Fetching outputs for shuffle $shuffleId, partitions $startPartition-$endPartition")
     val statuses = getStatuses(shuffleId)
     try {
@@ -873,9 +873,9 @@ private[spark] object MapOutputTracker extends Logging {
       shuffleId: Int,
       startPartition: Int,
       endPartition: Int,
-      statuses: Array[MapStatus]): Iterator[(ShuffleLocation, Seq[(BlockId, Long)])] = {
+      statuses: Array[MapStatus]): Iterator[(Option[ShuffleLocation], Seq[(BlockId, Long)])] = {
     assert (statuses != null)
-    val splitsByAddress = new HashMap[ShuffleLocation, ListBuffer[(BlockId, Long)]]
+    val splitsByAddress = new HashMap[Option[ShuffleLocation], ListBuffer[(BlockId, Long)]]
     for ((status, mapId) <- statuses.iterator.zipWithIndex) {
       if (status == null) {
         val errorMessage = s"Missing an output location for shuffle $shuffleId"
@@ -885,9 +885,14 @@ private[spark] object MapOutputTracker extends Logging {
         for (part <- startPartition until endPartition) {
           val size = status.getSizeForBlock(part)
           if (size != 0) {
-            val shuffleLoc = status.mapShuffleLocations.getLocationForBlock(part)
-            splitsByAddress.getOrElseUpdate(shuffleLoc, ListBuffer()) +=
+            if (status.mapShuffleLocations == null) {
+              splitsByAddress.getOrElseUpdate(Option.empty, ListBuffer()) +=
                 ((ShuffleBlockId(shuffleId, mapId, part), size))
+            } else {
+              val shuffleLoc = status.mapShuffleLocations.getLocationForBlock(part)
+              splitsByAddress.getOrElseUpdate(Option.apply(shuffleLoc), ListBuffer()) +=
+                ((ShuffleBlockId(shuffleId, mapId, part), size))
+            }
           }
         }
       }

diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
@@ -56,6 +56,8 @@ class TaskMetrics private[spark] () extends Serializable {
   private val _diskBytesSpilled = new LongAccumulator
   private val _peakExecutionMemory = new LongAccumulator
   private val _updatedBlockStatuses = new CollectionAccumulator[(BlockId, BlockStatus)]
+  private var _decorFunc: TempShuffleReadMetrics => TempShuffleReadMetrics =
+    Predef.identity[TempShuffleReadMetrics]
 
   /**
    * Time taken on the executor to deserialize this task.
@@ -187,11 +189,17 @@ class TaskMetrics private[spark] () extends Serializable {
    * be lost.
    */
   private[spark] def createTempShuffleReadMetrics(): TempShuffleReadMetrics = synchronized {
-    val readMetrics = new TempShuffleReadMetrics
-    tempShuffleReadMetrics += readMetrics
+    val tempShuffleMetrics = new TempShuffleReadMetrics
+    val readMetrics = _decorFunc(tempShuffleMetrics)
+    tempShuffleReadMetrics += tempShuffleMetrics
     readMetrics
   }
 
+  private[spark] def decorateTempShuffleReadMetrics(
+      decorFunc: TempShuffleReadMetrics => TempShuffleReadMetrics): Unit = synchronized {
+    _decorFunc = decorFunc
+  }
+
   /**
    * Merge values across all temporary [[ShuffleReadMetrics]] into `_shuffleReadMetrics`.
    * This is expected to be called on executor heartbeat and at the end of a task.

diff --git a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
@@ -17,11 +17,18 @@
 
 package org.apache.spark.shuffle
 
+import java.io.InputStream
+
+import scala.collection.JavaConverters._
+
 import org.apache.spark._
+import org.apache.spark.api.java.Optional
+import org.apache.spark.api.shuffle.{ShuffleBlockInfo, ShuffleReadSupport}
 import org.apache.spark.internal.{config, Logging}
+import org.apache.spark.io.CompressionCodec
 import org.apache.spark.serializer.SerializerManager
-import org.apache.spark.shuffle.sort.DefaultMapShuffleLocations
-import org.apache.spark.storage.{BlockId, BlockManager, ShuffleBlockFetcherIterator}
+import org.apache.spark.shuffle.io.DefaultShuffleReadSupport
+import org.apache.spark.storage.{ShuffleBlockFetcherIterator, ShuffleBlockId}
 import org.apache.spark.util.CompletionIterator
 import org.apache.spark.util.collection.ExternalSorter
 
@@ -35,40 +42,68 @@ private[spark] class BlockStoreShuffleReader[K, C](
     endPartition: Int,
     context: TaskContext,
     readMetrics: ShuffleReadMetricsReporter,
+    shuffleReadSupport: ShuffleReadSupport,
     serializerManager: SerializerManager = SparkEnv.get.serializerManager,
-    blockManager: BlockManager = SparkEnv.get.blockManager,
-    mapOutputTracker: MapOutputTracker = SparkEnv.get.mapOutputTracker)
+    mapOutputTracker: MapOutputTracker = SparkEnv.get.mapOutputTracker,
+    sparkConf: SparkConf = SparkEnv.get.conf)
   extends ShuffleReader[K, C] with Logging {
 
   private val dep = handle.dependency
 
+  private val compressionCodec = CompressionCodec.createCodec(sparkConf)
+
+  private val compressShuffle = sparkConf.get(config.SHUFFLE_COMPRESS)
+
   /** Read the combined key-values for this reduce task */
   override def read(): Iterator[Product2[K, C]] = {
-    val wrappedStreams = new ShuffleBlockFetcherIterator(
-      context,
-      blockManager.shuffleClient,
-      blockManager,
-      mapOutputTracker.getMapSizesByShuffleLocation(handle.shuffleId, startPartition, endPartition)
-        .map {
-          case (loc: DefaultMapShuffleLocations, blocks: Seq[(BlockId, Long)]) =>
-            (loc.getBlockManagerId, blocks)
-          case _ =>
-            throw new UnsupportedOperationException("Not allowed to using non-default map shuffle" +
-              " locations yet.")
-        },
-      serializerManager.wrapStream,
-      // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility
-      SparkEnv.get.conf.get(config.REDUCER_MAX_SIZE_IN_FLIGHT) * 1024 * 1024,
-      SparkEnv.get.conf.get(config.REDUCER_MAX_REQS_IN_FLIGHT),
-      SparkEnv.get.conf.get(config.REDUCER_MAX_BLOCKS_IN_FLIGHT_PER_ADDRESS),
-      SparkEnv.get.conf.get(config.MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM),
-      SparkEnv.get.conf.get(config.SHUFFLE_DETECT_CORRUPT),
-      readMetrics).toCompletionIterator
+    val streamsIterator =
+      shuffleReadSupport.getPartitionReaders(new Iterable[ShuffleBlockInfo] {
+        override def iterator: Iterator[ShuffleBlockInfo] = {
+          mapOutputTracker
+            .getMapSizesByShuffleLocation(handle.shuffleId, startPartition, endPartition)
+            .flatMap { shuffleLocationInfo =>
+              shuffleLocationInfo._2.map { blockInfo =>
+                val block = blockInfo._1.asInstanceOf[ShuffleBlockId]
+                new ShuffleBlockInfo(
+                  block.shuffleId,
+                  block.mapId,
+                  block.reduceId,
+                  blockInfo._2,
+                  Optional.ofNullable(shuffleLocationInfo._1.orNull))
+              }
+            }
+        }
+      }.asJava).iterator()
 
-    val serializerInstance = dep.serializer.newInstance()
+    val retryingWrappedStreams = new Iterator[InputStream] {
+      override def hasNext: Boolean = streamsIterator.hasNext
 
-    // Create a key/value iterator for each stream
-    val recordIter = wrappedStreams.flatMap { case (blockId, wrappedStream) =>
+      override def next(): InputStream = {
+        var returnStream: InputStream = null
+        while (streamsIterator.hasNext && returnStream == null) {
+          if (shuffleReadSupport.isInstanceOf[DefaultShuffleReadSupport]) {
+            // The default implementation checks for corrupt streams, so it will already have
+            // decompressed/decrypted the bytes
+            returnStream = streamsIterator.next()
+          } else {
+            val nextStream = streamsIterator.next()
+            returnStream = if (compressShuffle) {
+              compressionCodec.compressedInputStream(
+                serializerManager.wrapForEncryption(nextStream))
+            } else {
+              serializerManager.wrapForEncryption(nextStream)
+            }
+          }
+        }
+        if (returnStream == null) {
+          throw new IllegalStateException("Expected shuffle reader iterator to return a stream")
+        }
+        returnStream
+      }
+    }
+
+    val serializerInstance = dep.serializer.newInstance()
+    val recordIter = retryingWrappedStreams.flatMap { wrappedStream =>
       // Note: the asKeyValueIterator below wraps a key/value iterator inside of a
       // NextIterator. The NextIterator makes sure that close() is called on the
       // underlying InputStream when all records have been read.