apache · yucai · Nov 20, 2017 · Nov 21, 2017 · Nov 24, 2017 · Nov 24, 2017
diff --git a/...network-shuffle/src/main/java/org/apache/spark/network/shuffle/BlockFetchingListener.java b/...network-shuffle/src/main/java/org/apache/spark/network/shuffle/BlockFetchingListener.java
@@ -27,10 +27,10 @@ public interface BlockFetchingListener extends EventListener {
    * automatically. If the data will be passed to another thread, the receiver should retain()
    * and release() the buffer on their own, or copy the data to a new buffer.
    */
-  void onBlockFetchSuccess(String blockId, ManagedBuffer data);
+  void onBlockFetchSuccess(String[] blockIds, ManagedBuffer data);
 
   /**
    * Called at least once per block upon failures.
    */
-  void onBlockFetchFailure(String blockId, Throwable exception);
+  void onBlockFetchFailure(String[] blockIds, Throwable exception);
 }
diff --git a/...k-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java b/...k-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockHandler.java
@@ -20,9 +20,7 @@
 import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
+import java.util.*;
 
 import com.codahale.metrics.Gauge;
 import com.codahale.metrics.Meter;
@@ -91,16 +89,18 @@ protected void handleMessage(
       try {
         OpenBlocks msg = (OpenBlocks) msgObj;
         checkAuth(client, msg.appId);
-        long streamId = streamManager.registerStream(client.getClientId(),
-          new ManagedBufferIterator(msg.appId, msg.execId, msg.blockIds), client.getChannel());
+        ManagedBufferIterator blocksIter = new ManagedBufferIterator(
+          msg.appId, msg.execId, msg.blockIds, msg.fetchContinuousShuffleBlocksInBatch);
+        long streamId = streamManager.registerStream(
+          client.getClientId(), blocksIter, client.getChannel());
         if (logger.isTraceEnabled()) {
           logger.trace("Registered streamId {} with {} buffers for client {} from host {}",
                        streamId,
-                       msg.blockIds.length,
+                       blocksIter.getNumChunks(),
                        client.getClientId(),
                        getRemoteAddress(client.getChannel()));
         }
-        callback.onSuccess(new StreamHandle(streamId, msg.blockIds.length).toByteBuffer());
+        callback.onSuccess(new StreamHandle(streamId, blocksIter.getNumChunks()).toByteBuffer());
       } finally {
         responseDelayContext.stop();
       }
@@ -211,42 +211,58 @@ private class ManagedBufferIterator implements Iterator<ManagedBuffer> {
     private final String appId;
     private final String execId;
     private final int shuffleId;
-    // An array containing mapId and reduceId pairs.
-    private final int[] mapIdAndReduceIds;
+    // An array containing mapId, reduceId and numReducers tuple
+    private final int[] shuffleBlockBatches;
 
-    ManagedBufferIterator(String appId, String execId, String[] blockIds) {
+    ManagedBufferIterator(
+        String appId,
+        String execId,
+        String[] blockIds,
+        boolean fetchContinuousShuffleBlocksInBatch) {
       this.appId = appId;
       this.execId = execId;
       String[] blockId0Parts = blockIds[0].split("_");
-      if (blockId0Parts.length != 4 || !blockId0Parts[0].equals("shuffle")) {
+      if (!ExternalShuffleBlockResolver.isShuffleBlock(blockId0Parts)) {
         throw new IllegalArgumentException("Unexpected shuffle block id format: " + blockIds[0]);
       }
       this.shuffleId = Integer.parseInt(blockId0Parts[1]);
-      mapIdAndReduceIds = new int[2 * blockIds.length];
-      for (int i = 0; i < blockIds.length; i++) {
-        String[] blockIdParts = blockIds[i].split("_");
-        if (blockIdParts.length != 4 || !blockIdParts[0].equals("shuffle")) {
-          throw new IllegalArgumentException("Unexpected shuffle block id format: " + blockIds[i]);
+      if (fetchContinuousShuffleBlocksInBatch) {
+        ArrayList<ArrayList<int[]>> arrayShuffleBlockIds =
+          ExternalShuffleBlockResolver.mergeContinuousShuffleBlockIds(blockIds);
+        shuffleBlockBatches = new int[arrayShuffleBlockIds.size() * 3];
+        for (int i = 0; i < arrayShuffleBlockIds.size(); i++) {
+          ArrayList<int[]> arrayShuffleBlockId = arrayShuffleBlockIds.get(i);
+          int[] startBlockId = arrayShuffleBlockId.get(0);
+          int[] endBlockId = arrayShuffleBlockId.get(arrayShuffleBlockId.size() - 1);
+          shuffleBlockBatches[3 * i] = startBlockId[0];
+          shuffleBlockBatches[3 * i + 1] = startBlockId[1];
+          shuffleBlockBatches[3 * i + 2] = endBlockId[1] - startBlockId[1] + 1;
         }
-        if (Integer.parseInt(blockIdParts[1]) != shuffleId) {
-          throw new IllegalArgumentException("Expected shuffleId=" + shuffleId +
-            ", got:" + blockIds[i]);
+      } else {
+        shuffleBlockBatches = new int[3 * blockIds.length];
+        for (int i = 0; i < blockIds.length; i++) {
+          int[] blockIdParts = ExternalShuffleBlockResolver.getBlockIdParts(blockIds[i]);
+          shuffleBlockBatches[3 * i] = blockIdParts[0];
+          shuffleBlockBatches[3 * i + 1] = blockIdParts[1];
+          shuffleBlockBatches[3 * i + 2] = 1;
         }
-        mapIdAndReduceIds[2 * i] = Integer.parseInt(blockIdParts[2]);
-        mapIdAndReduceIds[2 * i + 1] = Integer.parseInt(blockIdParts[3]);
       }
     }
 
+    public int getNumChunks() {
+      return shuffleBlockBatches.length / 3;
+    }
+
     @Override
     public boolean hasNext() {
-      return index < mapIdAndReduceIds.length;
+      return index < shuffleBlockBatches.length;
     }
 
     @Override
     public ManagedBuffer next() {
       final ManagedBuffer block = blockManager.getBlockData(appId, execId, shuffleId,
-        mapIdAndReduceIds[index], mapIdAndReduceIds[index + 1]);
-      index += 2;
+        shuffleBlockBatches[index], shuffleBlockBatches[index + 1], shuffleBlockBatches[index + 2]);
+      index += 3;
       metrics.blockTransferRateBytes.mark(block != null ? block.size() : 0);
       return block;
     }

diff --git a/...-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java b/...-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
@@ -161,22 +161,69 @@ public void registerExecutor(
     executors.put(fullId, executorInfo);
   }
 
+  // For testing
+  public ManagedBuffer getBlockData(
+      String appId,
+      String execId,
+      int shuffleId,
+      int mapId,
+      int reduceId) {
+    return getBlockData(appId, execId, shuffleId, mapId, reduceId, 1);
+  }
+
   /**
-   * Obtains a FileSegmentManagedBuffer from (shuffleId, mapId, reduceId). We make assumptions
-   * about how the hash and sort based shuffles store their data.
+   * Obtains a FileSegmentManagedBuffer from (shuffleId, mapId, reduceId, numReducers). We make
+   * assumptions about how the hash and sort based shuffles store their data.
    */
   public ManagedBuffer getBlockData(
       String appId,
       String execId,
       int shuffleId,
       int mapId,
-      int reduceId) {
+      int reduceId,
+      int numReducers) {
     ExecutorShuffleInfo executor = executors.get(new AppExecId(appId, execId));
     if (executor == null) {
       throw new RuntimeException(
         String.format("Executor is not registered (appId=%s, execId=%s)", appId, execId));
     }
-    return getSortBasedShuffleBlockData(executor, shuffleId, mapId, reduceId);
+    return getSortBasedShuffleBlockData(executor, shuffleId, mapId, reduceId, numReducers);
+  }
+
+  public static boolean isShuffleBlock(String[] blockIdParts) {
+    return blockIdParts.length == 4 && blockIdParts[0].equals("shuffle");
+  }
+
+  public static int[] getBlockIdParts(String blockId) {
+    String[] blockIdParts = blockId.split("_");
+    if (!isShuffleBlock(blockIdParts)) {
+      throw new IllegalArgumentException("Unexpected shuffle block id format: " + blockId);
+    }
+    return new int[] { Integer.parseInt(blockIdParts[2]), Integer.parseInt(blockIdParts[3]) };
+  }
+
+  // Currently, for all input blockIds, we can make assumption that block ids of the same mapper id
+  // are consecutive in the map output file. Although, logically, they might not be consecutive
+  // because of zero-sized blocks, which have been filtered out in the client side actually.
+  public static ArrayList<ArrayList<int[]>> mergeContinuousShuffleBlockIds(String[] blockIds) {
+    ArrayList<int[]> shuffleBlockIds = new ArrayList<>();
+    ArrayList<ArrayList<int[]>> arrayShuffleBlockIds = new ArrayList<>();
+
+    for (String blockId: blockIds) {
+      int[] blockIdParts = getBlockIdParts(blockId);
+      if (shuffleBlockIds.size() == 0) {
+        shuffleBlockIds.add(blockIdParts);
+      } else {
+        if (blockIdParts[0] != shuffleBlockIds.get(0)[0]) {
+          arrayShuffleBlockIds.add(shuffleBlockIds);
+          shuffleBlockIds = new ArrayList<>();
+        }
+        shuffleBlockIds.add(blockIdParts);
+      }
+    }
+    arrayShuffleBlockIds.add(shuffleBlockIds);
+
+    return arrayShuffleBlockIds;
   }
 
   /**
@@ -280,13 +327,14 @@ public boolean accept(File dir, String name) {
    * and the block id format is from ShuffleDataBlockId and ShuffleIndexBlockId.
    */
   private ManagedBuffer getSortBasedShuffleBlockData(
-    ExecutorShuffleInfo executor, int shuffleId, int mapId, int reduceId) {
+    ExecutorShuffleInfo executor, int shuffleId, int mapId, int reduceId, int numReducers) {
     File indexFile = getFile(executor.localDirs, executor.subDirsPerLocalDir,
       "shuffle_" + shuffleId + "_" + mapId + "_0.index");
 
     try {
       ShuffleIndexInformation shuffleIndexInformation = shuffleIndexCache.get(indexFile);
-      ShuffleIndexRecord shuffleIndexRecord = shuffleIndexInformation.getIndex(reduceId);
+      ShuffleIndexRecord shuffleIndexRecord =
+        shuffleIndexInformation.getIndex(reduceId, numReducers);
       return new FileSegmentManagedBuffer(
         conf,
         getFile(executor.localDirs, executor.subDirsPerLocalDir,

diff --git a/...network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java b/...network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleClient.java
@@ -91,15 +91,16 @@ public void fetchBlocks(
       String execId,
       String[] blockIds,
       BlockFetchingListener listener,
-      DownloadFileManager downloadFileManager) {
+      DownloadFileManager downloadFileManager,
+      boolean fetchContinuousShuffleBlocksInBatch) {
     checkInit();
     logger.debug("External shuffle fetch from {}:{} (executor id {})", host, port, execId);
     try {
       RetryingBlockFetcher.BlockFetchStarter blockFetchStarter =
           (blockIds1, listener1) -> {
             TransportClient client = clientFactory.createClient(host, port);
-            new OneForOneBlockFetcher(client, appId, execId,
-              blockIds1, listener1, conf, downloadFileManager).start();
+            new OneForOneBlockFetcher(client, appId, execId, blockIds1, listener1, conf,
+              downloadFileManager, fetchContinuousShuffleBlocksInBatch).start();
           };
 
       int maxRetries = conf.maxIORetries();
@@ -112,9 +113,7 @@ public void fetchBlocks(
       }
     } catch (Exception e) {
       logger.error("Exception while beginning fetchBlocks", e);
-      for (String blockId : blockIds) {
-        listener.onBlockFetchFailure(blockId, e);
-      }
+      listener.onBlockFetchFailure(blockIds, e);
     }
   }
 

diff --git a/...network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java b/...network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
@@ -20,6 +20,7 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.Arrays;
+import java.util.ArrayList;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -50,6 +51,10 @@ public class OneForOneBlockFetcher {
   private final TransportClient client;
   private final OpenBlocks openMessage;
   private final String[] blockIds;
+  // In adaptive execution, one returned chunk might contain data for several consecutive blockIds,
+  // blockIdIndices is used to record the mapping relationship between chunk and its blockIds.
+  // chunk i contains block Ids: blockIdIndices[i] until blockIdIndices[i + 1] in blockIds
+  private int[] blockIdIndices = null;
   private final BlockFetchingListener listener;
   private final ChunkReceivedCallback chunkCallback;
   private final TransportConf transportConf;
@@ -64,7 +69,7 @@ public OneForOneBlockFetcher(
     String[] blockIds,
     BlockFetchingListener listener,
     TransportConf transportConf) {
-    this(client, appId, execId, blockIds, listener, transportConf, null);
+    this(client, appId, execId, blockIds, listener, transportConf, null, false);
   }
 
   public OneForOneBlockFetcher(
@@ -74,9 +79,10 @@ public OneForOneBlockFetcher(
       String[] blockIds,
       BlockFetchingListener listener,
       TransportConf transportConf,
-      DownloadFileManager downloadFileManager) {
+      DownloadFileManager downloadFileManager,
+      boolean fetchContinuousShuffleBlocksInBatch) {
     this.client = client;
-    this.openMessage = new OpenBlocks(appId, execId, blockIds);
+    this.openMessage = new OpenBlocks(appId, execId, blockIds, fetchContinuousShuffleBlocksInBatch);
     this.blockIds = blockIds;
     this.listener = listener;
     this.chunkCallback = new ChunkCallback();
@@ -89,13 +95,15 @@ private class ChunkCallback implements ChunkReceivedCallback {
     @Override
     public void onSuccess(int chunkIndex, ManagedBuffer buffer) {
       // On receipt of a chunk, pass it upwards as a block.
-      listener.onBlockFetchSuccess(blockIds[chunkIndex], buffer);
+      listener.onBlockFetchSuccess(Arrays.copyOfRange(blockIds, blockIdIndices[chunkIndex],
+        blockIdIndices[chunkIndex + 1]), buffer);
     }
 
     @Override
     public void onFailure(int chunkIndex, Throwable e) {
       // On receipt of a failure, fail every block from chunkIndex onwards.
-      String[] remainingBlockIds = Arrays.copyOfRange(blockIds, chunkIndex, blockIds.length);
+      String[] remainingBlockIds = Arrays.copyOfRange(blockIds, blockIdIndices[chunkIndex],
+        blockIds.length);
       failRemainingBlocks(remainingBlockIds, e);
     }
   }
@@ -117,6 +125,25 @@ public void onSuccess(ByteBuffer response) {
           streamHandle = (StreamHandle) BlockTransferMessage.Decoder.fromByteBuffer(response);
           logger.trace("Successfully opened blocks {}, preparing to fetch chunks.", streamHandle);
 
+          // initiate blockIdIndices
+          if (streamHandle.numChunks == blockIds.length) {
+            blockIdIndices = new int[streamHandle.numChunks + 1];
+            for (int i = 0; i < blockIdIndices.length; i++) {
+              blockIdIndices[i] = i;
+            }
+          } else {
+            // server fetches continuous shuffle blocks in batch
+            ArrayList<ArrayList<int[]>> arrayShuffleBlockIds =
+              ExternalShuffleBlockResolver.mergeContinuousShuffleBlockIds(blockIds);
+            assert(streamHandle.numChunks == arrayShuffleBlockIds.size());
+            blockIdIndices = new int[arrayShuffleBlockIds.size() + 1];
+            blockIdIndices[0] = 0;
+            for (int i = 1; i < blockIdIndices.length; i++) {
+              blockIdIndices[i] = blockIdIndices[i - 1] + arrayShuffleBlockIds.get(i - 1).size();;
+            }
+          }
+          assert blockIdIndices[blockIdIndices.length - 1] == blockIds.length;
+
           // Immediately request all chunks -- we expect that the total size of the request is
           // reasonable due to higher level chunking in [[ShuffleBlockFetcherIterator]].
           for (int i = 0; i < streamHandle.numChunks; i++) {
@@ -143,12 +170,10 @@ public void onFailure(Throwable e) {
 
   /** Invokes the "onBlockFetchFailure" callback for every listed block id. */
   private void failRemainingBlocks(String[] failedBlockIds, Throwable e) {
-    for (String blockId : failedBlockIds) {
-      try {
-        listener.onBlockFetchFailure(blockId, e);
-      } catch (Exception e2) {
-        logger.error("Error in block fetch failure callback", e2);
-      }
+    try {
+      listener.onBlockFetchFailure(failedBlockIds, e);
+    } catch (Exception e2) {
+      logger.error("Error in block fetch failure callback", e2);
     }
   }
 
@@ -173,7 +198,8 @@ public void onData(String streamId, ByteBuffer buf) throws IOException {
 
     @Override
     public void onComplete(String streamId) throws IOException {
-      listener.onBlockFetchSuccess(blockIds[chunkIndex], channel.closeAndRead());
+      listener.onBlockFetchSuccess(Arrays.copyOfRange(blockIds, blockIdIndices[chunkIndex],
+        blockIdIndices[chunkIndex + 1]), channel.closeAndRead());
       if (!downloadFileManager.registerTempFileToClean(targetFile)) {
         targetFile.delete();
       }
@@ -183,7 +209,8 @@ public void onComplete(String streamId) throws IOException {
     public void onFailure(String streamId, Throwable cause) throws IOException {
       channel.close();
       // On receipt of a failure, fail every block from chunkIndex onwards.
-      String[] remainingBlockIds = Arrays.copyOfRange(blockIds, chunkIndex, blockIds.length);
+      String[] remainingBlockIds =
+        Arrays.copyOfRange(blockIds, blockIdIndices[chunkIndex], blockIds.length);
       failRemainingBlocks(remainingBlockIds, cause);
       targetFile.delete();
     }