apache · mccheah · Mar 17, 2020 · May 23, 2020 · Jun 3, 2020 · Jun 3, 2020
diff --git a/core/pom.xml b/core/pom.xml
@@ -229,6 +229,10 @@
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-xml_${scala.binary.version}</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.scala-lang.modules</groupId>
+      <artifactId>scala-java8-compat_${scala.binary.version}</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
       <artifactId>scala-library</artifactId>

diff --git a/core/src/main/java/org/apache/spark/shuffle/api/ShuffleDataIO.java b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleDataIO.java
@@ -17,6 +17,8 @@
 
 package org.apache.spark.shuffle.api;
 
+import java.util.Map;
+
 import org.apache.spark.annotation.Private;
 
 /**
@@ -44,12 +46,18 @@ public interface ShuffleDataIO {
   /**
    * Called once on executor processes to bootstrap the shuffle data storage modules that
    * are only invoked on the executors.
+   *
+   * @param appId The Spark application id
+   * @param execId The unique identifier of the executor being initialized
+   * @param extraConfigs Extra configs that were returned by
+   *                     {@link ShuffleDriverComponents#getAddedExecutorSparkConf()}
    */
-  ShuffleExecutorComponents executor();
+  ShuffleExecutorComponents initializeShuffleExecutorComponents(
+      String appId, String execId, Map<String, String> extraConfigs);
 
   /**
    * Called once on driver process to bootstrap the shuffle metadata modules that
    * are maintained by the driver.
    */
-  ShuffleDriverComponents driver();
+  ShuffleDriverComponents initializeShuffleDriverComponents();
 }
diff --git a/core/src/main/java/org/apache/spark/shuffle/api/ShuffleDriverComponents.java b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleDriverComponents.java
@@ -20,6 +20,8 @@
 import java.util.Map;
 
 import org.apache.spark.annotation.Private;
+import org.apache.spark.shuffle.api.metadata.NoOpShuffleOutputTracker;
+import org.apache.spark.shuffle.api.metadata.ShuffleOutputTracker;
 
 /**
  * :: Private ::
@@ -29,36 +31,21 @@
 public interface ShuffleDriverComponents {
 
   /**
-   * Called once in the driver to bootstrap this module that is specific to this application.
-   * This method is called before submitting executor requests to the cluster manager.
-   *
-   * This method should prepare the module with its shuffle components i.e. registering against
-   * an external file servers or shuffle services, or creating tables in a shuffle
-   * storage data database.
+   * Provide additional configuration for the executors when their plugin system is initialized
+   * via {@link ShuffleDataIO#initializeShuffleExecutorComponents(String, String, Map)} ()}
    *
    * @return additional SparkConf settings necessary for initializing the executor components.
    * This would include configurations that cannot be statically set on the application, like
    * the host:port of external services for shuffle storage.
    */
-  Map<String, String> initializeApplication();
+  Map<String, String> getAddedExecutorSparkConf();
 
   /**
    * Called once at the end of the Spark application to clean up any existing shuffle state.
    */
   void cleanupApplication();
 
-  /**
-   * Called once per shuffle id when the shuffle id is first generated for a shuffle stage.
-   *
-   * @param shuffleId The unique identifier for the shuffle stage.
-   */
-  default void registerShuffle(int shuffleId) {}
-
-  /**
-   * Removes shuffle data associated with the given shuffle.
-   *
-   * @param shuffleId The unique identifier for the shuffle stage.
-   * @param blocking Whether this call should block on the deletion of the data.
-   */
-  default void removeShuffle(int shuffleId, boolean blocking) {}
+  default ShuffleOutputTracker shuffleOutputTracker() {
+    return new NoOpShuffleOutputTracker();
+  }
 }
diff --git a/core/src/main/java/org/apache/spark/shuffle/api/ShuffleExecutorComponents.java b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleExecutorComponents.java
@@ -18,7 +18,6 @@
 package org.apache.spark.shuffle.api;
 
 import java.io.IOException;
-import java.util.Map;
 import java.util.Optional;
 
 import org.apache.spark.annotation.Private;
@@ -32,17 +31,6 @@
 @Private
 public interface ShuffleExecutorComponents {
 
-  /**
-   * Called once per executor to bootstrap this module with state that is specific to
-   * that executor, specifically the application ID and executor ID.
-   *
-   * @param appId The Spark application id
-   * @param execId The unique identifier of the executor being initialized
-   * @param extraConfigs Extra configs that were returned by
-   *                     {@link ShuffleDriverComponents#initializeApplication()}
-   */
-  void initializeExecutor(String appId, String execId, Map<String, String> extraConfigs);
-
   /**
    * Called once per map task to create a writer that will be responsible for persisting all the
    * partitioned bytes written by that map task.

diff --git a/core/src/main/java/org/apache/spark/shuffle/api/SingleSpillShuffleMapOutputWriter.java b/core/src/main/java/org/apache/spark/shuffle/api/SingleSpillShuffleMapOutputWriter.java
@@ -21,6 +21,7 @@
 import java.io.IOException;
 
 import org.apache.spark.annotation.Private;
+import org.apache.spark.shuffle.api.metadata.MapOutputCommitMessage;
 
 /**
  * Optional extension for partition writing that is optimized for transferring a single
@@ -32,5 +33,6 @@ public interface SingleSpillShuffleMapOutputWriter {
   /**
    * Transfer a file that contains the bytes of all the partitions written by this map task.
    */
-  void transferMapSpillFile(File mapOutputFile, long[] partitionLengths) throws IOException;
+  MapOutputCommitMessage transferMapSpillFile(File mapOutputFile, long[] partitionLengths)
+      throws IOException;
 }
diff --git a/core/src/main/java/org/apache/spark/shuffle/api/metadata/NoOpShuffleOutputTracker.java b/core/src/main/java/org/apache/spark/shuffle/api/metadata/NoOpShuffleOutputTracker.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.api.metadata;
+
+/**
+ * An implementation of shuffle output tracking that does not keep track of any shuffle state.
+ */
+public class NoOpShuffleOutputTracker implements ShuffleOutputTracker {
+
+  @Override
+  public void registerShuffle(int shuffleId) {}
+
+  @Override
+  public void unregisterShuffle(int shuffleId, boolean blocking) {}
+
+  @Override
+  public void registerMapOutput(
+      int shuffleId, int mapIndex, long mapId, MapOutputMetadata mapOutputMetadata) {}
+
+  @Override
+  public void removeMapOutput(int shuffleId, int mapIndex, long mapId) {}
+}
diff --git a/core/src/main/java/org/apache/spark/shuffle/api/metadata/ShuffleOutputTracker.java b/core/src/main/java/org/apache/spark/shuffle/api/metadata/ShuffleOutputTracker.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.api.metadata;
+
+/**
+ * :: Private ::
+ *
+ * A plugin that can monitor the storage of shuffle data from map tasks, and can provide
+ * metadata to shuffle readers to aid their reading of shuffle blocks in reduce tasks.
+ * <p>
+ * {@link MapOutputMetadata} instances provided from the plugin tree's implementation of
+ * {@link org.apache.spark.shuffle.api.ShuffleMapOutputWriter} are sent to this module's map output
+ * metadata registration method in {@link #registerMapOutput(int, int, long, MapOutputMetadata)}.
+ * <p>
+ * Implementations MUST be thread-safe. Spark will invoke methods in this module in parallel.
+ * <p>
+ * A singleton instance of this module is instantiated on the driver via
+ * {@link ShuffleDriverComponents#shuffleOutputTracker()}.
+ */
+public interface ShuffleOutputTracker {
+
+  /**
+   * Called when a new shuffle stage is going to be run.
+   *
+   * @param shuffleId the unique identifier for the new shuffle stage
+   */
+  void registerShuffle(int shuffleId);
+
+  /**
+   * Called when the shuffle with the given id is unregistered because it will no longer
+   * be used by Spark tasks.
+   *
+   * @param shuffleId the unique identifier for the shuffle stage to be unregistered
+   */
+  void unregisterShuffle(int shuffleId, boolean blocking);
+
+  /**
+   * Called when a map task completes, and the map output writer has provided metadata to be
+   * persisted by this shuffle output tracker.
+   *
+   * @param shuffleId         the unique identifier for the shuffle stage that the map task is a
+   *                          part of
+   * @param mapIndex          the map index of the map task in its shuffle map stage - not
+   *                          necessarily unique across multiple attempts of this task
+   * @param mapId             the identifier for this map task, which is unique even across
+   *                          multiple attempts at this task
+   * @param mapOutputMetadata metadata about the map output data's storage returned by the map
+   *                          task's writer
+   *
+   */
+  void registerMapOutput(
+      int shuffleId, int mapIndex, long mapId, MapOutputMetadata mapOutputMetadata);
+
+  /**
+   * Called when the given map output is discarded, and will not longer be used in future Spark
+   * shuffles.
+   *
+   * @param shuffleId         the unique identifier for the shuffle stage that the map task is a
+   *                          part of
+   * @param mapIndex          the map index of the map task which is having its output being
+   *                          discarded - not necessarily unique across multiple attempts of this
+   *                          task
+   * @param mapId             the identifier for the map task which is having its output being
+   *                          discarded, which is unique even across multiple attempts at this task
+   */
+  void removeMapOutput(int shuffleId, int mapIndex, long mapId);
+}
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
@@ -30,6 +30,7 @@
 import scala.Product2;
 import scala.Tuple2;
 import scala.collection.Iterator;
+import scala.compat.java8.OptionConverters;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.io.Closeables;
@@ -39,17 +40,18 @@
 import org.apache.spark.Partitioner;
 import org.apache.spark.ShuffleDependency;
 import org.apache.spark.SparkConf;
+import org.apache.spark.scheduler.MapTaskResult;
 import org.apache.spark.shuffle.api.ShuffleExecutorComponents;
 import org.apache.spark.shuffle.api.ShuffleMapOutputWriter;
 import org.apache.spark.shuffle.api.ShufflePartitionWriter;
 import org.apache.spark.shuffle.api.WritableByteChannelWrapper;
 import org.apache.spark.internal.config.package$;
-import org.apache.spark.scheduler.MapStatus;
 import org.apache.spark.scheduler.MapStatus$;
 import org.apache.spark.serializer.Serializer;
 import org.apache.spark.serializer.SerializerInstance;
 import org.apache.spark.shuffle.ShuffleWriteMetricsReporter;
 import org.apache.spark.shuffle.ShuffleWriter;
+import org.apache.spark.shuffle.api.metadata.MapOutputCommitMessage;
 import org.apache.spark.storage.*;
 import org.apache.spark.util.Utils;
 
@@ -92,8 +94,8 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> {
   /** Array of file writers, one for each partition */
   private DiskBlockObjectWriter[] partitionWriters;
   private FileSegment[] partitionWriterSegments;
-  @Nullable private MapStatus mapStatus;
-  private long[] partitionLengths;
+  @Nullable private MapTaskResult taskResult;
+  private MapOutputCommitMessage mapOutputCommitMessage;
 
   /**
    * Are we in the process of stopping? Because map tasks can call stop() with success = true
@@ -130,9 +132,13 @@ public void write(Iterator<Product2<K, V>> records) throws IOException {
         .createMapOutputWriter(shuffleId, mapId, numPartitions);
     try {
       if (!records.hasNext()) {
-        partitionLengths = mapOutputWriter.commitAllPartitions().getPartitionLengths();
-        mapStatus = MapStatus$.MODULE$.apply(
-          blockManager.shuffleServerId(), partitionLengths, mapId);
+        mapOutputCommitMessage = mapOutputWriter.commitAllPartitions();
+        taskResult = new MapTaskResult(
+            MapStatus$.MODULE$.apply(
+                blockManager.shuffleServerId(),
+                mapOutputCommitMessage.getPartitionLengths(),
+                mapId),
+            OptionConverters.toScala(mapOutputCommitMessage.getMapOutputMetadata()));
         return;
       }
       final SerializerInstance serInstance = serializer.newInstance();
@@ -164,9 +170,13 @@ public void write(Iterator<Product2<K, V>> records) throws IOException {
         }
       }
 
-      partitionLengths = writePartitionedData(mapOutputWriter);
-      mapStatus = MapStatus$.MODULE$.apply(
-        blockManager.shuffleServerId(), partitionLengths, mapId);
+      mapOutputCommitMessage = writePartitionedData(mapOutputWriter);
+      taskResult = new MapTaskResult(
+          MapStatus$.MODULE$.apply(
+              blockManager.shuffleServerId(),
+              mapOutputCommitMessage.getPartitionLengths(),
+              mapId),
+          OptionConverters.toScala(mapOutputCommitMessage.getMapOutputMetadata()));
     } catch (Exception e) {
       try {
         mapOutputWriter.abort(e);
@@ -179,16 +189,17 @@ public void write(Iterator<Product2<K, V>> records) throws IOException {
   }
 
   @VisibleForTesting
-  long[] getPartitionLengths() {
-    return partitionLengths;
+  MapOutputCommitMessage getMapOutputCommitMessage() {
+    return mapOutputCommitMessage;
   }
 
   /**
    * Concatenate all of the per-partition files into a single combined file.
    *
    * @return array of lengths, in bytes, of each partition of the file (used by map output tracker).
    */
-  private long[] writePartitionedData(ShuffleMapOutputWriter mapOutputWriter) throws IOException {
+  private MapOutputCommitMessage writePartitionedData(ShuffleMapOutputWriter mapOutputWriter)
+      throws IOException {
     // Track location of the partition starts in the output file
     if (partitionWriters != null) {
       final long writeStartTime = System.nanoTime();
@@ -219,7 +230,7 @@ private long[] writePartitionedData(ShuffleMapOutputWriter mapOutputWriter) thro
       }
       partitionWriters = null;
     }
-    return mapOutputWriter.commitAllPartitions().getPartitionLengths();
+    return mapOutputWriter.commitAllPartitions();
   }
 
   private void writePartitionedDataWithChannel(
@@ -259,16 +270,16 @@ private void writePartitionedDataWithStream(File file, ShufflePartitionWriter wr
   }
 
   @Override
-  public Option<MapStatus> stop(boolean success) {
+  public Option<MapTaskResult> stop(boolean success) {
     if (stopping) {
       return None$.empty();
     } else {
       stopping = true;
       if (success) {
-        if (mapStatus == null) {
+        if (taskResult == null) {
           throw new IllegalStateException("Cannot call stop(true) without having called write()");
         }
-        return Option.apply(mapStatus);
+        return Option.apply(taskResult);
       } else {
         // The map task failed, so delete our output data.
         if (partitionWriters != null) {