apache
diff --git a/‎common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java‎
Lines changed: 4 additions & 3 deletions b/‎common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java‎
Lines changed: 12 additions & 1 deletion b/‎common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java‎
Lines changed: 2 additions & 3 deletions b/‎core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java‎
Lines changed: 6 additions & 1 deletion b/‎core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/executor/Executor.scala‎
Lines changed: 2 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/executor/Executor.scala‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/internal/config/package.scala‎
Lines changed: 29 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/internal/config/package.scala‎
Lines changed: 29 additions & 0 deletions
@@ -254,7 +254,7 @@ TransportClient createClient(InetSocketAddress address)
       // Disable Nagle's Algorithm since we don't want packets to wait
       .option(ChannelOption.TCP_NODELAY, true)
       .option(ChannelOption.SO_KEEPALIVE, true)
-      .option(ChannelOption.CONNECT_TIMEOUT_MILLIS, conf.connectionTimeoutMs())
+      .option(ChannelOption.CONNECT_TIMEOUT_MILLIS, conf.connectionCreationTimeoutMs())
       .option(ChannelOption.ALLOCATOR, pooledAllocator);
 
     if (conf.receiveBuf() > 0) {
@@ -280,9 +280,10 @@ public void initChannel(SocketChannel ch) {
     // Connect to the remote server
     long preConnect = System.nanoTime();
     ChannelFuture cf = bootstrap.connect(address);
-    if (!cf.await(conf.connectionTimeoutMs())) {
+    if (!cf.await(conf.connectionCreationTimeoutMs())) {
       throw new IOException(
-        String.format("Connecting to %s timed out (%s ms)", address, conf.connectionTimeoutMs()));
+        String.format("Connecting to %s timed out (%s ms)",
+          address, conf.connectionCreationTimeoutMs()));
     } else if (cf.cause() != null) {
       throw new IOException(String.format("Failed to connect to %s", address), cf.cause());
     }
 
@@ -19,6 +19,7 @@
 
 import java.util.Locale;
 import java.util.Properties;
+import java.util.concurrent.TimeUnit;
 
 import com.google.common.primitives.Ints;
 import io.netty.util.NettyRuntime;
@@ -31,6 +32,7 @@ public class TransportConf {
   private final String SPARK_NETWORK_IO_MODE_KEY;
   private final String SPARK_NETWORK_IO_PREFERDIRECTBUFS_KEY;
   private final String SPARK_NETWORK_IO_CONNECTIONTIMEOUT_KEY;
+  private final String SPARK_NETWORK_IO_CONNECTIONCREATIONTIMEOUT_KEY;
   private final String SPARK_NETWORK_IO_BACKLOG_KEY;
   private final String SPARK_NETWORK_IO_NUMCONNECTIONSPERPEER_KEY;
   private final String SPARK_NETWORK_IO_ACCEPTORTHREADS_KEY;
@@ -59,6 +61,7 @@ public TransportConf(String module, ConfigProvider conf) {
     SPARK_NETWORK_IO_MODE_KEY = getConfKey("io.mode");
     SPARK_NETWORK_IO_PREFERDIRECTBUFS_KEY = getConfKey("io.preferDirectBufs");
     SPARK_NETWORK_IO_CONNECTIONTIMEOUT_KEY = getConfKey("io.connectionTimeout");
+    SPARK_NETWORK_IO_CONNECTIONCREATIONTIMEOUT_KEY = getConfKey("io.connectionCreationTimeout");
     SPARK_NETWORK_IO_BACKLOG_KEY = getConfKey("io.backLog");
     SPARK_NETWORK_IO_NUMCONNECTIONSPERPEER_KEY =  getConfKey("io.numConnectionsPerPeer");
     SPARK_NETWORK_IO_ACCEPTORTHREADS_KEY = getConfKey("io.acceptorThreads");
@@ -108,7 +111,7 @@ public boolean preferDirectBufs() {
     return conf.getBoolean(SPARK_NETWORK_IO_PREFERDIRECTBUFS_KEY, true);
   }
 
-  /** Connect timeout in milliseconds. Default 120 secs. */
+  /** Connection idle timeout in milliseconds. Default 120 secs. */
   public int connectionTimeoutMs() {
     long defaultNetworkTimeoutS = JavaUtils.timeStringAsSec(
       conf.get("spark.network.timeout", "120s"));
@@ -139,6 +142,14 @@ public int streamReadTimeoutMs() {
     return (int) defaultTimeoutMs;
   }
 
+  /** Connect creation timeout in milliseconds. Default 30 secs. */
+  public int connectionCreationTimeoutMs() {
+    long connectionTimeoutS = TimeUnit.MILLISECONDS.toSeconds(connectionTimeoutMs());
+    long defaultTimeoutMs = JavaUtils.timeStringAsSec(
+      conf.get(SPARK_NETWORK_IO_CONNECTIONCREATIONTIMEOUT_KEY,  connectionTimeoutS + "s")) * 1000;
+    return (int) defaultTimeoutMs;
+  }
+
   /** Number of concurrent connections between two nodes for fetching data. */
   public int numConnectionsPerPeer() {
     return conf.getInt(SPARK_NETWORK_IO_NUMCONNECTIONSPERPEER_KEY, 1);
 
@@ -31,7 +31,6 @@
 import scala.Tuple2;
 import scala.collection.Iterator;
 
-import com.google.common.annotations.VisibleForTesting;
 import com.google.common.io.Closeables;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -178,8 +177,8 @@ public void write(Iterator<Product2<K, V>> records) throws IOException {
     }
   }
 
-  @VisibleForTesting
-  long[] getPartitionLengths() {
+  @Override
+  public long[] getPartitionLengths() {
     return partitionLengths;
   }
 
 
@@ -87,6 +87,7 @@ public class UnsafeShuffleWriter<K, V> extends ShuffleWriter<K, V> {
 
   @Nullable private MapStatus mapStatus;
   @Nullable private ShuffleExternalSorter sorter;
+  @Nullable private long[] partitionLengths;
   private long peakMemoryUsedBytes = 0;
 
   /** Subclass of ByteArrayOutputStream that exposes `buf` directly. */
@@ -218,7 +219,6 @@ void closeAndWriteOutput() throws IOException {
     serOutputStream = null;
     final SpillInfo[] spills = sorter.closeAndGetSpills();
     sorter = null;
-    final long[] partitionLengths;
     try {
       partitionLengths = mergeSpills(spills);
     } finally {
@@ -528,4 +528,9 @@ public void close() throws IOException {
       channel.close();
     }
   }
+
+  @Override
+  public long[] getPartitionLengths() {
+    return partitionLengths;
+  }
 }
@@ -46,7 +46,7 @@ import org.apache.spark.metrics.source.JVMCPUSource
 import org.apache.spark.resource.ResourceInformation
 import org.apache.spark.rpc.RpcTimeout
 import org.apache.spark.scheduler._
-import org.apache.spark.shuffle.FetchFailedException
+import org.apache.spark.shuffle.{FetchFailedException, ShuffleBlockPusher}
 import org.apache.spark.storage.{StorageLevel, TaskResultBlockId}
 import org.apache.spark.util._
 import org.apache.spark.util.io.ChunkedByteBuffer
@@ -307,6 +307,7 @@ private[spark] class Executor(
         case NonFatal(e) =>
           logWarning("Unable to stop heartbeater", e)
       }
+      ShuffleBlockPusher.stop()
       threadPool.shutdown()
 
       // Notify plugins that executor is shutting down so they can terminate cleanly
 
@@ -2134,4 +2134,33 @@ package object config {
       .version("3.1.0")
       .doubleConf
       .createWithDefault(5)
+
+  private[spark] val SHUFFLE_NUM_PUSH_THREADS =
+    ConfigBuilder("spark.shuffle.push.numPushThreads")
+      .doc("Specify the number of threads in the block pusher pool. These threads assist " +
+        "in creating connections and pushing blocks to remote shuffle services. By default, the " +
+        "threadpool size is equal to the number of spark executor cores.")
+      .version("3.2.0")
+      .intConf
+      .createOptional
+
+  private[spark] val SHUFFLE_MAX_BLOCK_SIZE_TO_PUSH =
+    ConfigBuilder("spark.shuffle.push.maxBlockSizeToPush")
+      .doc("The max size of an individual block to push to the remote shuffle services. Blocks " +
+       "larger than this threshold are not pushed to be merged remotely. These shuffle blocks " +
+       "will be fetched by the executors in the original manner.")
+      .version("3.2.0")
+      .bytesConf(ByteUnit.BYTE)
+      .createWithDefaultString("1m")
+
+  private[spark] val SHUFFLE_MAX_BLOCK_BATCH_SIZE_FOR_PUSH =
+    ConfigBuilder("spark.shuffle.push.maxBlockBatchSize")
+      .doc("The max size of a batch of shuffle blocks to be grouped into a single push request.")
+      .version("3.2.0")
+      .bytesConf(ByteUnit.BYTE)
+      // Default is 3m because it is greater than 2m which is the default value for
+      // TransportConf#memoryMapBytes. If this defaults to 2m as well it is very likely that each
+      // batch of block will be loaded in memory with memory mapping, which has higher overhead
+      // with small MB sized chunk of data.
+      .createWithDefaultString("3m")
 }
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,6 @@`
`31`	`31`	`import scala.Tuple2;`
`32`	`32`	`import scala.collection.Iterator;`
`33`	`33`
`34`		`-import com.google.common.annotations.VisibleForTesting;`
`35`	`34`	`import com.google.common.io.Closeables;`
`36`	`35`	`import org.slf4j.Logger;`
`37`	`36`	`import org.slf4j.LoggerFactory;`
`@@ -178,8 +177,8 @@ public void write(Iterator<Product2<K, V>> records) throws IOException {`
`178`	`177`	`}`
`179`	`178`	`}`
`180`	`179`
`181`		`- @VisibleForTesting`
`182`		`- long[] getPartitionLengths() {`
	`180`	`+ @Override`
	`181`	`+ public long[] getPartitionLengths() {`
`183`	`182`	`return partitionLengths;`
`184`	`183`	`}`
`185`	`184`