-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-3797] Run external shuffle service in Yarn NM #3082
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
b54a0c4
43dcb96
b4b1f0c
1bf5109
ea764e0
cd076a4
804e7ff
5b419b8
5bf9b7e
baff916
15a5b37
761f58a
f39daa6
f48b20c
9b6e058
5f8a96f
d1124e4
7b71d8f
6489db5
0eb6233
1c66046
0ee67a2
ef3ddae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -66,7 +66,6 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging | |
| // Lower and upper bounds on the number of executors. These are required. | ||
| private val minNumExecutors = conf.getInt("spark.dynamicAllocation.minExecutors", -1) | ||
| private val maxNumExecutors = conf.getInt("spark.dynamicAllocation.maxExecutors", -1) | ||
| verifyBounds() | ||
|
|
||
| // How long there must be backlogged tasks for before an addition is triggered | ||
| private val schedulerBacklogTimeout = conf.getLong( | ||
|
|
@@ -77,9 +76,11 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging | |
| "spark.dynamicAllocation.sustainedSchedulerBacklogTimeout", schedulerBacklogTimeout) | ||
|
|
||
| // How long an executor must be idle for before it is removed | ||
| private val removeThresholdSeconds = conf.getLong( | ||
| private val executorIdleTimeout = conf.getLong( | ||
| "spark.dynamicAllocation.executorIdleTimeout", 600) | ||
|
|
||
| validateSettings() | ||
|
|
||
| // Number of executors to add in the next round | ||
| private var numExecutorsToAdd = 1 | ||
|
|
||
|
|
@@ -110,10 +111,11 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging | |
| private var clock: Clock = new RealClock | ||
|
|
||
| /** | ||
| * Verify that the lower and upper bounds on the number of executors are valid. | ||
| * Verify that the settings specified through the config are valid. | ||
| * If not, throw an appropriate exception. | ||
| */ | ||
| private def verifyBounds(): Unit = { | ||
| private def validateSettings(): Unit = { | ||
| // Verify that bounds are valid | ||
| if (minNumExecutors < 0 || maxNumExecutors < 0) { | ||
| throw new SparkException("spark.dynamicAllocation.{min/max}Executors must be set!") | ||
| } | ||
|
|
@@ -124,6 +126,22 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging | |
| throw new SparkException(s"spark.dynamicAllocation.minExecutors ($minNumExecutors) must " + | ||
| s"be less than or equal to spark.dynamicAllocation.maxExecutors ($maxNumExecutors)!") | ||
| } | ||
| // Verify that timeouts are positive | ||
| if (schedulerBacklogTimeout <= 0) { | ||
| throw new SparkException(s"spark.dynamicAllocation.schedulerBacklogTimeout must be > 0!") | ||
| } | ||
| if (sustainedSchedulerBacklogTimeout <= 0) { | ||
| throw new SparkException( | ||
| s"spark.dynamicAllocation.sustainedSchedulerBacklogTimeout must be > 0!") | ||
| } | ||
| if (executorIdleTimeout <= 0) { | ||
| throw new SparkException(s"spark.dynamicAllocation.executorIdleTimeout must be > 0!") | ||
| } | ||
| // Verify that external shuffle service is enabled | ||
| if (!conf.getBoolean("spark.shuffle.service.enabled", false)) { | ||
| throw new SparkException(s"Dynamic allocation of executors requires the external " + | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: interpolation not necessary (also on next line). |
||
| s"shuffle service. You may enable this through spark.shuffle.service.enabled.") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At this point, is there any sense in having two separate settings that need to be set in tandem? Couldn't we just base everything on whether dynamic executor allocation is enabled or not?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (I guess you might want to allow people to still use the external shuffle service even without dynamic allocation. But still we could be nice and automatically use the external shuffle service when dynamic allocation is enabled, without requiring the user to set both configs.)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is tricky, because the user may not have set up an external shuffle service when running Spark with dynamic allocation. If we automatically enable this service for them, they will be confused if they see "connection refused" messages when executors try to fetch files from a service that doesn't exist. It's not intuitive to me why enabling a dynamic scaling feature will cause my executors to die because of this. However, if the user explicitly sets |
||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -254,7 +272,7 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging | |
| val removeRequestAcknowledged = testing || sc.killExecutor(executorId) | ||
| if (removeRequestAcknowledged) { | ||
| logInfo(s"Removing executor $executorId because it has been idle for " + | ||
| s"$removeThresholdSeconds seconds (new desired total will be ${numExistingExecutors - 1})") | ||
| s"$executorIdleTimeout seconds (new desired total will be ${numExistingExecutors - 1})") | ||
| executorsPendingToRemove.add(executorId) | ||
| true | ||
| } else { | ||
|
|
@@ -329,8 +347,8 @@ private[spark] class ExecutorAllocationManager(sc: SparkContext) extends Logging | |
| private def onExecutorIdle(executorId: String): Unit = synchronized { | ||
| if (!removeTimes.contains(executorId) && !executorsPendingToRemove.contains(executorId)) { | ||
| logDebug(s"Starting idle timer for $executorId because there are no more tasks " + | ||
| s"scheduled to run on the executor (to expire in $removeThresholdSeconds seconds)") | ||
| removeTimes(executorId) = clock.getTimeMillis + removeThresholdSeconds * 1000 | ||
| s"scheduled to run on the executor (to expire in $executorIdleTimeout seconds)") | ||
| removeTimes(executorId) = clock.getTimeMillis + executorIdleTimeout * 1000 | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,6 +30,7 @@ import akka.actor.{ActorSystem, Props} | |
| import sun.nio.ch.DirectBuffer | ||
|
|
||
| import org.apache.spark._ | ||
| import org.apache.spark.deploy.SparkHadoopUtil | ||
| import org.apache.spark.executor._ | ||
| import org.apache.spark.io.CompressionCodec | ||
| import org.apache.spark.network._ | ||
|
|
@@ -92,7 +93,19 @@ private[spark] class BlockManager( | |
|
|
||
| private[spark] | ||
| val externalShuffleServiceEnabled = conf.getBoolean("spark.shuffle.service.enabled", false) | ||
| private val externalShuffleServicePort = conf.getInt("spark.shuffle.service.port", 7337) | ||
|
|
||
| // In Yarn, the shuffle service port maybe set through the Hadoop config | ||
| private val shuffleServicePortKey = "spark.shuffle.service.port" | ||
| private val externalShuffleServicePort = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would rather not put YARN-related logic directly in the BlockManager itself. Can we extract this type of logic out to a utility function somewhere like It doesn't save many characters in here, but at least it hides the checking for YARN and constructing a Hadoop Configuration. |
||
| val sparkPort = conf.getInt(shuffleServicePortKey, 7337) | ||
| if (SparkHadoopUtil.get.isYarnMode) { | ||
| val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) | ||
| Option(hadoopConf.get(shuffleServicePortKey)).map(_.toInt).getOrElse(sparkPort) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh cool I didn't realize Hadoop conf also has that |
||
| } else { | ||
| sparkPort | ||
| } | ||
| } | ||
|
|
||
| // Check that we're not using external shuffle service with consolidated shuffle files. | ||
| if (externalShuffleServiceEnabled | ||
| && conf.getBoolean("spark.shuffle.consolidateFiles", false) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -181,6 +181,7 @@ echo "Spark $VERSION$GITREVSTRING built for Hadoop $SPARK_HADOOP_VERSION" > "$DI | |
| # Copy jars | ||
| cp "$FWDIR"/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/" | ||
| cp "$FWDIR"/examples/target/scala*/spark-examples*.jar "$DISTDIR/lib/" | ||
| cp "$FWDIR"/network/yarn/target/scala*/spark-network-yarn*.jar "$DISTDIR/lib/" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if it would be nicer to use maven-shade-plugin to create a single jar for the NM aux service. That might make it easier for people to install it.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes I plan to do that though in a separate PR. See my comment in andrewor14@f39daa6
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @andrewor14
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes I will fix this in a separate PR
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey @witgo I just pushed a hot fix. I didn't realize |
||
|
|
||
| # Copy example sources (needed for python and SQL) | ||
| mkdir -p "$DISTDIR/examples/src/main" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,78 @@ | ||
| <?xml version="1.0" encoding="UTF-8"?> | ||
| <!-- | ||
| ~ Licensed to the Apache Software Foundation (ASF) under one or more | ||
| ~ contributor license agreements. See the NOTICE file distributed with | ||
| ~ this work for additional information regarding copyright ownership. | ||
| ~ The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| ~ (the "License"); you may not use this file except in compliance with | ||
| ~ the License. You may obtain a copy of the License at | ||
| ~ | ||
| ~ http://www.apache.org/licenses/LICENSE-2.0 | ||
| ~ | ||
| ~ Unless required by applicable law or agreed to in writing, software | ||
| ~ distributed under the License is distributed on an "AS IS" BASIS, | ||
| ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| ~ See the License for the specific language governing permissions and | ||
| ~ limitations under the License. | ||
| --> | ||
|
|
||
| <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
| xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
| <modelVersion>4.0.0</modelVersion> | ||
| <parent> | ||
| <groupId>org.apache.spark</groupId> | ||
| <artifactId>spark-parent</artifactId> | ||
| <version>1.2.0-SNAPSHOT</version> | ||
| <relativePath>../../pom.xml</relativePath> | ||
| </parent> | ||
|
|
||
| <groupId>org.apache.spark</groupId> | ||
| <artifactId>spark-network-yarn_2.10</artifactId> | ||
| <packaging>jar</packaging> | ||
| <name>Spark Project Yarn Shuffle Service Code</name> | ||
| <url>http://spark.apache.org/</url> | ||
| <properties> | ||
| <sbt.project.name>network-yarn</sbt.project.name> | ||
| </properties> | ||
|
|
||
| <dependencies> | ||
| <!-- Core dependencies --> | ||
| <dependency> | ||
| <groupId>org.apache.spark</groupId> | ||
| <artifactId>spark-network-shuffle_2.10</artifactId> | ||
| <version>${project.version}</version> | ||
| </dependency> | ||
|
|
||
| <!-- Provided dependencies --> | ||
| <dependency> | ||
| <groupId>org.apache.hadoop</groupId> | ||
| <artifactId>hadoop-yarn-api</artifactId> | ||
| <scope>provided</scope> | ||
| </dependency> | ||
| <dependency> | ||
| <groupId>org.apache.hadoop</groupId> | ||
| <artifactId>hadoop-yarn-common</artifactId> | ||
| <scope>provided</scope> | ||
| </dependency> | ||
| <dependency> | ||
| <groupId>org.apache.hadoop</groupId> | ||
| <artifactId>hadoop-yarn-server-web-proxy</artifactId> | ||
| <scope>provided</scope> | ||
| </dependency> | ||
| <dependency> | ||
| <groupId>org.apache.hadoop</groupId> | ||
| <artifactId>hadoop-yarn-client</artifactId> | ||
| <scope>provided</scope> | ||
| </dependency> | ||
| <dependency> | ||
| <groupId>org.apache.hadoop</groupId> | ||
| <artifactId>hadoop-client</artifactId> | ||
| <scope>provided</scope> | ||
| </dependency> | ||
| </dependencies> | ||
|
|
||
| <build> | ||
| <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory> | ||
| <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory> | ||
| </build> | ||
| </project> |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,103 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.network.yarn; | ||
|
|
||
| import java.lang.Override; | ||
| import java.nio.ByteBuffer; | ||
|
|
||
| import org.apache.spark.network.TransportContext; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I think our import order calls for spark to go below hadoop (unless that's different in YARN code)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I thought I was in Hadoop land for a second. Will fix |
||
| import org.apache.spark.network.server.RpcHandler; | ||
| import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler; | ||
| import org.apache.spark.network.util.TransportConf; | ||
| import org.apache.spark.network.util.SystemPropertyConfigProvider; | ||
|
|
||
| import org.apache.hadoop.conf.Configuration; | ||
| import org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager; | ||
| import org.apache.hadoop.yarn.api.records.ApplicationId; | ||
| import org.apache.hadoop.yarn.api.records.ContainerId; | ||
| import org.apache.hadoop.yarn.server.api.AuxiliaryService; | ||
| import org.apache.hadoop.yarn.server.api.ApplicationInitializationContext; | ||
| import org.apache.hadoop.yarn.server.api.ApplicationTerminationContext; | ||
| import org.apache.hadoop.yarn.server.api.ContainerInitializationContext; | ||
| import org.apache.hadoop.yarn.server.api.ContainerTerminationContext; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| /** | ||
| * External shuffle service used by Spark on Yarn. | ||
| */ | ||
| public class YarnShuffleService extends AuxiliaryService { | ||
| private final Logger logger = LoggerFactory.getLogger(YarnShuffleService.class); | ||
| private static final JobTokenSecretManager secretManager = new JobTokenSecretManager(); | ||
|
|
||
| private static final String SPARK_SHUFFLE_SERVICE_PORT_KEY = "spark.shuffle.service.port"; | ||
| private static final int DEFAULT_SPARK_SHUFFLE_SERVICE_PORT = 7337; | ||
|
|
||
| public YarnShuffleService() { | ||
| super("spark_shuffle"); | ||
| logger.info("Initializing Yarn shuffle service for Spark"); | ||
| } | ||
|
|
||
| /** | ||
| * Start the shuffle server with the given configuration. | ||
| */ | ||
| @Override | ||
| protected void serviceInit(Configuration conf) { | ||
| try { | ||
| int port = conf.getInt( | ||
| SPARK_SHUFFLE_SERVICE_PORT_KEY, DEFAULT_SPARK_SHUFFLE_SERVICE_PORT); | ||
| TransportConf transportConf = new TransportConf(new SystemPropertyConfigProvider()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the SystemPropertyConfigProvider correct? Perhaps it should be one of these: private static class HadoopConfigProvider extends ConfigProvider {
private final Configuration conf;
public HadoopConfigProvider(Configuration conf) {
this.conf = conf;
}
@Override
public String get(String name) {
String value = conf.get(name)
if (value != null) {
return value;
} else {
throw new NoSuchElementException(name);
}
}
}
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oop you're right. |
||
| RpcHandler rpcHandler = new ExternalShuffleBlockHandler(); | ||
| TransportContext transportContext = new TransportContext(transportConf, rpcHandler); | ||
| transportContext.createServer(port); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To try to play nice, wouldn't it be better to implement
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I'm planning on adding that too |
||
| logger.info("Started Yarn shuffle service for Spark on port " + port); | ||
| } catch (Exception e) { | ||
| logger.error("Exception in starting Yarn shuffle service for Spark", e); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public void initializeApplication(ApplicationInitializationContext context) { | ||
| ApplicationId appId = context.getApplicationId(); | ||
| logger.debug("Initializing application " + appId + "!"); | ||
| } | ||
|
|
||
| @Override | ||
| public void stopApplication(ApplicationTerminationContext context) { | ||
| ApplicationId appId = context.getApplicationId(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Entirely unknowledgeable about the shuffler service's inner workings, but is there no state to clean up once an application completes? E.g. if an app is terminated suddenly, how do its shuffle blocks get cleaned up?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We might need to add some timeout to clean up the files.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a feature that still needs to be added to the external shuffle service. I made SPARK-4236 to track this.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah cool. |
||
| logger.debug("Stopping application " + appId + "!"); | ||
| } | ||
|
|
||
| @Override | ||
| public ByteBuffer getMetaData() { | ||
| logger.debug("Getting meta data"); | ||
| return ByteBuffer.allocate(0); | ||
| } | ||
|
|
||
| @Override | ||
| public void initializeContainer(ContainerInitializationContext context) { | ||
| ContainerId containerId = context.getContainerId(); | ||
| logger.debug("Initializing container " + containerId + "!"); | ||
| } | ||
|
|
||
| @Override | ||
| public void stopContainer(ContainerTerminationContext context) { | ||
| ContainerId containerId = context.getContainerId(); | ||
| logger.debug("Stopping container " + containerId + "!"); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -93,6 +93,7 @@ | |
| <module>tools</module> | ||
| <module>network/common</module> | ||
| <module>network/shuffle</module> | ||
| <module>network/yarn</module> | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW, this should be probably gated on some profile (e.g. "-Pyarn"). I don't think this would compile with Hadoop 1.0.4, for example...
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah... good point |
||
| <module>streaming</module> | ||
| <module>sql/catalyst</module> | ||
| <module>sql/core</module> | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -89,6 +89,12 @@ class ExecutorRunnable( | |
|
|
||
| ctx.setApplicationACLs(YarnSparkHadoopUtil.getApplicationAclsForYarn(securityMgr)) | ||
|
|
||
| // If external shuffle service is enabled, register with the | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might be good to say "register and transfer keys" rather than just register |
||
| // Yarn shuffle service already started on the node manager | ||
| if (sparkConf.getBoolean("spark.shuffle.service.enabled", false)) { | ||
| ctx.setServiceData(Map[String, ByteBuffer]("spark_shuffle" -> ByteBuffer.allocate(0))) | ||
| } | ||
|
|
||
| // Send the start request to the ContainerManager | ||
| nmClient.startContainer(container, ctx) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: comment is redundant given the code.