-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-22404][YARN] Provide an option to use unmanaged AM in yarn-client mode #19616
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
e51f99e
764c302
640013b
cba0c6d
19b6c3a
ce94235
0921f7a
837d25f
65aeba9
93b016f
dc31940
23ad9de
1c02b7d
2429e19
6854fc4
3b377af
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,6 +34,7 @@ import org.apache.hadoop.yarn.api._ | |
| import org.apache.hadoop.yarn.api.records._ | ||
| import org.apache.hadoop.yarn.conf.YarnConfiguration | ||
| import org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException | ||
| import org.apache.hadoop.yarn.server.webproxy.ProxyUriUtils | ||
| import org.apache.hadoop.yarn.util.{ConverterUtils, Records} | ||
|
|
||
| import org.apache.spark._ | ||
|
|
@@ -51,32 +52,27 @@ import org.apache.spark.util._ | |
| /** | ||
| * Common application master functionality for Spark on Yarn. | ||
| */ | ||
| private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends Logging { | ||
| private[spark] class ApplicationMaster( | ||
| val args: ApplicationMasterArguments, | ||
| val sparkConf: SparkConf, | ||
| val yarnConf: YarnConfiguration) | ||
| extends Logging { | ||
|
|
||
| def this(sparkConf: SparkConf, | ||
| yarnConf: YarnConfiguration, | ||
| clientRpcEnv: RpcEnv) { | ||
| this(new ApplicationMasterArguments(Array.empty), sparkConf, yarnConf) | ||
| this.clientRpcEnv = clientRpcEnv | ||
| } | ||
|
|
||
| private var clientRpcEnv: RpcEnv = null | ||
| // TODO: Currently, task to container is computed once (TaskSetManager) - which need not be | ||
| // optimal as more containers are available. Might need to handle this better. | ||
|
|
||
| private val isClusterMode = args.userClass != null | ||
|
|
||
| private val sparkConf = new SparkConf() | ||
| if (args.propertiesFile != null) { | ||
| Utils.getPropertiesFromFile(args.propertiesFile).foreach { case (k, v) => | ||
| sparkConf.set(k, v) | ||
| } | ||
| } | ||
|
|
||
| private val securityMgr = new SecurityManager(sparkConf) | ||
|
|
||
| // Set system properties for each config entry. This covers two use cases: | ||
| // - The default configuration stored by the SparkHadoopUtil class | ||
| // - The user application creating a new SparkConf in cluster mode | ||
| // | ||
| // Both cases create a new SparkConf object which reads these configs from system properties. | ||
| sparkConf.getAll.foreach { case (k, v) => | ||
| sys.props(k) = v | ||
| } | ||
|
|
||
| private val yarnConf = new YarnConfiguration(SparkHadoopUtil.newConfiguration(sparkConf)) | ||
|
|
||
| private val userClassLoader = { | ||
| val classpath = Client.getUserClasspath(sparkConf) | ||
|
|
@@ -232,8 +228,8 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends | |
| resources.toMap | ||
| } | ||
|
|
||
| def getAttemptId(): ApplicationAttemptId = { | ||
| client.getAttemptId() | ||
| def getAttemptId(sparkConf: SparkConf): ApplicationAttemptId = { | ||
| client.getAttemptId(sparkConf) | ||
| } | ||
|
|
||
| final def run(): Int = { | ||
|
|
@@ -245,7 +241,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends | |
|
|
||
| private def runImpl(): Unit = { | ||
| try { | ||
| val appAttemptId = client.getAttemptId() | ||
| val appAttemptId = client.getAttemptId(sparkConf) | ||
|
|
||
| var attemptID: Option[String] = None | ||
|
|
||
|
|
@@ -275,7 +271,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends | |
| val priority = ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY - 1 | ||
| ShutdownHookManager.addShutdownHook(priority) { () => | ||
| val maxAppAttempts = client.getMaxRegAttempts(sparkConf, yarnConf) | ||
| val isLastAttempt = client.getAttemptId().getAttemptId() >= maxAppAttempts | ||
| val isLastAttempt = client.getAttemptId(sparkConf).getAttemptId() >= maxAppAttempts | ||
|
|
||
| if (!finished) { | ||
| // The default state of ApplicationMaster is failed if it is invoked by shut down hook. | ||
|
|
@@ -393,8 +389,9 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends | |
| port: Int, | ||
| _sparkConf: SparkConf, | ||
| uiAddress: Option[String]): Unit = { | ||
| val appId = client.getAttemptId().getApplicationId().toString() | ||
| val attemptId = client.getAttemptId().getAttemptId().toString() | ||
| val appAttempt = client.getAttemptId(_sparkConf) | ||
| val appId = appAttempt.getApplicationId().toString() | ||
| val attemptId = appAttempt.getAttemptId().toString() | ||
| val historyAddress = ApplicationMaster | ||
| .getHistoryServerAddress(_sparkConf, yarnConf, appId, attemptId) | ||
|
|
||
|
|
@@ -403,7 +400,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends | |
| } | ||
|
|
||
| private def createAllocator(driverRef: RpcEndpointRef, _sparkConf: SparkConf): Unit = { | ||
| val appId = client.getAttemptId().getApplicationId().toString() | ||
| val appId = client.getAttemptId(_sparkConf).getApplicationId().toString() | ||
| val driverUrl = RpcEndpointAddress(driverRef.address.host, driverRef.address.port, | ||
| CoarseGrainedSchedulerBackend.ENDPOINT_NAME).toString | ||
|
|
||
|
|
@@ -481,20 +478,29 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends | |
| } | ||
|
|
||
| private def runExecutorLauncher(): Unit = { | ||
| val hostname = Utils.localHostName | ||
| val amCores = sparkConf.get(AM_CORES) | ||
| rpcEnv = RpcEnv.create("sparkYarnAM", hostname, hostname, -1, sparkConf, securityMgr, | ||
| amCores, true) | ||
|
|
||
| // The client-mode AM doesn't listen for incoming connections, so report an invalid port. | ||
| registerAM(hostname, -1, sparkConf, sparkConf.getOption("spark.driver.appUIAddress")) | ||
|
|
||
| // The driver should be up and listening, so unlike cluster mode, just try to connect to it | ||
| // with no waiting or retrying. | ||
| val (driverHost, driverPort) = Utils.parseHostPort(args.userArgs(0)) | ||
| val driverRef = rpcEnv.setupEndpointRef( | ||
| RpcAddress(driverHost, driverPort), | ||
| YarnSchedulerBackend.ENDPOINT_NAME) | ||
| var driverRef : RpcEndpointRef = null | ||
| if (sparkConf.get(YARN_UNMANAGED_AM)) { | ||
|
||
| rpcEnv = clientRpcEnv | ||
| driverRef = rpcEnv.setupEndpointRef( | ||
| RpcAddress(sparkConf.get("spark.driver.host"), | ||
| sparkConf.get("spark.driver.port").toInt), | ||
| YarnSchedulerBackend.ENDPOINT_NAME) | ||
| } else { | ||
| val hostname = Utils.localHostName | ||
| val amCores = sparkConf.get(AM_CORES) | ||
| rpcEnv = RpcEnv.create("sparkYarnAM", hostname, hostname, -1, sparkConf, securityMgr, | ||
| amCores, true) | ||
|
|
||
| // The client-mode AM doesn't listen for incoming connections, so report an invalid port. | ||
| registerAM(hostname, -1, sparkConf, sparkConf.getOption("spark.driver.appUIAddress")) | ||
|
|
||
| // The driver should be up and listening, so unlike cluster mode, just try to connect to it | ||
| // with no waiting or retrying. | ||
| val (driverHost, driverPort) = Utils.parseHostPort(args.userArgs(0)) | ||
| driverRef = rpcEnv.setupEndpointRef( | ||
| RpcAddress(driverHost, driverPort), | ||
| YarnSchedulerBackend.ENDPOINT_NAME) | ||
| } | ||
| addAmIpFilter(Some(driverRef)) | ||
| createAllocator(driverRef, sparkConf) | ||
|
|
||
|
|
@@ -600,7 +606,14 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends | |
| try { | ||
| val preserveFiles = sparkConf.get(PRESERVE_STAGING_FILES) | ||
| if (!preserveFiles) { | ||
| stagingDirPath = new Path(System.getenv("SPARK_YARN_STAGING_DIR")) | ||
| var stagingDir = System.getenv("SPARK_YARN_STAGING_DIR") | ||
|
||
| if (stagingDir == null) { | ||
| val appStagingBaseDir = sparkConf.get(STAGING_DIR).map { new Path(_) } | ||
|
||
| .getOrElse(FileSystem.get(yarnConf).getHomeDirectory()) | ||
| stagingDir = appStagingBaseDir.toString + Path.SEPARATOR + | ||
| getAttemptId(sparkConf).getApplicationId.toString | ||
| } | ||
| stagingDirPath = new Path(stagingDir) | ||
| logInfo("Deleting staging directory " + stagingDirPath) | ||
| val fs = stagingDirPath.getFileSystem(yarnConf) | ||
| fs.delete(stagingDirPath, true) | ||
|
|
@@ -613,7 +626,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends | |
|
|
||
| /** Add the Yarn IP filter that is required for properly securing the UI. */ | ||
| private def addAmIpFilter(driver: Option[RpcEndpointRef]) = { | ||
| val proxyBase = System.getenv(ApplicationConstants.APPLICATION_WEB_PROXY_BASE_ENV) | ||
| val proxyBase = getProxyBase | ||
| val amFilter = "org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter" | ||
| val params = client.getAmIpFilterParams(yarnConf, proxyBase) | ||
| driver match { | ||
|
|
@@ -626,6 +639,14 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends | |
| } | ||
| } | ||
|
|
||
| private def getProxyBase: String = { | ||
| var proxyBase = System.getenv(ApplicationConstants.APPLICATION_WEB_PROXY_BASE_ENV) | ||
| if (proxyBase == null) { | ||
| proxyBase = ProxyUriUtils.getPath(getAttemptId(sparkConf).getApplicationId) | ||
| } | ||
| proxyBase | ||
| } | ||
|
|
||
| /** | ||
| * Start the user class, which contains the spark driver, in a separate Thread. | ||
| * If the main routine exits cleanly or exits with System.exit(N) for any N | ||
|
|
@@ -774,16 +795,32 @@ object ApplicationMaster extends Logging { | |
| def main(args: Array[String]): Unit = { | ||
| SignalUtils.registerLogger(log) | ||
| val amArgs = new ApplicationMasterArguments(args) | ||
| master = new ApplicationMaster(amArgs) | ||
| val sparkConf = new SparkConf() | ||
| if (amArgs.propertiesFile != null) { | ||
| Utils.getPropertiesFromFile(amArgs.propertiesFile).foreach { case (k, v) => | ||
| sparkConf.set(k, v) | ||
| } | ||
| } | ||
| // Set system properties for each config entry. This covers two use cases: | ||
| // - The default configuration stored by the SparkHadoopUtil class | ||
| // - The user application creating a new SparkConf in cluster mode | ||
| // | ||
| // Both cases create a new SparkConf object which reads these configs from system properties. | ||
| sparkConf.getAll.foreach { case (k, v) => | ||
| sys.props(k) = v | ||
| } | ||
|
|
||
| val yarnConf = new YarnConfiguration(SparkHadoopUtil.newConfiguration(sparkConf)) | ||
| master = new ApplicationMaster(amArgs, sparkConf, yarnConf) | ||
| System.exit(master.run()) | ||
| } | ||
|
|
||
| private[spark] def sparkContextInitialized(sc: SparkContext): Unit = { | ||
| master.sparkContextInitialized(sc) | ||
| } | ||
|
|
||
| private[spark] def getAttemptId(): ApplicationAttemptId = { | ||
| master.getAttemptId | ||
| private[spark] def getAttemptId(sparkConf: SparkConf): ApplicationAttemptId = { | ||
| master.getAttemptId(sparkConf) | ||
| } | ||
|
|
||
| private[spark] def getHistoryServerAddress( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,7 +34,7 @@ import com.google.common.io.Files | |
| import org.apache.hadoop.conf.Configuration | ||
| import org.apache.hadoop.fs._ | ||
| import org.apache.hadoop.fs.permission.FsPermission | ||
| import org.apache.hadoop.io.DataOutputBuffer | ||
| import org.apache.hadoop.io.{DataOutputBuffer, Text} | ||
| import org.apache.hadoop.mapreduce.MRJobConfig | ||
| import org.apache.hadoop.security.{Credentials, UserGroupInformation} | ||
| import org.apache.hadoop.util.StringUtils | ||
|
|
@@ -45,6 +45,7 @@ import org.apache.hadoop.yarn.api.records._ | |
| import org.apache.hadoop.yarn.client.api.{YarnClient, YarnClientApplication} | ||
| import org.apache.hadoop.yarn.conf.YarnConfiguration | ||
| import org.apache.hadoop.yarn.exceptions.ApplicationNotFoundException | ||
| import org.apache.hadoop.yarn.security.AMRMTokenIdentifier | ||
| import org.apache.hadoop.yarn.util.Records | ||
|
|
||
| import org.apache.spark.{SecurityManager, SparkConf, SparkException} | ||
|
|
@@ -54,11 +55,13 @@ import org.apache.spark.deploy.yarn.security.YARNHadoopDelegationTokenManager | |
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.internal.config._ | ||
| import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle, YarnCommandBuilderUtils} | ||
| import org.apache.spark.rpc.RpcEnv | ||
| import org.apache.spark.util.{CallerContext, Utils} | ||
|
|
||
| private[spark] class Client( | ||
| val args: ClientArguments, | ||
| val sparkConf: SparkConf) | ||
| val sparkConf: SparkConf, | ||
| val rpcEnv: RpcEnv) | ||
| extends Logging { | ||
|
|
||
| import Client._ | ||
|
|
@@ -69,6 +72,9 @@ private[spark] class Client( | |
|
|
||
| private val isClusterMode = sparkConf.get("spark.submit.deployMode", "client") == "cluster" | ||
|
|
||
| private val isClientUnmanagedAMEnabled = sparkConf.get(YARN_UNMANAGED_AM) && !isClusterMode | ||
| private var amServiceStarted = false | ||
|
||
|
|
||
| // AM related configurations | ||
| private val amMemory = if (isClusterMode) { | ||
| sparkConf.get(DRIVER_MEMORY).toInt | ||
|
|
@@ -286,7 +292,10 @@ private[spark] class Client( | |
| "does not support it", e) | ||
| } | ||
| } | ||
|
|
||
| if (isClientUnmanagedAMEnabled) { | ||
| // Set Unmanaged AM to true in Application Submission Context | ||
| appContext.setUnmanagedAM(true) | ||
|
||
| } | ||
| appContext | ||
| } | ||
|
|
||
|
|
@@ -648,7 +657,9 @@ private[spark] class Client( | |
| // Clear the cache-related entries from the configuration to avoid them polluting the | ||
| // UI's environment page. This works for client mode; for cluster mode, this is handled | ||
| // by the AM. | ||
| CACHE_CONFIGS.foreach(sparkConf.remove) | ||
| if (!isClientUnmanagedAMEnabled) { | ||
|
||
| CACHE_CONFIGS.foreach(sparkConf.remove) | ||
| } | ||
|
|
||
| localResources | ||
| } | ||
|
|
@@ -1084,14 +1095,38 @@ private[spark] class Client( | |
| if (returnOnRunning && state == YarnApplicationState.RUNNING) { | ||
| return createAppReport(report) | ||
| } | ||
|
|
||
| if (state == YarnApplicationState.ACCEPTED && isClientUnmanagedAMEnabled | ||
| && !amServiceStarted && report.getAMRMToken != null) { | ||
|
||
| amServiceStarted = true | ||
| startApplicationMasterService(report) | ||
| } | ||
| lastState = state | ||
| } | ||
|
|
||
| // Never reached, but keeps compiler happy | ||
| throw new SparkException("While loop is depleted! This should never happen...") | ||
| } | ||
|
|
||
| private def startApplicationMasterService(report: ApplicationReport) = { | ||
|
||
| // Add AMRMToken to establish connection between RM and AM | ||
| val token = report.getAMRMToken | ||
| val amRMToken: org.apache.hadoop.security.token.Token[AMRMTokenIdentifier] = | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you need to make this copy? Isn't the
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. report.getAMRMToken gives org.apache.hadoop.yarn.api.records.Token type instance, but currentUGI.addToken expects org.apache.hadoop.security.token.Token type instance. |
||
| new org.apache.hadoop.security.token.Token[AMRMTokenIdentifier](token | ||
|
||
| .getIdentifier().array(), token.getPassword().array, new Text( | ||
| token.getKind()), new Text(token.getService())) | ||
| val currentUGI = UserGroupInformation.getCurrentUser | ||
| currentUGI.addToken(amRMToken) | ||
|
|
||
| sparkConf.set("spark.yarn.containerId", | ||
| ContainerId.newContainerId(report.getCurrentApplicationAttemptId, 1).toString) | ||
|
||
| // Start Application Service in a separate thread and continue with application monitoring | ||
| val amService = new Thread() { | ||
|
||
| override def run(): Unit = new ApplicationMaster(sparkConf, hadoopConf, rpcEnv).run() | ||
| } | ||
| amService.setDaemon(true) | ||
| amService.start() | ||
| } | ||
|
|
||
| private def formatReportDetails(report: ApplicationReport): String = { | ||
| val details = Seq[(String, String)]( | ||
| ("client token", getClientToken(report)), | ||
|
|
@@ -1513,7 +1548,7 @@ private[spark] class YarnClusterApplication extends SparkApplication { | |
| conf.remove("spark.jars") | ||
| conf.remove("spark.files") | ||
|
|
||
| new Client(new ClientArguments(args), conf).run() | ||
| new Client(new ClientArguments(args), conf, null).run() | ||
| } | ||
|
|
||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -183,8 +183,13 @@ object YarnSparkHadoopUtil { | |
| ) | ||
| } | ||
|
|
||
| def getContainerId: ContainerId = { | ||
| val containerIdString = System.getenv(ApplicationConstants.Environment.CONTAINER_ID.name()) | ||
| def getContainerId(sparkConf: SparkConf): ContainerId = { | ||
| val containerIdString = | ||
|
||
| if (System.getenv(ApplicationConstants.Environment.CONTAINER_ID.name()) != null) { | ||
|
||
| System.getenv(ApplicationConstants.Environment.CONTAINER_ID.name()) | ||
| } else { | ||
| sparkConf.get("spark.yarn.containerId") | ||
| } | ||
| ConverterUtils.toContainerId(containerIdString) | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -229,6 +229,14 @@ package object config { | |
| .stringConf | ||
| .createOptional | ||
|
|
||
| /* Unmanaged AM configuration. */ | ||
|
|
||
| private[spark] val YARN_UNMANAGED_AM = ConfigBuilder("spark.yarn.unmanagedAM") | ||
|
||
| .doc("In client mode, whether to launch the Application Master service as part of the client " + | ||
| "using unmanaged am.") | ||
| .booleanConf | ||
| .createWithDefault(false) | ||
|
|
||
| /* Security configuration. */ | ||
|
|
||
| private[spark] val NAMENODES_TO_ACCESS = ConfigBuilder("spark.yarn.access.namenodes") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See above constructor for multi-line args style.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
removed this constructor as part of below comment refactor