-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-4751] Dynamic allocation in standalone mode #7532
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
80047aa
49702d1
42ac215
58cb06f
32abe44
1334e9a
2eb5f3f
b7742af
0a8be79
a82e907
6832bd7
2e762d6
24149eb
c0a2c02
ee686a8
accc8f6
879e928
b3c1736
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -531,8 +531,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| val dynamicAllocationEnabled = _conf.getBoolean("spark.dynamicAllocation.enabled", false) | ||
| _executorAllocationManager = | ||
| if (dynamicAllocationEnabled) { | ||
| assert(supportDynamicAllocation, | ||
| "Dynamic allocation of executors is currently only supported in YARN and Mesos mode") | ||
| Some(new ExecutorAllocationManager(this, listenerBus, _conf)) | ||
| } else { | ||
| None | ||
|
|
@@ -1361,17 +1359,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| postEnvironmentUpdate() | ||
| } | ||
|
|
||
| /** | ||
| * Return whether dynamically adjusting the amount of resources allocated to | ||
| * this application is supported. This is currently only available for YARN | ||
| * and Mesos coarse-grained mode. | ||
| */ | ||
| private[spark] def supportDynamicAllocation: Boolean = { | ||
| (master.contains("yarn") | ||
| || master.contains("mesos") | ||
| || _conf.getBoolean("spark.dynamicAllocation.testing", false)) | ||
| } | ||
|
|
||
| /** | ||
| * :: DeveloperApi :: | ||
| * Register a listener to receive up-calls from events that happen during execution. | ||
|
|
@@ -1387,8 +1374,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| * This is currently only supported in YARN mode. Return whether the request is received. | ||
| */ | ||
| private[spark] override def requestTotalExecutors(numExecutors: Int): Boolean = { | ||
| assert(supportDynamicAllocation, | ||
| "Requesting executors is currently only supported in YARN and Mesos modes") | ||
| schedulerBackend match { | ||
| case b: CoarseGrainedSchedulerBackend => | ||
| b.requestTotalExecutors(numExecutors) | ||
|
|
@@ -1405,8 +1390,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| */ | ||
| @DeveloperApi | ||
| override def requestExecutors(numAdditionalExecutors: Int): Boolean = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. outdated comments for this function? https://github.com/apache/spark/pull/7532/files#diff-364713d7776956cb8b0a771e9b62f82dR1389 |
||
| assert(supportDynamicAllocation, | ||
| "Requesting executors is currently only supported in YARN and Mesos modes") | ||
| schedulerBackend match { | ||
| case b: CoarseGrainedSchedulerBackend => | ||
| b.requestExecutors(numAdditionalExecutors) | ||
|
|
@@ -1429,8 +1412,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| */ | ||
| @DeveloperApi | ||
| override def killExecutors(executorIds: Seq[String]): Boolean = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. outdated comments...BTW, how to comment line without the changes? |
||
| assert(supportDynamicAllocation, | ||
| "Killing executors is currently only supported in YARN and Mesos modes") | ||
| schedulerBackend match { | ||
| case b: CoarseGrainedSchedulerBackend => | ||
| b.killExecutors(executorIds) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -197,6 +197,22 @@ private[spark] class AppClient( | |
| sendToMaster(UnregisterApplication(appId)) | ||
| context.reply(true) | ||
| stop() | ||
|
|
||
| case r: RequestExecutors => | ||
| master match { | ||
| case Some(m) => context.reply(m.askWithRetry[Boolean](r)) | ||
| case None => | ||
| logWarning("Attempted to request executors before registering with Master.") | ||
| context.reply(false) | ||
| } | ||
|
|
||
| case k: KillExecutors => | ||
| master match { | ||
| case Some(m) => context.reply(m.askWithRetry[Boolean](k)) | ||
| case None => | ||
| logWarning("Attempted to kill executors before registering with Master.") | ||
| context.reply(false) | ||
| } | ||
| } | ||
|
|
||
| override def onDisconnected(address: RpcAddress): Unit = { | ||
|
|
@@ -256,4 +272,33 @@ private[spark] class AppClient( | |
| endpoint = null | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Request executors from the Master by specifying the total number desired, | ||
| * including existing pending and running executors. | ||
| * | ||
| * @return whether the request is acknowledged. | ||
| */ | ||
| def requestTotalExecutors(requestedTotal: Int): Boolean = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it necessary to validate the value of
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is already done in |
||
| if (endpoint != null && appId != null) { | ||
| endpoint.askWithRetry[Boolean](RequestExecutors(appId, requestedTotal)) | ||
| } else { | ||
| logWarning("Attempted to request executors before driver fully initialized.") | ||
| false | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Kill the given list of executors through the Master. | ||
| * @return whether the kill request is acknowledged. | ||
| */ | ||
| def killExecutors(executorIds: Seq[String]): Boolean = { | ||
| if (endpoint != null && appId != null) { | ||
| endpoint.askWithRetry[Boolean](KillExecutors(appId, executorIds)) | ||
| } else { | ||
| logWarning("Attempted to kill executors before driver fully initialized.") | ||
| false | ||
| } | ||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,7 +22,6 @@ import java.util.Date | |
| import scala.collection.mutable | ||
| import scala.collection.mutable.ArrayBuffer | ||
|
|
||
| import org.apache.spark.annotation.DeveloperApi | ||
| import org.apache.spark.deploy.ApplicationDescription | ||
| import org.apache.spark.rpc.RpcEndpointRef | ||
| import org.apache.spark.util.Utils | ||
|
|
@@ -43,6 +42,18 @@ private[spark] class ApplicationInfo( | |
| @transient var endTime: Long = _ | ||
| @transient var appSource: ApplicationSource = _ | ||
|
|
||
| // A cap on the number of executors this application can have at any given time. | ||
| // By default, this is infinite. Only after the first allocation request is issued | ||
| // by the application will this be set to a finite value. | ||
| @transient var executorLimit: Int = _ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could initialize it here to
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because this is transient, we need to reinitialize it when we deserialize it (see
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (see my comment below)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I didn't see the |
||
|
|
||
| // A set of workers on which this application cannot launch executors. | ||
| // This is used to handle kill requests when `spark.executor.cores` is NOT set. In this mode, | ||
| // at most one executor from this application can be run on each worker. When an executor is | ||
| // killed, its worker is added to the blacklist to avoid having the master immediately schedule | ||
| // a new executor on the worker. | ||
| @transient private var blacklistedWorkers: mutable.HashSet[String] = _ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still think it doesn't need to be both a My 2c. |
||
|
|
||
| @transient private var nextExecutorId: Int = _ | ||
|
|
||
| init() | ||
|
|
@@ -60,6 +71,8 @@ private[spark] class ApplicationInfo( | |
| appSource = new ApplicationSource(this) | ||
| nextExecutorId = 0 | ||
| removedExecutors = new ArrayBuffer[ExecutorDesc] | ||
| executorLimit = Integer.MAX_VALUE | ||
| blacklistedWorkers = new mutable.HashSet[String] | ||
| } | ||
|
|
||
| private def newExecutorId(useID: Option[Int] = None): Int = { | ||
|
|
@@ -96,6 +109,47 @@ private[spark] class ApplicationInfo( | |
|
|
||
| private[master] def coresLeft: Int = requestedCores - coresGranted | ||
|
|
||
| /** | ||
| * Return the number of executors waiting to be scheduled once space frees up. | ||
| * | ||
| * This is only defined if the application explicitly set the executor limit. For instance, | ||
| * if an application asks for 8 executors but there is only space for 5, then there will be | ||
| * 3 waiting executors. | ||
| */ | ||
| private[master] def numWaitingExecutors: Int = { | ||
| if (executorLimit != Integer.MAX_VALUE) { | ||
| math.max(0, executorLimit - executors.size) | ||
| } else { | ||
| 0 | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Add a worker to the blacklist, called when the executor running on the worker is killed. | ||
| * This is used only if cores per executor is not set. | ||
| */ | ||
| private[master] def blacklistWorker(workerId: String): Unit = { | ||
| blacklistedWorkers += workerId | ||
| } | ||
|
|
||
| /** | ||
| * Remove workers from the blacklist, called when the application requests new executors. | ||
| * This is used only if cores per executor is not set. | ||
| */ | ||
| private[master] def removeFromBlacklist(numWorkers: Int): Unit = { | ||
| blacklistedWorkers.take(numWorkers).foreach { workerId => | ||
|
||
| blacklistedWorkers.remove(workerId) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Return whether the specified worker is blacklisted. | ||
| * This is used only if cores per executor is not set. | ||
| */ | ||
| private[master] def isBlacklisted(workerId: String): Boolean = { | ||
| blacklistedWorkers.contains(workerId) | ||
| } | ||
|
|
||
|
||
| private var _retryCount = 0 | ||
|
|
||
| private[master] def retryCount = _retryCount | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
outdated comments?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yup, thanks