-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-12864][YARN] initialize executorIdCounter after ApplicationMaster killed for max n… #10794
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
30048ac
fa7d54b
3a1724c
659c505
ebe3c7f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -78,6 +78,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp | |
| // Executors that have been lost, but for which we don't yet know the real exit reason. | ||
| protected val executorsPendingLossReason = new HashSet[String] | ||
|
|
||
| // The num of current max ExecutorId used to re-register appMaster | ||
| private var currentExecutorIdCounter = 0 | ||
|
|
||
| class DriverEndpoint(override val rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)]) | ||
| extends ThreadSafeRpcEndpoint with Logging { | ||
|
|
||
|
|
@@ -155,6 +158,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp | |
| // in this block are read when requesting executors | ||
| CoarseGrainedSchedulerBackend.this.synchronized { | ||
| executorDataMap.put(executorId, data) | ||
| if (currentExecutorIdCounter < Integer.parseInt(executorId)) { | ||
| currentExecutorIdCounter = Integer.parseInt(executorId) | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is kind of awkward. You don't need to keep track of another variable; just compute the max executor ID when the AM asks for it. You already have all the information you need in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @andrewor14 Thank for review it. For my understanding, I don't think we can get the max executor ID in executorDataMap. Because, when AM is failure, all the executor are disconnect and be removed, by this time, as the code in method
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, with dynamic allocation, for example, the executor with the max known id may be gone already. minor:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @vanzin Thanks for your comments. I will optimize it. |
||
| if (numPendingExecutors > 0) { | ||
| numPendingExecutors -= 1 | ||
| logDebug(s"Decremented number of pending executors ($numPendingExecutors left)") | ||
|
|
@@ -184,6 +190,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp | |
|
|
||
| case RetrieveSparkProps => | ||
| context.reply(sparkProperties) | ||
|
|
||
| case RetrieveCurrentExecutorIdCounter => | ||
| context.reply(currentExecutorIdCounter) | ||
| } | ||
|
|
||
| // Make fake resource offers on all executors | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,11 +33,14 @@ import org.apache.hadoop.yarn.util.RackResolver | |
| import org.apache.log4j.{Level, Logger} | ||
|
|
||
| import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException} | ||
| import org.apache.spark.deploy.SparkHadoopUtil | ||
| import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._ | ||
| import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef} | ||
| import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv} | ||
| import org.apache.spark.scheduler.{ExecutorExited, ExecutorLossReason} | ||
| import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RemoveExecutor | ||
| import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RetrieveCurrentExecutorIdCounter | ||
| import org.apache.spark.util.ThreadUtils | ||
| import org.apache.spark.util.Utils | ||
|
|
||
| /** | ||
| * YarnAllocator is charged with requesting containers from the YARN ResourceManager and deciding | ||
|
|
@@ -168,6 +171,24 @@ private[yarn] class YarnAllocator( | |
| .toSeq | ||
| } | ||
|
|
||
| /** | ||
| * Init `executorIdCounter` | ||
| */ | ||
| def initExecutorIdCounter(): Unit = { | ||
|
||
| val port = sparkConf.getInt("spark.yarn.am.port", 0) | ||
| SparkHadoopUtil.get.runAsSparkUser { () => | ||
| val init = RpcEnv.create( | ||
| "executorIdCounterInit", | ||
| Utils.localHostName, | ||
| port, | ||
| sparkConf, | ||
| new SecurityManager(sparkConf)) | ||
| val driver = init.setupEndpointRefByURI(driverUrl) | ||
|
||
| executorIdCounter = driver.askWithRetry[Integer](RetrieveCurrentExecutorIdCounter) | ||
| init.shutdown() | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Request as many executors from the ResourceManager as needed to reach the desired total. If | ||
| * the requested total is smaller than the current number of running executors, no executors will | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would call this
RetrieveMaxExecutorId. Without context the reader has no idea whatexecutorIdCounteris.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@andrewor14 Yeah, you are right. I will fix it soon.