-
Notifications
You must be signed in to change notification settings - Fork 29.3k
[SPARK-31486] [CORE] spark.submit.waitAppCompletion flag to control spark-submit exit in Standalone Cluster Mode #28258
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
13ea149
68d76d0
34c7d26
a93ce76
d5eded1
8eef373
0918106
e225495
20f1bd6
9050a08
45c9817
743d93d
fe142a8
27a81c9
0e152f4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -61,6 +61,11 @@ private class ClientEndpoint( | |||||||
|
|
||||||||
| private val lostMasters = new HashSet[RpcAddress] | ||||||||
| private var activeMasterEndpoint: RpcEndpointRef = null | ||||||||
| private val waitAppCompletion = conf.getBoolean("spark.standalone.submit.waitAppCompletion", | ||||||||
| false) | ||||||||
| private val REPORT_DRIVER_STATUS_INTERVAL = 1000 | ||||||||
| private var submittedDriverID = "" | ||||||||
|
|
||||||||
|
|
||||||||
| private def getProperty(key: String, conf: SparkConf): Option[String] = { | ||||||||
| sys.props.get(key).orElse(conf.getOption(key)) | ||||||||
|
|
@@ -124,44 +129,53 @@ private class ClientEndpoint( | |||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| /* Find out driver status then exit the JVM */ | ||||||||
| /** | ||||||||
| * Find out driver status then exit the JVM. If the waitAppCompletion is set to true, monitors | ||||||||
| * the application until it finishes, fails or is killed. | ||||||||
| */ | ||||||||
| def pollAndReportStatus(driverId: String): Unit = { | ||||||||
| // Since ClientEndpoint is the only RpcEndpoint in the process, blocking the event loop thread | ||||||||
| // is fine. | ||||||||
| logInfo("... waiting before polling master for driver state") | ||||||||
| Thread.sleep(5000) | ||||||||
| logInfo("... polling master for driver state") | ||||||||
| val statusResponse = | ||||||||
| activeMasterEndpoint.askSync[DriverStatusResponse](RequestDriverStatus(driverId)) | ||||||||
| if (statusResponse.found) { | ||||||||
| logInfo(s"State of $driverId is ${statusResponse.state.get}") | ||||||||
| // Worker node, if present | ||||||||
| (statusResponse.workerId, statusResponse.workerHostPort, statusResponse.state) match { | ||||||||
| case (Some(id), Some(hostPort), Some(DriverState.RUNNING)) => | ||||||||
| logInfo(s"Driver running on $hostPort ($id)") | ||||||||
| case _ => | ||||||||
| } | ||||||||
| // Exception, if present | ||||||||
| statusResponse.exception match { | ||||||||
| case Some(e) => | ||||||||
| logError(s"Exception from cluster was: $e") | ||||||||
| e.printStackTrace() | ||||||||
| System.exit(-1) | ||||||||
| case _ => | ||||||||
| System.exit(0) | ||||||||
| val statusResponse = | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: indents.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, updated the indentation in the latest commit. |
||||||||
| activeMasterEndpoint.askSync[DriverStatusResponse](RequestDriverStatus(driverId)) | ||||||||
| if (statusResponse.found) { | ||||||||
| logInfo(s"State of $driverId is ${statusResponse.state.get}") | ||||||||
| // Worker node, if present | ||||||||
| (statusResponse.workerId, statusResponse.workerHostPort, statusResponse.state) match { | ||||||||
| case (Some(id), Some(hostPort), Some(DriverState.RUNNING)) => | ||||||||
| logInfo(s"Driver running on $hostPort ($id)") | ||||||||
| case _ => | ||||||||
| } | ||||||||
| // Exception, if present | ||||||||
| statusResponse.exception match { | ||||||||
| case Some(e) => | ||||||||
| logError(s"Exception from cluster was: $e") | ||||||||
| e.printStackTrace() | ||||||||
| System.exit(-1) | ||||||||
| case _ => | ||||||||
| if (!waitAppCompletion) { | ||||||||
| logInfo(s"spark-submit not configured to wait for completion, " + | ||||||||
| s"exiting spark-submit JVM.") | ||||||||
| System.exit(0) | ||||||||
| } else { | ||||||||
| asyncSendToMasterAndForwardReply[DriverStatusResponse](RequestDriverStatus(driverId)) | ||||||||
| } | ||||||||
| } | ||||||||
| } else { | ||||||||
| logError(s"ERROR: Cluster master did not recognize $driverId") | ||||||||
| System.exit(-1) | ||||||||
| } | ||||||||
| } else { | ||||||||
| logError(s"ERROR: Cluster master did not recognize $driverId") | ||||||||
| System.exit(-1) | ||||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| override def receive: PartialFunction[Any, Unit] = { | ||||||||
|
|
||||||||
| case SubmitDriverResponse(master, success, driverId, message) => | ||||||||
| logInfo(message) | ||||||||
| if (success) { | ||||||||
| activeMasterEndpoint = master | ||||||||
| submittedDriverID = driverId.get | ||||||||
| pollAndReportStatus(driverId.get) | ||||||||
| } else if (!Utils.responseFromBackup(message)) { | ||||||||
| System.exit(-1) | ||||||||
|
|
@@ -176,6 +190,25 @@ private class ClientEndpoint( | |||||||
| } else if (!Utils.responseFromBackup(message)) { | ||||||||
| System.exit(-1) | ||||||||
| } | ||||||||
|
|
||||||||
| case DriverStatusResponse(found, state, _, _, _) => | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be better if we could do some refactor on
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Ngone51 Thanks for your feedback.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
we use this: (but the initial delay need to change) in this way, submitting or killing drivers will still use it only for one time when
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Scheduling to monitor driver status is done only in case of submit and not in kill as of now. So we may need to explicitly send a message to monitor driver status after 5 seconds delay in case of kill.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that's why I said we need to change the delay (e.g. 5s) instead of 0 for both submiting and killing.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that we can change the delay to 5 seconds to keep it consistent with current logic. My question is that should we add the following block in
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we don't need to add
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's in the case "launch" as of now. I will move it to a global place and refactor the code. Thanks for your suggestions.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh, sorry miss that. yea, thank you!
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Ngone51 I have refactored the code as suggested. Kindly review it again. Thanks. |
||||||||
| if (found) { | ||||||||
| state.get match { | ||||||||
| case DriverState.FINISHED | DriverState.FAILED | | ||||||||
| DriverState.ERROR | DriverState.KILLED => | ||||||||
| logInfo(s"State of $submittedDriverID is ${state.get}, " + | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: s"State of driver $submittedDriverID ..."
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, updated in the latest commit. |
||||||||
| s"exiting spark-submit JVM.") | ||||||||
| System.exit(0) | ||||||||
| case _ => | ||||||||
| Thread.sleep(REPORT_DRIVER_STATUS_INTERVAL) | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
spark/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala Lines 137 to 139 in 9faad07
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Ngone51 Thanks for reviewing. I have updated to use the task scheduler to do the same. Could you kindly review it again and please let me know your comments? |
||||||||
| logInfo(s"State of $submittedDriverID is ${state.get}, " + | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: s"State of driver $submittedDriverID ..."
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since status polling will happen every second, I'm afraid logs can be too verbose. We can log it after a constant polling times, e.g. log every 60 times.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would produce too much logs, please consider using 'logDebug'
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, I have changed it to use |
||||||||
| s"continue monitoring driver status.") | ||||||||
| asyncSendToMasterAndForwardReply[DriverStatusResponse]( | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When you submit too much applications at the same time, this could lead to heavy communication burden to the driver. I would suggest check less frequent (like increasing the interval to 10s)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jiangxb1987 Thanks for reviewing. I have changed it to 10 seconds and took care of your other comments. Kindly review the PR again. |
||||||||
| RequestDriverStatus(submittedDriverID)) | ||||||||
| } | ||||||||
| } else { | ||||||||
| System.exit(-1) | ||||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| override def onDisconnected(remoteAddress: RpcAddress): Unit = { | ||||||||
|
|
||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -374,6 +374,25 @@ To run an interactive Spark shell against the cluster, run the following command | |
|
|
||
| You can also pass an option `--total-executor-cores <numCores>` to control the number of cores that spark-shell uses on the cluster. | ||
|
|
||
| # Client Properties | ||
|
|
||
| Spark applications supports the following configuration properties specific to standalone mode: | ||
|
|
||
| <table class="table"> | ||
|
akshatb1 marked this conversation as resolved.
|
||
| <tr><th style="width:21%">Property Name</th><th>Default Value</th><th>Meaning</th><th>Since Version</th></tr> | ||
| <tr> | ||
| <td><code>spark.standalone.submit.waitAppCompletion</code></td> | ||
| <td><code>false</code></td> | ||
| <td> | ||
| In standalone cluster mode, controls whether the client waits to exit until the application completes. | ||
| If set to <code>true</code>, the client process will stay alive polling the application's status. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, updated to |
||
| Otherwise, the client process will exit after submission. | ||
| </td> | ||
| <td>3.1.0</td> | ||
| </tr> | ||
| </table> | ||
|
|
||
|
|
||
| # Launching Spark Applications | ||
|
|
||
| The [`spark-submit` script](submitting-applications.html) provides the most straightforward way to | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.