-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-21146] [CORE] Master/Worker should handle and shutdown when any thread gets UncaughtException #18357
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-21146] [CORE] Master/Worker should handle and shutdown when any thread gets UncaughtException #18357
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,7 +38,7 @@ import org.apache.spark.deploy.worker.ui.WorkerWebUI | |
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.metrics.MetricsSystem | ||
| import org.apache.spark.rpc._ | ||
| import org.apache.spark.util.{ThreadUtils, Utils} | ||
| import org.apache.spark.util.{SparkUncaughtExceptionHandler, ThreadUtils, Utils} | ||
|
|
||
| private[deploy] class Worker( | ||
| override val rpcEnv: RpcEnv, | ||
|
|
@@ -737,6 +737,7 @@ private[deploy] object Worker extends Logging { | |
| val ENDPOINT_NAME = "Worker" | ||
|
|
||
| def main(argStrings: Array[String]) { | ||
| Thread.setDefaultUncaughtExceptionHandler(new SparkUncaughtExceptionHandler(false)) | ||
|
||
| Utils.initDaemon(log) | ||
| val conf = new SparkConf | ||
| val args = new WorkerArguments(argStrings, conf) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,29 +20,29 @@ package org.apache.spark.util | |
| import org.apache.spark.internal.Logging | ||
|
|
||
| /** | ||
| * The default uncaught exception handler for Executors terminates the whole process, to avoid | ||
| * getting into a bad state indefinitely. Since Executors are relatively lightweight, it's better | ||
| * to fail fast when things go wrong. | ||
| * The default uncaught exception handler for Spark daemons. It terminates the whole process for | ||
| * any Errors, and also terminates the process for Exceptions when the exitOnException flag is true. | ||
| */ | ||
| private[spark] object SparkUncaughtExceptionHandler | ||
| private[spark] class SparkUncaughtExceptionHandler(val exitOnException: Boolean = true) | ||
|
||
| extends Thread.UncaughtExceptionHandler with Logging { | ||
|
|
||
| override def uncaughtException(thread: Thread, exception: Throwable) { | ||
| try { | ||
| // Make it explicit that uncaught exceptions are thrown when container is shutting down. | ||
| // Make it explicit that uncaught exceptions are thrown when process is shutting down. | ||
| // It will help users when they analyze the executor logs | ||
| val inShutdownMsg = if (ShutdownHookManager.inShutdown()) "[Container in shutdown] " else "" | ||
| val errMsg = "Uncaught exception in thread " | ||
| logError(inShutdownMsg + errMsg + thread, exception) | ||
|
|
||
| // We may have been called from a shutdown hook. If so, we must not call System.exit(). | ||
| // (If we do, we will deadlock.) | ||
| if (!ShutdownHookManager.inShutdown()) { | ||
| val errMsg = "Uncaught exception in thread " + thread | ||
| if (ShutdownHookManager.inShutdown()) { | ||
| logError("[Process in shutdown] " + errMsg, exception) | ||
| } else if (exception.isInstanceOf[Error] || | ||
| (!exception.isInstanceOf[Error] && exitOnException)) { | ||
| logError(errMsg + ". Shutting down now..", exception) | ||
| if (exception.isInstanceOf[OutOfMemoryError]) { | ||
| System.exit(SparkExitCode.OOM) | ||
| } else { | ||
| System.exit(SparkExitCode.UNCAUGHT_EXCEPTION) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current changes are too much. Could you rename |
||
| } | ||
| } else { | ||
| logError(errMsg, exception) | ||
| } | ||
| } catch { | ||
| case oom: OutOfMemoryError => Runtime.getRuntime.halt(SparkExitCode.OOM) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you keep these codes? It unlikely happens but since the codes are there, it's better to not change it.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @zsxwing, this code is still there but I moved the try&catch to the block where we invoke System.exit. Do you mean moving the whole code in uncaughtException() to try block and having the catch block?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @zsxwing Thanks for the clarification. |
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
false=>exitOnException = false