-
Notifications
You must be signed in to change notification settings - Fork 5.5k
Watchdog: use abort action as a default if killing is enabled. #13523
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
3b8841c
931b0e7
8eeca92
ab416ac
8a27924
11b0e04
0e69454
64e7a62
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| This contains watchdog actions that are part of core Envoy, and therefore cannot | ||
| be in the extensions directory. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| syntax = "proto3"; | ||
|
|
||
| package envoy.watchdog.v3alpha; | ||
|
|
||
| import "google/protobuf/duration.proto"; | ||
|
|
||
| import "udpa/annotations/status.proto"; | ||
| import "udpa/annotations/versioning.proto"; | ||
| import "validate/validate.proto"; | ||
|
|
||
| option java_package = "io.envoyproxy.envoy.watchdog.v3alpha"; | ||
| option java_outer_classname = "AbortActionProto"; | ||
| option java_multiple_files = true; | ||
| option (udpa.annotations.file_status).work_in_progress = true; | ||
| option (udpa.annotations.file_status).package_version_status = ACTIVE; | ||
|
|
||
| // [#protodoc-title: Watchdog Action that kills a stuck thread to kill the process.] | ||
|
|
||
| // A GuardDogAction that will terminate the process by killing the | ||
| // stuck thread. This would allow easier access to the call stack of the stuck | ||
| // thread since we would run signal handlers on that thread. By default | ||
| // this will be registered to run as the last watchdog action on KILL and | ||
| // MULTIKILL events if those are enabled. | ||
| message AbortActionConfig { | ||
| // How long to wait for the thread to respond to the thread kill function | ||
| // before killing the process from this action. This is a blocking action. | ||
| // By default this is 5 seconds. | ||
| google.protobuf.Duration wait_duration = 1; | ||
| } | ||
This file was deleted.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| load( | ||
| "//bazel:envoy_build_system.bzl", | ||
| "envoy_cc_library", | ||
| "envoy_package", | ||
| ) | ||
|
|
||
| licenses(["notice"]) # Apache 2 | ||
|
|
||
| envoy_package() | ||
|
|
||
| envoy_cc_library( | ||
| name = "terminate_thread_lib", | ||
| srcs = ["terminate_thread.cc"], | ||
| hdrs = ["terminate_thread.h"], | ||
| deps = [ | ||
| "//include/envoy/thread:thread_interface", | ||
| "//source/common/common:minimal_logger_lib", | ||
| ], | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| #include "common/thread/terminate_thread.h" | ||
|
|
||
| #include <sys/types.h> | ||
|
|
||
| #include <csignal> | ||
|
|
||
| #include "common/common/logger.h" | ||
|
|
||
| namespace Envoy { | ||
| namespace Thread { | ||
| namespace { | ||
| #ifdef __linux__ | ||
| pid_t toPlatformTid(int64_t tid) { return static_cast<pid_t>(tid); } | ||
| #elif defined(__APPLE__) | ||
| uint64_t toPlatformTid(int64_t tid) { return static_cast<uint64_t>(tid); } | ||
| #endif | ||
| } // namespace | ||
|
|
||
| bool terminateThread(const ThreadId& tid) { | ||
| #ifndef WIN32 | ||
| // Assume POSIX-compatible system and signal to the thread. | ||
| return kill(toPlatformTid(tid.getId()), SIGABRT) == 0; | ||
| #else | ||
| // Windows, currently unsupported termination of thread. | ||
| ENVOY_LOG_MISC(error, "Windows is currently unsupported for terminateThread."); | ||
| return false; | ||
| #endif | ||
| } | ||
|
|
||
| } // namespace Thread | ||
| } // namespace Envoy |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| #pragma once | ||
|
|
||
| #include "envoy/thread/thread.h" | ||
|
|
||
| namespace Envoy { | ||
| namespace Thread { | ||
| /** | ||
| * Tries to terminates the process by killing the thread specified by | ||
| * the ThreadId. The implementation is platform dependent and currently | ||
| * only works on platforms that support SIGABRT. | ||
| * | ||
| * Returns the result from the platform specific function (i.e. kill) to terminate | ||
| * the thread. If the platform is currently unsupported, this will return false. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This does not return the result of the platform specific function, it returns true if the platform specific function succeeded. See implementation, return is: kill() == 0;
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch I've made it: Returns true if the platform specific function to terminate the thread succeeded (i.e. kill() == 0). If the platform is currently unsupported, this will return false. |
||
| */ | ||
| bool terminateThread(const ThreadId& tid); | ||
|
mattklein123 marked this conversation as resolved.
|
||
|
|
||
| } // namespace Thread | ||
| } // namespace Envoy | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| This contains watchdog actions that are part of core Envoy, and therefore cannot | ||
| be in the extensions directory. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| #include "common/watchdog/abort_action.h" | ||
|
|
||
| #include "envoy/thread/thread.h" | ||
|
|
||
| #include "common/common/assert.h" | ||
| #include "common/common/fmt.h" | ||
| #include "common/common/logger.h" | ||
| #include "common/protobuf/utility.h" | ||
| #include "common/thread/terminate_thread.h" | ||
|
|
||
| namespace Envoy { | ||
| namespace Watchdog { | ||
| namespace { | ||
| constexpr uint64_t DefaultWaitDurationMs = 5000; | ||
| } // end namespace | ||
|
|
||
| AbortAction::AbortAction(envoy::watchdog::v3alpha::AbortActionConfig& config, | ||
| Server::Configuration::GuardDogActionFactoryContext& /*context*/) | ||
| : wait_duration_(absl::Milliseconds( | ||
| PROTOBUF_GET_MS_OR_DEFAULT(config, wait_duration, DefaultWaitDurationMs))) {} | ||
|
|
||
| void AbortAction::run( | ||
| envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent /*event*/, | ||
| const std::vector<std::pair<Thread::ThreadId, MonotonicTime>>& thread_last_checkin_pairs, | ||
| MonotonicTime /*now*/) { | ||
|
|
||
| if (thread_last_checkin_pairs.empty()) { | ||
| ENVOY_LOG_MISC(warn, "Watchdog AbortAction called without any thread."); | ||
| return; | ||
| } | ||
|
|
||
| // The following lines of code won't be considered covered by code coverage | ||
| // tools since they would run in DEATH tests. | ||
| const auto& thread_id = thread_last_checkin_pairs[0].first; | ||
| const std::string tid_string = thread_id.debugString(); | ||
| ENVOY_LOG_MISC(error, "Watchdog AbortAction terminating thread with tid {}.", tid_string); | ||
|
|
||
| if (Thread::terminateThread(thread_id)) { | ||
| // Successfully signaled to thread to terminate, sleep for wait_duration. | ||
| absl::SleepFor(wait_duration_); | ||
| } else { | ||
| ENVOY_LOG_MISC(error, "Failed to terminate tid {}", tid_string); | ||
| } | ||
|
|
||
| // Abort from the action since the signaled thread hasn't yet crashed the process. | ||
| // Panicing in the action gives flexibility since it doesn't depend on | ||
| // external code to kill the process if the signal fails. | ||
| PANIC(fmt::format( | ||
| "Failed to terminate thread with id {}, aborting from Watchdog AbortAction instead.", | ||
| tid_string)); | ||
| } | ||
|
|
||
| } // namespace Watchdog | ||
| } // namespace Envoy |
Uh oh!
There was an error while loading. Please reload this page.