diff --git a/CODEOWNERS b/CODEOWNERS index b3c9af89a7a8f..9696b370c4c75 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -142,7 +142,6 @@ extensions/filters/common/original_src @snowp @klarose /*/extensions/filters/http/decompressor @rojkov @dio # Watchdog Extensions /*/extensions/watchdog/profile_action @kbaichoo @antoniovicente -/*/extensions/watchdog/abort_action @kbaichoo @antoniovicente # Core upstream code extensions/upstreams/http @alyssawilk @snowp @mattklein123 extensions/upstreams/http/http @alyssawilk @snowp @mattklein123 diff --git a/api/BUILD b/api/BUILD index ed8743b793e39..345732128a0d2 100644 --- a/api/BUILD +++ b/api/BUILD @@ -248,7 +248,6 @@ proto_library( "//envoy/extensions/upstreams/http/http/v3:pkg", "//envoy/extensions/upstreams/http/tcp/v3:pkg", "//envoy/extensions/wasm/v3:pkg", - "//envoy/extensions/watchdog/abort_action/v3alpha:pkg", "//envoy/extensions/watchdog/profile_action/v3alpha:pkg", "//envoy/service/accesslog/v3:pkg", "//envoy/service/auth/v3:pkg", @@ -272,6 +271,7 @@ proto_library( "//envoy/type/metadata/v3:pkg", "//envoy/type/tracing/v3:pkg", "//envoy/type/v3:pkg", + "//envoy/watchdog/v3alpha:pkg", ], ) diff --git a/api/envoy/extensions/watchdog/abort_action/v3alpha/abort_action.proto b/api/envoy/extensions/watchdog/abort_action/v3alpha/abort_action.proto deleted file mode 100644 index 7d793be82012b..0000000000000 --- a/api/envoy/extensions/watchdog/abort_action/v3alpha/abort_action.proto +++ /dev/null @@ -1,32 +0,0 @@ -syntax = "proto3"; - -package envoy.extensions.watchdog.abort_action.v3alpha; - -import "google/protobuf/duration.proto"; - -import "udpa/annotations/status.proto"; -import "udpa/annotations/versioning.proto"; -import "validate/validate.proto"; - -option java_package = "io.envoyproxy.envoy.extensions.watchdog.abort_action.v3alpha"; -option java_outer_classname = "AbortActionProto"; -option java_multiple_files = true; -option (udpa.annotations.file_status).work_in_progress = true; -option (udpa.annotations.file_status).package_version_status = ACTIVE; - -// [#protodoc-title: Watchdog Action that sends a SIGABRT to kill the process.] -// [#extension: envoy.watchdog.abort_action] - -// A GuardDogAction that will terminate the process by sending SIGABRT to the -// stuck thread. This would allow easier access to the call stack of the stuck -// thread since we would run signal handlers on that thread. This would be -// more useful than the default watchdog kill behaviors since those PANIC -// from the watchdog's thread. - -// This is currently only implemented for systems that support kill to send -// signals. -message AbortActionConfig { - // How long to wait for the thread to respond to the SIGABRT before killing the - // process from this action. This is a blocking action. - google.protobuf.Duration wait_duration = 1; -} diff --git a/api/envoy/extensions/watchdog/abort_action/v3alpha/BUILD b/api/envoy/watchdog/v3alpha/BUILD similarity index 100% rename from api/envoy/extensions/watchdog/abort_action/v3alpha/BUILD rename to api/envoy/watchdog/v3alpha/BUILD diff --git a/api/envoy/watchdog/v3alpha/README.md b/api/envoy/watchdog/v3alpha/README.md new file mode 100644 index 0000000000000..c8433b9c05b52 --- /dev/null +++ b/api/envoy/watchdog/v3alpha/README.md @@ -0,0 +1,2 @@ +This contains watchdog actions that are part of core Envoy, and therefore cannot +be in the extensions directory. diff --git a/api/envoy/watchdog/v3alpha/abort_action.proto b/api/envoy/watchdog/v3alpha/abort_action.proto new file mode 100644 index 0000000000000..3f47fddaa77ef --- /dev/null +++ b/api/envoy/watchdog/v3alpha/abort_action.proto @@ -0,0 +1,29 @@ +syntax = "proto3"; + +package envoy.watchdog.v3alpha; + +import "google/protobuf/duration.proto"; + +import "udpa/annotations/status.proto"; +import "udpa/annotations/versioning.proto"; +import "validate/validate.proto"; + +option java_package = "io.envoyproxy.envoy.watchdog.v3alpha"; +option java_outer_classname = "AbortActionProto"; +option java_multiple_files = true; +option (udpa.annotations.file_status).work_in_progress = true; +option (udpa.annotations.file_status).package_version_status = ACTIVE; + +// [#protodoc-title: Watchdog Action that kills a stuck thread to kill the process.] + +// A GuardDogAction that will terminate the process by killing the +// stuck thread. This would allow easier access to the call stack of the stuck +// thread since we would run signal handlers on that thread. By default +// this will be registered to run as the last watchdog action on KILL and +// MULTIKILL events if those are enabled. +message AbortActionConfig { + // How long to wait for the thread to respond to the thread kill function + // before killing the process from this action. This is a blocking action. + // By default this is 5 seconds. + google.protobuf.Duration wait_duration = 1; +} diff --git a/api/versioning/BUILD b/api/versioning/BUILD index d44a54640ca4a..2e0a1cd4997d7 100644 --- a/api/versioning/BUILD +++ b/api/versioning/BUILD @@ -131,7 +131,6 @@ proto_library( "//envoy/extensions/upstreams/http/http/v3:pkg", "//envoy/extensions/upstreams/http/tcp/v3:pkg", "//envoy/extensions/wasm/v3:pkg", - "//envoy/extensions/watchdog/abort_action/v3alpha:pkg", "//envoy/extensions/watchdog/profile_action/v3alpha:pkg", "//envoy/service/accesslog/v3:pkg", "//envoy/service/auth/v3:pkg", @@ -155,6 +154,7 @@ proto_library( "//envoy/type/metadata/v3:pkg", "//envoy/type/tracing/v3:pkg", "//envoy/type/v3:pkg", + "//envoy/watchdog/v3alpha:pkg", ], ) diff --git a/bazel/repositories.bzl b/bazel/repositories.bzl index 1ba42eaa6ae29..5047a52141f0d 100644 --- a/bazel/repositories.bzl +++ b/bazel/repositories.bzl @@ -13,7 +13,6 @@ WINDOWS_SKIP_TARGETS = [ "envoy.tracers.lightstep", "envoy.tracers.datadog", "envoy.tracers.opencensus", - "envoy.watchdog.abort_action", ] # Make all contents of an external repository accessible under a filegroup. Used for external HTTP diff --git a/docs/root/api-v3/config/watchdog/watchdog.rst b/docs/root/api-v3/config/watchdog/watchdog.rst index f3fd56c327ec1..f5906b3390d35 100644 --- a/docs/root/api-v3/config/watchdog/watchdog.rst +++ b/docs/root/api-v3/config/watchdog/watchdog.rst @@ -6,4 +6,4 @@ Watchdog :maxdepth: 2 ../../extensions/watchdog/profile_action/v3alpha/* - ../../extensions/watchdog/abort_action/v3alpha/* + ../../watchdog/v3alpha/* diff --git a/docs/root/version_history/current.rst b/docs/root/version_history/current.rst index 74060d4e5c5fb..892f2376f3638 100644 --- a/docs/root/version_history/current.rst +++ b/docs/root/version_history/current.rst @@ -11,6 +11,7 @@ Minor Behavior Changes * build: the Alpine based debug images are no longer built in CI, use Ubuntu based images instead. * ext_authz filter: the deprecated field :ref:`use_alpha ` is no longer supported and cannot be set anymore. +* watchdog: the watchdog action :ref:`abort_action ` is now the default action to terminate the process if watchdog kill / multikill is enabled. Bug Fixes --------- diff --git a/docs/root/version_history/v1.16.0.rst b/docs/root/version_history/v1.16.0.rst index 9e6dff0c83e44..43e123b6359e9 100644 --- a/docs/root/version_history/v1.16.0.rst +++ b/docs/root/version_history/v1.16.0.rst @@ -159,7 +159,7 @@ New Features * watchdog: support randomizing the watchdog's kill timeout to prevent synchronized kills via a maximium jitter parameter :ref:`max_kill_timeout_jitter`. * watchdog: supports an extension point where actions can be registered to fire on watchdog events such as miss, megamiss, kill and multikill. See :ref:`watchdog actions`. * watchdog: watchdog action extension that does cpu profiling. See :ref:`Profile Action `. -* watchdog: watchdog action extension that sends SIGABRT to the stuck thread to terminate the process. See :ref:`Abort Action `. +* watchdog: watchdog action extension that sends SIGABRT to the stuck thread to terminate the process. See :ref:`Abort Action `. * xds: added :ref:`extension config discovery` support for HTTP filters. * xds: added support for mixed v2/v3 discovery response, which enable type url downgrade and upgrade. This feature is disabled by default and is controlled by runtime guard `envoy.reloadable_features.enable_type_url_downgrade_and_upgrade`. * zlib: added option to use `zlib-ng `_ as zlib library. diff --git a/generated_api_shadow/envoy/extensions/watchdog/abort_action/v3alpha/abort_action.proto b/generated_api_shadow/envoy/extensions/watchdog/abort_action/v3alpha/abort_action.proto deleted file mode 100644 index 7d793be82012b..0000000000000 --- a/generated_api_shadow/envoy/extensions/watchdog/abort_action/v3alpha/abort_action.proto +++ /dev/null @@ -1,32 +0,0 @@ -syntax = "proto3"; - -package envoy.extensions.watchdog.abort_action.v3alpha; - -import "google/protobuf/duration.proto"; - -import "udpa/annotations/status.proto"; -import "udpa/annotations/versioning.proto"; -import "validate/validate.proto"; - -option java_package = "io.envoyproxy.envoy.extensions.watchdog.abort_action.v3alpha"; -option java_outer_classname = "AbortActionProto"; -option java_multiple_files = true; -option (udpa.annotations.file_status).work_in_progress = true; -option (udpa.annotations.file_status).package_version_status = ACTIVE; - -// [#protodoc-title: Watchdog Action that sends a SIGABRT to kill the process.] -// [#extension: envoy.watchdog.abort_action] - -// A GuardDogAction that will terminate the process by sending SIGABRT to the -// stuck thread. This would allow easier access to the call stack of the stuck -// thread since we would run signal handlers on that thread. This would be -// more useful than the default watchdog kill behaviors since those PANIC -// from the watchdog's thread. - -// This is currently only implemented for systems that support kill to send -// signals. -message AbortActionConfig { - // How long to wait for the thread to respond to the SIGABRT before killing the - // process from this action. This is a blocking action. - google.protobuf.Duration wait_duration = 1; -} diff --git a/generated_api_shadow/envoy/extensions/watchdog/abort_action/v3alpha/BUILD b/generated_api_shadow/envoy/watchdog/v3alpha/BUILD similarity index 100% rename from generated_api_shadow/envoy/extensions/watchdog/abort_action/v3alpha/BUILD rename to generated_api_shadow/envoy/watchdog/v3alpha/BUILD diff --git a/generated_api_shadow/envoy/watchdog/v3alpha/abort_action.proto b/generated_api_shadow/envoy/watchdog/v3alpha/abort_action.proto new file mode 100644 index 0000000000000..3f47fddaa77ef --- /dev/null +++ b/generated_api_shadow/envoy/watchdog/v3alpha/abort_action.proto @@ -0,0 +1,29 @@ +syntax = "proto3"; + +package envoy.watchdog.v3alpha; + +import "google/protobuf/duration.proto"; + +import "udpa/annotations/status.proto"; +import "udpa/annotations/versioning.proto"; +import "validate/validate.proto"; + +option java_package = "io.envoyproxy.envoy.watchdog.v3alpha"; +option java_outer_classname = "AbortActionProto"; +option java_multiple_files = true; +option (udpa.annotations.file_status).work_in_progress = true; +option (udpa.annotations.file_status).package_version_status = ACTIVE; + +// [#protodoc-title: Watchdog Action that kills a stuck thread to kill the process.] + +// A GuardDogAction that will terminate the process by killing the +// stuck thread. This would allow easier access to the call stack of the stuck +// thread since we would run signal handlers on that thread. By default +// this will be registered to run as the last watchdog action on KILL and +// MULTIKILL events if those are enabled. +message AbortActionConfig { + // How long to wait for the thread to respond to the thread kill function + // before killing the process from this action. This is a blocking action. + // By default this is 5 seconds. + google.protobuf.Duration wait_duration = 1; +} diff --git a/source/common/thread/BUILD b/source/common/thread/BUILD new file mode 100644 index 0000000000000..6f79301e8676b --- /dev/null +++ b/source/common/thread/BUILD @@ -0,0 +1,19 @@ +load( + "//bazel:envoy_build_system.bzl", + "envoy_cc_library", + "envoy_package", +) + +licenses(["notice"]) # Apache 2 + +envoy_package() + +envoy_cc_library( + name = "terminate_thread_lib", + srcs = ["terminate_thread.cc"], + hdrs = ["terminate_thread.h"], + deps = [ + "//include/envoy/thread:thread_interface", + "//source/common/common:minimal_logger_lib", + ], +) diff --git a/source/common/thread/terminate_thread.cc b/source/common/thread/terminate_thread.cc new file mode 100644 index 0000000000000..435d704e6d249 --- /dev/null +++ b/source/common/thread/terminate_thread.cc @@ -0,0 +1,31 @@ +#include "common/thread/terminate_thread.h" + +#include + +#include + +#include "common/common/logger.h" + +namespace Envoy { +namespace Thread { +namespace { +#ifdef __linux__ +pid_t toPlatformTid(int64_t tid) { return static_cast(tid); } +#elif defined(__APPLE__) +uint64_t toPlatformTid(int64_t tid) { return static_cast(tid); } +#endif +} // namespace + +bool terminateThread(const ThreadId& tid) { +#ifndef WIN32 + // Assume POSIX-compatible system and signal to the thread. + return kill(toPlatformTid(tid.getId()), SIGABRT) == 0; +#else + // Windows, currently unsupported termination of thread. + ENVOY_LOG_MISC(error, "Windows is currently unsupported for terminateThread."); + return false; +#endif +} + +} // namespace Thread +} // namespace Envoy diff --git a/source/common/thread/terminate_thread.h b/source/common/thread/terminate_thread.h new file mode 100644 index 0000000000000..a9a20b1903cfa --- /dev/null +++ b/source/common/thread/terminate_thread.h @@ -0,0 +1,19 @@ +#pragma once + +#include "envoy/thread/thread.h" + +namespace Envoy { +namespace Thread { +/** + * Tries to terminates the process by killing the thread specified by + * the ThreadId. The implementation is platform dependent and currently + * only works on platforms that support SIGABRT. + * + * Returns true if the platform specific function to terminate the thread + * succeeded (i.e. kill() == 0). If the platform is currently unsupported, this + * will return false. + */ +bool terminateThread(const ThreadId& tid); + +} // namespace Thread +} // namespace Envoy diff --git a/source/extensions/watchdog/abort_action/BUILD b/source/common/watchdog/BUILD similarity index 63% rename from source/extensions/watchdog/abort_action/BUILD rename to source/common/watchdog/BUILD index 67305bc86e33e..db6b6162ba75d 100644 --- a/source/extensions/watchdog/abort_action/BUILD +++ b/source/common/watchdog/BUILD @@ -1,13 +1,12 @@ load( "//bazel:envoy_build_system.bzl", - "envoy_cc_extension", "envoy_cc_library", - "envoy_extension_package", + "envoy_package", ) licenses(["notice"]) # Apache 2 -envoy_extension_package() +envoy_package() envoy_cc_library( name = "abort_action_lib", @@ -19,22 +18,21 @@ envoy_cc_library( "//include/envoy/thread:thread_interface", "//source/common/common:assert_lib", "//source/common/protobuf:utility_lib", - "@envoy_api//envoy/extensions/watchdog/abort_action/v3alpha:pkg_cc_proto", + "//source/common/thread:terminate_thread_lib", + "@envoy_api//envoy/watchdog/v3alpha:pkg_cc_proto", ], ) -envoy_cc_extension( - name = "config", - srcs = ["config.cc"], - hdrs = ["config.h"], - security_posture = "data_plane_agnostic", - status = "alpha", +envoy_cc_library( + name = "abort_action_config", + srcs = ["abort_action_config.cc"], + hdrs = ["abort_action_config.h"], deps = [ ":abort_action_lib", "//include/envoy/registry", "//source/common/config:utility_lib", "//source/common/protobuf", "//source/common/protobuf:message_validator_lib", - "@envoy_api//envoy/extensions/watchdog/abort_action/v3alpha:pkg_cc_proto", + "@envoy_api//envoy/watchdog/v3alpha:pkg_cc_proto", ], ) diff --git a/source/common/watchdog/README.md b/source/common/watchdog/README.md new file mode 100644 index 0000000000000..c8433b9c05b52 --- /dev/null +++ b/source/common/watchdog/README.md @@ -0,0 +1,2 @@ +This contains watchdog actions that are part of core Envoy, and therefore cannot +be in the extensions directory. diff --git a/source/common/watchdog/abort_action.cc b/source/common/watchdog/abort_action.cc new file mode 100644 index 0000000000000..d629b0bce70ea --- /dev/null +++ b/source/common/watchdog/abort_action.cc @@ -0,0 +1,54 @@ +#include "common/watchdog/abort_action.h" + +#include "envoy/thread/thread.h" + +#include "common/common/assert.h" +#include "common/common/fmt.h" +#include "common/common/logger.h" +#include "common/protobuf/utility.h" +#include "common/thread/terminate_thread.h" + +namespace Envoy { +namespace Watchdog { +namespace { +constexpr uint64_t DefaultWaitDurationMs = 5000; +} // end namespace + +AbortAction::AbortAction(envoy::watchdog::v3alpha::AbortActionConfig& config, + Server::Configuration::GuardDogActionFactoryContext& /*context*/) + : wait_duration_(absl::Milliseconds( + PROTOBUF_GET_MS_OR_DEFAULT(config, wait_duration, DefaultWaitDurationMs))) {} + +void AbortAction::run( + envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent /*event*/, + const std::vector>& thread_last_checkin_pairs, + MonotonicTime /*now*/) { + + if (thread_last_checkin_pairs.empty()) { + ENVOY_LOG_MISC(warn, "Watchdog AbortAction called without any thread."); + return; + } + + // The following lines of code won't be considered covered by code coverage + // tools since they would run in DEATH tests. + const auto& thread_id = thread_last_checkin_pairs[0].first; + const std::string tid_string = thread_id.debugString(); + ENVOY_LOG_MISC(error, "Watchdog AbortAction terminating thread with tid {}.", tid_string); + + if (Thread::terminateThread(thread_id)) { + // Successfully signaled to thread to terminate, sleep for wait_duration. + absl::SleepFor(wait_duration_); + } else { + ENVOY_LOG_MISC(error, "Failed to terminate tid {}", tid_string); + } + + // Abort from the action since the signaled thread hasn't yet crashed the process. + // Panicing in the action gives flexibility since it doesn't depend on + // external code to kill the process if the signal fails. + PANIC(fmt::format( + "Failed to terminate thread with id {}, aborting from Watchdog AbortAction instead.", + tid_string)); +} + +} // namespace Watchdog +} // namespace Envoy diff --git a/source/extensions/watchdog/abort_action/abort_action.h b/source/common/watchdog/abort_action.h similarity index 53% rename from source/extensions/watchdog/abort_action/abort_action.h rename to source/common/watchdog/abort_action.h index 90b64393080bf..5170c8bbea000 100644 --- a/source/extensions/watchdog/abort_action/abort_action.h +++ b/source/common/watchdog/abort_action.h @@ -1,24 +1,18 @@ #pragma once -#include - -#include "envoy/extensions/watchdog/abort_action/v3alpha/abort_action.pb.h" #include "envoy/server/guarddog_config.h" #include "envoy/thread/thread.h" +#include "envoy/watchdog/v3alpha/abort_action.pb.h" namespace Envoy { -namespace Extensions { namespace Watchdog { -namespace AbortAction { - /** - * A GuardDogAction that will terminate the process by sending SIGABRT to the - * stuck thread. This is currently only implemented for systems that - * support kill to send signals. + * A GuardDogAction that will terminate the process by killing the + * stuck thread. */ class AbortAction : public Server::Configuration::GuardDogAction { public: - AbortAction(envoy::extensions::watchdog::abort_action::v3alpha::AbortActionConfig& config, + AbortAction(envoy::watchdog::v3alpha::AbortActionConfig& config, Server::Configuration::GuardDogActionFactoryContext& context); void run(envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent event, @@ -26,12 +20,10 @@ class AbortAction : public Server::Configuration::GuardDogAction { MonotonicTime now) override; private: - const envoy::extensions::watchdog::abort_action::v3alpha::AbortActionConfig config_; + const absl::Duration wait_duration_; }; using AbortActionPtr = std::unique_ptr; -} // namespace AbortAction } // namespace Watchdog -} // namespace Extensions } // namespace Envoy diff --git a/source/extensions/watchdog/abort_action/config.cc b/source/common/watchdog/abort_action_config.cc similarity index 59% rename from source/extensions/watchdog/abort_action/config.cc rename to source/common/watchdog/abort_action_config.cc index f59c62c94cf47..916864386ace2 100644 --- a/source/extensions/watchdog/abort_action/config.cc +++ b/source/common/watchdog/abort_action_config.cc @@ -1,32 +1,27 @@ -#include "extensions/watchdog/abort_action/config.h" +#include "common/watchdog/abort_action_config.h" #include "envoy/registry/registry.h" #include "common/config/utility.h" #include "common/protobuf/message_validator_impl.h" - -#include "extensions/watchdog/abort_action/abort_action.h" +#include "common/watchdog/abort_action.h" namespace Envoy { -namespace Extensions { namespace Watchdog { -namespace AbortAction { Server::Configuration::GuardDogActionPtr AbortActionFactory::createGuardDogActionFromProto( const envoy::config::bootstrap::v3::Watchdog::WatchdogAction& config, Server::Configuration::GuardDogActionFactoryContext& context) { - auto message = createEmptyConfigProto(); + AbortActionConfig message; Config::Utility::translateOpaqueConfig(config.config().typed_config(), ProtobufWkt::Struct(), - ProtobufMessage::getStrictValidationVisitor(), *message); - return std::make_unique(dynamic_cast(*message), context); + ProtobufMessage::getStrictValidationVisitor(), message); + return std::make_unique(message, context); } /** - * Static registration for the fixed heap resource monitor factory. @see RegistryFactory. + * Static registration for the Abort Action factory. @see RegisterFactory. */ REGISTER_FACTORY(AbortActionFactory, Server::Configuration::GuardDogActionFactory); -} // namespace AbortAction } // namespace Watchdog -} // namespace Extensions } // namespace Envoy diff --git a/source/extensions/watchdog/abort_action/config.h b/source/common/watchdog/abort_action_config.h similarity index 72% rename from source/extensions/watchdog/abort_action/config.h rename to source/common/watchdog/abort_action_config.h index d9f11c562b712..27f65ea16b609 100644 --- a/source/extensions/watchdog/abort_action/config.h +++ b/source/common/watchdog/abort_action_config.h @@ -1,14 +1,12 @@ #pragma once -#include "envoy/extensions/watchdog/abort_action/v3alpha/abort_action.pb.h" #include "envoy/server/guarddog_config.h" +#include "envoy/watchdog/v3alpha/abort_action.pb.h" #include "common/protobuf/protobuf.h" namespace Envoy { -namespace Extensions { namespace Watchdog { -namespace AbortAction { class AbortActionFactory : public Server::Configuration::GuardDogActionFactory { public: @@ -24,10 +22,8 @@ class AbortActionFactory : public Server::Configuration::GuardDogActionFactory { std::string name() const override { return "envoy.watchdog.abort_action"; } - using AbortActionConfig = envoy::extensions::watchdog::abort_action::v3alpha::AbortActionConfig; + using AbortActionConfig = envoy::watchdog::v3alpha::AbortActionConfig; }; -} // namespace AbortAction } // namespace Watchdog -} // namespace Extensions } // namespace Envoy diff --git a/source/extensions/extensions_build_config.bzl b/source/extensions/extensions_build_config.bzl index ddc3dc9a0d508..e3ec724d93399 100644 --- a/source/extensions/extensions_build_config.bzl +++ b/source/extensions/extensions_build_config.bzl @@ -214,7 +214,6 @@ EXTENSIONS = { # Watchdog actions # "envoy.watchdog.profile_action": "//source/extensions/watchdog/profile_action:config", - "envoy.watchdog.abort_action": "//source/extensions/watchdog/abort_action:config", } diff --git a/source/extensions/watchdog/abort_action/abort_action.cc b/source/extensions/watchdog/abort_action/abort_action.cc deleted file mode 100644 index 3a4c3ade615e8..0000000000000 --- a/source/extensions/watchdog/abort_action/abort_action.cc +++ /dev/null @@ -1,67 +0,0 @@ -#include "extensions/watchdog/abort_action/abort_action.h" - -#include - -#include - -#include "envoy/thread/thread.h" - -#include "common/common/assert.h" -#include "common/common/fmt.h" -#include "common/common/logger.h" -#include "common/protobuf/utility.h" - -namespace Envoy { -namespace Extensions { -namespace Watchdog { -namespace AbortAction { -namespace { -#ifdef __linux__ -pid_t toPlatformTid(int64_t tid) { return static_cast(tid); } -#elif defined(__APPLE__) -uint64_t toPlatformTid(int64_t tid) { return static_cast(tid); } -#endif -} // namespace - -AbortAction::AbortAction( - envoy::extensions::watchdog::abort_action::v3alpha::AbortActionConfig& config, - Server::Configuration::GuardDogActionFactoryContext& /*context*/) - : config_(config){}; - -void AbortAction::run( - envoy::config::bootstrap::v3::Watchdog::WatchdogAction::WatchdogEvent /*event*/, - const std::vector>& thread_last_checkin_pairs, - MonotonicTime /*now*/) { - - if (thread_last_checkin_pairs.empty()) { - ENVOY_LOG_MISC(warn, "Watchdog AbortAction called without any thread."); - return; - } - - // The following lines of code won't be considered covered by code coverage - // tools since they would run in DEATH tests. - int64_t raw_tid = thread_last_checkin_pairs[0].first.getId(); - - // Assume POSIX-compatible system and signal to the thread. - ENVOY_LOG_MISC(error, "Watchdog AbortAction sending abort signal to thread with tid {}.", - raw_tid); - - if (kill(toPlatformTid(raw_tid), SIGABRT) == 0) { - // Successfully sent signal, sleep for wait_duration. - absl::SleepFor(absl::Milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(config_, wait_duration, 0))); - } else { - // Failed to send the signal, abort? - ENVOY_LOG_MISC(error, "Failed to send signal to tid {}", raw_tid); - } - - // Abort from the action since the signaled thread hasn't yet crashed the process. - // panicing in the action gives flexibility since it doesn't depend on - // external code to kill the process if the signal fails. - PANIC(fmt::format("Failed to kill thread with id {}, aborting from Watchdog AbortAction instead.", - raw_tid)); -} - -} // namespace AbortAction -} // namespace Watchdog -} // namespace Extensions -} // namespace Envoy diff --git a/source/server/BUILD b/source/server/BUILD index 6c587d9d95d7b..852cc2bda6045 100644 --- a/source/server/BUILD +++ b/source/server/BUILD @@ -127,7 +127,9 @@ envoy_cc_library( "//source/common/event:libevent_lib", "//source/common/protobuf:utility_lib", "//source/common/stats:symbol_table_lib", + "//source/common/watchdog:abort_action_config", "@envoy_api//envoy/config/bootstrap/v3:pkg_cc_proto", + "@envoy_api//envoy/watchdog/v3alpha:pkg_cc_proto", ], ) diff --git a/source/server/guarddog_impl.cc b/source/server/guarddog_impl.cc index 7c512d20310e4..64b04228c1b80 100644 --- a/source/server/guarddog_impl.cc +++ b/source/server/guarddog_impl.cc @@ -13,6 +13,7 @@ #include "envoy/server/guarddog.h" #include "envoy/server/guarddog_config.h" #include "envoy/stats/scope.h" +#include "envoy/watchdog/v3alpha/abort_action.pb.h" #include "common/common/assert.h" #include "common/common/fmt.h" @@ -64,7 +65,23 @@ GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuratio Configuration::GuardDogActionFactoryContext context = {api, *dispatcher_, stats_scope, name}; - const auto& actions = config.actions(); + auto actions = config.actions(); + + // Add default abort_action if kill and/or multi-kill is enabled. + if (config.killTimeout().count() > 0) { + envoy::watchdog::v3alpha::AbortActionConfig abort_config; + WatchDogAction* abort_action_config = actions.Add(); + abort_action_config->set_event(WatchDogAction::KILL); + abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config); + } + + if (config.multiKillTimeout().count() > 0) { + envoy::watchdog::v3alpha::AbortActionConfig abort_config; + WatchDogAction* abort_action_config = actions.Add(); + abort_action_config->set_event(WatchDogAction::MULTIKILL); + abort_action_config->mutable_config()->mutable_typed_config()->PackFrom(abort_config); + } + for (const auto& action : actions) { // Get factory and add the created cb auto& factory = Config::Utility::getAndCheckFactory( @@ -133,19 +150,12 @@ void GuardDogImpl::step() { } if (killEnabled() && delta > kill_timeout_) { invokeGuardDogActions(WatchDogAction::KILL, {{tid, last_checkin}}, now); - - PANIC(fmt::format("GuardDog: one thread ({}) stuck for more than watchdog_kill_timeout", - watched_dog->dog_->threadId().debugString())); } if (multikillEnabled() && delta > multi_kill_timeout_) { multi_kill_threads.emplace_back(tid, last_checkin); if (multi_kill_threads.size() >= required_for_multi_kill) { invokeGuardDogActions(WatchDogAction::MULTIKILL, multi_kill_threads, now); - - PANIC(fmt::format("GuardDog: At least {} threads ({},...) stuck for more than " - "watchdog_multikill_timeout", - multi_kill_threads.size(), tid.debugString())); } } } diff --git a/test/extensions/watchdog/abort_action/BUILD b/test/common/watchdog/BUILD similarity index 51% rename from test/extensions/watchdog/abort_action/BUILD rename to test/common/watchdog/BUILD index b79b1205e86e3..85edbe118222e 100644 --- a/test/extensions/watchdog/abort_action/BUILD +++ b/test/common/watchdog/BUILD @@ -1,50 +1,43 @@ load( "//bazel:envoy_build_system.bzl", + "envoy_cc_test", "envoy_package", ) -load( - "//test/extensions:extensions_build_system.bzl", - "envoy_extension_cc_test", -) licenses(["notice"]) # Apache 2 envoy_package() -envoy_extension_cc_test( +envoy_cc_test( name = "abort_action_test", srcs = ["abort_action_test.cc"], - extension_name = "envoy.watchdog.abort_action", external_deps = [ "abseil_synchronization", ], - tags = ["skip_on_windows"], deps = [ "//include/envoy/common:time_interface", "//include/envoy/registry", "//include/envoy/server:guarddog_config_interface", - "//source/extensions/watchdog/abort_action:abort_action_lib", - "//source/extensions/watchdog/abort_action:config", + "//source/common/watchdog:abort_action_config", + "//source/common/watchdog:abort_action_lib", "//test/common/stats:stat_test_utility_lib", "//test/test_common:utility_lib", "@envoy_api//envoy/config/bootstrap/v3:pkg_cc_proto", - "@envoy_api//envoy/extensions/watchdog/abort_action/v3alpha:pkg_cc_proto", + "@envoy_api//envoy/watchdog/v3alpha:pkg_cc_proto", ], ) -envoy_extension_cc_test( - name = "config_test", - srcs = ["config_test.cc"], - extension_name = "envoy.watchdog.abort_action", - tags = ["skip_on_windows"], +envoy_cc_test( + name = "abort_action_config_test", + srcs = ["abort_action_config_test.cc"], deps = [ "//include/envoy/registry", "//include/envoy/server:guarddog_config_interface", - "//source/extensions/watchdog/abort_action:abort_action_lib", - "//source/extensions/watchdog/abort_action:config", + "//source/common/watchdog:abort_action_config", + "//source/common/watchdog:abort_action_lib", "//test/common/stats:stat_test_utility_lib", "//test/mocks/event:event_mocks", "//test/test_common:utility_lib", - "@envoy_api//envoy/extensions/watchdog/abort_action/v3alpha:pkg_cc_proto", + "@envoy_api//envoy/watchdog/v3alpha:pkg_cc_proto", ], ) diff --git a/test/extensions/watchdog/abort_action/config_test.cc b/test/common/watchdog/abort_action_config_test.cc similarity index 79% rename from test/extensions/watchdog/abort_action/config_test.cc rename to test/common/watchdog/abort_action_config_test.cc index 7e2b716ac8f33..6c3f729fe1a4b 100644 --- a/test/extensions/watchdog/abort_action/config_test.cc +++ b/test/common/watchdog/abort_action_config_test.cc @@ -1,8 +1,8 @@ -#include "envoy/extensions/watchdog/abort_action/v3alpha/abort_action.pb.h" #include "envoy/registry/registry.h" #include "envoy/server/guarddog_config.h" +#include "envoy/watchdog/v3alpha/abort_action.pb.h" -#include "extensions/watchdog/abort_action/config.h" +#include "common/watchdog/abort_action_config.h" #include "test/common/stats/stat_test_utility.h" #include "test/mocks/event/mocks.h" @@ -11,9 +11,7 @@ #include "gtest/gtest.h" namespace Envoy { -namespace Extensions { namespace Watchdog { -namespace AbortAction { namespace { TEST(AbortActionFactoryTest, CanCreateAction) { @@ -31,7 +29,7 @@ TEST(AbortActionFactoryTest, CanCreateAction) { "name": "envoy.watchdog.abort_action", "typed_config": { "@type": "type.googleapis.com/udpa.type.v1.TypedStruct", - "type_url": "type.googleapis.com/envoy.extensions.watchdog.abort_action.v3alpha.AbortActionConfig", + "type_url": "type.googleapis.com/envoy.watchdog.abort_action.v3alpha.AbortActionConfig", "value": { "wait_duration": "2s", } @@ -50,7 +48,5 @@ TEST(AbortActionFactoryTest, CanCreateAction) { } } // namespace -} // namespace AbortAction } // namespace Watchdog -} // namespace Extensions } // namespace Envoy diff --git a/test/extensions/watchdog/abort_action/abort_action_test.cc b/test/common/watchdog/abort_action_test.cc similarity index 90% rename from test/extensions/watchdog/abort_action/abort_action_test.cc rename to test/common/watchdog/abort_action_test.cc index e648fbe66c603..7f2f1bf4606e1 100644 --- a/test/extensions/watchdog/abort_action/abort_action_test.cc +++ b/test/common/watchdog/abort_action_test.cc @@ -4,12 +4,12 @@ #include "envoy/common/time.h" #include "envoy/config/bootstrap/v3/bootstrap.pb.h" #include "envoy/event/dispatcher.h" -#include "envoy/extensions/watchdog/abort_action/v3alpha/abort_action.pb.h" #include "envoy/server/guarddog_config.h" #include "envoy/thread/thread.h" +#include "envoy/watchdog/v3alpha/abort_action.pb.h" -#include "extensions/watchdog/abort_action/abort_action.h" -#include "extensions/watchdog/abort_action/config.h" +#include "common/watchdog/abort_action.h" +#include "common/watchdog/abort_action_config.h" #include "test/common/stats/stat_test_utility.h" #include "test/test_common/utility.h" @@ -18,12 +18,10 @@ #include "gtest/gtest.h" namespace Envoy { -namespace Extensions { namespace Watchdog { -namespace AbortAction { namespace { -using AbortActionConfig = envoy::extensions::watchdog::abort_action::v3alpha::AbortActionConfig; +using AbortActionConfig = envoy::watchdog::v3alpha::AbortActionConfig; class AbortActionTest : public testing::Test { protected: @@ -54,8 +52,7 @@ TEST_F(AbortActionTest, ShouldNotAbortIfNoTids) { action_->run(envoy::config::bootstrap::v3::Watchdog::WatchdogAction::KILL, tid_ltt_pairs, now); } -// insufficient signal support on Windows. -TEST_F(AbortActionTest, CanKillThread) { +TEST_F(AbortActionTest, ShouldKillTheProcess) { AbortActionConfig config; config.mutable_wait_duration()->set_seconds(1); action_ = std::make_unique(config, context_); @@ -83,6 +80,8 @@ TEST_F(AbortActionTest, CanKillThread) { EXPECT_DEATH(die_function(), ""); } +#ifndef WIN32 +// insufficient signal support on Windows. void handler(int sig, siginfo_t* /*siginfo*/, void* /*context*/) { std::cout << "Eating signal :" << std::to_string(sig) << ". will ignore it." << std::endl; signal(SIGABRT, SIG_IGN); @@ -122,9 +121,8 @@ TEST_F(AbortActionTest, PanicsIfThreadDoesNotDie) { EXPECT_DEATH(die_function(), "aborting from Watchdog AbortAction instead"); } +#endif } // namespace -} // namespace AbortAction } // namespace Watchdog -} // namespace Extensions } // namespace Envoy diff --git a/test/per_file_coverage.sh b/test/per_file_coverage.sh index 853724035d179..2ba764721609c 100755 --- a/test/per_file_coverage.sh +++ b/test/per_file_coverage.sh @@ -15,11 +15,13 @@ declare -a KNOWN_LOW_COVERAGE=( "source/common/json:90.6" "source/common/filesystem:96.1" "source/common/filesystem/posix:94.5" +"source/common/thread:0.0" # Death tests don't report LCOV "source/common/thread_local:95.7" "source/common/crypto:0.0" "source/common/common:96.1" "source/common/common/posix:94.1" "source/common/signal:90.4" +"source/common/watchdog:42.9" # Death tests don't report LCOV "source/exe:93.7" "source/extensions:96.3" "source/extensions/common:94.4" @@ -67,7 +69,6 @@ declare -a KNOWN_LOW_COVERAGE=( "source/extensions/transport_sockets/tls/private_key:76.9" "source/extensions/watchdog:69.6" # Death tests within extensions "source/extensions/watchdog/profile_action:84.9" -"source/extensions/watchdog/abort_action:42.9" # Death tests don't report LCOV "source/server:94.6" "source/server/config_validation:76.6" "source/server/admin:95.3" diff --git a/test/server/configuration_impl_test.cc b/test/server/configuration_impl_test.cc index fdb1a0ba2341e..ea6da93626b9a 100644 --- a/test/server/configuration_impl_test.cc +++ b/test/server/configuration_impl_test.cc @@ -754,8 +754,8 @@ TEST_F(ConfigurationImplTest, KillTimeoutWithoutSkew) { MainImpl config; config.initialize(bootstrap, server_, cluster_manager_factory_); - EXPECT_EQ(std::chrono::milliseconds(1000), config.workerWatchdogConfig().killTimeout()); - EXPECT_EQ(std::chrono::milliseconds(1000), config.mainThreadWatchdogConfig().killTimeout()); + EXPECT_EQ(config.workerWatchdogConfig().killTimeout(), std::chrono::milliseconds(1000)); + EXPECT_EQ(config.mainThreadWatchdogConfig().killTimeout(), std::chrono::milliseconds(1000)); } TEST_F(ConfigurationImplTest, CanSkewsKillTimeout) { @@ -793,8 +793,8 @@ TEST_F(ConfigurationImplTest, DoesNotSkewIfKillTimeoutDisabled) { MainImpl config; config.initialize(bootstrap, server_, cluster_manager_factory_); - EXPECT_EQ(std::chrono::milliseconds(0), config.mainThreadWatchdogConfig().killTimeout()); - EXPECT_EQ(std::chrono::milliseconds(0), config.workerWatchdogConfig().killTimeout()); + EXPECT_EQ(config.mainThreadWatchdogConfig().killTimeout(), std::chrono::milliseconds(0)); + EXPECT_EQ(config.workerWatchdogConfig().killTimeout(), std::chrono::milliseconds(0)); } TEST_F(ConfigurationImplTest, ShouldErrorIfBothWatchdogsAndWatchdogSet) { diff --git a/test/server/test_data/server/watchdogs_bootstrap_with_deprecated_field.yaml b/test/server/test_data/server/watchdogs_bootstrap_with_deprecated_field.yaml deleted file mode 100644 index f6e28cfcde91f..0000000000000 --- a/test/server/test_data/server/watchdogs_bootstrap_with_deprecated_field.yaml +++ /dev/null @@ -1,7 +0,0 @@ -watchdogs: - main_thread_watchdog: - miss_timeout: 1s - worker_watchdog: - miss_timeout: 0.5s -watchdog: - miss_timeout: 1s