diff --git a/api/envoy/config/core/v3/health_check.proto b/api/envoy/config/core/v3/health_check.proto index c5441c30fd3bc..ccd4739698468 100644 --- a/api/envoy/config/core/v3/health_check.proto +++ b/api/envoy/config/core/v3/health_check.proto @@ -54,7 +54,7 @@ enum HealthStatus { DEGRADED = 5; } -// [#next-free-field: 24] +// [#next-free-field: 25] message HealthCheck { option (udpa.annotations.versioning).previous_message_type = "envoy.api.v2.core.HealthCheck"; @@ -284,6 +284,21 @@ message HealthCheck { // The default value for "no traffic interval" is 60 seconds. google.protobuf.Duration no_traffic_interval = 12 [(validate.rules).duration = {gt {}}]; + // The "no traffic healthy interval" is a special health check interval that + // is used for hosts that are currently passing active health checking + // (including new hosts) when the cluster has received no traffic. + // + // This is useful for when we want to send frequent health checks with + // `no_traffic_interval` but then revert to lower frequency `no_traffic_healthy_interval` once + // a host in the cluster is marked as healthy. + // + // Once a cluster has been used for traffic routing, Envoy will shift back to using the + // standard health check interval that is defined. + // + // If no_traffic_healthy_interval is not set, it will default to the + // no traffic interval and send that interval regardless of health state. + google.protobuf.Duration no_traffic_healthy_interval = 24 [(validate.rules).duration = {gt {}}]; + // The "unhealthy interval" is a health check interval that is used for hosts that are marked as // unhealthy. As soon as the host is marked as healthy, Envoy will shift back to using the // standard health check interval that is defined. diff --git a/api/envoy/config/core/v4alpha/health_check.proto b/api/envoy/config/core/v4alpha/health_check.proto index 8d54d81af00bb..2761b856a3d7e 100644 --- a/api/envoy/config/core/v4alpha/health_check.proto +++ b/api/envoy/config/core/v4alpha/health_check.proto @@ -54,7 +54,7 @@ enum HealthStatus { DEGRADED = 5; } -// [#next-free-field: 24] +// [#next-free-field: 25] message HealthCheck { option (udpa.annotations.versioning).previous_message_type = "envoy.config.core.v3.HealthCheck"; @@ -284,6 +284,21 @@ message HealthCheck { // The default value for "no traffic interval" is 60 seconds. google.protobuf.Duration no_traffic_interval = 12 [(validate.rules).duration = {gt {}}]; + // The "no traffic healthy interval" is a special health check interval that + // is used for hosts that are currently passing active health checking + // (including new hosts) when the cluster has received no traffic. + // + // This is useful for when we want to send frequent health checks with + // `no_traffic_interval` but then revert to lower frequency `no_traffic_healthy_interval` once + // a host in the cluster is marked as healthy. + // + // Once a cluster has been used for traffic routing, Envoy will shift back to using the + // standard health check interval that is defined. + // + // If no_traffic_healthy_interval is not set, it will default to the + // no traffic interval and send that interval regardless of health state. + google.protobuf.Duration no_traffic_healthy_interval = 24 [(validate.rules).duration = {gt {}}]; + // The "unhealthy interval" is a health check interval that is used for hosts that are marked as // unhealthy. As soon as the host is marked as healthy, Envoy will shift back to using the // standard health check interval that is defined. diff --git a/docs/root/version_history/current.rst b/docs/root/version_history/current.rst index ce3938e22a6da..e66ee1b0622d0 100644 --- a/docs/root/version_history/current.rst +++ b/docs/root/version_history/current.rst @@ -20,6 +20,7 @@ Removed Config or Runtime New Features ------------ * grpc: implemented header value syntax support when defining :ref:`initial metadata ` for gRPC-based `ext_authz` :ref:`HTTP ` and :ref:`network ` filters, and :ref:`ratelimit ` filters. +* health_check: added option to use :ref:`no_traffic_healthy_interval ` which allows a different no traffic interval when the host is healthy. Deprecated ---------- diff --git a/generated_api_shadow/envoy/config/core/v3/health_check.proto b/generated_api_shadow/envoy/config/core/v3/health_check.proto index 55adaa55aef45..5f8fd325aa959 100644 --- a/generated_api_shadow/envoy/config/core/v3/health_check.proto +++ b/generated_api_shadow/envoy/config/core/v3/health_check.proto @@ -54,7 +54,7 @@ enum HealthStatus { DEGRADED = 5; } -// [#next-free-field: 24] +// [#next-free-field: 25] message HealthCheck { option (udpa.annotations.versioning).previous_message_type = "envoy.api.v2.core.HealthCheck"; @@ -283,6 +283,21 @@ message HealthCheck { // The default value for "no traffic interval" is 60 seconds. google.protobuf.Duration no_traffic_interval = 12 [(validate.rules).duration = {gt {}}]; + // The "no traffic healthy interval" is a special health check interval that + // is used for hosts that are currently passing active health checking + // (including new hosts) when the cluster has received no traffic. + // + // This is useful for when we want to send frequent health checks with + // `no_traffic_interval` but then revert to lower frequency `no_traffic_healthy_interval` once + // a host in the cluster is marked as healthy. + // + // Once a cluster has been used for traffic routing, Envoy will shift back to using the + // standard health check interval that is defined. + // + // If no_traffic_healthy_interval is not set, it will default to the + // no traffic interval and send that interval regardless of health state. + google.protobuf.Duration no_traffic_healthy_interval = 24 [(validate.rules).duration = {gt {}}]; + // The "unhealthy interval" is a health check interval that is used for hosts that are marked as // unhealthy. As soon as the host is marked as healthy, Envoy will shift back to using the // standard health check interval that is defined. diff --git a/generated_api_shadow/envoy/config/core/v4alpha/health_check.proto b/generated_api_shadow/envoy/config/core/v4alpha/health_check.proto index 8d54d81af00bb..2761b856a3d7e 100644 --- a/generated_api_shadow/envoy/config/core/v4alpha/health_check.proto +++ b/generated_api_shadow/envoy/config/core/v4alpha/health_check.proto @@ -54,7 +54,7 @@ enum HealthStatus { DEGRADED = 5; } -// [#next-free-field: 24] +// [#next-free-field: 25] message HealthCheck { option (udpa.annotations.versioning).previous_message_type = "envoy.config.core.v3.HealthCheck"; @@ -284,6 +284,21 @@ message HealthCheck { // The default value for "no traffic interval" is 60 seconds. google.protobuf.Duration no_traffic_interval = 12 [(validate.rules).duration = {gt {}}]; + // The "no traffic healthy interval" is a special health check interval that + // is used for hosts that are currently passing active health checking + // (including new hosts) when the cluster has received no traffic. + // + // This is useful for when we want to send frequent health checks with + // `no_traffic_interval` but then revert to lower frequency `no_traffic_healthy_interval` once + // a host in the cluster is marked as healthy. + // + // Once a cluster has been used for traffic routing, Envoy will shift back to using the + // standard health check interval that is defined. + // + // If no_traffic_healthy_interval is not set, it will default to the + // no traffic interval and send that interval regardless of health state. + google.protobuf.Duration no_traffic_healthy_interval = 24 [(validate.rules).duration = {gt {}}]; + // The "unhealthy interval" is a health check interval that is used for hosts that are marked as // unhealthy. As soon as the host is marked as healthy, Envoy will shift back to using the // standard health check interval that is defined. diff --git a/source/common/upstream/health_checker_base_impl.cc b/source/common/upstream/health_checker_base_impl.cc index 47a67c3f29a79..f6357559eec8f 100644 --- a/source/common/upstream/health_checker_base_impl.cc +++ b/source/common/upstream/health_checker_base_impl.cc @@ -26,6 +26,8 @@ HealthCheckerImplBase::HealthCheckerImplBase(const Cluster& cluster, reuse_connection_(PROTOBUF_GET_WRAPPED_OR_DEFAULT(config, reuse_connection, true)), event_logger_(std::move(event_logger)), interval_(PROTOBUF_GET_MS_REQUIRED(config, interval)), no_traffic_interval_(PROTOBUF_GET_MS_OR_DEFAULT(config, no_traffic_interval, 60000)), + no_traffic_healthy_interval_(PROTOBUF_GET_MS_OR_DEFAULT(config, no_traffic_healthy_interval, + no_traffic_interval_.count())), initial_jitter_(PROTOBUF_GET_MS_OR_DEFAULT(config, initial_jitter, 0)), interval_jitter_(PROTOBUF_GET_MS_OR_DEFAULT(config, interval_jitter, 0)), interval_jitter_percent_(config.interval_jitter_percent()), @@ -123,7 +125,10 @@ std::chrono::milliseconds HealthCheckerImplBase::interval(HealthState state, break; } } else { - base_time_ms = no_traffic_interval_.count(); + base_time_ms = + (state == HealthState::Healthy && changed_state != HealthTransition::ChangePending) + ? no_traffic_healthy_interval_.count() + : no_traffic_interval_.count(); } return intervalWithJitter(base_time_ms, interval_jitter_); } diff --git a/source/common/upstream/health_checker_base_impl.h b/source/common/upstream/health_checker_base_impl.h index ff2f62101f577..b773ced03a376 100644 --- a/source/common/upstream/health_checker_base_impl.h +++ b/source/common/upstream/health_checker_base_impl.h @@ -148,6 +148,7 @@ class HealthCheckerImplBase : public HealthChecker, std::list callbacks_; const std::chrono::milliseconds interval_; const std::chrono::milliseconds no_traffic_interval_; + const std::chrono::milliseconds no_traffic_healthy_interval_; const std::chrono::milliseconds initial_jitter_; const std::chrono::milliseconds interval_jitter_; const uint32_t interval_jitter_percent_; diff --git a/test/common/upstream/health_checker_impl_test.cc b/test/common/upstream/health_checker_impl_test.cc index 321eeaa269d42..f1db76c44100e 100644 --- a/test/common/upstream/health_checker_impl_test.cc +++ b/test/common/upstream/health_checker_impl_test.cc @@ -187,6 +187,25 @@ class HttpHealthCheckerImplTest : public testing::Test, public HttpHealthChecker addCompletionCallback(); } + void setupNoTrafficHealthyValidationHC() { + const std::string yaml = R"EOF( + timeout: 1s + interval: 1s + no_traffic_interval: 5s + no_traffic_healthy_interval: 10s + interval_jitter: 1s + unhealthy_threshold: 1 + healthy_threshold: 1 + http_health_check: + service_name_matcher: + prefix: locations + path: /healthcheck + )EOF"; + + allocHealthChecker(yaml); + addCompletionCallback(); + } + void setupNoServiceValidationHCOneUnhealthy() { const std::string yaml = R"EOF( timeout: 1s @@ -1487,6 +1506,29 @@ TEST_F(HttpHealthCheckerImplTest, SuccessNoTraffic) { EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); } +// First start with an unhealthy cluster that moves to +// no_traffic_healthy_interval. +TEST_F(HttpHealthCheckerImplTest, UnhealthyTransitionNoTrafficHealthy) { + setupNoTrafficHealthyValidationHC(); + cluster_->prioritySet().getMockHostSet(0)->hosts_ = { + makeTestHost(cluster_->info_, "tcp://127.0.0.1:80")}; + cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->healthFlagSet( + Host::HealthFlag::FAILED_ACTIVE_HC); + expectSessionCreate(); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + health_checker_->start(); + + // Successful health check should now trigger the no_traffic_healthy_interval 10000ms. + EXPECT_CALL(*this, onHostStatus(_, HealthTransition::Changed)); + EXPECT_CALL(event_logger_, logAddHealthy(_, _, _)); + EXPECT_CALL(*test_sessions_[0]->interval_timer_, + enableTimer(std::chrono::milliseconds(10000), _)); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer()); + respond(0, "200", false, false, false, false); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); +} + TEST_F(HttpHealthCheckerImplTest, SuccessStartFailedSuccessFirst) { setupNoServiceValidationHC(); cluster_->prioritySet().getMockHostSet(0)->hosts_ = {