diff --git a/api/envoy/api/v2/core/health_check.proto b/api/envoy/api/v2/core/health_check.proto index 92c811e90bc09..3d21e7d413a1c 100644 --- a/api/envoy/api/v2/core/health_check.proto +++ b/api/envoy/api/v2/core/health_check.proto @@ -33,9 +33,17 @@ message HealthCheck { ]; // An optional jitter amount in millseconds. If specified, during every - // internal Envoy will add 0 to interval_jitter to the wait time. + // interval Envoy will add 0 to interval_jitter to the wait time. google.protobuf.Duration interval_jitter = 3; + // An optional jitter amount as a percentage of interval_ms. If specified, + // during every interval Envoy will add 0 to interval_ms * + // interval_jitter_percent / 100 to the wait time. + // + // If interval_jitter_ms and interval_jitter_percent are both set, both of + // them will be used to increase the wait time. + uint32 interval_jitter_percent = 18; + // The number of unhealthy health checks required before a host is marked // unhealthy. Note that for *http* health checking if a host responds with 503 // this threshold is ignored and the host is considered unhealthy immediately. diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index 46ed4670a078a..41fa51622c09d 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -8,6 +8,7 @@ Version history * admin: added :http:get:`/hystrix_event_stream` as an endpoint for monitoring envoy's statistics through `Hystrix dashboard `_. * health check: added support for :ref:`custom health check `. +* health check: added support for :ref:`specifying jitter as a percentage `. * health_check: added support for :ref:`health check event logging `. * http: response filters not applied to early error paths such as http_parser generated 400s. * lua: added :ref:`connection() ` wrapper and *ssl()* API. diff --git a/source/common/upstream/health_checker_base_impl.cc b/source/common/upstream/health_checker_base_impl.cc index b843ea88fd56b..a87f8579bc7df 100644 --- a/source/common/upstream/health_checker_base_impl.cc +++ b/source/common/upstream/health_checker_base_impl.cc @@ -22,6 +22,7 @@ HealthCheckerImplBase::HealthCheckerImplBase(const Cluster& cluster, event_logger_(std::move(event_logger)), interval_(PROTOBUF_GET_MS_REQUIRED(config, interval)), no_traffic_interval_(PROTOBUF_GET_MS_OR_DEFAULT(config, no_traffic_interval, 60000)), interval_jitter_(PROTOBUF_GET_MS_OR_DEFAULT(config, interval_jitter, 0)), + interval_jitter_percent_(config.interval_jitter_percent()), unhealthy_interval_( PROTOBUF_GET_MS_OR_DEFAULT(config, unhealthy_interval, interval_.count())), unhealthy_edge_interval_( @@ -85,6 +86,10 @@ std::chrono::milliseconds HealthCheckerImplBase::interval(HealthState state, base_time_ms = no_traffic_interval_.count(); } + if (interval_jitter_percent_ > 0) { + base_time_ms += random_.random() % (interval_jitter_percent_ * base_time_ms / 100); + } + if (interval_jitter_.count() > 0) { base_time_ms += (random_.random() % interval_jitter_.count()); } diff --git a/source/common/upstream/health_checker_base_impl.h b/source/common/upstream/health_checker_base_impl.h index c80650294bad7..644c1d174c579 100644 --- a/source/common/upstream/health_checker_base_impl.h +++ b/source/common/upstream/health_checker_base_impl.h @@ -121,6 +121,7 @@ class HealthCheckerImplBase : public HealthChecker, const std::chrono::milliseconds interval_; const std::chrono::milliseconds no_traffic_interval_; const std::chrono::milliseconds interval_jitter_; + const uint32_t interval_jitter_percent_; const std::chrono::milliseconds unhealthy_interval_; const std::chrono::milliseconds unhealthy_edge_interval_; const std::chrono::milliseconds healthy_edge_interval_; diff --git a/test/common/upstream/health_checker_impl_test.cc b/test/common/upstream/health_checker_impl_test.cc index c270ba832aa80..04d9766fa7fcd 100644 --- a/test/common/upstream/health_checker_impl_test.cc +++ b/test/common/upstream/health_checker_impl_test.cc @@ -143,6 +143,28 @@ class HttpHealthCheckerImplTest : public testing::Test { }); } + void setupIntervalJitterPercent() { + const std::string yaml = R"EOF( + timeout: 1s + interval: 1s + no_traffic_interval: 5s + interval_jitter_percent: 40 + unhealthy_threshold: 2 + healthy_threshold: 2 + http_health_check: + service_name: locations + path: /healthcheck + )EOF"; + + health_checker_.reset(new TestHttpHealthCheckerImpl(*cluster_, parseHealthCheckFromV2Yaml(yaml), + dispatcher_, runtime_, random_, + HealthCheckEventLoggerPtr(event_logger_))); + health_checker_->addHostCheckCompleteCb( + [this](HostSharedPtr host, HealthTransition changed_state) -> void { + onHostStatus(host, changed_state); + }); + } + void setupNoServiceValidationHC() { const std::string yaml = R"EOF( timeout: 1s @@ -428,6 +450,94 @@ TEST_F(HttpHealthCheckerImplTest, Success) { EXPECT_TRUE(cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->healthy()); } +TEST_F(HttpHealthCheckerImplTest, SuccessIntervalJitter) { + setupNoServiceValidationHC(); + EXPECT_CALL(*this, onHostStatus(_, HealthTransition::Unchanged)).Times(testing::AnyNumber()); + + cluster_->prioritySet().getMockHostSet(0)->hosts_ = { + makeTestHost(cluster_->info_, "tcp://127.0.0.1:80")}; + expectSessionCreate(); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_)); + health_checker_->start(); + + EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(_)); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer()); + respond(0, "200", false, true, true); + EXPECT_TRUE(cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->healthy()); + + for (int i = 0; i < 50000; i += 239) { + EXPECT_CALL(random_, random()).WillOnce(Return(i)); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_)); + expectStreamCreate(0); + test_sessions_[0]->interval_timer_->callback_(); + // the jitter is 1000ms here + EXPECT_CALL(*test_sessions_[0]->interval_timer_, + enableTimer(std::chrono::milliseconds(5000 + i % 1000))); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer()); + respond(0, "200", false, true, true); + } +} + +TEST_F(HttpHealthCheckerImplTest, SuccessIntervalJitterPercentNoTraffic) { + setupIntervalJitterPercent(); + EXPECT_CALL(*this, onHostStatus(_, HealthTransition::Unchanged)).Times(testing::AnyNumber()); + + cluster_->prioritySet().getMockHostSet(0)->hosts_ = { + makeTestHost(cluster_->info_, "tcp://127.0.0.1:80")}; + expectSessionCreate(); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_)); + health_checker_->start(); + + EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(_)); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer()); + respond(0, "200", false, true, true); + EXPECT_TRUE(cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->healthy()); + + for (int i = 0; i < 50000; i += 239) { + EXPECT_CALL(random_, random()).WillOnce(Return(i)); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_)); + expectStreamCreate(0); + test_sessions_[0]->interval_timer_->callback_(); + // the jitter is 40% of 5000, so should be 2000 + EXPECT_CALL(*test_sessions_[0]->interval_timer_, + enableTimer(std::chrono::milliseconds(5000 + i % 2000))); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer()); + respond(0, "200", false, true, true); + } +} + +TEST_F(HttpHealthCheckerImplTest, SuccessIntervalJitterPercent) { + setupIntervalJitterPercent(); + EXPECT_CALL(*this, onHostStatus(_, HealthTransition::Unchanged)).Times(testing::AnyNumber()); + + cluster_->prioritySet().getMockHostSet(0)->hosts_ = { + makeTestHost(cluster_->info_, "tcp://127.0.0.1:80")}; + cluster_->info_->stats().upstream_cx_total_.inc(); + expectSessionCreate(); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_)); + health_checker_->start(); + + EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(_)); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer()); + respond(0, "200", false, true, true); + EXPECT_TRUE(cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->healthy()); + + for (int i = 0; i < 50000; i += 239) { + EXPECT_CALL(random_, random()).WillOnce(Return(i)); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_)); + expectStreamCreate(0); + test_sessions_[0]->interval_timer_->callback_(); + // the jitter is 40% of 1000, so should be 400 + EXPECT_CALL(*test_sessions_[0]->interval_timer_, + enableTimer(std::chrono::milliseconds(1000 + i % 400))); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer()); + respond(0, "200", false, true, true); + } +} + TEST_F(HttpHealthCheckerImplTest, SuccessWithSpurious100Continue) { setupNoServiceValidationHC(); EXPECT_CALL(*this, onHostStatus(_, HealthTransition::Unchanged)).Times(1);