Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion api/envoy/api/v2/core/health_check.proto
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,17 @@ message HealthCheck {
];

// An optional jitter amount in millseconds. If specified, during every
// internal Envoy will add 0 to interval_jitter to the wait time.
// interval Envoy will add 0 to interval_jitter to the wait time.
google.protobuf.Duration interval_jitter = 3;

// An optional jitter amount as a percentage of interval_ms. If specified,
// during every interval Envoy will add 0 to interval_ms *
// interval_jitter_percent / 100 to the wait time.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please document what happens if both this and interval_jitter are set. Or only allow one to be set and document that (and validate that somewhere).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done!

//
// If interval_jitter_ms and interval_jitter_percent are both set, both of
// them will be used to increase the wait time.
uint32 interval_jitter_percent = 18;

// The number of unhealthy health checks required before a host is marked
// unhealthy. Note that for *http* health checking if a host responds with 503
// this threshold is ignored and the host is considered unhealthy immediately.
Expand Down
1 change: 1 addition & 0 deletions docs/root/intro/version_history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Version history
* admin: added :http:get:`/hystrix_event_stream` as an endpoint for monitoring envoy's statistics
through `Hystrix dashboard <https://github.com/Netflix-Skunkworks/hystrix-dashboard/wiki>`_.
* health check: added support for :ref:`custom health check <envoy_api_field_core.HealthCheck.custom_health_check>`.
* health check: added support for :ref:`specifying jitter as a percentage <envoy_api_field_core.HealthCheck.interval_jitter_percent>`.
* health_check: added support for :ref:`health check event logging <arch_overview_health_check_logging>`.
* http: response filters not applied to early error paths such as http_parser generated 400s.
* lua: added :ref:`connection() <config_http_filters_lua_connection_wrapper>` wrapper and *ssl()* API.
Expand Down
5 changes: 5 additions & 0 deletions source/common/upstream/health_checker_base_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ HealthCheckerImplBase::HealthCheckerImplBase(const Cluster& cluster,
event_logger_(std::move(event_logger)), interval_(PROTOBUF_GET_MS_REQUIRED(config, interval)),
no_traffic_interval_(PROTOBUF_GET_MS_OR_DEFAULT(config, no_traffic_interval, 60000)),
interval_jitter_(PROTOBUF_GET_MS_OR_DEFAULT(config, interval_jitter, 0)),
interval_jitter_percent_(config.interval_jitter_percent()),
unhealthy_interval_(
PROTOBUF_GET_MS_OR_DEFAULT(config, unhealthy_interval, interval_.count())),
unhealthy_edge_interval_(
Expand Down Expand Up @@ -85,6 +86,10 @@ std::chrono::milliseconds HealthCheckerImplBase::interval(HealthState state,
base_time_ms = no_traffic_interval_.count();
}

if (interval_jitter_percent_ > 0) {
base_time_ms += random_.random() % (interval_jitter_percent_ * base_time_ms / 100);
}

if (interval_jitter_.count() > 0) {
base_time_ms += (random_.random() % interval_jitter_.count());
}
Expand Down
1 change: 1 addition & 0 deletions source/common/upstream/health_checker_base_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ class HealthCheckerImplBase : public HealthChecker,
const std::chrono::milliseconds interval_;
const std::chrono::milliseconds no_traffic_interval_;
const std::chrono::milliseconds interval_jitter_;
const uint32_t interval_jitter_percent_;
const std::chrono::milliseconds unhealthy_interval_;
const std::chrono::milliseconds unhealthy_edge_interval_;
const std::chrono::milliseconds healthy_edge_interval_;
Expand Down
110 changes: 110 additions & 0 deletions test/common/upstream/health_checker_impl_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,28 @@ class HttpHealthCheckerImplTest : public testing::Test {
});
}

void setupIntervalJitterPercent() {
const std::string yaml = R"EOF(
timeout: 1s
interval: 1s
no_traffic_interval: 5s
interval_jitter_percent: 40
unhealthy_threshold: 2
healthy_threshold: 2
http_health_check:
service_name: locations
path: /healthcheck
)EOF";

health_checker_.reset(new TestHttpHealthCheckerImpl(*cluster_, parseHealthCheckFromV2Yaml(yaml),
dispatcher_, runtime_, random_,
HealthCheckEventLoggerPtr(event_logger_)));
health_checker_->addHostCheckCompleteCb(
[this](HostSharedPtr host, HealthTransition changed_state) -> void {
onHostStatus(host, changed_state);
});
}

void setupNoServiceValidationHC() {
const std::string yaml = R"EOF(
timeout: 1s
Expand Down Expand Up @@ -428,6 +450,94 @@ TEST_F(HttpHealthCheckerImplTest, Success) {
EXPECT_TRUE(cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->healthy());
}

TEST_F(HttpHealthCheckerImplTest, SuccessIntervalJitter) {
setupNoServiceValidationHC();
EXPECT_CALL(*this, onHostStatus(_, HealthTransition::Unchanged)).Times(testing::AnyNumber());

cluster_->prioritySet().getMockHostSet(0)->hosts_ = {
makeTestHost(cluster_->info_, "tcp://127.0.0.1:80")};
expectSessionCreate();
expectStreamCreate(0);
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_));
health_checker_->start();

EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(_));
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer());
respond(0, "200", false, true, true);
EXPECT_TRUE(cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->healthy());

for (int i = 0; i < 50000; i += 239) {
EXPECT_CALL(random_, random()).WillOnce(Return(i));
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_));
expectStreamCreate(0);
test_sessions_[0]->interval_timer_->callback_();
// the jitter is 1000ms here
EXPECT_CALL(*test_sessions_[0]->interval_timer_,
enableTimer(std::chrono::milliseconds(5000 + i % 1000)));
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer());
respond(0, "200", false, true, true);
}
}

TEST_F(HttpHealthCheckerImplTest, SuccessIntervalJitterPercentNoTraffic) {
setupIntervalJitterPercent();
EXPECT_CALL(*this, onHostStatus(_, HealthTransition::Unchanged)).Times(testing::AnyNumber());

cluster_->prioritySet().getMockHostSet(0)->hosts_ = {
makeTestHost(cluster_->info_, "tcp://127.0.0.1:80")};
expectSessionCreate();
expectStreamCreate(0);
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_));
health_checker_->start();

EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(_));
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer());
respond(0, "200", false, true, true);
EXPECT_TRUE(cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->healthy());

for (int i = 0; i < 50000; i += 239) {
EXPECT_CALL(random_, random()).WillOnce(Return(i));
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_));
expectStreamCreate(0);
test_sessions_[0]->interval_timer_->callback_();
// the jitter is 40% of 5000, so should be 2000
EXPECT_CALL(*test_sessions_[0]->interval_timer_,
enableTimer(std::chrono::milliseconds(5000 + i % 2000)));
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer());
respond(0, "200", false, true, true);
}
}

TEST_F(HttpHealthCheckerImplTest, SuccessIntervalJitterPercent) {
setupIntervalJitterPercent();
EXPECT_CALL(*this, onHostStatus(_, HealthTransition::Unchanged)).Times(testing::AnyNumber());

cluster_->prioritySet().getMockHostSet(0)->hosts_ = {
makeTestHost(cluster_->info_, "tcp://127.0.0.1:80")};
cluster_->info_->stats().upstream_cx_total_.inc();
expectSessionCreate();
expectStreamCreate(0);
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_));
health_checker_->start();

EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(_));
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer());
respond(0, "200", false, true, true);
EXPECT_TRUE(cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->healthy());

for (int i = 0; i < 50000; i += 239) {
EXPECT_CALL(random_, random()).WillOnce(Return(i));
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_));
expectStreamCreate(0);
test_sessions_[0]->interval_timer_->callback_();
// the jitter is 40% of 1000, so should be 400
EXPECT_CALL(*test_sessions_[0]->interval_timer_,
enableTimer(std::chrono::milliseconds(1000 + i % 400)));
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer());
respond(0, "200", false, true, true);
}
}

TEST_F(HttpHealthCheckerImplTest, SuccessWithSpurious100Continue) {
setupNoServiceValidationHC();
EXPECT_CALL(*this, onHostStatus(_, HealthTransition::Unchanged)).Times(1);
Expand Down