diff --git a/api/envoy/api/v2/route/route.proto b/api/envoy/api/v2/route/route.proto index f0bda5fd8ace6..87232a78eadce 100644 --- a/api/envoy/api/v2/route/route.proto +++ b/api/envoy/api/v2/route/route.proto @@ -796,6 +796,7 @@ message RouteAction { } // HTTP retry :ref:`architecture overview `. +// [#comment:next free field: 9] message RetryPolicy { // Specifies the conditions under which retry takes place. These are the same // conditions documented for :ref:`config_http_filters_router_x-envoy-retry-on` and @@ -856,6 +857,34 @@ message RetryPolicy { // HTTP status codes that should trigger a retry in addition to those specified by retry_on. repeated uint32 retriable_status_codes = 7; + + message RetryBackOff { + // Specifies the base interval between retries. This parameter is required and must be greater + // than zero. Values less than 1 ms are rounded up to 1 ms. + // See :ref:`config_http_filters_router_x-envoy-max-retries` for a discussion of Envoy's + // back-off algorithm. + google.protobuf.Duration base_interval = 1 [ + (validate.rules).duration = { + required: true, + gt: {seconds: 0} + }, + (gogoproto.stdduration) = true + ]; + + // Specifies the maximum interval between retries. This parameter is optional, but must be + // greater than or equal to the `base_interval` if set. The default is 10 times the + // `base_interval`. See :ref:`config_http_filters_router_x-envoy-max-retries` for a discussion + // of Envoy's back-off algorithm. + google.protobuf.Duration max_interval = 2 + [(validate.rules).duration.gt = {seconds: 0}, (gogoproto.stdduration) = true]; + } + + // Specifies parameters that control retry back off. This parameter is optional, in which case the + // default base interval is 25 milliseconds or, if set, the current value of the + // `upstream.base_retry_backoff_ms` runtime parameter. The default maximum interval is 10 times + // the base interval. The documentation for :ref:`config_http_filters_router_x-envoy-max-retries` + // describes Envoy's back-off algorithm. + RetryBackOff retry_back_off = 8; } // HTTP request hedging TODO(mpuncel) docs diff --git a/docs/root/configuration/http_filters/router_filter.rst b/docs/root/configuration/http_filters/router_filter.rst index cb5ab6a5941f6..d33a974eaad51 100644 --- a/docs/root/configuration/http_filters/router_filter.rst +++ b/docs/root/configuration/http_filters/router_filter.rst @@ -38,11 +38,17 @@ A few notes on how Envoy does retries: * The route timeout (set via :ref:`config_http_filters_router_x-envoy-upstream-rq-timeout-ms` or the :ref:`route configuration `) **includes** all retries. Thus if the request timeout is set to 3s, and the first request attempt takes 2.7s, the - retry (including backoff) has .3s to complete. This is by design to avoid an exponential + retry (including back-off) has .3s to complete. This is by design to avoid an exponential retry/timeout explosion. -* Envoy uses a fully jittered exponential backoff algorithm for retries with a base time of 25ms. - The first retry will be delayed randomly between 0-24ms, the 2nd between 0-74ms, the 3rd between - 0-174ms and so on. +* Envoy uses a fully jittered exponential back-off algorithm for retries with a default base + interval of 25ms. Given a base interval B and retry number N, the back-off for the retry is in + the range :math:`\big[0, (2^N-1)B\big)`. For example, given the default interval, the first retry + will be delayed randomly by 0-24ms, the 2nd by 0-74ms, the 3rd by 0-174ms, and so on. The + interval is capped at a maximum interval, which defaults to 10 times the base interval (250ms). + The default base interval (and therefore the maximum interval) can be manipulated by setting the + upstream.base_retry_backoff_ms runtime parameter. The back-off intervals can also be modified + by configuring the retry policy's + :ref:`retry back-off `. * If max retries is set both by header as well as in the route configuration, the maximum value is taken when determining the max retries to use for the request. @@ -156,7 +162,7 @@ x-envoy-retriable-status-codes Setting this header informs Envoy about what status codes should be considered retriable when used in conjunction with the :ref:`retriable-status-code ` retry policy. When the corresponding retry policy is set, the list of retriable status codes will be considered retriable -in addition to the status codes enabled for retry through other retry policies. +in addition to the status codes enabled for retry through other retry policies. The list is a comma delimited list of integers: "409" would cause 409 to be considered retriable, while "504,409" would consider both 504 and 409 retriable. @@ -239,7 +245,7 @@ x-envoy-ratelimited If this header is set by upstream, Envoy will not retry. Currently the value of the header is not looked at, only its presence. This header is set by :ref:`rate limit filter` -when the request is rate limited. +when the request is rate limited. .. _config_http_filters_router_x-envoy-decorator-operation: @@ -350,8 +356,9 @@ Runtime The router filter supports the following runtime settings: upstream.base_retry_backoff_ms - Base exponential retry back off time. See :ref:`here ` for more - information. Defaults to 25ms. + Base exponential retry back-off time. See :ref:`here ` and + :ref:`config_http_filters_router_x-envoy-max-retries` for more information. Defaults to 25ms. + The default maximum retry back-off time is 10 times this value. .. _config_http_filters_router_runtime_maintenance_mode: diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index 35fba003bbae6..f8e90900ed605 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -6,6 +6,7 @@ Version history * dubbo_proxy: support the :ref:`Dubbo proxy filter `. * http: mitigated a race condition with the :ref:`delayed_close_timeout` where it could trigger while actively flushing a pending write buffer for a downstream connection. * redis: add support for zpopmax and zpopmin commands. +* router: added ability to control retry back-off intervals via :ref:`retry policy `. * upstream: added :ref:`upstream_cx_pool_overflow ` for the connection pool circuit breaker. 1.10.0 (Apr 5, 2019) diff --git a/include/envoy/router/router.h b/include/envoy/router/router.h index e090c9caadc63..acd9738ab08ee 100644 --- a/include/envoy/router/router.h +++ b/include/envoy/router/router.h @@ -201,6 +201,16 @@ class RetryPolicy { * policy is enabled. */ virtual const std::vector& retriableStatusCodes() const PURE; + + /** + * @return absl::optional base retry interval + */ + virtual absl::optional baseInterval() const PURE; + + /** + * @return absl::optional maximum retry interval + */ + virtual absl::optional maxInterval() const PURE; }; /** diff --git a/source/common/http/async_client_impl.h b/source/common/http/async_client_impl.h index 35fffefc2c473..10aed756001e9 100644 --- a/source/common/http/async_client_impl.h +++ b/source/common/http/async_client_impl.h @@ -148,6 +148,10 @@ class AsyncStreamImpl : public AsyncClient::Stream, const std::vector& retriableStatusCodes() const override { return retriable_status_codes_; } + absl::optional baseInterval() const override { + return absl::nullopt; + } + absl::optional maxInterval() const override { return absl::nullopt; } const std::vector retriable_status_codes_; }; diff --git a/source/common/router/config_impl.cc b/source/common/router/config_impl.cc index d17e0e56f4ca5..e941a1fafe56a 100644 --- a/source/common/router/config_impl.cc +++ b/source/common/router/config_impl.cc @@ -96,6 +96,27 @@ RetryPolicyImpl::RetryPolicyImpl(const envoy::api::v2::route::RetryPolicy& retry for (auto code : retry_policy.retriable_status_codes()) { retriable_status_codes_.emplace_back(code); } + + if (retry_policy.has_retry_back_off()) { + base_interval_ = std::chrono::milliseconds( + PROTOBUF_GET_MS_REQUIRED(retry_policy.retry_back_off(), base_interval)); + if ((*base_interval_).count() < 1) { + base_interval_ = std::chrono::milliseconds(1); + } + + max_interval_ = PROTOBUF_GET_OPTIONAL_MS(retry_policy.retry_back_off(), max_interval); + if (max_interval_) { + // Apply the same rounding to max interval in case both are set to sub-millisecond values. + if ((*max_interval_).count() < 1) { + max_interval_ = std::chrono::milliseconds(1); + } + + if ((*max_interval_).count() < (*base_interval_).count()) { + throw EnvoyException( + "retry_policy.max_interval must greater than or equal to the base_interval"); + } + } + } } std::vector RetryPolicyImpl::retryHostPredicates() const { diff --git a/source/common/router/config_impl.h b/source/common/router/config_impl.h index c9fdeb40403e3..2ded20d38e4fe 100644 --- a/source/common/router/config_impl.h +++ b/source/common/router/config_impl.h @@ -227,6 +227,8 @@ class RetryPolicyImpl : public RetryPolicy { const std::vector& retriableStatusCodes() const override { return retriable_status_codes_; } + absl::optional baseInterval() const override { return base_interval_; } + absl::optional maxInterval() const override { return max_interval_; } private: std::chrono::milliseconds per_try_timeout_{0}; @@ -241,6 +243,8 @@ class RetryPolicyImpl : public RetryPolicy { std::pair retry_priority_config_; uint32_t host_selection_attempts_{1}; std::vector retriable_status_codes_; + absl::optional base_interval_; + absl::optional max_interval_; }; /** diff --git a/source/common/router/retry_state_impl.cc b/source/common/router/retry_state_impl.cc index 84e1863176031..c5c75cfafe440 100644 --- a/source/common/router/retry_state_impl.cc +++ b/source/common/router/retry_state_impl.cc @@ -59,9 +59,22 @@ RetryStateImpl::RetryStateImpl(const RetryPolicy& route_policy, Http::HeaderMap& retry_on_ = route_policy.retryOn(); retries_remaining_ = std::max(retries_remaining_, route_policy.numRetries()); - const uint32_t base = runtime_.snapshot().getInteger("upstream.base_retry_backoff_ms", 25); - // Cap the max interval to 10 times the base interval to ensure reasonable backoff intervals. - backoff_strategy_ = std::make_unique(base, base * 10, random_); + + std::chrono::milliseconds base_interval( + runtime_.snapshot().getInteger("upstream.base_retry_backoff_ms", 25)); + if (route_policy.baseInterval()) { + base_interval = *route_policy.baseInterval(); + } + + // By default, cap the max interval to 10 times the base interval to ensure reasonable back-off + // intervals. + std::chrono::milliseconds max_interval = base_interval * 10; + if (route_policy.maxInterval()) { + max_interval = *route_policy.maxInterval(); + } + + backoff_strategy_ = std::make_unique(base_interval.count(), + max_interval.count(), random_); host_selection_max_attempts_ = route_policy.hostSelectionMaxAttempts(); // Merge in the headers. diff --git a/test/common/router/config_impl_test.cc b/test/common/router/config_impl_test.cc index 4215294deaf50..b2d2a1589262d 100644 --- a/test/common/router/config_impl_test.cc +++ b/test/common/router/config_impl_test.cc @@ -2399,6 +2399,115 @@ TEST_F(RouteMatcherTest, GrpcRetry) { .retryOn()); } +// Test route-specific retry back-off intervals. +TEST_F(RouteMatcherTest, RetryBackOffIntervals) { + const std::string yaml = R"EOF( +virtual_hosts: +- name: www2 + domains: + - www.lyft.com + routes: + - match: + prefix: "/foo" + route: + cluster: www2 + retry_policy: + retry_back_off: + base_interval: 0.050s + - match: + prefix: "/bar" + route: + cluster: www2 + retry_policy: + retry_back_off: + base_interval: 0.100s + max_interval: 0.500s + - match: + prefix: "/baz" + route: + cluster: www2 + retry_policy: + retry_back_off: + base_interval: 0.0001s # < 1 ms + max_interval: 0.0001s + - match: + prefix: "/" + route: + cluster: www2 + retry_policy: + retry_on: connect-failure + )EOF"; + + TestConfigImpl config(parseRouteConfigurationFromV2Yaml(yaml), factory_context_, true); + + EXPECT_EQ(absl::optional(50), + config.route(genHeaders("www.lyft.com", "/foo", "GET"), 0) + ->routeEntry() + ->retryPolicy() + .baseInterval()); + + EXPECT_EQ(absl::nullopt, config.route(genHeaders("www.lyft.com", "/foo", "GET"), 0) + ->routeEntry() + ->retryPolicy() + .maxInterval()); + + EXPECT_EQ(absl::optional(100), + config.route(genHeaders("www.lyft.com", "/bar", "GET"), 0) + ->routeEntry() + ->retryPolicy() + .baseInterval()); + + EXPECT_EQ(absl::optional(500), + config.route(genHeaders("www.lyft.com", "/bar", "GET"), 0) + ->routeEntry() + ->retryPolicy() + .maxInterval()); + + // Sub-millisecond interval converted to 1 ms. + EXPECT_EQ(absl::optional(1), + config.route(genHeaders("www.lyft.com", "/baz", "GET"), 0) + ->routeEntry() + ->retryPolicy() + .baseInterval()); + + EXPECT_EQ(absl::optional(1), + config.route(genHeaders("www.lyft.com", "/baz", "GET"), 0) + ->routeEntry() + ->retryPolicy() + .maxInterval()); + + EXPECT_EQ(absl::nullopt, config.route(genHeaders("www.lyft.com", "/", "GET"), 0) + ->routeEntry() + ->retryPolicy() + .baseInterval()); + + EXPECT_EQ(absl::nullopt, config.route(genHeaders("www.lyft.com", "/", "GET"), 0) + ->routeEntry() + ->retryPolicy() + .maxInterval()); +} + +// Test invalid route-specific retry back-off configs. +TEST_F(RouteMatcherTest, InvalidRetryBackOff) { + const std::string invalid_max = R"EOF( +virtual_hosts: + - name: backoff + domains: ["*"] + routes: + - match: { prefix: "/" } + route: + cluster: backoff + retry_policy: + retry_back_off: + base_interval: 10s + max_interval: 5s +)EOF"; + + EXPECT_THROW_WITH_MESSAGE( + TestConfigImpl(parseRouteConfigurationFromV2Yaml(invalid_max), factory_context_, true), + EnvoyException, "retry_policy.max_interval must greater than or equal to the base_interval"); +} + TEST_F(RouteMatcherTest, HedgeRouteLevel) { const std::string yaml = R"EOF( name: HedgeRouteLevel diff --git a/test/common/router/retry_state_impl_test.cc b/test/common/router/retry_state_impl_test.cc index af96861b8b9a7..415fa5b32ec28 100644 --- a/test/common/router/retry_state_impl_test.cc +++ b/test/common/router/retry_state_impl_test.cc @@ -505,6 +505,77 @@ TEST_F(RouterRetryStateImplTest, Backoff) { EXPECT_EQ(0UL, cluster_.circuit_breakers_stats_.rq_retry_open_.value()); } +// Test customized retry back-off intervals. +TEST_F(RouterRetryStateImplTest, CustomBackOffInterval) { + policy_.num_retries_ = 10; + policy_.retry_on_ = RetryPolicy::RETRY_ON_CONNECT_FAILURE; + policy_.base_interval_ = std::chrono::milliseconds(100); + policy_.max_interval_ = std::chrono::milliseconds(1200); + Http::TestHeaderMapImpl request_headers; + setup(request_headers); + EXPECT_TRUE(state_->enabled()); + + EXPECT_CALL(random_, random()).WillOnce(Return(149)); + retry_timer_ = new Event::MockTimer(&dispatcher_); + EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(49))); + EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_)); + EXPECT_CALL(callback_ready_, ready()); + retry_timer_->callback_(); + + EXPECT_CALL(random_, random()).WillOnce(Return(350)); + EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(50))); + EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_)); + EXPECT_CALL(callback_ready_, ready()); + retry_timer_->callback_(); + + EXPECT_CALL(random_, random()).WillOnce(Return(751)); + EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(51))); + EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_)); + EXPECT_CALL(callback_ready_, ready()); + retry_timer_->callback_(); + + EXPECT_CALL(random_, random()).WillOnce(Return(1499)); + EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(1200))); + EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_)); + EXPECT_CALL(callback_ready_, ready()); + retry_timer_->callback_(); +} + +// Test the default maximum retry back-off interval. +TEST_F(RouterRetryStateImplTest, CustomBackOffIntervalDefaultMax) { + policy_.num_retries_ = 10; + policy_.retry_on_ = RetryPolicy::RETRY_ON_CONNECT_FAILURE; + policy_.base_interval_ = std::chrono::milliseconds(100); + Http::TestHeaderMapImpl request_headers; + setup(request_headers); + EXPECT_TRUE(state_->enabled()); + + EXPECT_CALL(random_, random()).WillOnce(Return(149)); + retry_timer_ = new Event::MockTimer(&dispatcher_); + EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(49))); + EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_)); + EXPECT_CALL(callback_ready_, ready()); + retry_timer_->callback_(); + + EXPECT_CALL(random_, random()).WillOnce(Return(350)); + EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(50))); + EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_)); + EXPECT_CALL(callback_ready_, ready()); + retry_timer_->callback_(); + + EXPECT_CALL(random_, random()).WillOnce(Return(751)); + EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(51))); + EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_)); + EXPECT_CALL(callback_ready_, ready()); + retry_timer_->callback_(); + + EXPECT_CALL(random_, random()).WillOnce(Return(1499)); + EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(1000))); + EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_)); + EXPECT_CALL(callback_ready_, ready()); + retry_timer_->callback_(); +} + TEST_F(RouterRetryStateImplTest, HostSelectionAttempts) { policy_.host_selection_max_attempts_ = 2; policy_.retry_on_ = RetryPolicy::RETRY_ON_CONNECT_FAILURE; diff --git a/test/mocks/router/mocks.h b/test/mocks/router/mocks.h index ba40815aaab49..f49cd5f719363 100644 --- a/test/mocks/router/mocks.h +++ b/test/mocks/router/mocks.h @@ -91,12 +91,16 @@ class TestRetryPolicy : public RetryPolicy { const std::vector& retriableStatusCodes() const override { return retriable_status_codes_; } + absl::optional baseInterval() const override { return base_interval_; } + absl::optional maxInterval() const override { return max_interval_; } std::chrono::milliseconds per_try_timeout_{0}; uint32_t num_retries_{}; uint32_t retry_on_{}; uint32_t host_selection_max_attempts_; std::vector retriable_status_codes_; + absl::optional base_interval_{}; + absl::optional max_interval_{}; }; class MockRetryState : public RetryState {