envoyproxy · htuch · Apr 18, 2019 · Apr 12, 2019 · Apr 17, 2019 · htuch
diff --git a/api/envoy/api/v2/route/route.proto b/api/envoy/api/v2/route/route.proto
@@ -796,6 +796,7 @@ message RouteAction {
 }
 
 // HTTP retry :ref:`architecture overview <arch_overview_http_routing_retry>`.
+// [#comment:next free field: 9]
 message RetryPolicy {
   // Specifies the conditions under which retry takes place. These are the same
   // conditions documented for :ref:`config_http_filters_router_x-envoy-retry-on` and
@@ -856,6 +857,34 @@ message RetryPolicy {
 
   // HTTP status codes that should trigger a retry in addition to those specified by retry_on.
   repeated uint32 retriable_status_codes = 7;
+
+  message RetryBackOff {
+    // Specifies the base interval between retries. This parameter is required and must be greater
+    // than zero. Values less than 1 ms are rounded up to 1 ms.
+    // See :ref:`config_http_filters_router_x-envoy-max-retries` for a discussion of Envoy's
+    // back-off algorithm.
+    google.protobuf.Duration base_interval = 1 [
+      (validate.rules).duration = {
+        required: true,
+        gt: {seconds: 0}
+      },
+      (gogoproto.stdduration) = true
+    ];
+
+    // Specifies the maximum interval between retries. This parameter is optional, but must be
+    // greater than or equal to the `base_interval` if set. The default is 10 times the
+    // `base_interval`. See :ref:`config_http_filters_router_x-envoy-max-retries` for a discussion
+    // of Envoy's back-off algorithm.
+    google.protobuf.Duration max_interval = 2
+        [(validate.rules).duration.gt = {seconds: 0}, (gogoproto.stdduration) = true];
+  }
+
+  // Specifies parameters that control retry back off. This parameter is optional, in which case the
+  // default base interval is 25 milliseconds or, if set, the current value of the
+  // `upstream.base_retry_backoff_ms` runtime parameter. The default maximum interval is 10 times
+  // the base interval. The documentation for :ref:`config_http_filters_router_x-envoy-max-retries`
+  // describes Envoy's back-off algorithm.
+  RetryBackOff retry_back_off = 8;
 }
 
 // HTTP request hedging TODO(mpuncel) docs

diff --git a/docs/root/configuration/http_filters/router_filter.rst b/docs/root/configuration/http_filters/router_filter.rst
@@ -38,11 +38,17 @@ A few notes on how Envoy does retries:
 * The route timeout (set via :ref:`config_http_filters_router_x-envoy-upstream-rq-timeout-ms` or the
   :ref:`route configuration <envoy_api_field_route.RouteAction.timeout>`) **includes** all
   retries. Thus if the request timeout is set to 3s, and the first request attempt takes 2.7s, the
-  retry (including backoff) has .3s to complete. This is by design to avoid an exponential
+  retry (including back-off) has .3s to complete. This is by design to avoid an exponential
   retry/timeout explosion.
-* Envoy uses a fully jittered exponential backoff algorithm for retries with a base time of 25ms.
-  The first retry will be delayed randomly between 0-24ms, the 2nd between 0-74ms, the 3rd between
-  0-174ms and so on.
+* Envoy uses a fully jittered exponential back-off algorithm for retries with a default base
+  interval of 25ms. Given a base interval B and retry number N, the back-off for the retry is in
+  the range :math:`\big[0, (2^N-1)B\big)`. For example, given the default interval, the first retry
+  will be delayed randomly by 0-24ms, the 2nd by 0-74ms, the 3rd by 0-174ms, and so on. The
+  interval is capped at a maximum interval, which defaults to 10 times the base interval (250ms).
+  The default base interval (and therefore the maximum interval) can be manipulated by setting the
+  upstream.base_retry_backoff_ms runtime parameter. The back-off intervals can also be modified
+  by configuring the retry policy's
+  :ref:`retry back-off <envoy_api_field_route.RetryPolicy.retry_back_off>`.
 * If max retries is set both by header as well as in the route configuration, the maximum value is
   taken when determining the max retries to use for the request.
 
@@ -156,7 +162,7 @@ x-envoy-retriable-status-codes
 Setting this header informs Envoy about what status codes should be considered retriable when used in
 conjunction with the :ref:`retriable-status-code <config_http_filters_router_x-envoy-retry-on>` retry policy.
 When the corresponding retry policy is set, the list of retriable status codes will be considered retriable
-in addition to the status codes enabled for retry through other retry policies. 
+in addition to the status codes enabled for retry through other retry policies.
 
 The list is a comma delimited list of integers: "409" would cause 409 to be considered retriable, while "504,409"
 would consider both 504 and 409 retriable.
@@ -239,7 +245,7 @@ x-envoy-ratelimited
 
 If this header is set by upstream, Envoy will not retry. Currently the value of the header is not
 looked at, only its presence. This header is set by :ref:`rate limit filter<config_http_filters_rate_limit>`
-when the request is rate limited. 
+when the request is rate limited.
 
 .. _config_http_filters_router_x-envoy-decorator-operation:
 
@@ -350,8 +356,9 @@ Runtime
 The router filter supports the following runtime settings:
 
 upstream.base_retry_backoff_ms
-  Base exponential retry back off time. See :ref:`here <arch_overview_http_routing_retry>` for more
-  information. Defaults to 25ms.
+  Base exponential retry back-off time. See :ref:`here <arch_overview_http_routing_retry>` and
+  :ref:`config_http_filters_router_x-envoy-max-retries` for more information. Defaults to 25ms.
+  The default maximum retry back-off time is 10 times this value.
 
 .. _config_http_filters_router_runtime_maintenance_mode:
 

diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst
@@ -6,6 +6,7 @@ Version history
 * dubbo_proxy: support the :ref:`Dubbo proxy filter <config_network_filters_dubbo_proxy>`.
 * http: mitigated a race condition with the :ref:`delayed_close_timeout<envoy_api_field_config.filter.network.http_connection_manager.v2.HttpConnectionManager.delayed_close_timeout>` where it could trigger while actively flushing a pending write buffer for a downstream connection.
 * redis: add support for zpopmax and zpopmin commands.
+* router: added ability to control retry back-off intervals via :ref:`retry policy <envoy_api_msg_route.RetryPolicy.RetryBackOff>`.
 * upstream: added :ref:`upstream_cx_pool_overflow <config_cluster_manager_cluster_stats>` for the connection pool circuit breaker.
 
 1.10.0 (Apr 5, 2019)

diff --git a/include/envoy/router/router.h b/include/envoy/router/router.h
@@ -201,6 +201,16 @@ class RetryPolicy {
    * policy is enabled.
    */
   virtual const std::vector<uint32_t>& retriableStatusCodes() const PURE;
+
+  /**
+   * @return absl::optional<std::chrono::milliseconds> base retry interval
+   */
+  virtual absl::optional<std::chrono::milliseconds> baseInterval() const PURE;
+
+  /**
+   * @return absl::optional<std::chrono::milliseconds> maximum retry interval
+   */
+  virtual absl::optional<std::chrono::milliseconds> maxInterval() const PURE;
 };
 
 /**

diff --git a/source/common/http/async_client_impl.h b/source/common/http/async_client_impl.h
@@ -148,6 +148,10 @@ class AsyncStreamImpl : public AsyncClient::Stream,
     const std::vector<uint32_t>& retriableStatusCodes() const override {
       return retriable_status_codes_;
     }
+    absl::optional<std::chrono::milliseconds> baseInterval() const override {
+      return absl::nullopt;
+    }
+    absl::optional<std::chrono::milliseconds> maxInterval() const override { return absl::nullopt; }
 
     const std::vector<uint32_t> retriable_status_codes_;
   };

diff --git a/source/common/router/config_impl.cc b/source/common/router/config_impl.cc
@@ -96,6 +96,27 @@ RetryPolicyImpl::RetryPolicyImpl(const envoy::api::v2::route::RetryPolicy& retry
   for (auto code : retry_policy.retriable_status_codes()) {
     retriable_status_codes_.emplace_back(code);
   }
+
+  if (retry_policy.has_retry_back_off()) {
+    base_interval_ = std::chrono::milliseconds(
+        PROTOBUF_GET_MS_REQUIRED(retry_policy.retry_back_off(), base_interval));
+    if ((*base_interval_).count() < 1) {
+      base_interval_ = std::chrono::milliseconds(1);
+    }
+
+    max_interval_ = PROTOBUF_GET_OPTIONAL_MS(retry_policy.retry_back_off(), max_interval);
+    if (max_interval_) {
+      // Apply the same rounding to max interval in case both are set to sub-millisecond values.
+      if ((*max_interval_).count() < 1) {
+        max_interval_ = std::chrono::milliseconds(1);
+      }
+
+      if ((*max_interval_).count() < (*base_interval_).count()) {
+        throw EnvoyException(
+            "retry_policy.max_interval must greater than or equal to the base_interval");
+      }
+    }
+  }
 }
 
 std::vector<Upstream::RetryHostPredicateSharedPtr> RetryPolicyImpl::retryHostPredicates() const {

diff --git a/source/common/router/config_impl.h b/source/common/router/config_impl.h
@@ -227,6 +227,8 @@ class RetryPolicyImpl : public RetryPolicy {
   const std::vector<uint32_t>& retriableStatusCodes() const override {
     return retriable_status_codes_;
   }
+  absl::optional<std::chrono::milliseconds> baseInterval() const override { return base_interval_; }
+  absl::optional<std::chrono::milliseconds> maxInterval() const override { return max_interval_; }
 
 private:
   std::chrono::milliseconds per_try_timeout_{0};
@@ -241,6 +243,8 @@ class RetryPolicyImpl : public RetryPolicy {
   std::pair<std::string, ProtobufTypes::MessagePtr> retry_priority_config_;
   uint32_t host_selection_attempts_{1};
   std::vector<uint32_t> retriable_status_codes_;
+  absl::optional<std::chrono::milliseconds> base_interval_;
+  absl::optional<std::chrono::milliseconds> max_interval_;
 };
 
 /**

diff --git a/source/common/router/retry_state_impl.cc b/source/common/router/retry_state_impl.cc
@@ -59,9 +59,22 @@ RetryStateImpl::RetryStateImpl(const RetryPolicy& route_policy, Http::HeaderMap&
 
   retry_on_ = route_policy.retryOn();
   retries_remaining_ = std::max(retries_remaining_, route_policy.numRetries());
-  const uint32_t base = runtime_.snapshot().getInteger("upstream.base_retry_backoff_ms", 25);
-  // Cap the max interval to 10 times the base interval to ensure reasonable backoff intervals.
-  backoff_strategy_ = std::make_unique<JitteredBackOffStrategy>(base, base * 10, random_);
+
+  std::chrono::milliseconds base_interval(
+      runtime_.snapshot().getInteger("upstream.base_retry_backoff_ms", 25));
+  if (route_policy.baseInterval()) {
+    base_interval = *route_policy.baseInterval();
+  }
+
+  // By default, cap the max interval to 10 times the base interval to ensure reasonable back-off
+  // intervals.
+  std::chrono::milliseconds max_interval = base_interval * 10;
+  if (route_policy.maxInterval()) {
+    max_interval = *route_policy.maxInterval();
+  }
+
+  backoff_strategy_ = std::make_unique<JitteredBackOffStrategy>(base_interval.count(),
+                                                                max_interval.count(), random_);
   host_selection_max_attempts_ = route_policy.hostSelectionMaxAttempts();
 
   // Merge in the headers.

diff --git a/test/common/router/config_impl_test.cc b/test/common/router/config_impl_test.cc
@@ -2399,6 +2399,115 @@ TEST_F(RouteMatcherTest, GrpcRetry) {
                 .retryOn());
 }
 
+// Test route-specific retry back-off intervals.
+TEST_F(RouteMatcherTest, RetryBackOffIntervals) {
+  const std::string yaml = R"EOF(
+virtual_hosts:
+- name: www2
+  domains:
+  - www.lyft.com
+  routes:
+  - match:
+      prefix: "/foo"
+    route:
+      cluster: www2
+      retry_policy:
+        retry_back_off:
+          base_interval: 0.050s
+  - match:
+      prefix: "/bar"
+    route:
+      cluster: www2
+      retry_policy:
+        retry_back_off:
+          base_interval: 0.100s
+          max_interval: 0.500s
+  - match:
+      prefix: "/baz"
+    route:
+      cluster: www2
+      retry_policy:
+        retry_back_off:
+          base_interval: 0.0001s # < 1 ms
+          max_interval: 0.0001s
+  - match:
+      prefix: "/"
+    route:
+      cluster: www2
+      retry_policy:
+        retry_on: connect-failure
+  )EOF";
+
+  TestConfigImpl config(parseRouteConfigurationFromV2Yaml(yaml), factory_context_, true);
+
+  EXPECT_EQ(absl::optional<std::chrono::milliseconds>(50),
+            config.route(genHeaders("www.lyft.com", "/foo", "GET"), 0)
+                ->routeEntry()
+                ->retryPolicy()
+                .baseInterval());
+
+  EXPECT_EQ(absl::nullopt, config.route(genHeaders("www.lyft.com", "/foo", "GET"), 0)
+                               ->routeEntry()
+                               ->retryPolicy()
+                               .maxInterval());
+
+  EXPECT_EQ(absl::optional<std::chrono::milliseconds>(100),
+            config.route(genHeaders("www.lyft.com", "/bar", "GET"), 0)
+                ->routeEntry()
+                ->retryPolicy()
+                .baseInterval());
+
+  EXPECT_EQ(absl::optional<std::chrono::milliseconds>(500),
+            config.route(genHeaders("www.lyft.com", "/bar", "GET"), 0)
+                ->routeEntry()
+                ->retryPolicy()
+                .maxInterval());
+
+  // Sub-millisecond interval converted to 1 ms.
+  EXPECT_EQ(absl::optional<std::chrono::milliseconds>(1),
+            config.route(genHeaders("www.lyft.com", "/baz", "GET"), 0)
+                ->routeEntry()
+                ->retryPolicy()
+                .baseInterval());
+
+  EXPECT_EQ(absl::optional<std::chrono::milliseconds>(1),
+            config.route(genHeaders("www.lyft.com", "/baz", "GET"), 0)
+                ->routeEntry()
+                ->retryPolicy()
+                .maxInterval());
+
+  EXPECT_EQ(absl::nullopt, config.route(genHeaders("www.lyft.com", "/", "GET"), 0)
+                               ->routeEntry()
+                               ->retryPolicy()
+                               .baseInterval());
+
+  EXPECT_EQ(absl::nullopt, config.route(genHeaders("www.lyft.com", "/", "GET"), 0)
+                               ->routeEntry()
+                               ->retryPolicy()
+                               .maxInterval());
+}
+
+// Test invalid route-specific retry back-off configs.
+TEST_F(RouteMatcherTest, InvalidRetryBackOff) {
+  const std::string invalid_max = R"EOF(
+virtual_hosts:
+  - name: backoff
+    domains: ["*"]
+    routes:
+      - match: { prefix: "/" }
+        route:
+          cluster: backoff
+          retry_policy:
+            retry_back_off:
+              base_interval: 10s
+              max_interval: 5s
+)EOF";
+
+  EXPECT_THROW_WITH_MESSAGE(
+      TestConfigImpl(parseRouteConfigurationFromV2Yaml(invalid_max), factory_context_, true),
+      EnvoyException, "retry_policy.max_interval must greater than or equal to the base_interval");
+}
+
 TEST_F(RouteMatcherTest, HedgeRouteLevel) {
   const std::string yaml = R"EOF(
 name: HedgeRouteLevel

diff --git a/test/common/router/retry_state_impl_test.cc b/test/common/router/retry_state_impl_test.cc
@@ -505,6 +505,77 @@ TEST_F(RouterRetryStateImplTest, Backoff) {
   EXPECT_EQ(0UL, cluster_.circuit_breakers_stats_.rq_retry_open_.value());
 }
 
+// Test customized retry back-off intervals.
+TEST_F(RouterRetryStateImplTest, CustomBackOffInterval) {
+  policy_.num_retries_ = 10;
+  policy_.retry_on_ = RetryPolicy::RETRY_ON_CONNECT_FAILURE;
+  policy_.base_interval_ = std::chrono::milliseconds(100);
+  policy_.max_interval_ = std::chrono::milliseconds(1200);
+  Http::TestHeaderMapImpl request_headers;
+  setup(request_headers);
+  EXPECT_TRUE(state_->enabled());
+
+  EXPECT_CALL(random_, random()).WillOnce(Return(149));
+  retry_timer_ = new Event::MockTimer(&dispatcher_);
+  EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(49)));
+  EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_));
+  EXPECT_CALL(callback_ready_, ready());
+  retry_timer_->callback_();
+
+  EXPECT_CALL(random_, random()).WillOnce(Return(350));
+  EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(50)));
+  EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_));
+  EXPECT_CALL(callback_ready_, ready());
+  retry_timer_->callback_();
+
+  EXPECT_CALL(random_, random()).WillOnce(Return(751));
+  EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(51)));
+  EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_));
+  EXPECT_CALL(callback_ready_, ready());
+  retry_timer_->callback_();
+
+  EXPECT_CALL(random_, random()).WillOnce(Return(1499));
+  EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(1200)));
+  EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_));
+  EXPECT_CALL(callback_ready_, ready());
+  retry_timer_->callback_();
+}
+
+// Test the default maximum retry back-off interval.
+TEST_F(RouterRetryStateImplTest, CustomBackOffIntervalDefaultMax) {
+  policy_.num_retries_ = 10;
+  policy_.retry_on_ = RetryPolicy::RETRY_ON_CONNECT_FAILURE;
+  policy_.base_interval_ = std::chrono::milliseconds(100);
+  Http::TestHeaderMapImpl request_headers;
+  setup(request_headers);
+  EXPECT_TRUE(state_->enabled());
+
+  EXPECT_CALL(random_, random()).WillOnce(Return(149));
+  retry_timer_ = new Event::MockTimer(&dispatcher_);
+  EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(49)));
+  EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_));
+  EXPECT_CALL(callback_ready_, ready());
+  retry_timer_->callback_();
+
+  EXPECT_CALL(random_, random()).WillOnce(Return(350));
+  EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(50)));
+  EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_));
+  EXPECT_CALL(callback_ready_, ready());
+  retry_timer_->callback_();
+
+  EXPECT_CALL(random_, random()).WillOnce(Return(751));
+  EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(51)));
+  EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_));
+  EXPECT_CALL(callback_ready_, ready());
+  retry_timer_->callback_();
+
+  EXPECT_CALL(random_, random()).WillOnce(Return(1499));
+  EXPECT_CALL(*retry_timer_, enableTimer(std::chrono::milliseconds(1000)));
+  EXPECT_EQ(RetryStatus::Yes, state_->shouldRetryReset(connect_failure_, callback_));
+  EXPECT_CALL(callback_ready_, ready());
+  retry_timer_->callback_();
+}
+
 TEST_F(RouterRetryStateImplTest, HostSelectionAttempts) {
   policy_.host_selection_max_attempts_ = 2;
   policy_.retry_on_ = RetryPolicy::RETRY_ON_CONNECT_FAILURE;