Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions api/envoy/config/route/v3/route_components.proto
Original file line number Diff line number Diff line change
Expand Up @@ -1164,7 +1164,7 @@ message RouteAction {
}

// HTTP retry :ref:`architecture overview <arch_overview_http_routing_retry>`.
// [#next-free-field: 12]
// [#next-free-field: 14]
message RetryPolicy {
option (udpa.annotations.versioning).previous_message_type = "envoy.api.v2.route.RetryPolicy";

Expand Down Expand Up @@ -1305,8 +1305,8 @@ message RetryPolicy {
google.protobuf.UInt32Value num_retries = 2
[(udpa.annotations.field_migrate).rename = "max_retries"];

// Specifies a non-zero upstream timeout per retry attempt. This parameter is optional. The
// same conditions documented for
// Specifies a non-zero upstream timeout per retry attempt (including the initial attempt). This
// parameter is optional. The same conditions documented for
// :ref:`config_http_filters_router_x-envoy-upstream-rq-per-try-timeout-ms` apply.
//
// .. note::
Expand All @@ -1318,6 +1318,27 @@ message RetryPolicy {
// would have been exhausted.
google.protobuf.Duration per_try_timeout = 3;

// Specifies an upstream idle timeout per retry attempt (including the initial attempt). This
// parameter is optional and if absent there is no per try idle timeout. The semantics of the per
// try idle timeout are similar to the
// :ref:`route idle timeout <envoy_v3_api_field_config.route.v3.RouteAction.timeout>` and
// :ref:`stream idle timeout
// <envoy_v3_api_field_extensions.filters.network.http_connection_manager.v3.HttpConnectionManager.stream_idle_timeout>`
// both enforced by the HTTP connection manager. The difference is that this idle timeout
// is enforced by the router for each individual attempt and thus after all previous filters have
// run, as opposed to *before* all previous filters run for the other idle timeouts. This timeout
// is useful in cases in which total request timeout is bounded by a number of retries and a
// :ref:`per_try_timeout <envoy_v3_api_field_config.route.v3.RetryPolicy.per_try_timeout>`, but
// there is a desire to ensure each try is making incremental progress. Note also that similar
// to :ref:`per_try_timeout <envoy_v3_api_field_config.route.v3.RetryPolicy.per_try_timeout>`,
// this idle timeout does not start until after both the entire request has been received by the
// router *and* a connection pool connection has been obtained. Unlike
// :ref:`per_try_timeout <envoy_v3_api_field_config.route.v3.RetryPolicy.per_try_timeout>`,
// the idle timer continues once the response starts streaming back to the downstream client.
// This ensures that response data continues to make progress without using one of the HTTP
// connection manager idle timeouts.
google.protobuf.Duration per_try_idle_timeout = 13;

// Specifies an implementation of a RetryPriority which is used to determine the
// distribution of load across priorities used for retries. Refer to
// :ref:`retry plugin configuration <arch_overview_http_retry_plugins>` for more details.
Expand Down
5 changes: 5 additions & 0 deletions docs/root/faq/configuration/timeouts.rst
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ stream timeouts already introduced above.
is sent to the downstream, which normally happens after the upstream has sent response headers.
This timeout can be used with streaming endpoints to retry if the upstream fails to begin a
response within the timeout.
* The route :ref:`per_try_idle_timeout <envoy_v3_api_field_config.route.v3.RetryPolicy.per_try_idle_timeout>`
can be configured to ensure continued response progress of individual retry attempts (including
the first attempt). This is useful in cases where the total upstream request time is bounded
by the number of attempts multiplied by the per try timeout, but while the user wants to
ensure that individual attempts are making progress.
* The route :ref:`MaxStreamDuration proto <envoy_v3_api_msg_config.route.v3.RouteAction.MaxStreamDuration>`
can be used to override the HttpConnectionManager's
:ref:`max_stream_duration <envoy_v3_api_field_config.core.v3.HttpProtocolOptions.max_stream_duration>`
Expand Down
1 change: 1 addition & 0 deletions docs/root/version_history/current.rst
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ New Features
* overload: add a new overload action that resets streams using a lot of memory. To enable the tracking of allocated bytes in buffers that a stream is using we need to configure the minimum threshold for tracking via:ref:`buffer_factory_config <envoy_v3_api_field_config.overload.v3.OverloadManager.buffer_factory_config>`. We have an overload action ``Envoy::Server::OverloadActionNameValues::ResetStreams`` that takes advantage of the tracking to reset the most expensive stream first.
* rbac: added :ref:`destination_port_range <envoy_v3_api_field_config.rbac.v3.Permission.destination_port_range>` for matching range of destination ports.
* route config: added :ref:`dynamic_metadata <envoy_v3_api_field_config.route.v3.RouteMatch.dynamic_metadata>` for routing based on dynamic metadata.
* router: added :ref:`per_try_idle_timeout <envoy_v3_api_field_config.route.v3.RetryPolicy.per_try_idle_timeout>` timeout configuration.
* router: added an optional :ref:`override_auto_sni_header <envoy_v3_api_field_config.core.v3.UpstreamHttpProtocolOptions.override_auto_sni_header>` to support setting SNI value from an arbitrary header other than host/authority.
* sxg_filter: added filter to transform response to SXG package to :ref:`contrib images <install_contrib>`. This can be enabled by setting :ref:`SXG <envoy_v3_api_msg_extensions.filters.http.sxg.v3alpha.SXG>` configuration.
* thrift_proxy: added support for :ref:`mirroring requests <envoy_v3_api_field_extensions.filters.network.thrift_proxy.v3.RouteAction.request_mirror_policies>`.
Expand Down
7 changes: 6 additions & 1 deletion envoy/router/router.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,15 @@ class RetryPolicy {
virtual ~RetryPolicy() = default;

/**
* @return std::chrono::milliseconds timeout per retry attempt.
* @return std::chrono::milliseconds timeout per retry attempt. 0 is disabled.
*/
virtual std::chrono::milliseconds perTryTimeout() const PURE;

/**
* @return std::chrono::milliseconds the optional per try idle timeout. 0 is disabled.
*/
virtual std::chrono::milliseconds perTryIdleTimeout() const PURE;

/**
* @return uint32_t the number of retries to allow against the route.
*/
Expand Down
2 changes: 2 additions & 0 deletions envoy/stream_info/stream_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ struct ResponseCodeDetailValues {
const std::string UpstreamTimeout = "upstream_response_timeout";
// The final upstream try timed out.
const std::string UpstreamPerTryTimeout = "upstream_per_try_timeout";
// The final upstream try idle timed out.
const std::string UpstreamPerTryIdleTimeout = "upstream_per_try_idle_timeout";
// The request was destroyed because of user defined max stream duration.
const std::string UpstreamMaxStreamDurationReached = "upstream_max_stream_duration_reached";
// The upstream connection was reset before a response was started. This
Expand Down
2 changes: 1 addition & 1 deletion envoy/upstream/cluster_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ class ClusterManager {
virtual void drainConnections() PURE;

/**
* Check if the cluster is active and statically configured, and if not, throw excetion.
* Check if the cluster is active and statically configured, and if not, throw exception.
* @param cluster, the cluster to check.
*/
virtual void checkActiveStaticCluster(const std::string& cluster) PURE;
Expand Down
1 change: 1 addition & 0 deletions envoy/upstream/upstream.h
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,7 @@ class PrioritySet {
COUNTER(upstream_rq_pending_overflow) \
COUNTER(upstream_rq_pending_total) \
COUNTER(upstream_rq_per_try_timeout) \
COUNTER(upstream_rq_per_try_idle_timeout) \
COUNTER(upstream_rq_retry) \
COUNTER(upstream_rq_retry_backoff_exponential) \
COUNTER(upstream_rq_retry_backoff_ratelimited) \
Expand Down
3 changes: 3 additions & 0 deletions source/common/http/async_client_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ class AsyncStreamImpl : public AsyncClient::Stream,
std::chrono::milliseconds perTryTimeout() const override {
return std::chrono::milliseconds(0);
}
std::chrono::milliseconds perTryIdleTimeout() const override {
return std::chrono::milliseconds(0);
}
std::vector<Upstream::RetryHostPredicateSharedPtr> retryHostPredicates() const override {
return {};
}
Expand Down
2 changes: 2 additions & 0 deletions source/common/router/config_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ RetryPolicyImpl::RetryPolicyImpl(const envoy::config::route::v3::RetryPolicy& re
validation_visitor_(&validation_visitor) {
per_try_timeout_ =
std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(retry_policy, per_try_timeout, 0));
per_try_idle_timeout_ =
std::chrono::milliseconds(PROTOBUF_GET_MS_OR_DEFAULT(retry_policy, per_try_idle_timeout, 0));
num_retries_ = PROTOBUF_GET_WRAPPED_OR_DEFAULT(retry_policy, num_retries, 1);
retry_on_ = RetryStateImpl::parseRetryOn(retry_policy.retry_on()).first;
retry_on_ |= RetryStateImpl::parseRetryGrpcOn(retry_policy.retry_on()).first;
Expand Down
2 changes: 2 additions & 0 deletions source/common/router/config_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ class RetryPolicyImpl : public RetryPolicy {

// Router::RetryPolicy
std::chrono::milliseconds perTryTimeout() const override { return per_try_timeout_; }
std::chrono::milliseconds perTryIdleTimeout() const override { return per_try_idle_timeout_; }
uint32_t numRetries() const override { return num_retries_; }
uint32_t retryOn() const override { return retry_on_; }
std::vector<Upstream::RetryHostPredicateSharedPtr> retryHostPredicates() const override;
Expand All @@ -320,6 +321,7 @@ class RetryPolicyImpl : public RetryPolicy {

private:
std::chrono::milliseconds per_try_timeout_{0};
std::chrono::milliseconds per_try_idle_timeout_{0};
// We set the number of retries to 1 by default (i.e. when no route or vhost level retry policy is
// set) so that when retries get enabled through the x-envoy-retry-on header we default to 1
// retry.
Expand Down
17 changes: 14 additions & 3 deletions source/common/router/router.cc
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ FilterUtility::finalTimeout(const RouteEntry& route, Http::RequestHeaderMap& req
}
}
timeout.per_try_timeout_ = route.retryPolicy().perTryTimeout();
timeout.per_try_idle_timeout_ = route.retryPolicy().perTryIdleTimeout();

uint64_t header_timeout;

Expand Down Expand Up @@ -969,13 +970,24 @@ void Filter::onSoftPerTryTimeout(UpstreamRequest& upstream_request) {
}
}

void Filter::onPerTryIdleTimeout(UpstreamRequest& upstream_request) {
onPerTryTimeoutCommon(upstream_request, cluster_->stats().upstream_rq_per_try_idle_timeout_,
StreamInfo::ResponseCodeDetails::get().UpstreamPerTryIdleTimeout);
}

void Filter::onPerTryTimeout(UpstreamRequest& upstream_request) {
onPerTryTimeoutCommon(upstream_request, cluster_->stats().upstream_rq_per_try_timeout_,
StreamInfo::ResponseCodeDetails::get().UpstreamPerTryTimeout);
}

void Filter::onPerTryTimeoutCommon(UpstreamRequest& upstream_request, Stats::Counter& error_counter,
const std::string& response_code_details) {
if (hedging_params_.hedge_on_per_try_timeout_) {
onSoftPerTryTimeout(upstream_request);
return;
}

cluster_->stats().upstream_rq_per_try_timeout_.inc();
error_counter.inc();
if (upstream_request.upstreamHost()) {
upstream_request.upstreamHost()->stats().rq_timeout_.inc();
}
Expand All @@ -993,8 +1005,7 @@ void Filter::onPerTryTimeout(UpstreamRequest& upstream_request) {

// Remove this upstream request from the list now that we're done with it.
upstream_request.removeFromList(upstream_requests_);
onUpstreamTimeoutAbort(StreamInfo::ResponseFlag::UpstreamRequestTimeout,
StreamInfo::ResponseCodeDetails::get().UpstreamPerTryTimeout);
onUpstreamTimeoutAbort(StreamInfo::ResponseFlag::UpstreamRequestTimeout, response_code_details);
}

void Filter::onStreamMaxDurationReached(UpstreamRequest& upstream_request) {
Expand Down
8 changes: 6 additions & 2 deletions source/common/router/router.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class FilterUtility {
struct TimeoutData {
std::chrono::milliseconds global_timeout_{0};
std::chrono::milliseconds per_try_timeout_{0};
std::chrono::milliseconds per_try_idle_timeout_{0};
};

struct HedgingParams {
Expand Down Expand Up @@ -271,6 +272,7 @@ class RouterFilterInterface {
UpstreamRequest& upstream_request) PURE;
virtual void onUpstreamHostSelected(Upstream::HostDescriptionConstSharedPtr host) PURE;
virtual void onPerTryTimeout(UpstreamRequest& upstream_request) PURE;
virtual void onPerTryIdleTimeout(UpstreamRequest& upstream_request) PURE;
virtual void onStreamMaxDurationReached(UpstreamRequest& upstream_request) PURE;

virtual Http::StreamDecoderFilterCallbacks* callbacks() PURE;
Expand Down Expand Up @@ -445,6 +447,7 @@ class Filter : Logger::Loggable<Logger::Id::router>,
UpstreamRequest& upstream_request) override;
void onUpstreamHostSelected(Upstream::HostDescriptionConstSharedPtr host) override;
void onPerTryTimeout(UpstreamRequest& upstream_request) override;
void onPerTryIdleTimeout(UpstreamRequest& upstream_request) override;
void onStreamMaxDurationReached(UpstreamRequest& upstream_request) override;
Http::StreamDecoderFilterCallbacks* callbacks() override { return callbacks_; }
Upstream::ClusterInfoConstSharedPtr cluster() override { return cluster_; }
Expand All @@ -469,8 +472,8 @@ class Filter : Logger::Loggable<Logger::Id::router>,
private:
friend class UpstreamRequest;

RetryStatePtr retry_state_;

void onPerTryTimeoutCommon(UpstreamRequest& upstream_request, Stats::Counter& error_counter,
const std::string& response_code_details);
Stats::StatName upstreamZone(Upstream::HostDescriptionConstSharedPtr upstream_host);
void chargeUpstreamCode(uint64_t response_status_code,
const Http::ResponseHeaderMap& response_headers,
Expand Down Expand Up @@ -528,6 +531,7 @@ class Filter : Logger::Loggable<Logger::Id::router>,
uint64_t grpc_to_http_status);
Http::Context& httpContext() { return config_.http_context_; }

RetryStatePtr retry_state_;
FilterConfig& config_;
Http::StreamDecoderFilterCallbacks* callbacks_{};
RouteConstSharedPtr route_;
Expand Down
25 changes: 25 additions & 0 deletions source/common/router/upstream_request.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ UpstreamRequest::~UpstreamRequest() {
// Allows for testing.
per_try_timeout_->disableTimer();
}
if (per_try_idle_timeout_ != nullptr) {
// Allows for testing.
per_try_idle_timeout_->disableTimer();
}
if (max_stream_duration_timer_ != nullptr) {
max_stream_duration_timer_->disableTimer();
}
Expand Down Expand Up @@ -136,6 +140,7 @@ void UpstreamRequest::decode100ContinueHeaders(Http::ResponseHeaderMapPtr&& head
void UpstreamRequest::decodeHeaders(Http::ResponseHeaderMapPtr&& headers, bool end_stream) {
ScopeTrackerScopeState scope(&parent_.callbacks()->scope(), parent_.callbacks()->dispatcher());

resetPerTryIdleTimer();
addResponseHeadersSize(headers->byteSize());

// We drop 1xx other than 101 on the floor; 101 upgrade headers need to be passed to the client as
Expand Down Expand Up @@ -177,6 +182,7 @@ void UpstreamRequest::decodeHeaders(Http::ResponseHeaderMapPtr&& headers, bool e
void UpstreamRequest::decodeData(Buffer::Instance& data, bool end_stream) {
ScopeTrackerScopeState scope(&parent_.callbacks()->scope(), parent_.callbacks()->dispatcher());

resetPerTryIdleTimer();
maybeEndDecode(end_stream);
stream_info_.addBytesReceived(data.length());
parent_.onUpstreamData(data, *this, end_stream);
Expand Down Expand Up @@ -331,13 +337,32 @@ void UpstreamRequest::resetStream() {
}
}

void UpstreamRequest::resetPerTryIdleTimer() {
if (per_try_idle_timeout_ != nullptr) {
per_try_idle_timeout_->enableTimer(parent_.timeout().per_try_idle_timeout_);
}
}

void UpstreamRequest::setupPerTryTimeout() {
ASSERT(!per_try_timeout_);
if (parent_.timeout().per_try_timeout_.count() > 0) {
per_try_timeout_ =
parent_.callbacks()->dispatcher().createTimer([this]() -> void { onPerTryTimeout(); });
per_try_timeout_->enableTimer(parent_.timeout().per_try_timeout_);
}

ASSERT(!per_try_idle_timeout_);
if (parent_.timeout().per_try_idle_timeout_.count() > 0) {
per_try_idle_timeout_ =
parent_.callbacks()->dispatcher().createTimer([this]() -> void { onPerTryIdleTimeout(); });
resetPerTryIdleTimer();
}
}

void UpstreamRequest::onPerTryIdleTimeout() {
ENVOY_STREAM_LOG(debug, "upstream per try idle timeout", *parent_.callbacks());
stream_info_.setResponseFlag(StreamInfo::ResponseFlag::StreamIdleTimeout);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there's 2 tries (hedging or retries) but we successfully proxy, should the response flag be set?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is stream info on the upstream request (for upstream logs), not the downstream stream info.

parent_.onPerTryIdleTimeout(*this);
}

void UpstreamRequest::onPerTryTimeout() {
Expand Down
5 changes: 4 additions & 1 deletion source/common/router/upstream_request.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ class UpstreamRequest : public Logger::Loggable<Logger::Id::router>,

void resetStream();
void setupPerTryTimeout();
void onPerTryTimeout();
void maybeEndDecode(bool end_stream);
void onUpstreamHostSelected(Upstream::HostDescriptionConstSharedPtr host);

Expand Down Expand Up @@ -132,11 +131,15 @@ class UpstreamRequest : public Logger::Loggable<Logger::Id::router>,
void addResponseHeadersSize(uint64_t size) {
response_headers_size_ = response_headers_size_.value_or(0) + size;
}
void resetPerTryIdleTimer();
void onPerTryTimeout();
void onPerTryIdleTimeout();

RouterFilterInterface& parent_;
std::unique_ptr<GenericConnPool> conn_pool_;
bool grpc_rq_success_deferred_;
Event::TimerPtr per_try_timeout_;
Event::TimerPtr per_try_idle_timeout_;
std::unique_ptr<GenericUpstream> upstream_;
absl::optional<Http::StreamResetReason> deferred_reset_reason_;
Buffer::InstancePtr buffered_request_body_;
Expand Down
11 changes: 11 additions & 0 deletions test/common/router/config_impl_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3337,6 +3337,7 @@ TEST_F(RouteMatcherTest, Retry) {
cluster: www2
retry_policy:
per_try_timeout: 1s
per_try_idle_timeout: 5s
num_retries: 3
retry_on: 5xx,gateway-error,connect-failure,reset
)EOF";
Expand All @@ -3349,6 +3350,11 @@ TEST_F(RouteMatcherTest, Retry) {
->routeEntry()
->retryPolicy()
.perTryTimeout());
EXPECT_EQ(std::chrono::milliseconds(0),
config.route(genHeaders("www.lyft.com", "/foo", "GET"), 0)
->routeEntry()
->retryPolicy()
.perTryIdleTimeout());
EXPECT_EQ(1U, config.route(genHeaders("www.lyft.com", "/foo", "GET"), 0)
->routeEntry()
->retryPolicy()
Expand Down Expand Up @@ -3378,6 +3384,11 @@ TEST_F(RouteMatcherTest, Retry) {
->routeEntry()
->retryPolicy()
.perTryTimeout());
EXPECT_EQ(std::chrono::milliseconds(5000),
config.route(genHeaders("www.lyft.com", "/", "GET"), 0)
->routeEntry()
->retryPolicy()
.perTryIdleTimeout());
EXPECT_EQ(3U, config.route(genHeaders("www.lyft.com", "/", "GET"), 0)
->routeEntry()
->retryPolicy()
Expand Down
Loading