From 4ab06413d2349ab01db3e366c30f1d04d144e6cc Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Wed, 27 Feb 2019 13:27:02 -0500 Subject: [PATCH 01/70] router filter: implement hedge_on_per_try_timeout. Implements the hedge_on_per_try_timeout option in HedgePolicy config which instructs the router filter to not cancel requests that hit the per try timeout before sending a retry. This means the router must be able to manage multiple simultaneous upstream requests and handle deciding which response "wins" and canceling any other in flight requests. Finishes #5841. Signed-off-by: Michael Puncel --- api/envoy/api/v2/route/route.proto | 7 +- .../cluster_manager/cluster_stats.rst | 1 + .../root/intro/arch_overview/http_routing.rst | 11 + docs/root/intro/version_history.rst | 1 + docs/root/operations/admin.rst | 1 + include/envoy/router/router.h | 13 + include/envoy/upstream/host_description.h | 1 + include/envoy/upstream/upstream.h | 1 + source/common/router/retry_state_impl.cc | 7 + source/common/router/retry_state_impl.h | 1 + source/common/router/router.cc | 428 ++++++++++++------ source/common/router/router.h | 60 ++- test/common/router/router_test.cc | 278 +++++++++++- test/mocks/router/mocks.cc | 5 + test/mocks/router/mocks.h | 6 +- 15 files changed, 651 insertions(+), 170 deletions(-) diff --git a/api/envoy/api/v2/route/route.proto b/api/envoy/api/v2/route/route.proto index 3cbf726835e98..cb503608c1332 100644 --- a/api/envoy/api/v2/route/route.proto +++ b/api/envoy/api/v2/route/route.proto @@ -143,7 +143,6 @@ message VirtualHost { // Indicates the hedge policy for all routes in this virtual host. Note that setting a // route level entry will take precedence over this config and it'll be treated // independently (e.g.: values are not inherited). - // [#not-implemented-hide:] HedgePolicy hedge_policy = 17; } @@ -792,7 +791,6 @@ message RouteAction { // Indicates that the route has a hedge policy. Note that if this is set, // it'll take precedence over the virtual host level hedge policy entirely // (e.g.: policies are not merged, most internal one becomes the enforced policy). - // [#not-implemented-hide:] HedgePolicy hedge_policy = 27; } @@ -859,17 +857,18 @@ message RetryPolicy { repeated uint32 retriable_status_codes = 7; } -// HTTP request hedging TODO(mpuncel) docs -// [#not-implemented-hide:] +// HTTP request hedging :ref:`architecture overview `. message HedgePolicy { // Specifies the number of initial requests that should be sent upstream. // Must be at least 1. // Defaults to 1. + // [#not-implemented-hide:] google.protobuf.UInt32Value initial_requests = 1 [(validate.rules).uint32.gte = 1]; // Specifies a probability that an additional upstream request should be sent // on top of what is specified by initial_requests. // Defaults to 0. + // [#not-implemented-hide:] envoy.type.FractionalPercent additional_request_chance = 2; // Indicates that a hedged request should be sent when the per-try timeout diff --git a/docs/root/configuration/cluster_manager/cluster_stats.rst b/docs/root/configuration/cluster_manager/cluster_stats.rst index 38d0759aa2ca1..680a2ff067cb8 100644 --- a/docs/root/configuration/cluster_manager/cluster_stats.rst +++ b/docs/root/configuration/cluster_manager/cluster_stats.rst @@ -60,6 +60,7 @@ Every cluster has a statistics tree rooted at *cluster..* with the followi upstream_cx_max_requests, Counter, Total connections closed due to maximum requests upstream_cx_none_healthy, Counter, Total times connection not established due to no healthy hosts upstream_rq_total, Counter, Total requests + upstream_rq_hedge_abandoned, Counter, Number of hedged requests that were abandoned due to accepting another response. upstream_rq_active, Gauge, Total active requests upstream_rq_pending_total, Counter, Total requests pending a connection pool connection upstream_rq_pending_overflow, Counter, Total requests that overflowed connection pool circuit breaking and were failed diff --git a/docs/root/intro/arch_overview/http_routing.rst b/docs/root/intro/arch_overview/http_routing.rst index 8d0ed1baea453..cf92be2828981 100644 --- a/docs/root/intro/arch_overview/http_routing.rst +++ b/docs/root/intro/arch_overview/http_routing.rst @@ -35,6 +35,7 @@ request. The router filter supports the following features: * Request timeout specified either via :ref:`HTTP header ` or via :ref:`route configuration `. +* :ref:`Request hedging ` in response to a request (per try) timeout. * Traffic shifting from one upstream cluster to another via :ref:`runtime values ` (see :ref:`traffic shifting/splitting `). @@ -87,6 +88,16 @@ headers `. The following configurat Note that retries may be disabled depending on the contents of the :ref:`x-envoy-overloaded `. +.. _arch_overview_http_routing_hedging: + +Request Hedging +--------------- + +Envoy supports request hedging via specifying a :ref:`hedge policy `. This means that Envoy +will race multiple simultaneous upstream requests and return the first valid response to the downstream. + +Currently hedging can only be performed in response to a request timeout. + .. _arch_overview_http_routing_priority: Priority routing diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index 688cf3d22577e..1de7d66ef120c 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -47,6 +47,7 @@ Version history * router: added reset reason to response body when upstream reset happens. After this change, the response body will be of the form `upstream connect error or disconnect/reset before headers. reset reason:` * router: added :ref:`rq_reset_after_downstream_response_started ` counter stat to router stats. * router: added per-route configuration of :ref:`internal redirects `. +* router: added ability to issue a hedged retry in response to a per try timeout via a :ref:`hedge policy `. * stats: added support for histograms in prometheus * stats: added usedonly flag to prometheus stats to only output metrics which have been updated at least once. diff --git a/docs/root/operations/admin.rst b/docs/root/operations/admin.rst index a826fa1b75b03..5948fe70961da 100644 --- a/docs/root/operations/admin.rst +++ b/docs/root/operations/admin.rst @@ -84,6 +84,7 @@ modify different aspects of the server: cx_total, Counter, Total connections cx_active, Gauge, Total active connections cx_connect_fail, Counter, Total connection failures + rq_hedge_abandoned, Counter, Total hedged requests that were canceled and abandoned due to accepting another response. rq_total, Counter, Total requests rq_timeout, Counter, Total timed out requests rq_success, Counter, Total requests with non-5xx responses diff --git a/include/envoy/router/router.h b/include/envoy/router/router.h index e090c9caadc63..ad23563581906 100644 --- a/include/envoy/router/router.h +++ b/include/envoy/router/router.h @@ -253,6 +253,19 @@ class RetryState { virtual RetryStatus shouldRetryReset(const Http::StreamResetReason reset_reason, DoRetryCallback callback) PURE; + /** + * Determine whether a "hedged" retry should be sent after the per try + * timeout expires. This means the original request is not canceled, but a + * new one is sent to hedge against the original request taking even longer. + * @param callback supplies the callback that will be invoked when the retry should take place. + * This is used to add timed backoff, etc. The callback will never be called + * inline. + * @return RetryStatus if a retry should take place. @param callback will be called at some point + * in the future. Otherwise a retry should not take place and the callback will never be + * called. Calling code should proceed with error handling. + */ + virtual RetryStatus shouldHedgeRetryPerTryTimeout(DoRetryCallback callback) PURE; + /** * Called when a host was attempted but the request failed and is eligible for another retry. * Should be used to update whatever internal state depends on previously attempted hosts. diff --git a/include/envoy/upstream/host_description.h b/include/envoy/upstream/host_description.h index 1fabc6686946e..db3a8359e82d5 100644 --- a/include/envoy/upstream/host_description.h +++ b/include/envoy/upstream/host_description.h @@ -24,6 +24,7 @@ namespace Upstream { COUNTER(cx_total) \ GAUGE (cx_active) \ COUNTER(cx_connect_fail) \ + COUNTER(rq_hedge_abandoned) \ COUNTER(rq_total) \ COUNTER(rq_timeout) \ COUNTER(rq_success) \ diff --git a/include/envoy/upstream/upstream.h b/include/envoy/upstream/upstream.h index 12b1229c0a21e..80f9204471ba1 100644 --- a/include/envoy/upstream/upstream.h +++ b/include/envoy/upstream/upstream.h @@ -474,6 +474,7 @@ class PrioritySet { COUNTER (upstream_cx_max_requests) \ COUNTER (upstream_cx_none_healthy) \ COUNTER (upstream_rq_total) \ + COUNTER (upstream_rq_hedge_abandoned) \ GAUGE (upstream_rq_active) \ COUNTER (upstream_rq_completed) \ COUNTER (upstream_rq_pending_total) \ diff --git a/source/common/router/retry_state_impl.cc b/source/common/router/retry_state_impl.cc index d826ae82f341a..3ce99640a561f 100644 --- a/source/common/router/retry_state_impl.cc +++ b/source/common/router/retry_state_impl.cc @@ -193,6 +193,13 @@ RetryStatus RetryStateImpl::shouldRetryReset(Http::StreamResetReason reset_reaso return shouldRetry(wouldRetryFromReset(reset_reason), callback); } +RetryStatus RetryStateImpl::shouldHedgeRetryPerTryTimeout(DoRetryCallback callback) { + // A hedged retry on per try timeout is always retried if there are retries + // left. NOTE: this is different than non-hedged per try timeouts which are only retried + // if RETRY_ON_5XX or RETRY_ON_GATEWAY_ERROR + return shouldRetry([]() -> bool { return true; }, callback); +} + bool RetryStateImpl::wouldRetryFromHeaders(const Http::HeaderMap& response_headers) { if (response_headers.EnvoyOverloaded() != nullptr) { return false; diff --git a/source/common/router/retry_state_impl.h b/source/common/router/retry_state_impl.h index 78d017cf8db9c..9d7ce7647322c 100644 --- a/source/common/router/retry_state_impl.h +++ b/source/common/router/retry_state_impl.h @@ -40,6 +40,7 @@ class RetryStateImpl : public RetryState { DoRetryCallback callback) override; RetryStatus shouldRetryReset(const Http::StreamResetReason reset_reason, DoRetryCallback callback) override; + RetryStatus shouldHedgeRetryPerTryTimeout(DoRetryCallback callback) override; void onHostAttempted(Upstream::HostDescriptionConstSharedPtr host) override { std::for_each(retry_host_predicates_.begin(), retry_host_predicates_.end(), diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 41ca824c5e989..43618c496368e 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -179,9 +179,25 @@ FilterUtility::finalTimeout(const RouteEntry& route, Http::HeaderMap& request_he return timeout; } +FilterUtility::HedgingParams FilterUtility::finalHedgingParams(const RouteEntry& route, + uint64_t random_value) { + HedgingParams hedgingParams; + hedgingParams.initial_requests_ = route.hedgePolicy().initialRequests(); + hedgingParams.hedge_on_per_try_timeout_ = route.hedgePolicy().hedgeOnPerTryTimeout(); + + if (ProtobufPercentHelper::evaluateFractionalPercent( + route.hedgePolicy().additionalRequestChance(), random_value)) { + hedgingParams.initial_requests_++; + } + + return hedgingParams; +} + Filter::~Filter() { // Upstream resources should already have been cleaned. - ASSERT(!upstream_request_); + for (unsigned long i = 0; i < upstream_requests_.size(); i++) { + ASSERT(!upstream_requests_[i]); + } ASSERT(!retry_state_); } @@ -372,6 +388,8 @@ Http::FilterHeadersStatus Filter::decodeHeaders(Http::HeaderMap& headers, bool e // Ensure an http transport scheme is selected before continuing with decoding. ASSERT(headers.Scheme()); + hedging_params_ = FilterUtility::finalHedgingParams(*route_entry_, callbacks_->streamId()); + retry_state_ = createRetryState(route_entry_->retryPolicy(), headers, *cluster_, config_.runtime_, config_.random_, callbacks_->dispatcher(), route_entry_->priority()); @@ -380,8 +398,9 @@ Http::FilterHeadersStatus Filter::decodeHeaders(Http::HeaderMap& headers, bool e ENVOY_STREAM_LOG(debug, "router decoding headers:\n{}", *callbacks_, headers); - upstream_request_ = std::make_unique(*this, *conn_pool); - upstream_request_->encodeHeaders(end_stream); + UpstreamRequestPtr upstream_request = std::make_unique(*this, *conn_pool); + upstream_requests_.emplace_back(std::move(upstream_request)); + upstream_requests_[0]->encodeHeaders(end_stream); if (end_stream) { onRequestComplete(); } @@ -423,20 +442,27 @@ Http::FilterDataStatus Filter::decodeData(Buffer::Instance& data, bool end_strea do_shadowing_ = false; } - if (buffering) { - // If we are going to buffer for retries or shadowing, we need to make a copy before encoding - // since it's all moves from here on. - Buffer::OwnedImpl copy(data); - upstream_request_->encodeData(copy, end_stream); + for (unsigned long i = 0; i < upstream_requests_.size(); i++) { + if (buffering) { + // If we are going to buffer for retries or shadowing, we need to make a copy before encoding + // since it's all moves from here on. + Buffer::OwnedImpl copy(data); + upstream_requests_[i]->encodeData(copy, end_stream); + + if (i == 0) { + } + } else { + upstream_requests_[i]->encodeData(data, end_stream); + } + } + if (buffering) { // If we are potentially going to retry or shadow this request we need to buffer. // This will not cause the connection manager to 413 because before we hit the // buffer limit we give up on retries and buffering. We must buffer using addDecodedData() // so that all buffered data is available by the time we do request complete processing and // potentially shadow. callbacks_->addDecodedData(data, true); - } else { - upstream_request_->encodeData(data, end_stream); } if (end_stream) { @@ -449,7 +475,9 @@ Http::FilterDataStatus Filter::decodeData(Buffer::Instance& data, bool end_strea Http::FilterTrailersStatus Filter::decodeTrailers(Http::HeaderMap& trailers) { ENVOY_STREAM_LOG(debug, "router decoding trailers:\n{}", *callbacks_, trailers); downstream_trailers_ = &trailers; - upstream_request_->encodeTrailers(trailers); + for (unsigned long i = 0; i < upstream_requests_.size(); i++) { + upstream_requests_[i]->encodeTrailers(trailers); + } onRequestComplete(); return Http::FilterTrailersStatus::StopIteration; } @@ -463,13 +491,17 @@ void Filter::setDecoderFilterCallbacks(Http::StreamDecoderFilterCallbacks& callb } void Filter::cleanup() { - // upstream_request_ is only destroyed in this method (cleanup()) or when we - // do a retry (setupRetry()). In the latter case we don't want to save the - // upstream timings to the downstream info. - if (upstream_request_) { - callbacks_->streamInfo().setUpstreamTiming(upstream_request_->upstream_timing_); + for (unsigned long i = 0; i < upstream_requests_.size(); i++) { + UpstreamRequest* upstream_request = upstream_requests_[i].get(); + if (upstream_request) { + if (final_upstream_request_ != nullptr && upstream_request == final_upstream_request_) { + callbacks_->streamInfo().setUpstreamTiming(final_upstream_request_->upstream_timing_); + } else { + upstream_request->resetStream(); // Idempotent. + } + upstream_requests_[i].reset(); + } } - upstream_request_.reset(); retry_state_.reset(); if (response_timeout_) { response_timeout_->disableTimer(); @@ -502,7 +534,15 @@ void Filter::onRequestComplete() { downstream_request_complete_time_ = dispatcher.timeSource().monotonicTime(); // Possible that we got an immediate reset. - if (upstream_request_) { + bool any_upstreams = false; + for (unsigned long i = 0; i < upstream_requests_.size(); i++) { + if (upstream_requests_[i]) { + any_upstreams = true; + break; + } + } + + if (any_upstreams) { // Even if we got an immediate reset, we could still shadow, but that is a riskier change and // seems unnecessary right now. maybeDoShadowing(); @@ -515,44 +555,119 @@ void Filter::onRequestComplete() { } void Filter::onDestroy() { - if (upstream_request_ && !attempting_internal_redirect_with_complete_stream_) { - upstream_request_->resetStream(); + if (!attempting_internal_redirect_with_complete_stream_) { + for (unsigned long i = 0; i < upstream_requests_.size(); i++) { + if (upstream_requests_[i]) { + upstream_requests_[i]->resetStream(); + } + } } cleanup(); } void Filter::onResponseTimeout() { ENVOY_STREAM_LOG(debug, "upstream timeout", *callbacks_); - cluster_->stats().upstream_rq_timeout_.inc(); - // It's possible to timeout during a retry backoff delay when we have no upstream request. - if (upstream_request_) { - if (upstream_request_->upstream_host_) { - upstream_request_->upstream_host_->stats().rq_timeout_.inc(); + // Reset any upstream requests that are still in flight. + for (unsigned long i = 0; i < upstream_requests_.size(); i++) { + // Don't record a timeout for upstream requests we've already seen headers + // for. + UpstreamRequest* upstream_request = upstream_requests_[i].get(); + if (upstream_request && !upstream_request->upstream_headers_) { + cluster_->stats().upstream_rq_timeout_.inc(); + if (upstream_request->upstream_host_) { + upstream_request->upstream_host_->stats().rq_timeout_.inc(); + } + + // If this upstream request already hit a "soft" timeout, then it + // already recorded a timeout into outlier detection. Don't do it again. + if (!upstream_request->outlier_detection_timeout_recorded_) { + updateOutlierDetection(timeout_response_code_, upstream_request); + } + upstream_request->resetStream(); + + chargeUpstreamAbort(timeout_response_code_, false, upstream_request); } - upstream_request_->resetStream(); } - updateOutlierDetection(timeout_response_code_); onUpstreamTimeoutAbort(StreamInfo::ResponseFlag::UpstreamRequestTimeout); } -void Filter::onPerTryTimeout() { - updateOutlierDetection(timeout_response_code_); +// Called when the per try timeout is hit but we didn't reset the request +// (hedge_on_per_try_timeout enabled). +void Filter::onSoftPerTryTimeout(UpstreamRequest* upstream_request) { + // Even though we didn't cancel the request yet we still want to track it + // in outlier detection. + // TODO(mpuncel) is it weird to have a pretend response code here? we might + // get a 200 back from this request later. + updateOutlierDetection(timeout_response_code_, upstream_request); + upstream_request->outlier_detection_timeout_recorded_ = true; + + Upstream::HostDescriptionConstSharedPtr upstream_host = upstream_request->upstream_host_; + + if (!downstream_response_started_ && retry_state_) { + RetryStatus retry_status = + retry_state_->shouldHedgeRetryPerTryTimeout([this]() -> void { doRetry(); }); + + if (retry_status == RetryStatus::Yes && setupRetry()) { + setupRetry(); + // Don't increment upstream_host->stats().rq_error_ here, we'll do that + // later if 1) we hit global timeout or 2) we get bad response headers + // back. + } else if (retry_status == RetryStatus::NoOverflow) { + callbacks_->streamInfo().setResponseFlag(StreamInfo::ResponseFlag::UpstreamOverflow); + } else if (retry_status == RetryStatus::NoRetryLimitExceeded) { + callbacks_->streamInfo().setResponseFlag( + StreamInfo::ResponseFlag::UpstreamRetryLimitExceeded); + } + } +} + +void Filter::onPerTryTimeout(UpstreamRequest* upstream_request) { + if (hedging_params_.hedge_on_per_try_timeout_) { + onSoftPerTryTimeout(upstream_request); + return; + } + + cluster_->stats().upstream_rq_per_try_timeout_.inc(); + if (upstream_request->upstream_host_) { + upstream_request->upstream_host_->stats().rq_timeout_.inc(); + } + + upstream_request->resetStream(); + + updateOutlierDetection(timeout_response_code_, upstream_request); - if (maybeRetryReset(Http::StreamResetReason::LocalReset)) { + if (maybeRetryReset(Http::StreamResetReason::LocalReset, upstream_request)) { return; } + chargeUpstreamAbort(timeout_response_code_, false, upstream_request); onUpstreamTimeoutAbort(StreamInfo::ResponseFlag::UpstreamRequestTimeout); } -void Filter::updateOutlierDetection(Http::Code code) { - Upstream::HostDescriptionConstSharedPtr upstream_host; - if (upstream_request_) { - upstream_host = upstream_request_->upstream_host_; - if (upstream_host) { - upstream_host->outlierDetector().putHttpResponseCode(enumToInt(code)); +void Filter::updateOutlierDetection(Http::Code code, UpstreamRequest* upstream_request) { + if (upstream_request->upstream_host_) { + upstream_request->upstream_host_->outlierDetector().putHttpResponseCode(enumToInt(code)); + } +} + +void Filter::chargeUpstreamAbort(Http::Code code, bool dropped, UpstreamRequest* upstream_request) { + if (downstream_response_started_) { + if (upstream_request != nullptr && upstream_request->grpc_rq_success_deferred_) { + upstream_request->upstream_host_->stats().rq_error_.inc(); + config_.stats_.rq_reset_after_downstream_response_started_.inc(); + } + } else { + Upstream::HostDescriptionConstSharedPtr upstream_host = upstream_request->upstream_host_; + + chargeUpstreamCode(code, upstream_host, dropped); + // If we had non-5xx but still have been reset by backend or timeout before + // starting response, we treat this as an error. We only get non-5xx when + // timeout_response_code_ is used for code above, where this member can + // assume values such as 204 (NoContent). + if (upstream_host != nullptr && !Http::CodeUtility::is5xx(enumToInt(code))) { + upstream_host->stats().rq_error_.inc(); } } } @@ -568,32 +683,15 @@ void Filter::onUpstreamAbort(Http::Code code, StreamInfo::ResponseFlag response_ // If we have not yet sent anything downstream, send a response with an appropriate status code. // Otherwise just reset the ongoing response. if (downstream_response_started_) { - if (upstream_request_ != nullptr && upstream_request_->grpc_rq_success_deferred_) { - upstream_request_->upstream_host_->stats().rq_error_.inc(); - config_.stats_.rq_reset_after_downstream_response_started_.inc(); - } // This will destroy any created retry timers. cleanup(); callbacks_->resetStream(); } else { - Upstream::HostDescriptionConstSharedPtr upstream_host; - if (upstream_request_) { - upstream_host = upstream_request_->upstream_host_; - } - // This will destroy any created retry timers. cleanup(); callbacks_->streamInfo().setResponseFlag(response_flags); - chargeUpstreamCode(code, upstream_host, dropped); - // If we had non-5xx but still have been reset by backend or timeout before - // starting response, we treat this as an error. We only get non-5xx when - // timeout_response_code_ is used for code above, where this member can - // assume values such as 204 (NoContent). - if (upstream_host != nullptr && !Http::CodeUtility::is5xx(enumToInt(code))) { - upstream_host->stats().rq_error_.inc(); - } callbacks_->sendLocalReply(code, body, [dropped, this](Http::HeaderMap& headers) { if (dropped && !config_.suppress_envoy_headers_) { @@ -605,27 +703,18 @@ void Filter::onUpstreamAbort(Http::Code code, StreamInfo::ResponseFlag response_ } } -bool Filter::maybeRetryReset(Http::StreamResetReason reset_reason) { +bool Filter::maybeRetryReset(Http::StreamResetReason reset_reason, + UpstreamRequest* upstream_request) { // We don't retry if we already started the response. if (downstream_response_started_ || !retry_state_) { return false; } - Upstream::HostDescriptionConstSharedPtr upstream_host; - if (upstream_request_) { - upstream_host = upstream_request_->upstream_host_; - } - - // Notify retry modifiers about the attempted host. - if (upstream_host != nullptr) { - retry_state_->onHostAttempted(upstream_host); - } - const RetryStatus retry_status = retry_state_->shouldRetryReset(reset_reason, [this]() -> void { doRetry(); }); - if (retry_status == RetryStatus::Yes && setupRetry(true)) { - if (upstream_host) { - upstream_host->stats().rq_error_.inc(); + if (retry_status == RetryStatus::Yes && setupRetry()) { + if (upstream_request->upstream_host_) { + upstream_request->upstream_host_->stats().rq_error_.inc(); } return true; } else if (retry_status == RetryStatus::NoOverflow) { @@ -637,23 +726,25 @@ bool Filter::maybeRetryReset(Http::StreamResetReason reset_reason) { return false; } -void Filter::onUpstreamReset(Http::StreamResetReason reset_reason) { - ASSERT(upstream_request_); +void Filter::onUpstreamReset(Http::StreamResetReason reset_reason, + UpstreamRequest* upstream_request) { ENVOY_STREAM_LOG(debug, "upstream reset: reset reason {}", *callbacks_, Http::Utility::resetReasonToString(reset_reason)); - updateOutlierDetection(Http::Code::ServiceUnavailable); + updateOutlierDetection(Http::Code::ServiceUnavailable, upstream_request); - if (maybeRetryReset(reset_reason)) { + if (maybeRetryReset(reset_reason, upstream_request)) { return; } + bool dropped = reset_reason == Http::StreamResetReason::Overflow; + chargeUpstreamAbort(Http::Code::ServiceUnavailable, dropped, upstream_request); + const StreamInfo::ResponseFlag response_flags = streamResetReasonToResponseFlag(reset_reason); const std::string body = absl::StrCat("upstream connect error or disconnect/reset before headers. reset reason: ", Http::Utility::resetReasonToString(reset_reason)); - const bool dropped = reset_reason == Http::StreamResetReason::Overflow; onUpstreamAbort(Http::Code::ServiceUnavailable, response_flags, body, dropped); } @@ -677,7 +768,8 @@ Filter::streamResetReasonToResponseFlag(Http::StreamResetReason reset_reason) { NOT_REACHED_GCOVR_EXCL_LINE; } -void Filter::handleNon5xxResponseHeaders(const Http::HeaderMap& headers, bool end_stream) { +void Filter::handleNon5xxResponseHeaders(const Http::HeaderMap& headers, + UpstreamRequest* upstream_request, bool end_stream) { // We need to defer gRPC success until after we have processed grpc-status in // the trailers. if (grpc_request_) { @@ -685,55 +777,82 @@ void Filter::handleNon5xxResponseHeaders(const Http::HeaderMap& headers, bool en absl::optional grpc_status = Grpc::Common::getGrpcStatus(headers); if (grpc_status && !Http::CodeUtility::is5xx(Grpc::Utility::grpcToHttpStatus(grpc_status.value()))) { - upstream_request_->upstream_host_->stats().rq_success_.inc(); + upstream_request->upstream_host_->stats().rq_success_.inc(); } else { - upstream_request_->upstream_host_->stats().rq_error_.inc(); + upstream_request->upstream_host_->stats().rq_error_.inc(); } } else { - upstream_request_->grpc_rq_success_deferred_ = true; + upstream_request->grpc_rq_success_deferred_ = true; } } else { - upstream_request_->upstream_host_->stats().rq_success_.inc(); + upstream_request->upstream_host_->stats().rq_success_.inc(); } } -void Filter::onUpstream100ContinueHeaders(Http::HeaderMapPtr&& headers) { +void Filter::onUpstream100ContinueHeaders(Http::HeaderMapPtr&& headers, + UpstreamRequest* upstream_request) { ENVOY_STREAM_LOG(debug, "upstream 100 continue", *callbacks_); - downstream_response_started_ = true; + if (!downstream_response_started_) { + downstream_response_started_ = true; + final_upstream_request_ = upstream_request; + resetOtherUpstreams(upstream_request); + } // Don't send retries after 100-Continue has been sent on. Arguably we could attempt to do a // retry, assume the next upstream would also send an 100-Continue and swallow the second one // but it's sketchy (as the subsequent upstream might not send a 100-Continue) and not worth // the complexity until someone asks for it. retry_state_.reset(); - callbacks_->encode100ContinueHeaders(std::move(headers)); + if (final_upstream_request_ == upstream_request) { + callbacks_->encode100ContinueHeaders(std::move(headers)); + } +} + +void Filter::resetOtherUpstreams(UpstreamRequest* upstream_request) { + UpstreamRequest* upstream_request_tmp; + for (unsigned long i = 0; i < upstream_requests_.size(); i++) { + upstream_request_tmp = upstream_requests_[i].get(); + if (upstream_request_tmp != upstream_request) { + if (!upstream_request_tmp->encode_complete_ || !upstream_request_tmp->decode_complete_) { + upstream_request_tmp->resetStream(); + if (upstream_request_tmp->upstream_host_) { + upstream_request_tmp->upstream_host_->stats().rq_hedge_abandoned_.inc(); + } + cluster_->stats().upstream_rq_hedge_abandoned_.inc(); + } + } + } } void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& headers, - bool end_stream) { + UpstreamRequest* upstream_request, bool end_stream) { ENVOY_STREAM_LOG(debug, "upstream headers complete: end_stream={}", *callbacks_, end_stream); - upstream_request_->upstream_host_->outlierDetector().putHttpResponseCode(response_code); + upstream_request->upstream_host_->outlierDetector().putHttpResponseCode(response_code); if (headers->EnvoyImmediateHealthCheckFail() != nullptr) { - upstream_request_->upstream_host_->healthChecker().setUnhealthy(); + upstream_request->upstream_host_->healthChecker().setUnhealthy(); } - if (retry_state_) { - // Notify retry modifiers about the attempted host. - retry_state_->onHostAttempted(upstream_request_->upstream_host_); - - const RetryStatus retry_status = + // Check if this upstream request was already retried, for instance after + // hitting a per try timeout. Don't retry it if we already have. + if (retry_state_ && !upstream_request->retried_) { + RetryStatus retry_status = retry_state_->shouldRetryHeaders(*headers, [this]() -> void { doRetry(); }); // Capture upstream_host since setupRetry() in the following line will clear - // upstream_request_. - const auto upstream_host = upstream_request_->upstream_host_; - if (retry_status == RetryStatus::Yes && setupRetry(end_stream)) { + // upstream_request. + const auto upstream_host = upstream_request->upstream_host_; + if (retry_status == RetryStatus::Yes && setupRetry()) { + if (!end_stream) { + upstream_request->resetStream(); + } + Http::CodeStats& code_stats = httpContext().codeStats(); code_stats.chargeBasicResponseStat(cluster_->statsScope(), "retry.", static_cast(response_code)); upstream_host->stats().rq_error_.inc(); + upstream_request->retried_ = true; return; } else if (retry_status == RetryStatus::NoOverflow) { callbacks_->streamInfo().setResponseFlag(StreamInfo::ResponseFlag::UpstreamOverflow); @@ -749,7 +868,7 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head if (static_cast(response_code) == Http::Code::Found && route_entry_->internalRedirectAction() == InternalRedirectAction::Handle && - setupRedirect(*headers)) { + setupRedirect(*headers, upstream_request)) { return; // If the redirect could not be handled, fail open and let it pass to the // next downstream. @@ -767,12 +886,12 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head } } - upstream_request_->upstream_canary_ = + upstream_request->upstream_canary_ = (headers->EnvoyUpstreamCanary() && headers->EnvoyUpstreamCanary()->value() == "true") || - upstream_request_->upstream_host_->canary(); - chargeUpstreamCode(response_code, *headers, upstream_request_->upstream_host_, false); + upstream_request->upstream_host_->canary(); + chargeUpstreamCode(response_code, *headers, upstream_request->upstream_host_, false); if (!Http::CodeUtility::is5xx(response_code)) { - handleNon5xxResponseHeaders(*headers, end_stream); + handleNon5xxResponseHeaders(*headers, upstream_request, end_stream); } // Append routing cookies @@ -785,47 +904,61 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head // provide finalizeResponseHeaders functions on the Router::Config and VirtualHost interfaces. route_entry_->finalizeResponseHeaders(*headers, callbacks_->streamInfo()); - downstream_response_started_ = true; + if (!downstream_response_started_) { + downstream_response_started_ = true; + final_upstream_request_ = upstream_request; + resetOtherUpstreams(upstream_request); + } if (end_stream) { - onUpstreamComplete(); + onUpstreamComplete(upstream_request); } - callbacks_->encodeHeaders(std::move(headers), end_stream); + if (final_upstream_request_ == upstream_request) { + callbacks_->encodeHeaders(std::move(headers), end_stream); + } } -void Filter::onUpstreamData(Buffer::Instance& data, bool end_stream) { +void Filter::onUpstreamData(Buffer::Instance& data, UpstreamRequest* upstream_request, + bool end_stream) { if (end_stream) { // gRPC request termination without trailers is an error. - if (upstream_request_->grpc_rq_success_deferred_) { - upstream_request_->upstream_host_->stats().rq_error_.inc(); + if (upstream_request->grpc_rq_success_deferred_) { + upstream_request->upstream_host_->stats().rq_error_.inc(); } - onUpstreamComplete(); + onUpstreamComplete(upstream_request); } - callbacks_->encodeData(data, end_stream); + if (final_upstream_request_ == upstream_request) { + callbacks_->encodeData(data, end_stream); + } } -void Filter::onUpstreamTrailers(Http::HeaderMapPtr&& trailers) { - if (upstream_request_->grpc_rq_success_deferred_) { +void Filter::onUpstreamTrailers(Http::HeaderMapPtr&& trailers, UpstreamRequest* upstream_request) { + if (upstream_request->grpc_rq_success_deferred_) { absl::optional grpc_status = Grpc::Common::getGrpcStatus(*trailers); if (grpc_status && !Http::CodeUtility::is5xx(Grpc::Utility::grpcToHttpStatus(grpc_status.value()))) { - upstream_request_->upstream_host_->stats().rq_success_.inc(); + upstream_request->upstream_host_->stats().rq_success_.inc(); } else { - upstream_request_->upstream_host_->stats().rq_error_.inc(); + upstream_request->upstream_host_->stats().rq_error_.inc(); } } - onUpstreamComplete(); - callbacks_->encodeTrailers(std::move(trailers)); + onUpstreamComplete(upstream_request); + if (final_upstream_request_ == upstream_request) { + callbacks_->encodeTrailers(std::move(trailers)); + } } -void Filter::onUpstreamMetadata(Http::MetadataMapPtr&& metadata_map) { - callbacks_->encodeMetadata(std::move(metadata_map)); +void Filter::onUpstreamMetadata(Http::MetadataMapPtr&& metadata_map, + UpstreamRequest* upstream_request) { + if (final_upstream_request_ == upstream_request) { + callbacks_->encodeMetadata(std::move(metadata_map)); + } } -void Filter::onUpstreamComplete() { +void Filter::onUpstreamComplete(UpstreamRequest* upstream_request) { if (!downstream_end_stream_) { - upstream_request_->resetStream(); + upstream_request->resetStream(); } if (config_.emit_dynamic_stats_ && !callbacks_->streamInfo().healthCheck() && @@ -834,7 +967,7 @@ void Filter::onUpstreamComplete() { std::chrono::milliseconds response_time = std::chrono::duration_cast( dispatcher.timeSource().monotonicTime() - downstream_request_complete_time_); - upstream_request_->upstream_host_->outlierDetector().putResponseTime(response_time); + upstream_request->upstream_host_->outlierDetector().putResponseTime(response_time); const Http::HeaderEntry* internal_request_header = downstream_headers_->EnvoyInternalRequest(); const bool internal_request = @@ -848,13 +981,13 @@ void Filter::onUpstreamComplete() { cluster_->statsScope(), EMPTY_STRING, response_time, - upstream_request_->upstream_canary_, + upstream_request->upstream_canary_, internal_request, route_entry_->virtualHost().name(), request_vcluster_ ? request_vcluster_->name() : EMPTY_STRING, zone_name, - upstreamZone(upstream_request_->upstream_host_)}; + upstreamZone(upstream_request->upstream_host_)}; code_stats.chargeResponseTiming(info); @@ -863,12 +996,12 @@ void Filter::onUpstreamComplete() { cluster_->statsScope(), alt_stat_prefix_, response_time, - upstream_request_->upstream_canary_, + upstream_request->upstream_canary_, internal_request, EMPTY_STRING, EMPTY_STRING, zone_name, - upstreamZone(upstream_request_->upstream_host_)}; + upstreamZone(upstream_request->upstream_host_)}; code_stats.chargeResponseTiming(info); } @@ -877,7 +1010,7 @@ void Filter::onUpstreamComplete() { cleanup(); } -bool Filter::setupRetry(bool end_stream) { +bool Filter::setupRetry() { // If we responded before the request was complete we don't bother doing a retry. This may not // catch certain cases where we are in full streaming mode and we have a connect timeout or an // overflow of some kind. However, in many cases deployments will use the buffer filter before @@ -888,15 +1021,11 @@ bool Filter::setupRetry(bool end_stream) { } ENVOY_STREAM_LOG(debug, "performing retry", *callbacks_); - if (!end_stream) { - upstream_request_->resetStream(); - } - upstream_request_.reset(); return true; } -bool Filter::setupRedirect(const Http::HeaderMap& headers) { +bool Filter::setupRedirect(const Http::HeaderMap& headers, UpstreamRequest* upstream_request) { ENVOY_STREAM_LOG(debug, "attempting internal redirect", *callbacks_); const Http::HeaderEntry* location = headers.Location(); @@ -910,7 +1039,7 @@ bool Filter::setupRedirect(const Http::HeaderMap& headers) { // completion here and check it in onDestroy. This is annoyingly complicated but is better than // needlessly resetting streams. attempting_internal_redirect_with_complete_stream_ = - upstream_request_->upstream_timing_.last_upstream_rx_byte_received_ && downstream_end_stream_; + upstream_request->upstream_timing_.last_upstream_rx_byte_received_ && downstream_end_stream_; // As with setupRetry, redirects are not supported for streaming requests yet. if (downstream_end_stream_ && @@ -945,19 +1074,20 @@ void Filter::doRetry() { } ASSERT(response_timeout_ || timeout_.global_timeout_.count() == 0); - ASSERT(!upstream_request_); - upstream_request_ = std::make_unique(*this, *conn_pool); - upstream_request_->encodeHeaders(!callbacks_->decodingBuffer() && !downstream_trailers_); + UpstreamRequestPtr upstream_request = std::make_unique(*this, *conn_pool); + UpstreamRequest* upstream_request_ptr = upstream_request.get(); + upstream_requests_.emplace_back(std::move(upstream_request)); + upstream_request_ptr->encodeHeaders(!callbacks_->decodingBuffer() && !downstream_trailers_); // It's possible we got immediately reset. - if (upstream_request_) { + if (upstream_request_ptr) { if (callbacks_->decodingBuffer()) { // If we are doing a retry we need to make a copy. Buffer::OwnedImpl copy(*callbacks_->decodingBuffer()); - upstream_request_->encodeData(copy, !downstream_trailers_); + upstream_request_ptr->encodeData(copy, !downstream_trailers_); } if (downstream_trailers_) { - upstream_request_->encodeTrailers(*downstream_trailers_); + upstream_request_ptr->encodeTrailers(*downstream_trailers_); } } } @@ -965,8 +1095,9 @@ void Filter::doRetry() { Filter::UpstreamRequest::UpstreamRequest(Filter& parent, Http::ConnectionPool::Instance& pool) : parent_(parent), conn_pool_(pool), grpc_rq_success_deferred_(false), stream_info_(pool.protocol(), parent_.callbacks_->dispatcher().timeSource()), - calling_encode_headers_(false), upstream_canary_(false), encode_complete_(false), - encode_trailers_(false) { + calling_encode_headers_(false), upstream_canary_(false), decode_complete_(false), + encode_complete_(false), encode_trailers_(false), retried_(false), + outlier_detection_timeout_recorded_(false) { if (parent_.config_.start_child_span_) { span_ = parent_.callbacks_->activeSpan().spawnChild( @@ -999,7 +1130,7 @@ Filter::UpstreamRequest::~UpstreamRequest() { void Filter::UpstreamRequest::decode100ContinueHeaders(Http::HeaderMapPtr&& headers) { ASSERT(100 == Http::Utility::getResponseStatus(*headers)); - parent_.onUpstream100ContinueHeaders(std::move(headers)); + parent_.onUpstream100ContinueHeaders(std::move(headers), this); } void Filter::UpstreamRequest::decodeHeaders(Http::HeaderMapPtr&& headers, bool end_stream) { @@ -1010,28 +1141,29 @@ void Filter::UpstreamRequest::decodeHeaders(Http::HeaderMapPtr&& headers, bool e upstream_headers_ = headers.get(); const uint64_t response_code = Http::Utility::getResponseStatus(*headers); stream_info_.response_code_ = static_cast(response_code); - parent_.onUpstreamHeaders(response_code, std::move(headers), end_stream); + parent_.onUpstreamHeaders(response_code, std::move(headers), this, end_stream); } void Filter::UpstreamRequest::decodeData(Buffer::Instance& data, bool end_stream) { maybeEndDecode(end_stream); stream_info_.addBytesReceived(data.length()); - parent_.onUpstreamData(data, end_stream); + parent_.onUpstreamData(data, this, end_stream); } void Filter::UpstreamRequest::decodeTrailers(Http::HeaderMapPtr&& trailers) { maybeEndDecode(true); upstream_trailers_ = trailers.get(); - parent_.onUpstreamTrailers(std::move(trailers)); + parent_.onUpstreamTrailers(std::move(trailers), this); } void Filter::UpstreamRequest::decodeMetadata(Http::MetadataMapPtr&& metadata_map) { - parent_.onUpstreamMetadata(std::move(metadata_map)); + parent_.onUpstreamMetadata(std::move(metadata_map), this); } void Filter::UpstreamRequest::maybeEndDecode(bool end_stream) { if (end_stream) { upstream_timing_.onLastUpstreamRxByteReceived(parent_.callbacks_->dispatcher().timeSource()); + decode_complete_ = true; } } @@ -1090,13 +1222,18 @@ void Filter::UpstreamRequest::onResetStream(Http::StreamResetReason reason) { clearRequestEncoder(); if (!calling_encode_headers_) { stream_info_.setResponseFlag(parent_.streamResetReasonToResponseFlag(reason)); - parent_.onUpstreamReset(reason); + parent_.onUpstreamReset(reason, this); } else { deferred_reset_reason_ = reason; } } void Filter::UpstreamRequest::resetStream() { + // Don't reset the stream if we're already done with it. + if (encode_complete_ && decode_complete_) { + return; + } + if (conn_pool_stream_handle_) { ENVOY_STREAM_LOG(debug, "cancelling pool request", *parent_.callbacks_); ASSERT(!request_encoder_); @@ -1108,6 +1245,7 @@ void Filter::UpstreamRequest::resetStream() { ENVOY_STREAM_LOG(debug, "resetting pool request", *parent_.callbacks_); request_encoder_->getStream().removeCallbacks(*this); request_encoder_->getStream().resetStream(Http::StreamResetReason::LocalReset); + clearRequestEncoder(); } } @@ -1125,13 +1263,11 @@ void Filter::UpstreamRequest::onPerTryTimeout() { // to the global timeout if (!parent_.downstream_response_started_) { ENVOY_STREAM_LOG(debug, "upstream per try timeout", *parent_.callbacks_); - parent_.cluster_->stats().upstream_rq_per_try_timeout_.inc(); - if (upstream_host_) { - upstream_host_->stats().rq_timeout_.inc(); - } - resetStream(); + + // Set response flag to UT for now, but it might be overwritten if a + // response arrives later and hedg_on_per_try_timeout_ is set stream_info_.setResponseFlag(StreamInfo::ResponseFlag::UpstreamRequestTimeout); - parent_.onPerTryTimeout(); + parent_.onPerTryTimeout(this); } else { ENVOY_STREAM_LOG(debug, "ignored upstream per try timeout due to already started downstream response", diff --git a/source/common/router/router.h b/source/common/router/router.h index 11b06e6bbdf59..b02b1ec475fab 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -61,6 +61,11 @@ class FilterUtility { std::chrono::milliseconds per_try_timeout_{0}; }; + struct HedgingParams { + uint32_t initial_requests_; + bool hedge_on_per_try_timeout_; + }; + /** * Set the :scheme header based on the properties of the upstream cluster. */ @@ -88,6 +93,15 @@ class FilterUtility { */ static TimeoutData finalTimeout(const RouteEntry& route, Http::HeaderMap& request_headers, bool insert_envoy_expected_request_timeout_ms, bool grpc_request); + + /** + * Determine the final hedging settings after applying randomized behavior. + * @param route supplies the request route. + * @param random_value supplies a stable random value to use for evaluating whether an additional + * initial request should be sent + * @return HedgingParams the final parameters to use for request hedging + */ + static HedgingParams finalHedgingParams(const RouteEntry& route, uint64_t random_value); }; /** @@ -285,6 +299,9 @@ class Filter : Logger::Loggable, stream_info_.onUpstreamHostSelected(host); upstream_host_ = host; parent_.callbacks_->streamInfo().onUpstreamHostSelected(host); + if (parent_.retry_state_ && host) { + parent_.retry_state_->onHostAttempted(host); + } } // Http::StreamDecoder @@ -347,8 +364,11 @@ class Filter : Logger::Loggable, bool calling_encode_headers_ : 1; bool upstream_canary_ : 1; + bool decode_complete_ : 1; bool encode_complete_ : 1; bool encode_trailers_ : 1; + bool retried_ : 1; + bool outlier_detection_timeout_recorded_ : 1; }; typedef std::unique_ptr UpstreamRequestPtr; @@ -360,6 +380,7 @@ class Filter : Logger::Loggable, Upstream::HostDescriptionConstSharedPtr upstream_host, bool dropped); void chargeUpstreamCode(Http::Code code, Upstream::HostDescriptionConstSharedPtr upstream_host, bool dropped); + void chargeUpstreamAbort(Http::Code code, bool dropped, UpstreamRequest* upstream_request); void cleanup(); virtual RetryStatePtr createRetryState(const RetryPolicy& policy, Http::HeaderMap& request_headers, @@ -369,32 +390,39 @@ class Filter : Logger::Loggable, Upstream::ResourcePriority priority) PURE; Http::ConnectionPool::Instance* getConnPool(); void maybeDoShadowing(); - bool maybeRetryReset(Http::StreamResetReason reset_reason); - void onPerTryTimeout(); + bool maybeRetryReset(Http::StreamResetReason reset_reason, UpstreamRequest* upstream_request); + void onGlobalTimeout(); + void onPerTryTimeout(UpstreamRequest* upstream_request); void onRequestComplete(); void onResponseTimeout(); - void onUpstream100ContinueHeaders(Http::HeaderMapPtr&& headers); + void onUpstream100ContinueHeaders(Http::HeaderMapPtr&& headers, + UpstreamRequest* upstream_request); // Handle an upstream request aborted due to a local timeout. + void onSoftPerTryTimeout(); + void onSoftPerTryTimeout(UpstreamRequest* upstream_request); void onUpstreamTimeoutAbort(StreamInfo::ResponseFlag response_flag); // Handle an "aborted" upstream request, meaning we didn't see response // headers (e.g. due to a reset). Handles recording stats and responding // downstream if appropriate. void onUpstreamAbort(Http::Code code, StreamInfo::ResponseFlag response_flag, absl::string_view body, bool dropped); - void onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& headers, bool end_stream); - void onUpstreamData(Buffer::Instance& data, bool end_stream); - void onUpstreamTrailers(Http::HeaderMapPtr&& trailers); - void onUpstreamMetadata(Http::MetadataMapPtr&& metadata_map); - void onUpstreamComplete(); - void onUpstreamReset(Http::StreamResetReason reset_reason); + void onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& headers, + UpstreamRequest* upstream_request, bool end_stream); + void onUpstreamData(Buffer::Instance& data, UpstreamRequest* upstream_request, bool end_stream); + void onUpstreamTrailers(Http::HeaderMapPtr&& trailers, UpstreamRequest* upstream_request); + void onUpstreamMetadata(Http::MetadataMapPtr&& metadata_map, UpstreamRequest* upstream_request); + void onUpstreamComplete(UpstreamRequest* upstream_request); + void onUpstreamReset(Http::StreamResetReason reset_reason, UpstreamRequest* upstream_request); + void resetOtherUpstreams(UpstreamRequest* upstream_request); void sendNoHealthyUpstreamResponse(); - bool setupRetry(bool end_stream); - bool setupRedirect(const Http::HeaderMap& headers); - void updateOutlierDetection(Http::Code code); + bool setupRetry(); + bool setupRedirect(const Http::HeaderMap& headers, UpstreamRequest* upstream_request); + void updateOutlierDetection(Http::Code code, UpstreamRequest* upstream_request); void doRetry(); // Called immediately after a non-5xx header is received from upstream, performs stats accounting // and handle difference between gRPC and non-gRPC requests. - void handleNon5xxResponseHeaders(const Http::HeaderMap& headers, bool end_stream); + void handleNon5xxResponseHeaders(const Http::HeaderMap& headers, + UpstreamRequest* upstream_request, bool end_stream); TimeSource& timeSource() { return config_.timeSource(); } Http::Context& httpContext() { return config_.http_context_; } @@ -407,8 +435,12 @@ class Filter : Logger::Loggable, const VirtualCluster* request_vcluster_; Event::TimerPtr response_timeout_; FilterUtility::TimeoutData timeout_; + FilterUtility::HedgingParams hedging_params_; Http::Code timeout_response_code_ = Http::Code::GatewayTimeout; - UpstreamRequestPtr upstream_request_; + std::vector upstream_requests_; + // Tracks which upstream request "wins" and will have the corresponding + // response forwarded downstream + UpstreamRequest* final_upstream_request_; bool grpc_request_{}; Http::HeaderMap* downstream_headers_{}; Http::HeaderMap* downstream_trailers_{}; diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index 83cad58665f41..dfbf907bfdad1 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -209,6 +209,16 @@ class RouterTestBase : public testing::Test { ON_CALL(callbacks_, connection()).WillByDefault(Return(&connection_)); } + void enableHedgeOnPerTryTimeout() { + callbacks_.route_->route_entry_.hedge_policy_.initial_requests_ = 1; + callbacks_.route_->route_entry_.hedge_policy_.hedge_on_per_try_timeout_ = true; + callbacks_.route_->route_entry_.hedge_policy_.additional_request_chance_ = + envoy::type::FractionalPercent{}; + callbacks_.route_->route_entry_.hedge_policy_.additional_request_chance_.set_numerator(0); + callbacks_.route_->route_entry_.hedge_policy_.additional_request_chance_.set_denominator( + envoy::type::FractionalPercent::HUNDRED); + } + Event::SimulatedTimeSystem test_time_; std::string upstream_zone_{"to_az"}; envoy::api::v2::core::Locality upstream_locality_; @@ -1225,6 +1235,168 @@ TEST_F(RouterTest, UpstreamPerTryTimeoutExcludesNewStream) { EXPECT_TRUE(verifyHostUpstreamStats(0, 1)); } +// Tests that a retry is sent after the first request hits the per try timeout, but then +// headers received in response to the first request are still used (and the 2nd request +// canceled). +TEST_F(RouterTest, HedgedPerTryTimeoutFirstRequestSucceeds) { + enableHedgeOnPerTryTimeout(); + + NiceMock encoder1; + Http::StreamDecoder* response_decoder1 = nullptr; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder1 = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder1, cm_.conn_pool_.host_); + return nullptr; + })); + expectResponseTimerCreate(); + expectPerTryTimerCreate(); + + Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; + HttpTestUtility::addDefaultHeaders(headers); + router_.decodeHeaders(headers, true); + + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(504)); + EXPECT_CALL(encoder1.stream_, resetStream(_)).Times(0); + + NiceMock encoder2; + Http::StreamDecoder* response_decoder2 = nullptr; + router_.retry_state_->expectHedgedPerTryTimeoutRetry(); + per_try_timeout_->callback_(); + + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder2 = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder2, cm_.conn_pool_.host_); + return nullptr; + })); + expectPerTryTimerCreate(); + router_.retry_state_->callback_(); + + // We should not have updated any stats yet because no requests have been + // canceled + EXPECT_TRUE(verifyHostUpstreamStats(0, 0)); + + // Now write a 200 back. We expect the 2nd stream to be reset and stats to be + // incremented properly. + Http::HeaderMapPtr response_headers(new Http::TestHeaderMapImpl{{":status", "200"}}); + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(200)); + EXPECT_CALL(encoder1.stream_, resetStream(_)).Times(0); + EXPECT_CALL(encoder2.stream_, resetStream(_)); + + EXPECT_CALL(callbacks_, encodeHeaders_(_, _)) + .WillOnce(Invoke([&](Http::HeaderMap& headers, bool end_stream) -> void { + EXPECT_EQ(headers.Status()->value(), "200"); + EXPECT_TRUE(end_stream); + })); + response_decoder1->decodeHeaders(std::move(response_headers), true); + EXPECT_TRUE(verifyHostUpstreamStats(1, 0)); + EXPECT_EQ(1, cm_.conn_pool_.host_->stats_store_.counter("rq_hedge_abandoned").value()); + EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ + .counter("upstream_rq_hedge_abandoned") + .value()); +} + +// Three requests sent: 1) 5xx error, 2) per try timeout, 3) gets good response +// headers. +TEST_F(RouterTest, HedgedPerTryTimeoutThirdRequestSucceeds) { + enableHedgeOnPerTryTimeout(); + + NiceMock encoder1; + Http::StreamDecoder* response_decoder1 = nullptr; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder1 = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder1, cm_.conn_pool_.host_); + return nullptr; + })); + expectResponseTimerCreate(); + expectPerTryTimerCreate(); + + Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; + HttpTestUtility::addDefaultHeaders(headers); + router_.decodeHeaders(headers, true); + + EXPECT_CALL(encoder1.stream_, resetStream(_)).Times(0); + + Http::HeaderMapPtr response_headers1(new Http::TestHeaderMapImpl{{":status", "500"}}); + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(500)); + EXPECT_CALL(encoder1.stream_, resetStream(_)).Times(0); + EXPECT_CALL(callbacks_, encodeHeaders_(_, _)).Times(0); + router_.retry_state_->expectHeadersRetry(); + response_decoder1->decodeHeaders(std::move(response_headers1), true); + + NiceMock encoder2; + Http::StreamDecoder* response_decoder2 = nullptr; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder2 = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder2, cm_.conn_pool_.host_); + return nullptr; + })); + expectPerTryTimerCreate(); + router_.retry_state_->callback_(); + + EXPECT_TRUE(verifyHostUpstreamStats(0, 1)); + EXPECT_EQ(0, cm_.conn_pool_.host_->stats_store_.counter("rq_hedge_abandoned").value()); + EXPECT_EQ(0, cm_.thread_local_cluster_.cluster_.info_->stats_store_ + .counter("upstream_rq_hedge_abandoned") + .value()); + + // Now trigger a per try timeout on the 2nd request, expect a 3rd + router_.retry_state_->expectHedgedPerTryTimeoutRetry(); + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(504)); + NiceMock encoder3; + Http::StreamDecoder* response_decoder3 = nullptr; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder3 = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder3, cm_.conn_pool_.host_); + return nullptr; + })); + + EXPECT_CALL(callbacks_, encodeHeaders_(_, _)).Times(0); + per_try_timeout_->callback_(); + expectPerTryTimerCreate(); + router_.retry_state_->callback_(); + EXPECT_TRUE(verifyHostUpstreamStats(0, 1)); + EXPECT_EQ(0, cm_.conn_pool_.host_->stats_store_.counter("rq_hedge_abandoned").value()); + EXPECT_EQ(0, cm_.thread_local_cluster_.cluster_.info_->stats_store_ + .counter("upstream_rq_hedge_abandoned") + .value()); + + // Now write a 200 back. We expect the 2nd stream to be reset and stats to be + // incremented properly. + Http::HeaderMapPtr response_headers2(new Http::TestHeaderMapImpl{{":status", "200"}}); + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(200)); + EXPECT_CALL(encoder1.stream_, resetStream(_)).Times(0); + EXPECT_CALL(encoder2.stream_, resetStream(_)); + EXPECT_CALL(encoder3.stream_, resetStream(_)).Times(0); + + EXPECT_CALL(callbacks_, encodeHeaders_(_, _)) + .WillOnce(Invoke([&](Http::HeaderMap& headers, bool end_stream) -> void { + EXPECT_EQ(headers.Status()->value(), "200"); + EXPECT_TRUE(end_stream); + })); + EXPECT_CALL(*router_.retry_state_, shouldRetryHeaders(_, _)).WillOnce(Return(RetryStatus::No)); + response_decoder3->decodeHeaders(std::move(response_headers2), true); + EXPECT_TRUE(verifyHostUpstreamStats(1, 1)); + EXPECT_EQ(1, cm_.conn_pool_.host_->stats_store_.counter("rq_hedge_abandoned").value()); + EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ + .counter("upstream_rq_hedge_abandoned") + .value()); +} + TEST_F(RouterTest, RetryRequestNotComplete) { NiceMock encoder1; Http::StreamDecoder* response_decoder = nullptr; @@ -1252,6 +1424,73 @@ TEST_F(RouterTest, RetryRequestNotComplete) { EXPECT_TRUE(verifyHostUpstreamStats(0, 1)); } +// Two requests are sent (slow request + hedged retry) and then global timeout +// is hit. Verify everything gets cleaned up. +TEST_F(RouterTest, HedgedPerTryTimeoutGlobalTimeout) { + enableHedgeOnPerTryTimeout(); + + NiceMock encoder1; + Http::StreamDecoder* response_decoder1 = nullptr; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder1 = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder1, cm_.conn_pool_.host_); + return nullptr; + })); + expectResponseTimerCreate(); + expectPerTryTimerCreate(); + + Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; + HttpTestUtility::addDefaultHeaders(headers); + router_.decodeHeaders(headers, true); + + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(504)); + EXPECT_CALL(encoder1.stream_, resetStream(_)).Times(0); + EXPECT_CALL(callbacks_, encodeHeaders_(_, _)).Times(0); + router_.retry_state_->expectHedgedPerTryTimeoutRetry(); + per_try_timeout_->callback_(); + + NiceMock encoder2; + Http::StreamDecoder* response_decoder2 = nullptr; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder2 = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder2, cm_.conn_pool_.host_); + return nullptr; + })); + expectPerTryTimerCreate(); + router_.retry_state_->callback_(); + + EXPECT_TRUE(verifyHostUpstreamStats(0, 0)); + EXPECT_EQ(0, cm_.conn_pool_.host_->stats_store_.counter("rq_hedge_abandoned").value()); + EXPECT_EQ(0, cm_.thread_local_cluster_.cluster_.info_->stats_store_ + .counter("upstream_rq_hedge_abandoned") + .value()); + + // Now trigger global timeout, expect everything to be reset + EXPECT_CALL(encoder1.stream_, resetStream(_)).Times(1); + EXPECT_CALL(encoder2.stream_, resetStream(_)).Times(1); + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(504)); + + EXPECT_CALL(callbacks_, encodeHeaders_(_, _)) + .WillOnce(Invoke([&](Http::HeaderMap& headers, bool) -> void { + EXPECT_EQ(headers.Status()->value(), "504"); + })); + response_timeout_->callback_(); + EXPECT_TRUE(verifyHostUpstreamStats(0, 2)); + EXPECT_EQ(0, cm_.conn_pool_.host_->stats_store_.counter("rq_hedge_abandoned").value()); + EXPECT_EQ(2, cm_.conn_pool_.host_->stats_store_.counter("rq_timeout").value()); + EXPECT_EQ(0, cm_.thread_local_cluster_.cluster_.info_->stats_store_ + .counter("upstream_rq_hedge_abandoned") + .value()); + EXPECT_EQ(2, cm_.thread_local_cluster_.cluster_.info_->stats_store_.counter("upstream_rq_timeout") + .value()); +} + TEST_F(RouterTest, RetryNoneHealthy) { NiceMock encoder1; Http::StreamDecoder* response_decoder = nullptr; @@ -1338,6 +1577,7 @@ TEST_F(RouterTest, RetryUpstreamPerTryTimeout) { .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) -> Http::ConnectionPool::Cancellable* { response_decoder = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); callbacks.onPoolReady(encoder1, cm_.conn_pool_.host_); return nullptr; })); @@ -1350,7 +1590,6 @@ TEST_F(RouterTest, RetryUpstreamPerTryTimeout) { HttpTestUtility::addDefaultHeaders(headers); router_.decodeHeaders(headers, true); - EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); router_.retry_state_->expectResetRetry(); EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(504)); per_try_timeout_->callback_(); @@ -1358,6 +1597,7 @@ TEST_F(RouterTest, RetryUpstreamPerTryTimeout) { // We expect this reset to kick off a new request. NiceMock encoder2; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) -> Http::ConnectionPool::Cancellable* { @@ -1368,7 +1608,6 @@ TEST_F(RouterTest, RetryUpstreamPerTryTimeout) { expectPerTryTimerCreate(); router_.retry_state_->callback_(); - EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); // Normal response. EXPECT_CALL(*router_.retry_state_, shouldRetryHeaders(_, _)).WillOnce(Return(RetryStatus::No)); Http::HeaderMapPtr response_headers(new Http::TestHeaderMapImpl{{":status", "200"}}); @@ -1407,13 +1646,12 @@ TEST_F(RouterTest, RetryUpstreamConnectionFailure) { .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) -> Http::ConnectionPool::Cancellable* { response_decoder = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); callbacks.onPoolReady(encoder2, cm_.conn_pool_.host_); return nullptr; })); router_.retry_state_->callback_(); - EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); - // Normal response. EXPECT_CALL(*router_.retry_state_, shouldRetryHeaders(_, _)).WillOnce(Return(RetryStatus::No)); Http::HeaderMapPtr response_headers(new Http::TestHeaderMapImpl{{":status", "200"}}); @@ -2412,6 +2650,38 @@ TEST_F(RouterTest, UpstreamTimingTimeout) { EXPECT_EQ(stream_info.firstUpstreamRxByteReceived().value(), std::chrono::milliseconds(56)); } +TEST(RouterFilterUtilityTest, FinalHedgingParams) { + { // no chance of additional request + NiceMock route; + route.hedge_policy_.initial_requests_ = 10; + EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, 0); + EXPECT_EQ(10, hedgingParams.initial_requests_); + hedgingParams = FilterUtility::finalHedgingParams(route, 10); + EXPECT_EQ(10, hedgingParams.initial_requests_); + hedgingParams = FilterUtility::finalHedgingParams(route, 100); + EXPECT_EQ(10, hedgingParams.initial_requests_); + hedgingParams = FilterUtility::finalHedgingParams(route, 1000); + EXPECT_EQ(10, hedgingParams.initial_requests_); + } + { // 50% chance additional request + NiceMock route; + route.hedge_policy_.initial_requests_ = 10; + route.hedge_policy_.additional_request_chance_.set_numerator(50); + EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, 0); + EXPECT_EQ(11, hedgingParams.initial_requests_); + hedgingParams = FilterUtility::finalHedgingParams(route, 49); + EXPECT_EQ(11, hedgingParams.initial_requests_); + hedgingParams = FilterUtility::finalHedgingParams(route, 50); + EXPECT_EQ(10, hedgingParams.initial_requests_); + hedgingParams = FilterUtility::finalHedgingParams(route, 99); + EXPECT_EQ(10, hedgingParams.initial_requests_); + hedgingParams = FilterUtility::finalHedgingParams(route, 100); + EXPECT_EQ(11, hedgingParams.initial_requests_); + } +} + TEST(RouterFilterUtilityTest, FinalTimeout) { { NiceMock route; diff --git a/test/mocks/router/mocks.cc b/test/mocks/router/mocks.cc index da2b3f22aa204..f667c3982e574 100644 --- a/test/mocks/router/mocks.cc +++ b/test/mocks/router/mocks.cc @@ -25,6 +25,11 @@ void MockRetryState::expectHeadersRetry() { .WillOnce(DoAll(SaveArg<1>(&callback_), Return(RetryStatus::Yes))); } +void MockRetryState::expectHedgedPerTryTimeoutRetry() { + EXPECT_CALL(*this, shouldHedgeRetryPerTryTimeout(_)) + .WillOnce(DoAll(SaveArg<0>(&callback_), Return(RetryStatus::Yes))); +} + void MockRetryState::expectResetRetry() { EXPECT_CALL(*this, shouldRetryReset(_, _)) .WillOnce(DoAll(SaveArg<1>(&callback_), Return(RetryStatus::Yes))); diff --git a/test/mocks/router/mocks.h b/test/mocks/router/mocks.h index 8a1e7d30653d8..cb90e04ed939b 100644 --- a/test/mocks/router/mocks.h +++ b/test/mocks/router/mocks.h @@ -73,11 +73,11 @@ class TestHedgePolicy : public HedgePolicy { const envoy::type::FractionalPercent& additionalRequestChance() const override { return additional_request_chance_; } - bool hedgeOnPerTryTimeout() const override { return hedge_on_per_try_timeout; } + bool hedgeOnPerTryTimeout() const override { return hedge_on_per_try_timeout_; } uint32_t initial_requests_{}; envoy::type::FractionalPercent additional_request_chance_{}; - bool hedge_on_per_try_timeout{}; + bool hedge_on_per_try_timeout_{}; }; class TestRetryPolicy : public RetryPolicy { @@ -106,6 +106,7 @@ class MockRetryState : public RetryState { ~MockRetryState(); void expectHeadersRetry(); + void expectHedgedPerTryTimeoutRetry(); void expectResetRetry(); MOCK_METHOD0(enabled, bool()); @@ -113,6 +114,7 @@ class MockRetryState : public RetryState { RetryStatus(const Http::HeaderMap& response_headers, DoRetryCallback callback)); MOCK_METHOD2(shouldRetryReset, RetryStatus(const Http::StreamResetReason reset_reason, DoRetryCallback callback)); + MOCK_METHOD1(shouldHedgeRetryPerTryTimeout, RetryStatus(DoRetryCallback callback)); MOCK_METHOD1(onHostAttempted, void(Upstream::HostDescriptionConstSharedPtr)); MOCK_METHOD1(shouldSelectAnotherHost, bool(const Upstream::Host& host)); MOCK_METHOD2(priorityLoadForRetry, From ac2e338e3436b150ff89f97812308e12275a2ef6 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Fri, 8 Mar 2019 20:07:06 -0500 Subject: [PATCH 02/70] remove empty if block Signed-off-by: Michael Puncel --- source/common/router/router.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 43618c496368e..e2bc4190b9b73 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -448,9 +448,6 @@ Http::FilterDataStatus Filter::decodeData(Buffer::Instance& data, bool end_strea // since it's all moves from here on. Buffer::OwnedImpl copy(data); upstream_requests_[i]->encodeData(copy, end_stream); - - if (i == 0) { - } } else { upstream_requests_[i]->encodeData(data, end_stream); } From b51e9022fac7098169994441a2f7003170359994 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 11 Mar 2019 13:40:36 -0400 Subject: [PATCH 03/70] null out conn pool stream handle after pool failure Signed-off-by: Michael Puncel --- source/common/router/router.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index e2bc4190b9b73..497537f92ca44 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -1275,6 +1275,7 @@ void Filter::UpstreamRequest::onPerTryTimeout() { void Filter::UpstreamRequest::onPoolFailure(Http::ConnectionPool::PoolFailureReason reason, Upstream::HostDescriptionConstSharedPtr host) { Http::StreamResetReason reset_reason = Http::StreamResetReason::ConnectionFailure; + conn_pool_stream_handle_ = nullptr; switch (reason) { case Http::ConnectionPool::PoolFailureReason::Overflow: reset_reason = Http::StreamResetReason::Overflow; From 19d76518d831c1fdb1bdddf6e1edca9bf2fdca41 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 11 Mar 2019 14:10:51 -0400 Subject: [PATCH 04/70] fix format Signed-off-by: Michael Puncel --- source/common/router/router.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/common/router/router.h b/source/common/router/router.h index 3b3c907725c1e..836527347bd48 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -414,7 +414,8 @@ class Filter : Logger::Loggable, void onUpstreamTrailers(Http::HeaderMapPtr&& trailers, UpstreamRequest* upstream_request); void onUpstreamMetadata(Http::MetadataMapPtr&& metadata_map, UpstreamRequest* upstream_request); void onUpstreamComplete(UpstreamRequest* upstream_request); - void onUpstreamReset(Http::StreamResetReason reset_reason, absl::string_view transport_failure, UpstreamRequest* upstream_request); + void onUpstreamReset(Http::StreamResetReason reset_reason, absl::string_view transport_failure, + UpstreamRequest* upstream_request); void resetOtherUpstreams(UpstreamRequest* upstream_request); void sendNoHealthyUpstreamResponse(); bool setupRetry(); From 6447c06343e7de847719744ad17bee01b5bb1d3e Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 11 Mar 2019 14:59:33 -0400 Subject: [PATCH 05/70] add bug fix and test for double retries Signed-off-by: Michael Puncel --- source/common/router/router.cc | 1 + test/common/router/router_test.cc | 51 +++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 089bb910e7e27..c924cd5e2e16f 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -611,6 +611,7 @@ void Filter::onSoftPerTryTimeout(UpstreamRequest* upstream_request) { // Don't increment upstream_host->stats().rq_error_ here, we'll do that // later if 1) we hit global timeout or 2) we get bad response headers // back. + upstream_request->retried_ = true; } else if (retry_status == RetryStatus::NoOverflow) { callbacks_->streamInfo().setResponseFlag(StreamInfo::ResponseFlag::UpstreamOverflow); } else if (retry_status == RetryStatus::NoRetryLimitExceeded) { diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index 063609f4adef1..35362c0b67f85 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -1397,6 +1397,57 @@ TEST_F(RouterTest, HedgedPerTryTimeoutThirdRequestSucceeds) { .value()); } +// First request times out and is retried, and then a response is received. +// Make sure we don't attempt to retry because we already retried for timeout. +TEST_F(RouterTest, RetryOnlyOnceForSameUpstreamRequest) { + enableHedgeOnPerTryTimeout(); + + NiceMock encoder1; + Http::StreamDecoder* response_decoder1 = nullptr; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder1 = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder1, cm_.conn_pool_.host_); + return nullptr; + })); + expectResponseTimerCreate(); + expectPerTryTimerCreate(); + + Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; + HttpTestUtility::addDefaultHeaders(headers); + router_.decodeHeaders(headers, true); + + EXPECT_CALL(encoder1.stream_, resetStream(_)).Times(0); + + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(504)); + router_.retry_state_->expectHedgedPerTryTimeoutRetry(); + per_try_timeout_->callback_(); + + NiceMock encoder2; + Http::StreamDecoder* response_decoder2 = nullptr; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder2 = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder2, cm_.conn_pool_.host_); + return nullptr; + })); + + expectPerTryTimerCreate(); + router_.retry_state_->callback_(); + + // Now send a 5xx back and make sure we don't ask whether we should retry it. + Http::HeaderMapPtr response_headers1(new Http::TestHeaderMapImpl{{":status", "500"}}); + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(500)); + EXPECT_CALL(*router_.retry_state_, shouldRetryHeaders(_, _)).Times(0); + response_decoder1->decodeHeaders(std::move(response_headers1), true); + + response_timeout_->callback_(); +} + TEST_F(RouterTest, RetryRequestNotComplete) { NiceMock encoder1; Http::StreamDecoder* response_decoder = nullptr; From 676300085d488cc0fccab8fa9343272a18908ca6 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 12 Mar 2019 10:05:30 -0400 Subject: [PATCH 06/70] Fix "retry only once per upstream request" behavior. This required exposing wouldRetryFromHeaders in the retry state impl API so that the router can know if a response is "good" or not without actually scheduling a retry. If the response is "bad", it checks if there are any pending upstream requests that have not seen headers yet. If there are not, then it returns the response downstream. Signed-off-by: Michael Puncel --- include/envoy/router/router.h | 10 ++ source/common/router/BUILD | 1 + source/common/router/retry_state_impl.h | 4 +- source/common/router/router.cc | 138 +++++++++++++----------- source/common/router/router.h | 8 +- test/common/router/router_test.cc | 65 +++++++++++ test/mocks/router/mocks.h | 1 + 7 files changed, 162 insertions(+), 65 deletions(-) diff --git a/include/envoy/router/router.h b/include/envoy/router/router.h index ad23563581906..cdeaf68cf81cb 100644 --- a/include/envoy/router/router.h +++ b/include/envoy/router/router.h @@ -240,6 +240,16 @@ class RetryState { virtual RetryStatus shouldRetryHeaders(const Http::HeaderMap& response_headers, DoRetryCallback callback) PURE; + /** + * Determines whether given response headers would be retried by the retry policy, assuming + * sufficient retry budget and circuit breaker headroom. This is useful in cases where + * the information about whether a response is "good" or not is useful, but a retry should + * not be attempted for other reasons. + * @param response_headers supplies the response headers. + * @return bool true if a retry would be warranted based on the retry policy. + */ + virtual bool wouldRetryFromHeaders(const Http::HeaderMap& response_headers) PURE; + /** * Determine whether a request should be retried after a reset based on the reason for the reset. * @param reset_reason supplies the reset reason. diff --git a/source/common/router/BUILD b/source/common/router/BUILD index b861e6eeb3037..3214ca4fbc5d2 100644 --- a/source/common/router/BUILD +++ b/source/common/router/BUILD @@ -145,6 +145,7 @@ envoy_cc_library( "//source/common/common:enum_to_int", "//source/common/common:hash_lib", "//source/common/common:hex_lib", + "//source/common/common:linked_object", "//source/common/common:minimal_logger_lib", "//source/common/common:utility_lib", "//source/common/grpc:common_lib", diff --git a/source/common/router/retry_state_impl.h b/source/common/router/retry_state_impl.h index 9d7ce7647322c..647680da4f2f8 100644 --- a/source/common/router/retry_state_impl.h +++ b/source/common/router/retry_state_impl.h @@ -38,6 +38,9 @@ class RetryStateImpl : public RetryState { bool enabled() override { return retry_on_ != 0; } RetryStatus shouldRetryHeaders(const Http::HeaderMap& response_headers, DoRetryCallback callback) override; + // Returns true if the retry policy would retry the passed headers. Does not + // take into account circuit breaking or remaining tries. + bool wouldRetryFromHeaders(const Http::HeaderMap& response_headers) override; RetryStatus shouldRetryReset(const Http::StreamResetReason reset_reason, DoRetryCallback callback) override; RetryStatus shouldHedgeRetryPerTryTimeout(DoRetryCallback callback) override; @@ -76,7 +79,6 @@ class RetryStateImpl : public RetryState { void enableBackoffTimer(); void resetRetry(); bool wouldRetryFromReset(const Http::StreamResetReason reset_reason); - bool wouldRetryFromHeaders(const Http::HeaderMap& response_headers); RetryStatus shouldRetry(bool would_retry, DoRetryCallback callback); const Upstream::ClusterInfo& cluster_; diff --git a/source/common/router/router.cc b/source/common/router/router.cc index c924cd5e2e16f..6a699403e85a5 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -195,9 +195,7 @@ FilterUtility::HedgingParams FilterUtility::finalHedgingParams(const RouteEntry& Filter::~Filter() { // Upstream resources should already have been cleaned. - for (unsigned long i = 0; i < upstream_requests_.size(); i++) { - ASSERT(!upstream_requests_[i]); - } + ASSERT(upstream_requests_.empty()); ASSERT(!retry_state_); } @@ -399,8 +397,9 @@ Http::FilterHeadersStatus Filter::decodeHeaders(Http::HeaderMap& headers, bool e ENVOY_STREAM_LOG(debug, "router decoding headers:\n{}", *callbacks_, headers); UpstreamRequestPtr upstream_request = std::make_unique(*this, *conn_pool); - upstream_requests_.emplace_back(std::move(upstream_request)); - upstream_requests_[0]->encodeHeaders(end_stream); + UpstreamRequest* upstream_request_ptr = upstream_request.get(); + upstream_request->moveIntoList(std::move(upstream_request), upstream_requests_); + upstream_request_ptr->encodeHeaders(end_stream); if (end_stream) { onRequestComplete(); } @@ -442,14 +441,15 @@ Http::FilterDataStatus Filter::decodeData(Buffer::Instance& data, bool end_strea do_shadowing_ = false; } - for (unsigned long i = 0; i < upstream_requests_.size(); i++) { + for (auto it = upstream_requests_.cbegin(); it != upstream_requests_.cend(); it++) { + UpstreamRequest* upstream_request = it->get(); if (buffering) { // If we are going to buffer for retries or shadowing, we need to make a copy before encoding // since it's all moves from here on. Buffer::OwnedImpl copy(data); - upstream_requests_[i]->encodeData(copy, end_stream); + upstream_request->encodeData(copy, end_stream); } else { - upstream_requests_[i]->encodeData(data, end_stream); + upstream_request->encodeData(data, end_stream); } } @@ -472,8 +472,8 @@ Http::FilterDataStatus Filter::decodeData(Buffer::Instance& data, bool end_strea Http::FilterTrailersStatus Filter::decodeTrailers(Http::HeaderMap& trailers) { ENVOY_STREAM_LOG(debug, "router decoding trailers:\n{}", *callbacks_, trailers); downstream_trailers_ = &trailers; - for (unsigned long i = 0; i < upstream_requests_.size(); i++) { - upstream_requests_[i]->encodeTrailers(trailers); + for (auto it = upstream_requests_.cbegin(); it != upstream_requests_.cend(); it++) { + it->get()->encodeTrailers(trailers); } onRequestComplete(); return Http::FilterTrailersStatus::StopIteration; @@ -488,15 +488,12 @@ void Filter::setDecoderFilterCallbacks(Http::StreamDecoderFilterCallbacks& callb } void Filter::cleanup() { - for (unsigned long i = 0; i < upstream_requests_.size(); i++) { - UpstreamRequest* upstream_request = upstream_requests_[i].get(); - if (upstream_request) { - if (final_upstream_request_ != nullptr && upstream_request == final_upstream_request_) { - callbacks_->streamInfo().setUpstreamTiming(final_upstream_request_->upstream_timing_); - } else { - upstream_request->resetStream(); // Idempotent. - } - upstream_requests_[i].reset(); + while (!upstream_requests_.empty()) { + UpstreamRequestPtr upstream_request = upstream_requests_.front()->removeFromList(upstream_requests_); + if (final_upstream_request_ != nullptr && upstream_request.get() == final_upstream_request_) { + callbacks_->streamInfo().setUpstreamTiming(final_upstream_request_->upstream_timing_); + } else { + upstream_request->resetStream(); // Idempotent. } } retry_state_.reset(); @@ -531,15 +528,7 @@ void Filter::onRequestComplete() { downstream_request_complete_time_ = dispatcher.timeSource().monotonicTime(); // Possible that we got an immediate reset. - bool any_upstreams = false; - for (unsigned long i = 0; i < upstream_requests_.size(); i++) { - if (upstream_requests_[i]) { - any_upstreams = true; - break; - } - } - - if (any_upstreams) { + if (!upstream_requests_.empty()) { // Even if we got an immediate reset, we could still shadow, but that is a riskier change and // seems unnecessary right now. maybeDoShadowing(); @@ -552,13 +541,6 @@ void Filter::onRequestComplete() { } void Filter::onDestroy() { - if (!attempting_internal_redirect_with_complete_stream_) { - for (unsigned long i = 0; i < upstream_requests_.size(); i++) { - if (upstream_requests_[i]) { - upstream_requests_[i]->resetStream(); - } - } - } cleanup(); } @@ -566,11 +548,11 @@ void Filter::onResponseTimeout() { ENVOY_STREAM_LOG(debug, "upstream timeout", *callbacks_); // Reset any upstream requests that are still in flight. - for (unsigned long i = 0; i < upstream_requests_.size(); i++) { + for (auto it = upstream_requests_.cbegin(); it != upstream_requests_.cend(); it++) { + UpstreamRequest* upstream_request = it->get(); // Don't record a timeout for upstream requests we've already seen headers // for. - UpstreamRequest* upstream_request = upstream_requests_[i].get(); - if (upstream_request && !upstream_request->upstream_headers_) { + if (!upstream_request->upstream_headers_) { cluster_->stats().upstream_rq_timeout_.inc(); if (upstream_request->upstream_host_) { upstream_request->upstream_host_->stats().rq_timeout_.inc(); @@ -811,8 +793,8 @@ void Filter::onUpstream100ContinueHeaders(Http::HeaderMapPtr&& headers, void Filter::resetOtherUpstreams(UpstreamRequest* upstream_request) { UpstreamRequest* upstream_request_tmp; - for (unsigned long i = 0; i < upstream_requests_.size(); i++) { - upstream_request_tmp = upstream_requests_[i].get(); + for (auto it = upstream_requests_.cbegin(); it != upstream_requests_.cend(); it++) { + upstream_request_tmp = it->get(); if (upstream_request_tmp != upstream_request) { if (!upstream_request_tmp->encode_complete_ || !upstream_request_tmp->decode_complete_) { upstream_request_tmp->resetStream(); @@ -835,30 +817,41 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head upstream_request->upstream_host_->healthChecker().setUnhealthy(); } + bool could_not_retry = false; + // Check if this upstream request was already retried, for instance after // hitting a per try timeout. Don't retry it if we already have. - if (retry_state_ && !upstream_request->retried_) { - RetryStatus retry_status = - retry_state_->shouldRetryHeaders(*headers, [this]() -> void { doRetry(); }); - // Capture upstream_host since setupRetry() in the following line will clear - // upstream_request. - const auto upstream_host = upstream_request->upstream_host_; - if (retry_status == RetryStatus::Yes && setupRetry()) { - if (!end_stream) { - upstream_request->resetStream(); - } + if (retry_state_) { + if (upstream_request->retried_) { + // We already retried this request (presumably for a per try timeout) so + // we definitely won't retry it again. Check if we would have retried it + // if we could. + could_not_retry = retry_state_->wouldRetryFromHeaders(*headers); + } else { + RetryStatus retry_status = + retry_state_->shouldRetryHeaders(*headers, [this]() -> void { doRetry(); }); + // Capture upstream_host since setupRetry() in the following line will clear + // upstream_request. + const auto upstream_host = upstream_request->upstream_host_; + if (retry_status == RetryStatus::Yes && setupRetry()) { + if (!end_stream) { + upstream_request->resetStream(); + } - Http::CodeStats& code_stats = httpContext().codeStats(); - code_stats.chargeBasicResponseStat(cluster_->statsScope(), "retry.", - static_cast(response_code)); - upstream_host->stats().rq_error_.inc(); - upstream_request->retried_ = true; - return; - } else if (retry_status == RetryStatus::NoOverflow) { - callbacks_->streamInfo().setResponseFlag(StreamInfo::ResponseFlag::UpstreamOverflow); - } else if (retry_status == RetryStatus::NoRetryLimitExceeded) { - callbacks_->streamInfo().setResponseFlag( - StreamInfo::ResponseFlag::UpstreamRetryLimitExceeded); + Http::CodeStats& code_stats = httpContext().codeStats(); + code_stats.chargeBasicResponseStat(cluster_->statsScope(), "retry.", + static_cast(response_code)); + upstream_host->stats().rq_error_.inc(); + upstream_request->retried_ = true; + return; + } else if (retry_status == RetryStatus::NoOverflow) { + callbacks_->streamInfo().setResponseFlag(StreamInfo::ResponseFlag::UpstreamOverflow); + could_not_retry = true; + } else if (retry_status == RetryStatus::NoRetryLimitExceeded) { + callbacks_->streamInfo().setResponseFlag( + StreamInfo::ResponseFlag::UpstreamRetryLimitExceeded); + could_not_retry = true; + } } // Make sure any retry timers are destroyed since we may not call cleanup() if end_stream is @@ -874,6 +867,13 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head // next downstream. } + // Check if we got a "bad" response, but there are still upstream requests in + // flight awaiting headers or scheduled retries. If so, exit to give them a + // chance to return before returning a response downstream. + if (could_not_retry && (numRequestsAwaitingHeaders() > 0 || pending_retries_ > 0)) { + return; + } + // Only send upstream service time if we received the complete request and this is not a // premature response. if (DateUtil::timePointValid(downstream_request_complete_time_)) { @@ -1011,6 +1011,7 @@ void Filter::onUpstreamComplete(UpstreamRequest* upstream_request) { } bool Filter::setupRetry() { + pending_retries_++; // If we responded before the request was complete we don't bother doing a retry. This may not // catch certain cases where we are in full streaming mode and we have a connect timeout or an // overflow of some kind. However, in many cases deployments will use the buffer filter before @@ -1062,6 +1063,8 @@ bool Filter::setupRedirect(const Http::HeaderMap& headers, UpstreamRequest* upst void Filter::doRetry() { is_retry_ = true; attempt_count_++; + ASSERT(pending_retries_ > 0); + pending_retries_--; Http::ConnectionPool::Instance* conn_pool = getConnPool(); if (!conn_pool) { sendNoHealthyUpstreamResponse(); @@ -1076,7 +1079,7 @@ void Filter::doRetry() { ASSERT(response_timeout_ || timeout_.global_timeout_.count() == 0); UpstreamRequestPtr upstream_request = std::make_unique(*this, *conn_pool); UpstreamRequest* upstream_request_ptr = upstream_request.get(); - upstream_requests_.emplace_back(std::move(upstream_request)); + upstream_request->moveIntoList(std::move(upstream_request), upstream_requests_); upstream_request_ptr->encodeHeaders(!callbacks_->decodingBuffer() && !downstream_trailers_); // It's possible we got immediately reset. if (upstream_request_ptr) { @@ -1092,6 +1095,17 @@ void Filter::doRetry() { } } +uint32_t Filter::numRequestsAwaitingHeaders() { + uint32_t ret = 0; + for (auto upstream_request = upstream_requests_.cbegin(); upstream_request != upstream_requests_.cend(); upstream_request++) { + if (!upstream_request->get()->upstream_headers_) { + ret++; + } + } + + return ret; +} + Filter::UpstreamRequest::UpstreamRequest(Filter& parent, Http::ConnectionPool::Instance& pool) : parent_(parent), conn_pool_(pool), grpc_rq_success_deferred_(false), stream_info_(pool.protocol(), parent_.callbacks_->dispatcher().timeSource()), diff --git a/source/common/router/router.h b/source/common/router/router.h index 836527347bd48..98c4b8ca96ed3 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -21,6 +21,7 @@ #include "common/buffer/watermark_buffer.h" #include "common/common/hash.h" #include "common/common/hex.h" +#include "common/common/linked_object.h" #include "common/common/logger.h" #include "common/config/well_known_names.h" #include "common/http/utility.h" @@ -282,7 +283,8 @@ class Filter : Logger::Loggable, private: struct UpstreamRequest : public Http::StreamDecoder, public Http::StreamCallbacks, - public Http::ConnectionPool::Callbacks { + public Http::ConnectionPool::Callbacks, + public LinkedObject { UpstreamRequest(Filter& parent, Http::ConnectionPool::Instance& pool); ~UpstreamRequest(); @@ -393,6 +395,7 @@ class Filter : Logger::Loggable, Http::ConnectionPool::Instance* getConnPool(); void maybeDoShadowing(); bool maybeRetryReset(Http::StreamResetReason reset_reason, UpstreamRequest* upstream_request); + uint32_t numRequestsAwaitingHeaders(); void onGlobalTimeout(); void onPerTryTimeout(UpstreamRequest* upstream_request); void onRequestComplete(); @@ -440,7 +443,7 @@ class Filter : Logger::Loggable, FilterUtility::TimeoutData timeout_; FilterUtility::HedgingParams hedging_params_; Http::Code timeout_response_code_ = Http::Code::GatewayTimeout; - std::vector upstream_requests_; + std::list upstream_requests_; // Tracks which upstream request "wins" and will have the corresponding // response forwarded downstream UpstreamRequest* final_upstream_request_; @@ -461,6 +464,7 @@ class Filter : Logger::Loggable, bool include_attempt_count_ : 1; bool attempting_internal_redirect_with_complete_stream_ : 1; uint32_t attempt_count_{1}; + int32_t pending_retries_{0}; }; class ProdFilter : public Filter { diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index 35362c0b67f85..2b6b0379c0c17 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -1443,11 +1443,76 @@ TEST_F(RouterTest, RetryOnlyOnceForSameUpstreamRequest) { Http::HeaderMapPtr response_headers1(new Http::TestHeaderMapImpl{{":status", "500"}}); EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(500)); EXPECT_CALL(*router_.retry_state_, shouldRetryHeaders(_, _)).Times(0); + EXPECT_CALL(*router_.retry_state_, wouldRetryFromHeaders(_)).WillOnce(Return(true)); response_decoder1->decodeHeaders(std::move(response_headers1), true); response_timeout_->callback_(); } +// Sequence: upstream request hits soft per try timeout and is retried, and +// then "bad" response headers come back before the retry has been scheduled. +// Ensures that the "bad" headers are not sent downstream because there is +// still an attempt pending. +TEST_F(RouterTest, BadHeadersDroppedIfPreviousRetryScheduled) { + enableHedgeOnPerTryTimeout(); + + NiceMock encoder1; + Http::StreamDecoder* response_decoder1 = nullptr; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder1 = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder1, cm_.conn_pool_.host_); + return nullptr; + })); + expectResponseTimerCreate(); + expectPerTryTimerCreate(); + + Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; + HttpTestUtility::addDefaultHeaders(headers); + router_.decodeHeaders(headers, true); + + EXPECT_CALL(encoder1.stream_, resetStream(_)).Times(0); + + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(504)); + router_.retry_state_->expectHedgedPerTryTimeoutRetry(); + per_try_timeout_->callback_(); + + expectPerTryTimerCreate(); + + // Now send a 5xx back and make sure we don't ask whether we should retry it + // and also that we don't respond downstream with it. + Http::HeaderMapPtr response_headers1(new Http::TestHeaderMapImpl{{":status", "500"}}); + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(500)); + EXPECT_CALL(*router_.retry_state_, shouldRetryHeaders(_, _)).Times(0); + EXPECT_CALL(*router_.retry_state_, wouldRetryFromHeaders(_)).WillOnce(Return(true)); + EXPECT_CALL(callbacks_, encodeHeaders_(_, _)).Times(0); + response_decoder1->decodeHeaders(std::move(response_headers1), true); + + // Now trigger the retry for the per try timeout earlier. + NiceMock encoder2; + Http::StreamDecoder* response_decoder2 = nullptr; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder2 = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder2, cm_.conn_pool_.host_); + return nullptr; + })); + router_.retry_state_->callback_(); + + Http::HeaderMapPtr response_headers2(new Http::TestHeaderMapImpl{{":status", "200"}}); + EXPECT_CALL(*router_.retry_state_, shouldRetryHeaders(_, _)).WillOnce(Return(RetryStatus::No)); + EXPECT_CALL(callbacks_, encodeHeaders_(_, _)) + .WillOnce(Invoke([&](Http::HeaderMap& headers, bool end_stream) -> void { + EXPECT_EQ(headers.Status()->value(), "200"); + EXPECT_TRUE(end_stream); + })); + response_decoder2->decodeHeaders(std::move(response_headers2), true); +} + TEST_F(RouterTest, RetryRequestNotComplete) { NiceMock encoder1; Http::StreamDecoder* response_decoder = nullptr; diff --git a/test/mocks/router/mocks.h b/test/mocks/router/mocks.h index cb90e04ed939b..2672c543158b7 100644 --- a/test/mocks/router/mocks.h +++ b/test/mocks/router/mocks.h @@ -112,6 +112,7 @@ class MockRetryState : public RetryState { MOCK_METHOD0(enabled, bool()); MOCK_METHOD2(shouldRetryHeaders, RetryStatus(const Http::HeaderMap& response_headers, DoRetryCallback callback)); + MOCK_METHOD1(wouldRetryFromHeaders, bool(const Http::HeaderMap& response_headers)); MOCK_METHOD2(shouldRetryReset, RetryStatus(const Http::StreamResetReason reset_reason, DoRetryCallback callback)); MOCK_METHOD1(shouldHedgeRetryPerTryTimeout, RetryStatus(DoRetryCallback callback)); From 47771436c1f666fc689480f8f3e8d5d88f4baa1e Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 12 Mar 2019 12:09:02 -0400 Subject: [PATCH 07/70] fix format Signed-off-by: Michael Puncel --- source/common/router/router.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 6a699403e85a5..f58ff82c9496c 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -489,7 +489,8 @@ void Filter::setDecoderFilterCallbacks(Http::StreamDecoderFilterCallbacks& callb void Filter::cleanup() { while (!upstream_requests_.empty()) { - UpstreamRequestPtr upstream_request = upstream_requests_.front()->removeFromList(upstream_requests_); + UpstreamRequestPtr upstream_request = + upstream_requests_.front()->removeFromList(upstream_requests_); if (final_upstream_request_ != nullptr && upstream_request.get() == final_upstream_request_) { callbacks_->streamInfo().setUpstreamTiming(final_upstream_request_->upstream_timing_); } else { @@ -540,9 +541,7 @@ void Filter::onRequestComplete() { } } -void Filter::onDestroy() { - cleanup(); -} +void Filter::onDestroy() { cleanup(); } void Filter::onResponseTimeout() { ENVOY_STREAM_LOG(debug, "upstream timeout", *callbacks_); @@ -1097,7 +1096,8 @@ void Filter::doRetry() { uint32_t Filter::numRequestsAwaitingHeaders() { uint32_t ret = 0; - for (auto upstream_request = upstream_requests_.cbegin(); upstream_request != upstream_requests_.cend(); upstream_request++) { + for (auto upstream_request = upstream_requests_.cbegin(); + upstream_request != upstream_requests_.cend(); upstream_request++) { if (!upstream_request->get()->upstream_headers_) { ret++; } From 140d4ad42737c1e2f97a2d84e8c9449e87fc6522 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 12 Mar 2019 12:24:25 -0400 Subject: [PATCH 08/70] fix spelling in a comment Signed-off-by: Michael Puncel --- source/common/router/router.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index f58ff82c9496c..b2fafabae5784 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -1280,7 +1280,7 @@ void Filter::UpstreamRequest::onPerTryTimeout() { ENVOY_STREAM_LOG(debug, "upstream per try timeout", *parent_.callbacks_); // Set response flag to UT for now, but it might be overwritten if a - // response arrives later and hedg_on_per_try_timeout_ is set + // response arrives later and hedge_on_per_try_timeout_ is set stream_info_.setResponseFlag(StreamInfo::ResponseFlag::UpstreamRequestTimeout); parent_.onPerTryTimeout(this); } else { From 655d126bd54dcf3d5ddb2fc8d636cf08bd19b958 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 12 Mar 2019 15:29:25 -0400 Subject: [PATCH 09/70] more build fixes Signed-off-by: Michael Puncel --- source/common/router/router.cc | 13 ++++++++----- test/common/router/router_test.cc | 3 +++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index b2fafabae5784..deef56f2a5b6d 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -490,7 +490,7 @@ void Filter::setDecoderFilterCallbacks(Http::StreamDecoderFilterCallbacks& callb void Filter::cleanup() { while (!upstream_requests_.empty()) { UpstreamRequestPtr upstream_request = - upstream_requests_.front()->removeFromList(upstream_requests_); + upstream_requests_.back()->removeFromList(upstream_requests_); if (final_upstream_request_ != nullptr && upstream_request.get() == final_upstream_request_) { callbacks_->streamInfo().setUpstreamTiming(final_upstream_request_->upstream_timing_); } else { @@ -851,11 +851,8 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head StreamInfo::ResponseFlag::UpstreamRetryLimitExceeded); could_not_retry = true; } - } - // Make sure any retry timers are destroyed since we may not call cleanup() if end_stream is - // false. - retry_state_.reset(); + } } if (static_cast(response_code) == Http::Code::Found && @@ -873,6 +870,12 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head return; } + // Make sure any retry timers are destroyed since we may not call cleanup() if end_stream is + // false. + if (retry_state_) { + retry_state_.reset(); + } + // Only send upstream service time if we received the complete request and this is not a // premature response. if (DateUtil::timePointValid(downstream_request_complete_time_)) { diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index 2b6b0379c0c17..ad35c169579e9 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -1446,6 +1446,8 @@ TEST_F(RouterTest, RetryOnlyOnceForSameUpstreamRequest) { EXPECT_CALL(*router_.retry_state_, wouldRetryFromHeaders(_)).WillOnce(Return(true)); response_decoder1->decodeHeaders(std::move(response_headers1), true); + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(504)); + response_timeout_->callback_(); } @@ -1510,6 +1512,7 @@ TEST_F(RouterTest, BadHeadersDroppedIfPreviousRetryScheduled) { EXPECT_EQ(headers.Status()->value(), "200"); EXPECT_TRUE(end_stream); })); + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(200)); response_decoder2->decodeHeaders(std::move(response_headers2), true); } From 86d61e52765181887a69d94c8a36f612c3dbd55f Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 12 Mar 2019 16:26:04 -0400 Subject: [PATCH 10/70] fix format Signed-off-by: Michael Puncel --- source/common/router/router.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index deef56f2a5b6d..9d27fa1a650b2 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -851,7 +851,6 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head StreamInfo::ResponseFlag::UpstreamRetryLimitExceeded); could_not_retry = true; } - } } From a12c667d0d0724a45ec77083fb22c996db32700b Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 12 Mar 2019 16:34:28 -0400 Subject: [PATCH 11/70] run resetStream always on upstream requests to ensure callbacks are removed Signed-off-by: Michael Puncel --- source/common/router/router.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 9d27fa1a650b2..c4ea6ebcf418d 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -493,9 +493,8 @@ void Filter::cleanup() { upstream_requests_.back()->removeFromList(upstream_requests_); if (final_upstream_request_ != nullptr && upstream_request.get() == final_upstream_request_) { callbacks_->streamInfo().setUpstreamTiming(final_upstream_request_->upstream_timing_); - } else { - upstream_request->resetStream(); // Idempotent. } + upstream_request->resetStream(); // Idempotent. } retry_state_.reset(); if (response_timeout_) { From 1617ffc7b21600efedbf6a141eb7a2d088fe3d10 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 12 Mar 2019 17:25:10 -0400 Subject: [PATCH 12/70] proxy all upstream metadata downstream Signed-off-by: Michael Puncel --- source/common/router/router.cc | 9 +++------ source/common/router/router.h | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index c4ea6ebcf418d..002cd3ac35fc8 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -949,11 +949,8 @@ void Filter::onUpstreamTrailers(Http::HeaderMapPtr&& trailers, UpstreamRequest* } } -void Filter::onUpstreamMetadata(Http::MetadataMapPtr&& metadata_map, - UpstreamRequest* upstream_request) { - if (final_upstream_request_ == upstream_request) { - callbacks_->encodeMetadata(std::move(metadata_map)); - } +void Filter::onUpstreamMetadata(Http::MetadataMapPtr&& metadata_map) { + callbacks_->encodeMetadata(std::move(metadata_map)); } void Filter::onUpstreamComplete(UpstreamRequest* upstream_request) { @@ -1172,7 +1169,7 @@ void Filter::UpstreamRequest::decodeTrailers(Http::HeaderMapPtr&& trailers) { } void Filter::UpstreamRequest::decodeMetadata(Http::MetadataMapPtr&& metadata_map) { - parent_.onUpstreamMetadata(std::move(metadata_map), this); + parent_.onUpstreamMetadata(std::move(metadata_map)); } void Filter::UpstreamRequest::maybeEndDecode(bool end_stream) { diff --git a/source/common/router/router.h b/source/common/router/router.h index 98c4b8ca96ed3..5edf94e7fe1f4 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -415,7 +415,7 @@ class Filter : Logger::Loggable, UpstreamRequest* upstream_request, bool end_stream); void onUpstreamData(Buffer::Instance& data, UpstreamRequest* upstream_request, bool end_stream); void onUpstreamTrailers(Http::HeaderMapPtr&& trailers, UpstreamRequest* upstream_request); - void onUpstreamMetadata(Http::MetadataMapPtr&& metadata_map, UpstreamRequest* upstream_request); + void onUpstreamMetadata(Http::MetadataMapPtr&& metadata_map); void onUpstreamComplete(UpstreamRequest* upstream_request); void onUpstreamReset(Http::StreamResetReason reset_reason, absl::string_view transport_failure, UpstreamRequest* upstream_request); From 89d0fb4c7c0770b1f522c5eae39552168db4130c Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Wed, 13 Mar 2019 09:04:59 -0400 Subject: [PATCH 13/70] use for each syntax to loop over upstream request list Signed-off-by: Michael Puncel --- source/common/router/router.cc | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 002cd3ac35fc8..33c4efc13cfd6 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -441,8 +441,7 @@ Http::FilterDataStatus Filter::decodeData(Buffer::Instance& data, bool end_strea do_shadowing_ = false; } - for (auto it = upstream_requests_.cbegin(); it != upstream_requests_.cend(); it++) { - UpstreamRequest* upstream_request = it->get(); + for (auto& upstream_request : upstream_requests_) { if (buffering) { // If we are going to buffer for retries or shadowing, we need to make a copy before encoding // since it's all moves from here on. @@ -472,8 +471,8 @@ Http::FilterDataStatus Filter::decodeData(Buffer::Instance& data, bool end_strea Http::FilterTrailersStatus Filter::decodeTrailers(Http::HeaderMap& trailers) { ENVOY_STREAM_LOG(debug, "router decoding trailers:\n{}", *callbacks_, trailers); downstream_trailers_ = &trailers; - for (auto it = upstream_requests_.cbegin(); it != upstream_requests_.cend(); it++) { - it->get()->encodeTrailers(trailers); + for (auto& upstream_request : upstream_requests_) { + upstream_request->encodeTrailers(trailers); } onRequestComplete(); return Http::FilterTrailersStatus::StopIteration; @@ -546,8 +545,7 @@ void Filter::onResponseTimeout() { ENVOY_STREAM_LOG(debug, "upstream timeout", *callbacks_); // Reset any upstream requests that are still in flight. - for (auto it = upstream_requests_.cbegin(); it != upstream_requests_.cend(); it++) { - UpstreamRequest* upstream_request = it->get(); + for (auto& upstream_request : upstream_requests_) { // Don't record a timeout for upstream requests we've already seen headers // for. if (!upstream_request->upstream_headers_) { @@ -559,11 +557,11 @@ void Filter::onResponseTimeout() { // If this upstream request already hit a "soft" timeout, then it // already recorded a timeout into outlier detection. Don't do it again. if (!upstream_request->outlier_detection_timeout_recorded_) { - updateOutlierDetection(timeout_response_code_, upstream_request); + updateOutlierDetection(timeout_response_code_, upstream_request.get()); } upstream_request->resetStream(); - chargeUpstreamAbort(timeout_response_code_, false, upstream_request); + chargeUpstreamAbort(timeout_response_code_, false, upstream_request.get()); } } @@ -790,10 +788,8 @@ void Filter::onUpstream100ContinueHeaders(Http::HeaderMapPtr&& headers, } void Filter::resetOtherUpstreams(UpstreamRequest* upstream_request) { - UpstreamRequest* upstream_request_tmp; - for (auto it = upstream_requests_.cbegin(); it != upstream_requests_.cend(); it++) { - upstream_request_tmp = it->get(); - if (upstream_request_tmp != upstream_request) { + for (auto& upstream_request_tmp : upstream_requests_) { + if (upstream_request_tmp.get() != upstream_request) { if (!upstream_request_tmp->encode_complete_ || !upstream_request_tmp->decode_complete_) { upstream_request_tmp->resetStream(); if (upstream_request_tmp->upstream_host_) { @@ -1094,9 +1090,8 @@ void Filter::doRetry() { uint32_t Filter::numRequestsAwaitingHeaders() { uint32_t ret = 0; - for (auto upstream_request = upstream_requests_.cbegin(); - upstream_request != upstream_requests_.cend(); upstream_request++) { - if (!upstream_request->get()->upstream_headers_) { + for (auto& upstream_request : upstream_requests_) { + if (!upstream_request->upstream_headers_) { ret++; } } From 02bc109e31540824f24bd7ff80fb2b70d4c22572 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Thu, 14 Mar 2019 08:40:42 -0400 Subject: [PATCH 14/70] PR feedback Signed-off-by: Michael Puncel --- api/envoy/api/v2/route/route.proto | 6 +++++- source/common/router/router.cc | 27 +++++++++++---------------- source/common/router/router.h | 2 +- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/api/envoy/api/v2/route/route.proto b/api/envoy/api/v2/route/route.proto index cb503608c1332..ce6c6de9ab650 100644 --- a/api/envoy/api/v2/route/route.proto +++ b/api/envoy/api/v2/route/route.proto @@ -873,7 +873,11 @@ message HedgePolicy { // Indicates that a hedged request should be sent when the per-try timeout // is hit. This will only occur if the retry policy also indicates that a - // timed out request should be retried. Defaults to false. + // timed out request should be retried. + // Once a timed out request is retried due to per try timeout, it will not be + // retried again even if the returned response headers would otherwise b + // according the specified :ref:`RetryPolicy `. + // Defaults to false. bool hedge_on_per_try_timeout = 3; } diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 33c4efc13cfd6..8d1a0a313a18f 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -397,9 +397,8 @@ Http::FilterHeadersStatus Filter::decodeHeaders(Http::HeaderMap& headers, bool e ENVOY_STREAM_LOG(debug, "router decoding headers:\n{}", *callbacks_, headers); UpstreamRequestPtr upstream_request = std::make_unique(*this, *conn_pool); - UpstreamRequest* upstream_request_ptr = upstream_request.get(); upstream_request->moveIntoList(std::move(upstream_request), upstream_requests_); - upstream_request_ptr->encodeHeaders(end_stream); + upstream_requests_.front()->encodeHeaders(end_stream); if (end_stream) { onRequestComplete(); } @@ -442,14 +441,11 @@ Http::FilterDataStatus Filter::decodeData(Buffer::Instance& data, bool end_strea } for (auto& upstream_request : upstream_requests_) { - if (buffering) { - // If we are going to buffer for retries or shadowing, we need to make a copy before encoding - // since it's all moves from here on. - Buffer::OwnedImpl copy(data); - upstream_request->encodeData(copy, end_stream); - } else { - upstream_request->encodeData(data, end_stream); - } + // We need to make a copy before encoding since it's all moves from here on + // and we might have multiple upstream requests or traffic + // shadowing/retries. + Buffer::OwnedImpl copy(data); + upstream_request->encodeData(copy, end_stream); } if (buffering) { @@ -822,7 +818,7 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head // if we could. could_not_retry = retry_state_->wouldRetryFromHeaders(*headers); } else { - RetryStatus retry_status = + const RetryStatus retry_status = retry_state_->shouldRetryHeaders(*headers, [this]() -> void { doRetry(); }); // Capture upstream_host since setupRetry() in the following line will clear // upstream_request. @@ -1071,19 +1067,18 @@ void Filter::doRetry() { ASSERT(response_timeout_ || timeout_.global_timeout_.count() == 0); UpstreamRequestPtr upstream_request = std::make_unique(*this, *conn_pool); - UpstreamRequest* upstream_request_ptr = upstream_request.get(); upstream_request->moveIntoList(std::move(upstream_request), upstream_requests_); - upstream_request_ptr->encodeHeaders(!callbacks_->decodingBuffer() && !downstream_trailers_); + upstream_requests_.front()->encodeHeaders(!callbacks_->decodingBuffer() && !downstream_trailers_); // It's possible we got immediately reset. - if (upstream_request_ptr) { + if (upstream_requests_.front()) { if (callbacks_->decodingBuffer()) { // If we are doing a retry we need to make a copy. Buffer::OwnedImpl copy(*callbacks_->decodingBuffer()); - upstream_request_ptr->encodeData(copy, !downstream_trailers_); + upstream_requests_.front()->encodeData(copy, !downstream_trailers_); } if (downstream_trailers_) { - upstream_request_ptr->encodeTrailers(*downstream_trailers_); + upstream_requests_.front()->encodeTrailers(*downstream_trailers_); } } } diff --git a/source/common/router/router.h b/source/common/router/router.h index 5edf94e7fe1f4..b3886622ec9cc 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -464,7 +464,7 @@ class Filter : Logger::Loggable, bool include_attempt_count_ : 1; bool attempting_internal_redirect_with_complete_stream_ : 1; uint32_t attempt_count_{1}; - int32_t pending_retries_{0}; + uint32_t pending_retries_{0}; }; class ProdFilter : public Filter { From 7b50e77cb5fd25522b65d08ab4a5f1f7e7547c0b Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Sun, 17 Mar 2019 16:09:15 -0400 Subject: [PATCH 15/70] PR feedback Signed-off-by: Michael Puncel --- .../root/intro/arch_overview/http_routing.rst | 6 +- source/common/router/router.cc | 167 +++++++++--------- source/common/router/router.h | 31 ++-- 3 files changed, 106 insertions(+), 98 deletions(-) diff --git a/docs/root/intro/arch_overview/http_routing.rst b/docs/root/intro/arch_overview/http_routing.rst index cf92be2828981..977699eacf719 100644 --- a/docs/root/intro/arch_overview/http_routing.rst +++ b/docs/root/intro/arch_overview/http_routing.rst @@ -35,7 +35,7 @@ request. The router filter supports the following features: * Request timeout specified either via :ref:`HTTP header ` or via :ref:`route configuration `. -* :ref:`Request hedging ` in response to a request (per try) timeout. +* :ref:`Request hedging ` for retries in response to a request (per try) timeout. * Traffic shifting from one upstream cluster to another via :ref:`runtime values ` (see :ref:`traffic shifting/splitting `). @@ -96,7 +96,9 @@ Request Hedging Envoy supports request hedging via specifying a :ref:`hedge policy `. This means that Envoy will race multiple simultaneous upstream requests and return the first valid response to the downstream. -Currently hedging can only be performed in response to a request timeout. +Currently hedging can only be applied to retries performed in response to a request timeout. The implementation ensures that +the same upstream request is not retried twice, for instance if it times out and then later receives a 5xx response and the +retry policy calls for retrying on 5xx. .. _arch_overview_http_routing_priority: diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 8d1a0a313a18f..adbd17d338d15 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -430,7 +430,8 @@ void Filter::sendNoHealthyUpstreamResponse() { } Http::FilterDataStatus Filter::decodeData(Buffer::Instance& data, bool end_stream) { - bool buffering = (retry_state_ && retry_state_->enabled()) || do_shadowing_; + bool buffering = + (retry_state_ && retry_state_->enabled()) || do_shadowing_ || upstream_requests_.size() > 1; if (buffering && buffer_limit_ > 0 && getLength(callbacks_->decodingBuffer()) + data.length() > buffer_limit_) { // The request is larger than we should buffer. Give up on the retry/shadow @@ -441,11 +442,15 @@ Http::FilterDataStatus Filter::decodeData(Buffer::Instance& data, bool end_strea } for (auto& upstream_request : upstream_requests_) { - // We need to make a copy before encoding since it's all moves from here on - // and we might have multiple upstream requests or traffic - // shadowing/retries. - Buffer::OwnedImpl copy(data); - upstream_request->encodeData(copy, end_stream); + if (buffering) { + // We need to make a copy before encoding since it's all moves from here + // on if we might have multiple upstream requests or traffic + // shadowing/retries. + Buffer::OwnedImpl copy(data); + upstream_request->encodeData(copy, end_stream); + } else { + upstream_request->encodeData(data, end_stream); + } } if (buffering) { @@ -486,7 +491,7 @@ void Filter::cleanup() { while (!upstream_requests_.empty()) { UpstreamRequestPtr upstream_request = upstream_requests_.back()->removeFromList(upstream_requests_); - if (final_upstream_request_ != nullptr && upstream_request.get() == final_upstream_request_) { + if (upstream_request.get() == final_upstream_request_) { callbacks_->streamInfo().setUpstreamTiming(final_upstream_request_->upstream_timing_); } upstream_request->resetStream(); // Idempotent. @@ -553,11 +558,11 @@ void Filter::onResponseTimeout() { // If this upstream request already hit a "soft" timeout, then it // already recorded a timeout into outlier detection. Don't do it again. if (!upstream_request->outlier_detection_timeout_recorded_) { - updateOutlierDetection(timeout_response_code_, upstream_request.get()); + updateOutlierDetection(timeout_response_code_, *upstream_request.get()); } upstream_request->resetStream(); - chargeUpstreamAbort(timeout_response_code_, false, upstream_request.get()); + chargeUpstreamAbort(timeout_response_code_, false, *upstream_request.get()); } } @@ -566,15 +571,13 @@ void Filter::onResponseTimeout() { // Called when the per try timeout is hit but we didn't reset the request // (hedge_on_per_try_timeout enabled). -void Filter::onSoftPerTryTimeout(UpstreamRequest* upstream_request) { +void Filter::onSoftPerTryTimeout(UpstreamRequest& upstream_request) { // Even though we didn't cancel the request yet we still want to track it // in outlier detection. // TODO(mpuncel) is it weird to have a pretend response code here? we might // get a 200 back from this request later. updateOutlierDetection(timeout_response_code_, upstream_request); - upstream_request->outlier_detection_timeout_recorded_ = true; - - Upstream::HostDescriptionConstSharedPtr upstream_host = upstream_request->upstream_host_; + upstream_request.outlier_detection_timeout_recorded_ = true; if (!downstream_response_started_ && retry_state_) { RetryStatus retry_status = @@ -585,7 +588,7 @@ void Filter::onSoftPerTryTimeout(UpstreamRequest* upstream_request) { // Don't increment upstream_host->stats().rq_error_ here, we'll do that // later if 1) we hit global timeout or 2) we get bad response headers // back. - upstream_request->retried_ = true; + upstream_request.retried_ = true; } else if (retry_status == RetryStatus::NoOverflow) { callbacks_->streamInfo().setResponseFlag(StreamInfo::ResponseFlag::UpstreamOverflow); } else if (retry_status == RetryStatus::NoRetryLimitExceeded) { @@ -595,22 +598,23 @@ void Filter::onSoftPerTryTimeout(UpstreamRequest* upstream_request) { } } -void Filter::onPerTryTimeout(UpstreamRequest* upstream_request) { +void Filter::onPerTryTimeout(UpstreamRequest& upstream_request) { if (hedging_params_.hedge_on_per_try_timeout_) { onSoftPerTryTimeout(upstream_request); return; } cluster_->stats().upstream_rq_per_try_timeout_.inc(); - if (upstream_request->upstream_host_) { - upstream_request->upstream_host_->stats().rq_timeout_.inc(); + if (upstream_request.upstream_host_) { + upstream_request.upstream_host_->stats().rq_timeout_.inc(); } - upstream_request->resetStream(); + upstream_request.resetStream(); updateOutlierDetection(timeout_response_code_, upstream_request); if (maybeRetryReset(Http::StreamResetReason::LocalReset, upstream_request)) { + upstream_request.removeFromList(upstream_requests_); return; } @@ -618,20 +622,20 @@ void Filter::onPerTryTimeout(UpstreamRequest* upstream_request) { onUpstreamTimeoutAbort(StreamInfo::ResponseFlag::UpstreamRequestTimeout); } -void Filter::updateOutlierDetection(Http::Code code, UpstreamRequest* upstream_request) { - if (upstream_request->upstream_host_) { - upstream_request->upstream_host_->outlierDetector().putHttpResponseCode(enumToInt(code)); +void Filter::updateOutlierDetection(Http::Code code, UpstreamRequest& upstream_request) { + if (upstream_request.upstream_host_) { + upstream_request.upstream_host_->outlierDetector().putHttpResponseCode(enumToInt(code)); } } -void Filter::chargeUpstreamAbort(Http::Code code, bool dropped, UpstreamRequest* upstream_request) { +void Filter::chargeUpstreamAbort(Http::Code code, bool dropped, UpstreamRequest& upstream_request) { if (downstream_response_started_) { - if (upstream_request != nullptr && upstream_request->grpc_rq_success_deferred_) { - upstream_request->upstream_host_->stats().rq_error_.inc(); + if (upstream_request.grpc_rq_success_deferred_) { + upstream_request.upstream_host_->stats().rq_error_.inc(); config_.stats_.rq_reset_after_downstream_response_started_.inc(); } } else { - Upstream::HostDescriptionConstSharedPtr upstream_host = upstream_request->upstream_host_; + Upstream::HostDescriptionConstSharedPtr upstream_host = upstream_request.upstream_host_; chargeUpstreamCode(code, upstream_host, dropped); // If we had non-5xx but still have been reset by backend or timeout before @@ -676,7 +680,7 @@ void Filter::onUpstreamAbort(Http::Code code, StreamInfo::ResponseFlag response_ } bool Filter::maybeRetryReset(Http::StreamResetReason reset_reason, - UpstreamRequest* upstream_request) { + UpstreamRequest& upstream_request) { // We don't retry if we already started the response. if (downstream_response_started_ || !retry_state_) { return false; @@ -685,8 +689,8 @@ bool Filter::maybeRetryReset(Http::StreamResetReason reset_reason, const RetryStatus retry_status = retry_state_->shouldRetryReset(reset_reason, [this]() -> void { doRetry(); }); if (retry_status == RetryStatus::Yes && setupRetry()) { - if (upstream_request->upstream_host_) { - upstream_request->upstream_host_->stats().rq_error_.inc(); + if (upstream_request.upstream_host_) { + upstream_request.upstream_host_->stats().rq_error_.inc(); } return true; } else if (retry_status == RetryStatus::NoOverflow) { @@ -700,7 +704,7 @@ bool Filter::maybeRetryReset(Http::StreamResetReason reset_reason, void Filter::onUpstreamReset(Http::StreamResetReason reset_reason, absl::string_view transport_failure_reason, - UpstreamRequest* upstream_request) { + UpstreamRequest& upstream_request) { ENVOY_STREAM_LOG(debug, "upstream reset: reset reason {}", *callbacks_, Http::Utility::resetReasonToString(reset_reason)); @@ -710,7 +714,7 @@ void Filter::onUpstreamReset(Http::StreamResetReason reset_reason, return; } - bool dropped = reset_reason == Http::StreamResetReason::Overflow; + const bool dropped = reset_reason == Http::StreamResetReason::Overflow; chargeUpstreamAbort(Http::Code::ServiceUnavailable, dropped, upstream_request); const StreamInfo::ResponseFlag response_flags = streamResetReasonToResponseFlag(reset_reason); @@ -743,7 +747,7 @@ Filter::streamResetReasonToResponseFlag(Http::StreamResetReason reset_reason) { } void Filter::handleNon5xxResponseHeaders(const Http::HeaderMap& headers, - UpstreamRequest* upstream_request, bool end_stream) { + UpstreamRequest& upstream_request, bool end_stream) { // We need to defer gRPC success until after we have processed grpc-status in // the trailers. if (grpc_request_) { @@ -751,25 +755,25 @@ void Filter::handleNon5xxResponseHeaders(const Http::HeaderMap& headers, absl::optional grpc_status = Grpc::Common::getGrpcStatus(headers); if (grpc_status && !Http::CodeUtility::is5xx(Grpc::Utility::grpcToHttpStatus(grpc_status.value()))) { - upstream_request->upstream_host_->stats().rq_success_.inc(); + upstream_request.upstream_host_->stats().rq_success_.inc(); } else { - upstream_request->upstream_host_->stats().rq_error_.inc(); + upstream_request.upstream_host_->stats().rq_error_.inc(); } } else { - upstream_request->grpc_rq_success_deferred_ = true; + upstream_request.grpc_rq_success_deferred_ = true; } } else { - upstream_request->upstream_host_->stats().rq_success_.inc(); + upstream_request.upstream_host_->stats().rq_success_.inc(); } } void Filter::onUpstream100ContinueHeaders(Http::HeaderMapPtr&& headers, - UpstreamRequest* upstream_request) { + UpstreamRequest& upstream_request) { ENVOY_STREAM_LOG(debug, "upstream 100 continue", *callbacks_); if (!downstream_response_started_) { downstream_response_started_ = true; - final_upstream_request_ = upstream_request; + final_upstream_request_ = &upstream_request; resetOtherUpstreams(upstream_request); } // Don't send retries after 100-Continue has been sent on. Arguably we could attempt to do a @@ -778,33 +782,31 @@ void Filter::onUpstream100ContinueHeaders(Http::HeaderMapPtr&& headers, // the complexity until someone asks for it. retry_state_.reset(); - if (final_upstream_request_ == upstream_request) { + if (final_upstream_request_ == &upstream_request) { callbacks_->encode100ContinueHeaders(std::move(headers)); } } -void Filter::resetOtherUpstreams(UpstreamRequest* upstream_request) { +void Filter::resetOtherUpstreams(UpstreamRequest& upstream_request) { for (auto& upstream_request_tmp : upstream_requests_) { - if (upstream_request_tmp.get() != upstream_request) { - if (!upstream_request_tmp->encode_complete_ || !upstream_request_tmp->decode_complete_) { - upstream_request_tmp->resetStream(); - if (upstream_request_tmp->upstream_host_) { - upstream_request_tmp->upstream_host_->stats().rq_hedge_abandoned_.inc(); - } - cluster_->stats().upstream_rq_hedge_abandoned_.inc(); + if (upstream_request_tmp.get() != &upstream_request) { + upstream_request_tmp->resetStream(); + if (upstream_request_tmp->upstream_host_) { + upstream_request_tmp->upstream_host_->stats().rq_hedge_abandoned_.inc(); } + cluster_->stats().upstream_rq_hedge_abandoned_.inc(); } } } void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& headers, - UpstreamRequest* upstream_request, bool end_stream) { + UpstreamRequest& upstream_request, bool end_stream) { ENVOY_STREAM_LOG(debug, "upstream headers complete: end_stream={}", *callbacks_, end_stream); - upstream_request->upstream_host_->outlierDetector().putHttpResponseCode(response_code); + upstream_request.upstream_host_->outlierDetector().putHttpResponseCode(response_code); if (headers->EnvoyImmediateHealthCheckFail() != nullptr) { - upstream_request->upstream_host_->healthChecker().setUnhealthy(); + upstream_request.upstream_host_->healthChecker().setUnhealthy(); } bool could_not_retry = false; @@ -812,7 +814,7 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head // Check if this upstream request was already retried, for instance after // hitting a per try timeout. Don't retry it if we already have. if (retry_state_) { - if (upstream_request->retried_) { + if (upstream_request.retried_) { // We already retried this request (presumably for a per try timeout) so // we definitely won't retry it again. Check if we would have retried it // if we could. @@ -822,17 +824,18 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head retry_state_->shouldRetryHeaders(*headers, [this]() -> void { doRetry(); }); // Capture upstream_host since setupRetry() in the following line will clear // upstream_request. - const auto upstream_host = upstream_request->upstream_host_; + const auto upstream_host = upstream_request.upstream_host_; if (retry_status == RetryStatus::Yes && setupRetry()) { if (!end_stream) { - upstream_request->resetStream(); + upstream_request.resetStream(); } + upstream_request.removeFromList(upstream_requests_); Http::CodeStats& code_stats = httpContext().codeStats(); code_stats.chargeBasicResponseStat(cluster_->statsScope(), "retry.", static_cast(response_code)); upstream_host->stats().rq_error_.inc(); - upstream_request->retried_ = true; + upstream_request.retried_ = true; return; } else if (retry_status == RetryStatus::NoOverflow) { callbacks_->streamInfo().setResponseFlag(StreamInfo::ResponseFlag::UpstreamOverflow); @@ -878,10 +881,10 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head } } - upstream_request->upstream_canary_ = + upstream_request.upstream_canary_ = (headers->EnvoyUpstreamCanary() && headers->EnvoyUpstreamCanary()->value() == "true") || - upstream_request->upstream_host_->canary(); - chargeUpstreamCode(response_code, *headers, upstream_request->upstream_host_, false); + upstream_request.upstream_host_->canary(); + chargeUpstreamCode(response_code, *headers, upstream_request.upstream_host_, false); if (!Http::CodeUtility::is5xx(response_code)) { handleNon5xxResponseHeaders(*headers, upstream_request, end_stream); } @@ -898,45 +901,45 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head if (!downstream_response_started_) { downstream_response_started_ = true; - final_upstream_request_ = upstream_request; + final_upstream_request_ = &upstream_request; resetOtherUpstreams(upstream_request); } if (end_stream) { onUpstreamComplete(upstream_request); } - if (final_upstream_request_ == upstream_request) { + if (final_upstream_request_ == &upstream_request) { callbacks_->encodeHeaders(std::move(headers), end_stream); } } -void Filter::onUpstreamData(Buffer::Instance& data, UpstreamRequest* upstream_request, +void Filter::onUpstreamData(Buffer::Instance& data, UpstreamRequest& upstream_request, bool end_stream) { if (end_stream) { // gRPC request termination without trailers is an error. - if (upstream_request->grpc_rq_success_deferred_) { - upstream_request->upstream_host_->stats().rq_error_.inc(); + if (upstream_request.grpc_rq_success_deferred_) { + upstream_request.upstream_host_->stats().rq_error_.inc(); } onUpstreamComplete(upstream_request); } - if (final_upstream_request_ == upstream_request) { + if (final_upstream_request_ == &upstream_request) { callbacks_->encodeData(data, end_stream); } } -void Filter::onUpstreamTrailers(Http::HeaderMapPtr&& trailers, UpstreamRequest* upstream_request) { - if (upstream_request->grpc_rq_success_deferred_) { +void Filter::onUpstreamTrailers(Http::HeaderMapPtr&& trailers, UpstreamRequest& upstream_request) { + if (upstream_request.grpc_rq_success_deferred_) { absl::optional grpc_status = Grpc::Common::getGrpcStatus(*trailers); if (grpc_status && !Http::CodeUtility::is5xx(Grpc::Utility::grpcToHttpStatus(grpc_status.value()))) { - upstream_request->upstream_host_->stats().rq_success_.inc(); + upstream_request.upstream_host_->stats().rq_success_.inc(); } else { - upstream_request->upstream_host_->stats().rq_error_.inc(); + upstream_request.upstream_host_->stats().rq_error_.inc(); } } onUpstreamComplete(upstream_request); - if (final_upstream_request_ == upstream_request) { + if (final_upstream_request_ == &upstream_request) { callbacks_->encodeTrailers(std::move(trailers)); } } @@ -945,9 +948,9 @@ void Filter::onUpstreamMetadata(Http::MetadataMapPtr&& metadata_map) { callbacks_->encodeMetadata(std::move(metadata_map)); } -void Filter::onUpstreamComplete(UpstreamRequest* upstream_request) { +void Filter::onUpstreamComplete(UpstreamRequest& upstream_request) { if (!downstream_end_stream_) { - upstream_request->resetStream(); + upstream_request.resetStream(); } if (config_.emit_dynamic_stats_ && !callbacks_->streamInfo().healthCheck() && @@ -956,7 +959,7 @@ void Filter::onUpstreamComplete(UpstreamRequest* upstream_request) { std::chrono::milliseconds response_time = std::chrono::duration_cast( dispatcher.timeSource().monotonicTime() - downstream_request_complete_time_); - upstream_request->upstream_host_->outlierDetector().putResponseTime(response_time); + upstream_request.upstream_host_->outlierDetector().putResponseTime(response_time); const Http::HeaderEntry* internal_request_header = downstream_headers_->EnvoyInternalRequest(); const bool internal_request = @@ -970,13 +973,13 @@ void Filter::onUpstreamComplete(UpstreamRequest* upstream_request) { cluster_->statsScope(), EMPTY_STRING, response_time, - upstream_request->upstream_canary_, + upstream_request.upstream_canary_, internal_request, route_entry_->virtualHost().name(), request_vcluster_ ? request_vcluster_->name() : EMPTY_STRING, zone_name, - upstreamZone(upstream_request->upstream_host_)}; + upstreamZone(upstream_request.upstream_host_)}; code_stats.chargeResponseTiming(info); @@ -985,12 +988,12 @@ void Filter::onUpstreamComplete(UpstreamRequest* upstream_request) { cluster_->statsScope(), alt_stat_prefix_, response_time, - upstream_request->upstream_canary_, + upstream_request.upstream_canary_, internal_request, EMPTY_STRING, EMPTY_STRING, zone_name, - upstreamZone(upstream_request->upstream_host_)}; + upstreamZone(upstream_request.upstream_host_)}; code_stats.chargeResponseTiming(info); } @@ -1015,7 +1018,7 @@ bool Filter::setupRetry() { return true; } -bool Filter::setupRedirect(const Http::HeaderMap& headers, UpstreamRequest* upstream_request) { +bool Filter::setupRedirect(const Http::HeaderMap& headers, UpstreamRequest& upstream_request) { ENVOY_STREAM_LOG(debug, "attempting internal redirect", *callbacks_); const Http::HeaderEntry* location = headers.Location(); @@ -1029,7 +1032,7 @@ bool Filter::setupRedirect(const Http::HeaderMap& headers, UpstreamRequest* upst // completion here and check it in onDestroy. This is annoyingly complicated but is better than // needlessly resetting streams. attempting_internal_redirect_with_complete_stream_ = - upstream_request->upstream_timing_.last_upstream_rx_byte_received_ && downstream_end_stream_; + upstream_request.upstream_timing_.last_upstream_rx_byte_received_ && downstream_end_stream_; // As with setupRetry, redirects are not supported for streaming requests yet. if (downstream_end_stream_ && @@ -1132,7 +1135,7 @@ Filter::UpstreamRequest::~UpstreamRequest() { void Filter::UpstreamRequest::decode100ContinueHeaders(Http::HeaderMapPtr&& headers) { ASSERT(100 == Http::Utility::getResponseStatus(*headers)); - parent_.onUpstream100ContinueHeaders(std::move(headers), this); + parent_.onUpstream100ContinueHeaders(std::move(headers), *this); } void Filter::UpstreamRequest::decodeHeaders(Http::HeaderMapPtr&& headers, bool end_stream) { @@ -1143,19 +1146,19 @@ void Filter::UpstreamRequest::decodeHeaders(Http::HeaderMapPtr&& headers, bool e upstream_headers_ = headers.get(); const uint64_t response_code = Http::Utility::getResponseStatus(*headers); stream_info_.response_code_ = static_cast(response_code); - parent_.onUpstreamHeaders(response_code, std::move(headers), this, end_stream); + parent_.onUpstreamHeaders(response_code, std::move(headers), *this, end_stream); } void Filter::UpstreamRequest::decodeData(Buffer::Instance& data, bool end_stream) { maybeEndDecode(end_stream); stream_info_.addBytesReceived(data.length()); - parent_.onUpstreamData(data, this, end_stream); + parent_.onUpstreamData(data, *this, end_stream); } void Filter::UpstreamRequest::decodeTrailers(Http::HeaderMapPtr&& trailers) { maybeEndDecode(true); upstream_trailers_ = trailers.get(); - parent_.onUpstreamTrailers(std::move(trailers), this); + parent_.onUpstreamTrailers(std::move(trailers), *this); } void Filter::UpstreamRequest::decodeMetadata(Http::MetadataMapPtr&& metadata_map) { @@ -1225,7 +1228,7 @@ void Filter::UpstreamRequest::onResetStream(Http::StreamResetReason reason, clearRequestEncoder(); if (!calling_encode_headers_) { stream_info_.setResponseFlag(parent_.streamResetReasonToResponseFlag(reason)); - parent_.onUpstreamReset(reason, transport_failure_reason, this); + parent_.onUpstreamReset(reason, transport_failure_reason, *this); } else { deferred_reset_reason_ = reason; } @@ -1270,7 +1273,7 @@ void Filter::UpstreamRequest::onPerTryTimeout() { // Set response flag to UT for now, but it might be overwritten if a // response arrives later and hedge_on_per_try_timeout_ is set stream_info_.setResponseFlag(StreamInfo::ResponseFlag::UpstreamRequestTimeout); - parent_.onPerTryTimeout(this); + parent_.onPerTryTimeout(*this); } else { ENVOY_STREAM_LOG(debug, "ignored upstream per try timeout due to already started downstream response", diff --git a/source/common/router/router.h b/source/common/router/router.h index b3886622ec9cc..0cdfdb7dac30d 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -384,7 +384,7 @@ class Filter : Logger::Loggable, Upstream::HostDescriptionConstSharedPtr upstream_host, bool dropped); void chargeUpstreamCode(Http::Code code, Upstream::HostDescriptionConstSharedPtr upstream_host, bool dropped); - void chargeUpstreamAbort(Http::Code code, bool dropped, UpstreamRequest* upstream_request); + void chargeUpstreamAbort(Http::Code code, bool dropped, UpstreamRequest& upstream_request); void cleanup(); virtual RetryStatePtr createRetryState(const RetryPolicy& policy, Http::HeaderMap& request_headers, @@ -394,17 +394,17 @@ class Filter : Logger::Loggable, Upstream::ResourcePriority priority) PURE; Http::ConnectionPool::Instance* getConnPool(); void maybeDoShadowing(); - bool maybeRetryReset(Http::StreamResetReason reset_reason, UpstreamRequest* upstream_request); + bool maybeRetryReset(Http::StreamResetReason reset_reason, UpstreamRequest& upstream_request); uint32_t numRequestsAwaitingHeaders(); void onGlobalTimeout(); - void onPerTryTimeout(UpstreamRequest* upstream_request); + void onPerTryTimeout(UpstreamRequest& upstream_request); void onRequestComplete(); void onResponseTimeout(); void onUpstream100ContinueHeaders(Http::HeaderMapPtr&& headers, - UpstreamRequest* upstream_request); + UpstreamRequest& upstream_request); // Handle an upstream request aborted due to a local timeout. void onSoftPerTryTimeout(); - void onSoftPerTryTimeout(UpstreamRequest* upstream_request); + void onSoftPerTryTimeout(UpstreamRequest& upstream_request); void onUpstreamTimeoutAbort(StreamInfo::ResponseFlag response_flag); // Handle an "aborted" upstream request, meaning we didn't see response // headers (e.g. due to a reset). Handles recording stats and responding @@ -412,23 +412,26 @@ class Filter : Logger::Loggable, void onUpstreamAbort(Http::Code code, StreamInfo::ResponseFlag response_flag, absl::string_view body, bool dropped); void onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& headers, - UpstreamRequest* upstream_request, bool end_stream); - void onUpstreamData(Buffer::Instance& data, UpstreamRequest* upstream_request, bool end_stream); - void onUpstreamTrailers(Http::HeaderMapPtr&& trailers, UpstreamRequest* upstream_request); + UpstreamRequest& upstream_request, bool end_stream); + void onUpstreamData(Buffer::Instance& data, UpstreamRequest& upstream_request, bool end_stream); + void onUpstreamTrailers(Http::HeaderMapPtr&& trailers, UpstreamRequest& upstream_request); void onUpstreamMetadata(Http::MetadataMapPtr&& metadata_map); - void onUpstreamComplete(UpstreamRequest* upstream_request); + void onUpstreamComplete(UpstreamRequest& upstream_request); void onUpstreamReset(Http::StreamResetReason reset_reason, absl::string_view transport_failure, - UpstreamRequest* upstream_request); - void resetOtherUpstreams(UpstreamRequest* upstream_request); + UpstreamRequest& upstream_request); + // Reset all in-flight upstream requests that do NOT match the passed argument. This is used + // if a "good" response comes back and we return downstream, so there is no point in waiting + // for the remaining upstream requests to return. + void resetOtherUpstreams(UpstreamRequest& upstream_request); void sendNoHealthyUpstreamResponse(); bool setupRetry(); - bool setupRedirect(const Http::HeaderMap& headers, UpstreamRequest* upstream_request); - void updateOutlierDetection(Http::Code code, UpstreamRequest* upstream_request); + bool setupRedirect(const Http::HeaderMap& headers, UpstreamRequest& upstream_request); + void updateOutlierDetection(Http::Code code, UpstreamRequest& upstream_request); void doRetry(); // Called immediately after a non-5xx header is received from upstream, performs stats accounting // and handle difference between gRPC and non-gRPC requests. void handleNon5xxResponseHeaders(const Http::HeaderMap& headers, - UpstreamRequest* upstream_request, bool end_stream); + UpstreamRequest& upstream_request, bool end_stream); TimeSource& timeSource() { return config_.timeSource(); } Http::Context& httpContext() { return config_.http_context_; } From 4b687401a6cb15ee1f833a367ce18301259a5672 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 18 Mar 2019 12:41:26 -0400 Subject: [PATCH 16/70] fix upstream_request reference after destruction Signed-off-by: Michael Puncel --- source/common/router/router.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index adbd17d338d15..16043dc7354a7 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -835,7 +835,6 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head code_stats.chargeBasicResponseStat(cluster_->statsScope(), "retry.", static_cast(response_code)); upstream_host->stats().rq_error_.inc(); - upstream_request.retried_ = true; return; } else if (retry_status == RetryStatus::NoOverflow) { callbacks_->streamInfo().setResponseFlag(StreamInfo::ResponseFlag::UpstreamOverflow); From 71ecc252242cf2e13bdf32ad944aa7a7d8676345 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 18 Mar 2019 14:00:12 -0400 Subject: [PATCH 17/70] remove unnecessary get() for clang-tidy Signed-off-by: Michael Puncel --- source/common/router/router.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 16043dc7354a7..b89d3950870d2 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -558,11 +558,11 @@ void Filter::onResponseTimeout() { // If this upstream request already hit a "soft" timeout, then it // already recorded a timeout into outlier detection. Don't do it again. if (!upstream_request->outlier_detection_timeout_recorded_) { - updateOutlierDetection(timeout_response_code_, *upstream_request.get()); + updateOutlierDetection(timeout_response_code_, *upstream_request); } upstream_request->resetStream(); - chargeUpstreamAbort(timeout_response_code_, false, *upstream_request.get()); + chargeUpstreamAbort(timeout_response_code_, false, *upstream_request); } } From fce805d36446d541746b9e3e476791c13069a5db Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 18 Mar 2019 14:13:22 -0400 Subject: [PATCH 18/70] docs fixes Signed-off-by: Michael Puncel --- api/envoy/api/v2/route/route.proto | 7 ++++--- docs/root/intro/arch_overview/http_routing.rst | 15 +++++++++------ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/api/envoy/api/v2/route/route.proto b/api/envoy/api/v2/route/route.proto index ce6c6de9ab650..6a38a197bbe17 100644 --- a/api/envoy/api/v2/route/route.proto +++ b/api/envoy/api/v2/route/route.proto @@ -874,9 +874,10 @@ message HedgePolicy { // Indicates that a hedged request should be sent when the per-try timeout // is hit. This will only occur if the retry policy also indicates that a // timed out request should be retried. - // Once a timed out request is retried due to per try timeout, it will not be - // retried again even if the returned response headers would otherwise b - // according the specified :ref:`RetryPolicy `. + // Once a timed out request is retried due to per try timeout, the router + // filter will ensure that it is not retried again even if the returned + // response headers would otherwise be retried according the specified + // :ref:`RetryPolicy `. // Defaults to false. bool hedge_on_per_try_timeout = 3; } diff --git a/docs/root/intro/arch_overview/http_routing.rst b/docs/root/intro/arch_overview/http_routing.rst index 977699eacf719..e900592fefd13 100644 --- a/docs/root/intro/arch_overview/http_routing.rst +++ b/docs/root/intro/arch_overview/http_routing.rst @@ -93,12 +93,15 @@ Note that retries may be disabled depending on the contents of the :ref:`x-envoy Request Hedging --------------- -Envoy supports request hedging via specifying a :ref:`hedge policy `. This means that Envoy -will race multiple simultaneous upstream requests and return the first valid response to the downstream. - -Currently hedging can only be applied to retries performed in response to a request timeout. The implementation ensures that -the same upstream request is not retried twice, for instance if it times out and then later receives a 5xx response and the -retry policy calls for retrying on 5xx. +Envoy supports request hedging via specifying a :ref:`hedge policy +`. This means that Envoy will race multiple +simultaneous upstream requests and return the first valid response to the +downstream according to retry policy. + +Currently hedging can only be applied to retries performed in response to a +request timeout. The implementation ensures that the same upstream request is +not retried twice, for instance if it times out and then later receives a 5xx +response and the retry policy calls for retrying on 5xx. .. _arch_overview_http_routing_priority: From eae3f0a045e93f30135b5c93de0bae5a148a0892 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 19 Mar 2019 08:44:02 -0400 Subject: [PATCH 19/70] make downstream watermark callbacks a list in conn manager Signed-off-by: Michael Puncel --- source/common/http/conn_manager_impl.cc | 21 ++++++++++----------- source/common/http/conn_manager_impl.h | 2 +- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/source/common/http/conn_manager_impl.cc b/source/common/http/conn_manager_impl.cc index 42951c2c6b84c..708c81a928bb4 100644 --- a/source/common/http/conn_manager_impl.cc +++ b/source/common/http/conn_manager_impl.cc @@ -1477,16 +1477,16 @@ bool ConnectionManagerImpl::ActiveStream::verbose() const { void ConnectionManagerImpl::ActiveStream::callHighWatermarkCallbacks() { ++high_watermark_count_; - if (watermark_callbacks_) { - watermark_callbacks_->onAboveWriteBufferHighWatermark(); + for (auto watermark_callbacks : watermark_callbacks_) { + watermark_callbacks->onAboveWriteBufferHighWatermark(); } } void ConnectionManagerImpl::ActiveStream::callLowWatermarkCallbacks() { ASSERT(high_watermark_count_ > 0); --high_watermark_count_; - if (watermark_callbacks_) { - watermark_callbacks_->onBelowWriteBufferLowWatermark(); + for (auto watermark_callbacks : watermark_callbacks_) { + watermark_callbacks->onBelowWriteBufferLowWatermark(); } } @@ -1801,19 +1801,18 @@ void ConnectionManagerImpl::ActiveStreamDecoderFilter:: void ConnectionManagerImpl::ActiveStreamDecoderFilter::addDownstreamWatermarkCallbacks( DownstreamWatermarkCallbacks& watermark_callbacks) { - // This is called exactly once per stream, by the router filter. - // If there's ever a need for another filter to subscribe to watermark callbacks this can be - // turned into a vector. - ASSERT(parent_.watermark_callbacks_ == nullptr); - parent_.watermark_callbacks_ = &watermark_callbacks; + // This is called exactly once per stream, by the router filter. Therefore we + // expect the same callbacks to not be registered twice. + ASSERT(std::find(parent_.watermark_callbacks_.begin(), parent_.watermark_callbacks_.end(), &watermark_callbacks) == parent_.watermark_callbacks_.end()); + parent_.watermark_callbacks_.emplace(parent_.watermark_callbacks_.end(), &watermark_callbacks); for (uint32_t i = 0; i < parent_.high_watermark_count_; ++i) { watermark_callbacks.onAboveWriteBufferHighWatermark(); } } void ConnectionManagerImpl::ActiveStreamDecoderFilter::removeDownstreamWatermarkCallbacks( DownstreamWatermarkCallbacks& watermark_callbacks) { - ASSERT(parent_.watermark_callbacks_ == &watermark_callbacks); - parent_.watermark_callbacks_ = nullptr; + ASSERT(std::find(parent_.watermark_callbacks_.begin(), parent_.watermark_callbacks_.end(), &watermark_callbacks) != parent_.watermark_callbacks_.end()); + parent_.watermark_callbacks_.remove(&watermark_callbacks); } bool ConnectionManagerImpl::ActiveStreamDecoderFilter::recreateStream() { diff --git a/source/common/http/conn_manager_impl.h b/source/common/http/conn_manager_impl.h index 5c15c9a9711c8..71569d8dfa45a 100644 --- a/source/common/http/conn_manager_impl.h +++ b/source/common/http/conn_manager_impl.h @@ -442,7 +442,7 @@ class ConnectionManagerImpl : Logger::Loggable, StreamInfo::StreamInfoImpl stream_info_; absl::optional cached_route_; absl::optional cached_cluster_info_; - DownstreamWatermarkCallbacks* watermark_callbacks_{nullptr}; + std::list watermark_callbacks_{}; uint32_t buffer_limit_{0}; uint32_t high_watermark_count_{0}; const std::string* decorated_operation_{nullptr}; From c8cd2ce068e78dec87fe7be5b8010ba1b1e97abe Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 19 Mar 2019 09:14:58 -0400 Subject: [PATCH 20/70] fix format Signed-off-by: Michael Puncel --- source/common/http/conn_manager_impl.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/source/common/http/conn_manager_impl.cc b/source/common/http/conn_manager_impl.cc index 708c81a928bb4..26d614da5be8f 100644 --- a/source/common/http/conn_manager_impl.cc +++ b/source/common/http/conn_manager_impl.cc @@ -1803,7 +1803,8 @@ void ConnectionManagerImpl::ActiveStreamDecoderFilter::addDownstreamWatermarkCal DownstreamWatermarkCallbacks& watermark_callbacks) { // This is called exactly once per stream, by the router filter. Therefore we // expect the same callbacks to not be registered twice. - ASSERT(std::find(parent_.watermark_callbacks_.begin(), parent_.watermark_callbacks_.end(), &watermark_callbacks) == parent_.watermark_callbacks_.end()); + ASSERT(std::find(parent_.watermark_callbacks_.begin(), parent_.watermark_callbacks_.end(), + &watermark_callbacks) == parent_.watermark_callbacks_.end()); parent_.watermark_callbacks_.emplace(parent_.watermark_callbacks_.end(), &watermark_callbacks); for (uint32_t i = 0; i < parent_.high_watermark_count_; ++i) { watermark_callbacks.onAboveWriteBufferHighWatermark(); @@ -1811,7 +1812,8 @@ void ConnectionManagerImpl::ActiveStreamDecoderFilter::addDownstreamWatermarkCal } void ConnectionManagerImpl::ActiveStreamDecoderFilter::removeDownstreamWatermarkCallbacks( DownstreamWatermarkCallbacks& watermark_callbacks) { - ASSERT(std::find(parent_.watermark_callbacks_.begin(), parent_.watermark_callbacks_.end(), &watermark_callbacks) != parent_.watermark_callbacks_.end()); + ASSERT(std::find(parent_.watermark_callbacks_.begin(), parent_.watermark_callbacks_.end(), + &watermark_callbacks) != parent_.watermark_callbacks_.end()); parent_.watermark_callbacks_.remove(&watermark_callbacks); } From e0755316b32c61efb0855abef21dcc10b1d627a4 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 8 Apr 2019 11:14:33 -0400 Subject: [PATCH 21/70] clarify some docs and comments Signed-off-by: Michael Puncel --- docs/root/intro/arch_overview/http_routing.rst | 6 ++++-- source/common/http/conn_manager_impl.cc | 2 +- source/common/router/retry_state_impl.cc | 8 ++++++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/docs/root/intro/arch_overview/http_routing.rst b/docs/root/intro/arch_overview/http_routing.rst index e900592fefd13..fddcd31e251bd 100644 --- a/docs/root/intro/arch_overview/http_routing.rst +++ b/docs/root/intro/arch_overview/http_routing.rst @@ -95,8 +95,10 @@ Request Hedging Envoy supports request hedging via specifying a :ref:`hedge policy `. This means that Envoy will race multiple -simultaneous upstream requests and return the first valid response to the -downstream according to retry policy. +simultaneous upstream requests and return the response associated with the +first acceptable response headers to the downstream. The retry policy is used +to determine whether a response should be returned or whether more responses +should be awaited. Currently hedging can only be applied to retries performed in response to a request timeout. The implementation ensures that the same upstream request is diff --git a/source/common/http/conn_manager_impl.cc b/source/common/http/conn_manager_impl.cc index e34d759e222fc..63d279529f41b 100644 --- a/source/common/http/conn_manager_impl.cc +++ b/source/common/http/conn_manager_impl.cc @@ -1891,7 +1891,7 @@ void ConnectionManagerImpl::ActiveStreamDecoderFilter:: void ConnectionManagerImpl::ActiveStreamDecoderFilter::addDownstreamWatermarkCallbacks( DownstreamWatermarkCallbacks& watermark_callbacks) { - // This is called exactly once per stream, by the router filter. Therefore we + // This is called exactly once per upstream-stream, by the router filter. Therefore, we // expect the same callbacks to not be registered twice. ASSERT(std::find(parent_.watermark_callbacks_.begin(), parent_.watermark_callbacks_.end(), &watermark_callbacks) == parent_.watermark_callbacks_.end()); diff --git a/source/common/router/retry_state_impl.cc b/source/common/router/retry_state_impl.cc index 3d7fdea12ecfa..fc3f5dfe38cbe 100644 --- a/source/common/router/retry_state_impl.cc +++ b/source/common/router/retry_state_impl.cc @@ -196,8 +196,12 @@ RetryStatus RetryStateImpl::shouldRetryReset(Http::StreamResetReason reset_reaso RetryStatus RetryStateImpl::shouldHedgeRetryPerTryTimeout(DoRetryCallback callback) { // A hedged retry on per try timeout is always retried if there are retries - // left. NOTE: this is different than non-hedged per try timeouts which are only retried - // if RETRY_ON_5XX or RETRY_ON_GATEWAY_ERROR + // left. NOTE: this is a bit different than non-hedged per try timeouts which + // are only retried if the applicable retry policy specifies either + // RETRY_ON_5XX or RETRY_ON_GATEWAY_ERROR. This is because these types of + // retries are associated with a stream reset which is analagous to a gateway + // error. When hedging on per try timeout is enabled, however, there is no + // stream reset. return shouldRetry([]() -> bool { return true; }, callback); } From df47fffc1345de918ae906de09707d001bbe49ba Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Wed, 10 Apr 2019 08:24:16 -0400 Subject: [PATCH 22/70] add HTTP header for enabling hedging on per try timeout Signed-off-by: Michael Puncel --- .../http_filters/router_filter.rst | 12 ++ include/envoy/http/header_map.h | 1 + source/common/http/conn_manager_utility.cc | 1 + source/common/http/headers.h | 1 + source/common/router/router.cc | 16 ++- source/common/router/router.h | 3 +- test/common/http/conn_manager_utility_test.cc | 2 + test/common/router/router_test.cc | 116 ++++++++++++++++-- 8 files changed, 139 insertions(+), 13 deletions(-) diff --git a/docs/root/configuration/http_filters/router_filter.rst b/docs/root/configuration/http_filters/router_filter.rst index cb5ab6a5941f6..2a2156da19fbd 100644 --- a/docs/root/configuration/http_filters/router_filter.rst +++ b/docs/root/configuration/http_filters/router_filter.rst @@ -211,6 +211,18 @@ requests. This timeout must be <= the global route timeout (see caller to set a tight per try timeout to allow for retries while maintaining a reasonable overall timeout. +x-envoy-hedge-on-per-try-timeout +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Setting this header on egress requests will cause Envoy to use a request +hedging strategy in the case of a per try timeout. This overrides the value set +in the :ref:`route configuration +`. This means that a retry +will be issued without resetting the original request, leaving multiple upstream requests +in flight. + +The value of the header should be "true" or "false", and is ignored if invalid. + .. _config_http_filters_router_x-envoy-immediate-health-check-fail: x-envoy-immediate-health-check-fail diff --git a/include/envoy/http/header_map.h b/include/envoy/http/header_map.h index b68967ed8492c..c03a837c66661 100644 --- a/include/envoy/http/header_map.h +++ b/include/envoy/http/header_map.h @@ -278,6 +278,7 @@ class HeaderEntry { HEADER_FUNC(EnvoyExpectedRequestTimeoutMs) \ HEADER_FUNC(EnvoyExternalAddress) \ HEADER_FUNC(EnvoyForceTrace) \ + HEADER_FUNC(EnvoyHedgeOnPerTryTimeout) \ HEADER_FUNC(EnvoyImmediateHealthCheckFail) \ HEADER_FUNC(EnvoyInternalRequest) \ HEADER_FUNC(EnvoyIpTags) \ diff --git a/source/common/http/conn_manager_utility.cc b/source/common/http/conn_manager_utility.cc index 164b8712c2954..99e280b5129ad 100644 --- a/source/common/http/conn_manager_utility.cc +++ b/source/common/http/conn_manager_utility.cc @@ -170,6 +170,7 @@ Network::Address::InstanceConstSharedPtr ConnectionManagerUtility::mutateRequest request_headers.removeEnvoyForceTrace(); request_headers.removeEnvoyIpTags(); request_headers.removeEnvoyOriginalUrl(); + request_headers.removeEnvoyHedgeOnPerTryTimeout(); for (const LowerCaseString& header : route_config.internalOnlyHeaders()) { request_headers.remove(header); diff --git a/source/common/http/headers.h b/source/common/http/headers.h index 79c0b1cd153c7..d372fd789798f 100644 --- a/source/common/http/headers.h +++ b/source/common/http/headers.h @@ -41,6 +41,7 @@ class HeaderValues { const LowerCaseString EnvoyDownstreamServiceNode{"x-envoy-downstream-service-node"}; const LowerCaseString EnvoyExternalAddress{"x-envoy-external-address"}; const LowerCaseString EnvoyForceTrace{"x-envoy-force-trace"}; + const LowerCaseString EnvoyHedgeOnPerTryTimeout{"x-envoy-hedge-on-per-try-timeout"}; const LowerCaseString EnvoyImmediateHealthCheckFail{"x-envoy-immediate-health-check-fail"}; const LowerCaseString EnvoyOriginalUrl{"x-envoy-original-url"}; const LowerCaseString EnvoyInternalRequest{"x-envoy-internal"}; diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 992c7dcc6081e..df076fe44bc60 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -180,6 +180,7 @@ FilterUtility::finalTimeout(const RouteEntry& route, Http::HeaderMap& request_he } FilterUtility::HedgingParams FilterUtility::finalHedgingParams(const RouteEntry& route, + Http::HeaderMap& request_headers, uint64_t random_value) { HedgingParams hedgingParams; hedgingParams.initial_requests_ = route.hedgePolicy().initialRequests(); @@ -190,6 +191,18 @@ FilterUtility::HedgingParams FilterUtility::finalHedgingParams(const RouteEntry& hedgingParams.initial_requests_++; } + Http::HeaderEntry* hedge_on_per_try_timeout_entry = request_headers.EnvoyHedgeOnPerTryTimeout(); + if (hedge_on_per_try_timeout_entry) { + if (hedge_on_per_try_timeout_entry->value() == "true") { + hedgingParams.hedge_on_per_try_timeout_ = true; + } + if (hedge_on_per_try_timeout_entry->value() == "false") { + hedgingParams.hedge_on_per_try_timeout_ = false; + } + + request_headers.removeEnvoyHedgeOnPerTryTimeout(); + } + return hedgingParams; } @@ -386,7 +399,8 @@ Http::FilterHeadersStatus Filter::decodeHeaders(Http::HeaderMap& headers, bool e // Ensure an http transport scheme is selected before continuing with decoding. ASSERT(headers.Scheme()); - hedging_params_ = FilterUtility::finalHedgingParams(*route_entry_, callbacks_->streamId()); + hedging_params_ = + FilterUtility::finalHedgingParams(*route_entry_, headers, callbacks_->streamId()); retry_state_ = createRetryState(route_entry_->retryPolicy(), headers, *cluster_, config_.runtime_, diff --git a/source/common/router/router.h b/source/common/router/router.h index 0cdfdb7dac30d..3a43f52220515 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -102,7 +102,8 @@ class FilterUtility { * initial request should be sent * @return HedgingParams the final parameters to use for request hedging */ - static HedgingParams finalHedgingParams(const RouteEntry& route, uint64_t random_value); + static HedgingParams finalHedgingParams(const RouteEntry& route, Http::HeaderMap& request_headers, + uint64_t random_value); }; /** diff --git a/test/common/http/conn_manager_utility_test.cc b/test/common/http/conn_manager_utility_test.cc index d1bc974e8b8c3..21eab5a47dc53 100644 --- a/test/common/http/conn_manager_utility_test.cc +++ b/test/common/http/conn_manager_utility_test.cc @@ -520,6 +520,7 @@ TEST_F(ConnectionManagerUtilityTest, ExternalAddressExternalRequestUseRemote) { route_config_.internal_only_headers_.push_back(LowerCaseString("custom_header")); TestHeaderMapImpl headers{{"x-envoy-decorator-operation", "foo"}, {"x-envoy-downstream-service-cluster", "foo"}, + {"x-envoy-hedge-on-per-try-timeout", "foo"}, {"x-envoy-retriable-status-codes", "123,456"}, {"x-envoy-retry-on", "foo"}, {"x-envoy-retry-grpc-on", "foo"}, @@ -537,6 +538,7 @@ TEST_F(ConnectionManagerUtilityTest, ExternalAddressExternalRequestUseRemote) { EXPECT_EQ("50.0.0.1", headers.get_("x-envoy-external-address")); EXPECT_FALSE(headers.has("x-envoy-decorator-operation")); EXPECT_FALSE(headers.has("x-envoy-downstream-service-cluster")); + EXPECT_FALSE(headers.has("x-envoy-hedge-on-per-try-timeout")); EXPECT_FALSE(headers.has("x-envoy-retriable-status-codes")); EXPECT_FALSE(headers.has("x-envoy-retry-on")); EXPECT_FALSE(headers.has("x-envoy-retry-grpc-on")); diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index 871a79667c432..ba4e4b1fe4126 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -2769,18 +2769,20 @@ TEST_F(RouterTest, UpstreamTimingTimeout) { EXPECT_EQ(stream_info.firstUpstreamRxByteReceived().value(), std::chrono::milliseconds(56)); } -TEST(RouterFilterUtilityTest, FinalHedgingParams) { - { // no chance of additional request +TEST(RouterFilterUtilityTest, FinalHedgingParamsInitialRequests) { + Http::TestHeaderMapImpl empty_headers; + { // no chance of additional request, header not present NiceMock route; route.hedge_policy_.initial_requests_ = 10; EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, 0); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, empty_headers, 0); EXPECT_EQ(10, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, 10); + hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 10); EXPECT_EQ(10, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, 100); + hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 100); EXPECT_EQ(10, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, 1000); + hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 1000); EXPECT_EQ(10, hedgingParams.initial_requests_); } { // 50% chance additional request @@ -2788,15 +2790,107 @@ TEST(RouterFilterUtilityTest, FinalHedgingParams) { route.hedge_policy_.initial_requests_ = 10; route.hedge_policy_.additional_request_chance_.set_numerator(50); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, 0); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, empty_headers, 0); EXPECT_EQ(11, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, 49); + hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 49); EXPECT_EQ(11, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, 50); + hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 50); EXPECT_EQ(10, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, 99); + hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 99); EXPECT_EQ(10, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, 100); + hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 100); + EXPECT_EQ(11, hedgingParams.initial_requests_); + } +} + +TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { + Http::TestHeaderMapImpl empty_headers; + { // route says true, header not present, expect true. + NiceMock route; + route.hedge_policy_.hedge_on_per_try_timeout_ = true; + EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, empty_headers, 0); + EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); + } + { // route says false, header not present, expect false. + NiceMock route; + route.hedge_policy_.hedge_on_per_try_timeout_ = false; + EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, empty_headers, 0); + EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); + } + { // route says false, header says true, expect true. + Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "true"}}; + NiceMock route; + route.hedge_policy_.hedge_on_per_try_timeout_ = false; + EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, headers, 0); + EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); + } + { // route says false, header says false, expect false. + Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "false"}}; + NiceMock route; + route.hedge_policy_.hedge_on_per_try_timeout_ = false; + EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, headers, 0); + EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); + } + { // route says true, header says false, expect false. + Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "false"}}; + NiceMock route; + route.hedge_policy_.hedge_on_per_try_timeout_ = true; + EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, headers, 0); + EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); + } + { // route says true, header says true, expect true. + Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "true"}}; + NiceMock route; + route.hedge_policy_.hedge_on_per_try_timeout_ = true; + EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, headers, 0); + EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); + } + { // route says true, header is invalid, expect true. + Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "bad"}}; + NiceMock route; + route.hedge_policy_.hedge_on_per_try_timeout_ = true; + EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, headers, 0); + EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); + } + { // route says false, header is invalid, expect false. + Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "bad"}}; + NiceMock route; + route.hedge_policy_.hedge_on_per_try_timeout_ = false; + EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, headers, 0); + EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); + } + { // 50% chance additional request + NiceMock route; + route.hedge_policy_.initial_requests_ = 10; + route.hedge_policy_.additional_request_chance_.set_numerator(50); + EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, empty_headers, 0); + EXPECT_EQ(11, hedgingParams.initial_requests_); + hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 49); + EXPECT_EQ(11, hedgingParams.initial_requests_); + hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 50); + EXPECT_EQ(10, hedgingParams.initial_requests_); + hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 99); + EXPECT_EQ(10, hedgingParams.initial_requests_); + hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 100); EXPECT_EQ(11, hedgingParams.initial_requests_); } } From bb6a289753ee6d482d69ff73d3c81175b022bd46 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Wed, 10 Apr 2019 08:43:34 -0400 Subject: [PATCH 23/70] add integration tests for router timeouts including the hedging case Signed-off-by: Michael Puncel --- test/integration/BUILD | 11 ++ .../http_timeout_integration_test.cc | 157 ++++++++++++++++++ .../http_timeout_integration_test.h | 22 +++ 3 files changed, 190 insertions(+) create mode 100644 test/integration/http_timeout_integration_test.cc create mode 100644 test/integration/http_timeout_integration_test.h diff --git a/test/integration/BUILD b/test/integration/BUILD index 6c547da012143..e7846a1391cb4 100644 --- a/test/integration/BUILD +++ b/test/integration/BUILD @@ -171,6 +171,17 @@ envoy_cc_test( ], ) +envoy_cc_test( + name = "http_timeout_integration_test", + srcs = [ + "http_timeout_integration_test.cc", + "http_timeout_integration_test.h", + ], + deps = [ + ":http_integration_lib", + ], +) + envoy_cc_test( name = "protocol_integration_test", srcs = [ diff --git a/test/integration/http_timeout_integration_test.cc b/test/integration/http_timeout_integration_test.cc new file mode 100644 index 0000000000000..c45651c8cc8a7 --- /dev/null +++ b/test/integration/http_timeout_integration_test.cc @@ -0,0 +1,157 @@ +#include "test/integration/http_timeout_integration_test.h" + +#include "gtest/gtest.h" + +namespace Envoy { + +INSTANTIATE_TEST_SUITE_P(IpVersions, HttpTimeoutIntegrationTest, + testing::ValuesIn(TestEnvironment::getIpVersionsForTest()), + TestUtility::ipTestParamsToString); + +// Sends a request with a global timeout specified, sleeps for longer than the +// timeout, and ensures that a timeout is received. +TEST_P(HttpTimeoutIntegrationTest, GlobalTimeout) { + initialize(); + + codec_client_ = makeHttpConnection(makeClientConnection(lookupPort("http"))); + auto encoder_decoder = codec_client_->startRequest( + Http::TestHeaderMapImpl{{":method", "POST"}, + {":path", "/test/long/url"}, + {":scheme", "http"}, + {":authority", "host"}, + {"x-forwarded-for", "10.0.0.1"}, + {"x-envoy-upstream-rq-timeout-ms", "500"}}); + auto response = std::move(encoder_decoder.second); + request_encoder_ = &encoder_decoder.first; + + ASSERT_TRUE(fake_upstreams_[0]->waitForHttpConnection(*dispatcher_, fake_upstream_connection_)); + ASSERT_TRUE(fake_upstream_connection_->waitForNewStream(*dispatcher_, upstream_request_)); + ASSERT_TRUE(upstream_request_->waitForHeadersComplete()); + codec_client_->sendData(*request_encoder_, 0, true); + + ASSERT_TRUE(upstream_request_->waitForEndStream(*dispatcher_)); + + // Trigger global timeout. + timeSystem().sleep(std::chrono::milliseconds(501)); + + // Ensure we got a timeout downstream and canceled the upstream request. + response->waitForHeaders(); + ASSERT_TRUE(upstream_request_->waitForReset(std::chrono::milliseconds(0))); + + codec_client_->close(); + + EXPECT_TRUE(upstream_request_->complete()); + EXPECT_EQ(0U, upstream_request_->bodyLength()); + + EXPECT_TRUE(response->complete()); + EXPECT_STREQ("504", response->headers().Status()->value().c_str()); +} + +// Sends a request with a global timeout and per try timeout specified, sleeps +// for longer than the per try but slightly less than the global timeout. +// Ensures that two requests are attempted and a timeout is returned +// downstream. +TEST_P(HttpTimeoutIntegrationTest, PerTryTimeout) { + initialize(); + + codec_client_ = makeHttpConnection(makeClientConnection(lookupPort("http"))); + auto encoder_decoder = codec_client_->startRequest( + Http::TestHeaderMapImpl{{":method", "POST"}, + {":path", "/test/long/url"}, + {":scheme", "http"}, + {":authority", "host"}, + {"x-forwarded-for", "10.0.0.1"}, + {"x-envoy-retry-on", "5xx"}, + {"x-envoy-upstream-rq-timeout-ms", "500"}, + {"x-envoy-upstream-rq-per-try-timeout-ms", "400"}}); + auto response = std::move(encoder_decoder.second); + request_encoder_ = &encoder_decoder.first; + + ASSERT_TRUE(fake_upstreams_[0]->waitForHttpConnection(*dispatcher_, fake_upstream_connection_)); + ASSERT_TRUE(fake_upstream_connection_->waitForNewStream(*dispatcher_, upstream_request_)); + ASSERT_TRUE(upstream_request_->waitForHeadersComplete()); + codec_client_->sendData(*request_encoder_, 0, true); + + ASSERT_TRUE(upstream_request_->waitForEndStream(*dispatcher_)); + + // Trigger per try timeout (but not global timeout). + timeSystem().sleep(std::chrono::milliseconds(400)); + + // Wait for a second request to be sent upstream + ASSERT_TRUE(fake_upstream_connection_->waitForNewStream(*dispatcher_, upstream_request_)); + ASSERT_TRUE(upstream_request_->waitForHeadersComplete()); + ASSERT_TRUE(upstream_request_->waitForEndStream(*dispatcher_)); + + // Trigger global timeout. + timeSystem().sleep(std::chrono::milliseconds(100)); + response->waitForHeaders(); + + codec_client_->close(); + + EXPECT_TRUE(upstream_request_->complete()); + EXPECT_EQ(0U, upstream_request_->bodyLength()); + + EXPECT_TRUE(response->complete()); + EXPECT_STREQ("504", response->headers().Status()->value().c_str()); +} + +// With hedge_on_per_try_timeout enabled via config, sends a request with a +// global timeout and per try timeout specified, sleeps for longer than the per +// try but slightly less than the global timeout. We then have the first +// upstream request return headers and expect those to be returned downstream +// (which proves the request was not canceled when the timeout was hit). +TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeout) { + initialize(); + + codec_client_ = makeHttpConnection(makeClientConnection(lookupPort("http"))); + auto encoder_decoder = codec_client_->startRequest( + Http::TestHeaderMapImpl{{":method", "POST"}, + {":path", "/test/long/url"}, + {":scheme", "http"}, + {":authority", "host"}, + {"x-forwarded-for", "10.0.0.1"}, + {"x-envoy-retry-on", "5xx"}, + {"x-envoy-hedge-on-per-try-timeout", "true"}, + {"x-envoy-upstream-rq-timeout-ms", "500"}, + {"x-envoy-upstream-rq-per-try-timeout-ms", "400"}}); + auto response = std::move(encoder_decoder.second); + request_encoder_ = &encoder_decoder.first; + + ASSERT_TRUE(fake_upstreams_[0]->waitForHttpConnection(*dispatcher_, fake_upstream_connection_)); + ASSERT_TRUE(fake_upstream_connection_->waitForNewStream(*dispatcher_, upstream_request_)); + ASSERT_TRUE(upstream_request_->waitForHeadersComplete()); + codec_client_->sendData(*request_encoder_, 0, true); + + ASSERT_TRUE(upstream_request_->waitForEndStream(*dispatcher_)); + + // Trigger per try timeout (but not global timeout). + timeSystem().sleep(std::chrono::milliseconds(400)); + + // Trigger retry (there's a 25ms backoff before it's issued). + timeSystem().sleep(std::chrono::milliseconds(26)); + + // Wait for a second request to be sent upstream + FakeStreamPtr upstream_request2; + ASSERT_TRUE(fake_upstream_connection_->waitForNewStream(*dispatcher_, upstream_request2)); + ASSERT_TRUE(upstream_request2->waitForHeadersComplete()); + ASSERT_TRUE(upstream_request2->waitForEndStream(*dispatcher_)); + + // Encode 200 response headers for the first (timed out) request. + Http::TestHeaderMapImpl response_headers{{":status", "200"}}; + upstream_request_->encodeHeaders(response_headers, true); + + response->waitForHeaders(); + + // The second request should be reset since we used the response from the first request. + ASSERT_TRUE(upstream_request2->waitForReset(std::chrono::milliseconds(0))); + + codec_client_->close(); + + EXPECT_TRUE(upstream_request_->complete()); + EXPECT_EQ(0U, upstream_request_->bodyLength()); + + EXPECT_TRUE(response->complete()); + EXPECT_STREQ("200", response->headers().Status()->value().c_str()); +} + +} // namespace Envoy diff --git a/test/integration/http_timeout_integration_test.h b/test/integration/http_timeout_integration_test.h new file mode 100644 index 0000000000000..fd378f4ce7f57 --- /dev/null +++ b/test/integration/http_timeout_integration_test.h @@ -0,0 +1,22 @@ +#pragma once + +#include "test/integration/http_integration.h" + +#include "gtest/gtest.h" + +namespace Envoy { +class HttpTimeoutIntegrationTest : public testing::TestWithParam, + public Event::TestUsingSimulatedTime, + public HttpIntegrationTest { +public: + // Arbitrarily choose HTTP2 here, the tests for this class are around + // timeouts which don't have version specific behavior. + HttpTimeoutIntegrationTest() : HttpIntegrationTest(Http::CodecClient::Type::HTTP2, GetParam()) {} + + void SetUp() override { + setDownstreamProtocol(Http::CodecClient::Type::HTTP2); + setUpstreamProtocol(FakeHttpConnection::Type::HTTP2); + } +}; + +} // namespace Envoy From 08c093ce0cd6cdd19f50f26da4cfa74f14ad085a Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Wed, 10 Apr 2019 09:01:22 -0400 Subject: [PATCH 24/70] fix spelling in comment Signed-off-by: Michael Puncel --- source/common/router/retry_state_impl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/common/router/retry_state_impl.cc b/source/common/router/retry_state_impl.cc index fc3f5dfe38cbe..a99878be56eb9 100644 --- a/source/common/router/retry_state_impl.cc +++ b/source/common/router/retry_state_impl.cc @@ -199,7 +199,7 @@ RetryStatus RetryStateImpl::shouldHedgeRetryPerTryTimeout(DoRetryCallback callba // left. NOTE: this is a bit different than non-hedged per try timeouts which // are only retried if the applicable retry policy specifies either // RETRY_ON_5XX or RETRY_ON_GATEWAY_ERROR. This is because these types of - // retries are associated with a stream reset which is analagous to a gateway + // retries are associated with a stream reset which is analogous to a gateway // error. When hedging on per try timeout is enabled, however, there is no // stream reset. return shouldRetry([]() -> bool { return true; }, callback); From 12f24b5fe6f79e66be4c557b86ba33ea2eaa917c Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Wed, 10 Apr 2019 09:20:14 -0400 Subject: [PATCH 25/70] clarify documentation Signed-off-by: Michael Puncel --- docs/root/intro/arch_overview/http_routing.rst | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/docs/root/intro/arch_overview/http_routing.rst b/docs/root/intro/arch_overview/http_routing.rst index fddcd31e251bd..3fd78b6dda849 100644 --- a/docs/root/intro/arch_overview/http_routing.rst +++ b/docs/root/intro/arch_overview/http_routing.rst @@ -93,17 +93,21 @@ Note that retries may be disabled depending on the contents of the :ref:`x-envoy Request Hedging --------------- -Envoy supports request hedging via specifying a :ref:`hedge policy +Envoy supports request hedging which can be enabled by specifying a :ref:`hedge policy `. This means that Envoy will race multiple simultaneous upstream requests and return the response associated with the first acceptable response headers to the downstream. The retry policy is used to determine whether a response should be returned or whether more responses should be awaited. -Currently hedging can only be applied to retries performed in response to a -request timeout. The implementation ensures that the same upstream request is -not retried twice, for instance if it times out and then later receives a 5xx -response and the retry policy calls for retrying on 5xx. +Currently hedging can only be performed in response to a request timeout. This +means that a retry request will be issued without canceling the initial +timed-out request and a late response will be awaited. The first "good" +response according to retry policy will be returned downstream. + +The implementation ensures that the same upstream request is not retried twice. +This might otherwise occur if a request times out and then results in a 5xx +response, creating two retriable events. .. _arch_overview_http_routing_priority: From e4df7be8f577b1bff0be8f68db7074740569f2b8 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Wed, 10 Apr 2019 17:04:28 -0400 Subject: [PATCH 26/70] add integration tests using low buffer sizes Signed-off-by: Michael Puncel --- .../http_timeout_integration_test.cc | 82 +++++++++++++++++++ .../http_timeout_integration_test.h | 3 + 2 files changed, 85 insertions(+) diff --git a/test/integration/http_timeout_integration_test.cc b/test/integration/http_timeout_integration_test.cc index c45651c8cc8a7..b73b2707287d1 100644 --- a/test/integration/http_timeout_integration_test.cc +++ b/test/integration/http_timeout_integration_test.cc @@ -154,4 +154,86 @@ TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeout) { EXPECT_STREQ("200", response->headers().Status()->value().c_str()); } +TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutWithBodyNoBuffer) { + testRouterRequestAndResponseWithHedgedPerTryTimeout(1024, 512); +} + +TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutLowUpstreamBufferLimitLargeRequest) { + config_helper_.setBufferLimits(1024, 1024 * 1024); // Set buffer limits upstream and downstream. + testRouterRequestAndResponseWithHedgedPerTryTimeout(1024 * 1024, 1024); +} + +TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutLowDownstreamBufferLimitLargeResponse) { + config_helper_.setBufferLimits(1024 * 1024, 1024); // Set buffer limits upstream and downstream. + testRouterRequestAndResponseWithHedgedPerTryTimeout(1024, 1024 * 1024); +} + +// Sends a request with x-envoy-hedge-on-per-try-timeout, sleeps (with +// simulated time) for longer than the per try timeout but shorter than the +// global timeout, asserts that a retry is sent, and then responds with a 200 +// response on the original request and ensures the downstream sees it. +// Request/response/header size are configurable to test flow control. +void HttpTimeoutIntegrationTest::testRouterRequestAndResponseWithHedgedPerTryTimeout( + uint64_t request_size, uint64_t response_size) { + initialize(); + + codec_client_ = makeHttpConnection(makeClientConnection(lookupPort("http"))); + Http::TestHeaderMapImpl request_headers{{":method", "POST"}, + {":path", "/test/long/url"}, + {":scheme", "http"}, + {":authority", "host"}, + {"x-forwarded-for", "10.0.0.1"}, + {"x-envoy-retry-on", "5xx"}, + {"x-envoy-hedge-on-per-try-timeout", "true"}, + {"x-envoy-upstream-rq-timeout-ms", "5000"}, + {"x-envoy-upstream-rq-per-try-timeout-ms", "400"}}; + auto encoder_decoder = codec_client_->startRequest(request_headers); + + auto response = std::move(encoder_decoder.second); + request_encoder_ = &encoder_decoder.first; + + ASSERT_TRUE(fake_upstreams_[0]->waitForHttpConnection(*dispatcher_, fake_upstream_connection_)); + ASSERT_TRUE(fake_upstream_connection_->waitForNewStream(*dispatcher_, upstream_request_)); + ASSERT_TRUE(upstream_request_->waitForHeadersComplete()); + + codec_client_->sendData(*request_encoder_, request_size, true); + + ASSERT_TRUE(upstream_request_->waitForEndStream(*dispatcher_)); + + // Trigger per try timeout (but not global timeout). + timeSystem().sleep(std::chrono::milliseconds(400)); + + FakeStreamPtr upstream_request2; + // Trigger retry (there's a 25ms backoff before it's issued). + timeSystem().sleep(std::chrono::milliseconds(26)); + + // Wait for a second request to be sent upstream + ASSERT_TRUE(fake_upstream_connection_->waitForNewStream(*dispatcher_, upstream_request2)); + ASSERT_TRUE(upstream_request2->waitForHeadersComplete()); + ASSERT_TRUE(upstream_request2->waitForEndStream(*dispatcher_)); + + // Encode 200 response headers for the first (timed out) request. + Http::TestHeaderMapImpl response_headers{{":status", "200"}}; + upstream_request_->encodeHeaders(response_headers, response_size == 0); + + response->waitForHeaders(); + + // The second request should be reset since we used the response from the first request. + ASSERT_TRUE(upstream_request2->waitForReset(std::chrono::milliseconds(0))); + + if (response_size) { + upstream_request_->encodeData(response_size, true); + } + + response->waitForEndStream(); + + codec_client_->close(); + + EXPECT_TRUE(upstream_request_->complete()); + EXPECT_EQ(request_size, upstream_request_->bodyLength()); + + EXPECT_TRUE(response->complete()); + EXPECT_STREQ("200", response->headers().Status()->value().c_str()); +} + } // namespace Envoy diff --git a/test/integration/http_timeout_integration_test.h b/test/integration/http_timeout_integration_test.h index fd378f4ce7f57..356a9f905011d 100644 --- a/test/integration/http_timeout_integration_test.h +++ b/test/integration/http_timeout_integration_test.h @@ -17,6 +17,9 @@ class HttpTimeoutIntegrationTest : public testing::TestWithParam Date: Wed, 10 Apr 2019 17:11:25 -0400 Subject: [PATCH 27/70] remove unnecessary buffering case Signed-off-by: Michael Puncel --- source/common/router/router.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index df076fe44bc60..ebde76996e620 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -445,7 +445,7 @@ void Filter::sendNoHealthyUpstreamResponse() { Http::FilterDataStatus Filter::decodeData(Buffer::Instance& data, bool end_stream) { bool buffering = - (retry_state_ && retry_state_->enabled()) || do_shadowing_ || upstream_requests_.size() > 1; + (retry_state_ && retry_state_->enabled()) || do_shadowing_; if (buffering && buffer_limit_ > 0 && getLength(callbacks_->decodingBuffer()) + data.length() > buffer_limit_) { // The request is larger than we should buffer. Give up on the retry/shadow From 01aa62a6357d724c1626d223459a6961c2203c39 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Thu, 18 Apr 2019 20:57:36 -0400 Subject: [PATCH 28/70] Add asserts in upstream watermark callbacks Signed-off-by: Michael Puncel --- source/common/router/router.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/source/common/router/router.h b/source/common/router/router.h index e45394064c133..5070be9688213 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -321,12 +321,21 @@ class Filter : Logger::Loggable, void onBelowWriteBufferLowWatermark() override { enableDataFromDownstream(); } void disableDataFromDownstream() { - ASSERT(parent_.upstream_requests_.size() == 1); + // If there is only one upstream request, we can be assured that + // disabling reads will not slow down other upstream requests. If we've + // already seen the full downstream requst (downstream_end_stream_) then + // disabling reads is a no-op. + ASSERT(parent_.upstream_requests_.size() == 1 || downstream_end_stream_); parent_.cluster_->stats().upstream_flow_control_backed_up_total_.inc(); parent_.callbacks_->onDecoderFilterAboveWriteBufferHighWatermark(); } + void enableDataFromDownstream() { - ASSERT(parent_.upstream_requests_.size() == 1); + // If there is only one upstream request, we can be assured that + // disabling reads will not overflow any write buffers in other upstream + // requests. If we've already seen the full downstream requst + // (downstream_end_stream_) then enabling reads is a no-op. + ASSERT(parent_.upstream_requests_.size() == 1 || downstream_end_stream_); parent_.cluster_->stats().upstream_flow_control_drained_total_.inc(); parent_.callbacks_->onDecoderFilterBelowWriteBufferLowWatermark(); } From 8fd42970a3fc7b7d07f24037239f0f1d224933b3 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Fri, 19 Apr 2019 13:31:10 -0400 Subject: [PATCH 29/70] remove outdated asserts around 1 upstream request at a time Signed-off-by: Michael Puncel --- source/common/router/router.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 9912171e93a47..39b013536830d 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -444,7 +444,6 @@ void Filter::sendNoHealthyUpstreamResponse() { } Http::FilterDataStatus Filter::decodeData(Buffer::Instance& data, bool end_stream) { - ASSERT(upstream_requests_.size() == 1); bool buffering = (retry_state_ && retry_state_->enabled()) || do_shadowing_; if (buffering && buffer_limit_ > 0 && getLength(callbacks_->decodingBuffer()) + data.length() > buffer_limit_) { @@ -1030,7 +1029,6 @@ bool Filter::setupRetry() { return false; } - ASSERT(upstream_requests_.size() == 1); ENVOY_STREAM_LOG(debug, "performing retry", *callbacks_); return true; @@ -1396,7 +1394,12 @@ void Filter::UpstreamRequest::clearRequestEncoder() { void Filter::UpstreamRequest::DownstreamWatermarkManager::onAboveWriteBufferHighWatermark() { ASSERT(parent_.request_encoder_); - ASSERT(parent_.parent_.upstream_requests_.size() == 1); + + // We only write response data downstream for the "winning" upstream request, + // so we shouldn't get the watermark callback invoked on the non-winning + // upstream request. + ASSERT(&parent_ == parent_.parent_.final_upstream_request_); + // The downstream connection is overrun. Pause reads from upstream. parent_.parent_.cluster_->stats().upstream_flow_control_paused_reading_total_.inc(); parent_.request_encoder_->getStream().readDisable(true); @@ -1404,7 +1407,7 @@ void Filter::UpstreamRequest::DownstreamWatermarkManager::onAboveWriteBufferHigh void Filter::UpstreamRequest::DownstreamWatermarkManager::onBelowWriteBufferLowWatermark() { ASSERT(parent_.request_encoder_); - ASSERT(parent_.parent_.upstream_requests_.size() == 1); + // The downstream connection has buffer available. Resume reads from upstream. parent_.parent_.cluster_->stats().upstream_flow_control_resumed_reading_total_.inc(); parent_.request_encoder_->getStream().readDisable(false); From b86bd470a4afa0209658364421c9109ef2a54fc6 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Fri, 19 Apr 2019 13:09:42 -0400 Subject: [PATCH 30/70] increase waitForReset timeouts in integration tests Signed-off-by: Michael Puncel --- test/integration/http_timeout_integration_test.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/integration/http_timeout_integration_test.cc b/test/integration/http_timeout_integration_test.cc index b73b2707287d1..edf4eb9b3d054 100644 --- a/test/integration/http_timeout_integration_test.cc +++ b/test/integration/http_timeout_integration_test.cc @@ -36,7 +36,7 @@ TEST_P(HttpTimeoutIntegrationTest, GlobalTimeout) { // Ensure we got a timeout downstream and canceled the upstream request. response->waitForHeaders(); - ASSERT_TRUE(upstream_request_->waitForReset(std::chrono::milliseconds(0))); + ASSERT_TRUE(upstream_request_->waitForReset(std::chrono::seconds(15))); codec_client_->close(); @@ -143,7 +143,7 @@ TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeout) { response->waitForHeaders(); // The second request should be reset since we used the response from the first request. - ASSERT_TRUE(upstream_request2->waitForReset(std::chrono::milliseconds(0))); + ASSERT_TRUE(upstream_request2->waitForReset(std::chrono::seconds(15))); codec_client_->close(); @@ -219,7 +219,7 @@ void HttpTimeoutIntegrationTest::testRouterRequestAndResponseWithHedgedPerTryTim response->waitForHeaders(); // The second request should be reset since we used the response from the first request. - ASSERT_TRUE(upstream_request2->waitForReset(std::chrono::milliseconds(0))); + ASSERT_TRUE(upstream_request2->waitForReset(std::chrono::seconds(15))); if (response_size) { upstream_request_->encodeData(response_size, true); From 59fe7b89536ff249e65fe32f0f95eba2d5a3c9a9 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Fri, 19 Apr 2019 16:55:36 -0400 Subject: [PATCH 31/70] fix assert in upstream watermark callbacks and unit test that doesn't fit the assumption Signed-off-by: Michael Puncel --- source/common/router/router.h | 4 ++-- test/common/router/router_test.cc | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/source/common/router/router.h b/source/common/router/router.h index 5070be9688213..e90c590746503 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -325,7 +325,7 @@ class Filter : Logger::Loggable, // disabling reads will not slow down other upstream requests. If we've // already seen the full downstream requst (downstream_end_stream_) then // disabling reads is a no-op. - ASSERT(parent_.upstream_requests_.size() == 1 || downstream_end_stream_); + ASSERT(parent_.upstream_requests_.size() == 1 || parent_.downstream_end_stream_); parent_.cluster_->stats().upstream_flow_control_backed_up_total_.inc(); parent_.callbacks_->onDecoderFilterAboveWriteBufferHighWatermark(); } @@ -335,7 +335,7 @@ class Filter : Logger::Loggable, // disabling reads will not overflow any write buffers in other upstream // requests. If we've already seen the full downstream requst // (downstream_end_stream_) then enabling reads is a no-op. - ASSERT(parent_.upstream_requests_.size() == 1 || downstream_end_stream_); + ASSERT(parent_.upstream_requests_.size() == 1 || parent_.downstream_end_stream_); parent_.cluster_->stats().upstream_flow_control_drained_total_.inc(); parent_.callbacks_->onDecoderFilterBelowWriteBufferLowWatermark(); } diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index ec9540f197534..1842a15ab3038 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -3422,7 +3422,10 @@ TEST_F(WatermarkTest, DownstreamWatermarks) { } TEST_F(WatermarkTest, UpstreamWatermarks) { - sendRequest(); + sendRequest(false); + + response_decoder_->decodeHeaders( + Http::HeaderMapPtr{new Http::TestHeaderMapImpl{{":status", "200"}}}, false); ASSERT(callbacks_.callbacks_.begin() != callbacks_.callbacks_.end()); Envoy::Http::DownstreamWatermarkCallbacks* watermark_callbacks = *callbacks_.callbacks_.begin(); @@ -3441,7 +3444,9 @@ TEST_F(WatermarkTest, UpstreamWatermarks) { .counter("upstream_flow_control_resumed_reading_total") .value()); - sendResponse(); + Buffer::OwnedImpl data; + EXPECT_CALL(encoder_, getStream()).Times(2).WillRepeatedly(ReturnRef(stream_)); + response_decoder_->decodeData(data, true); } TEST_F(WatermarkTest, FilterWatermarks) { From 899daccaaa3534c0b331d09e8ec6adf74e60bcbf Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Fri, 19 Apr 2019 18:52:06 -0400 Subject: [PATCH 32/70] fix format Signed-off-by: Michael Puncel --- source/common/router/router.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 04c0dc2ff4c8b..046cdc38c7afc 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -512,7 +512,8 @@ void Filter::cleanup() { if (upstream_request.get() == final_upstream_request_) { callbacks_->streamInfo().setUpstreamTiming(final_upstream_request_->upstream_timing_); } - if ((upstream_request.get() != final_upstream_request_) || !attempting_internal_redirect_with_complete_stream_) { + if ((upstream_request.get() != final_upstream_request_) || + !attempting_internal_redirect_with_complete_stream_) { upstream_request->resetStream(); // Idempotent. } } From dc9fff41ccaec9433770b605d0df7f0327855f5e Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 22 Apr 2019 11:39:38 -0400 Subject: [PATCH 33/70] fix format Signed-off-by: Michael Puncel --- source/common/router/router.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 1075323333f6e..d114ab4c90cc5 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -1143,7 +1143,8 @@ Filter::UpstreamRequest::UpstreamRequest(Filter& parent, Http::ConnectionPool::I stream_info_(pool.protocol(), parent_.callbacks_->dispatcher().timeSource()), calling_encode_headers_(false), upstream_canary_(false), decode_complete_(false), encode_complete_(false), encode_trailers_(false), retried_(false), - outlier_detection_timeout_recorded_(false), create_per_try_timeout_on_request_complete_(false) { + outlier_detection_timeout_recorded_(false), + create_per_try_timeout_on_request_complete_(false) { if (parent_.config_.start_child_span_) { span_ = parent_.callbacks_->activeSpan().spawnChild( From e3b4396859c3404e506eec5b9bd32867843c6880 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 22 Apr 2019 11:46:40 -0400 Subject: [PATCH 34/70] fix typos Signed-off-by: Michael Puncel --- source/common/router/router.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/common/router/router.h b/source/common/router/router.h index cc5c18554287d..8e08622439efa 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -323,7 +323,7 @@ class Filter : Logger::Loggable, void disableDataFromDownstream() { // If there is only one upstream request, we can be assured that // disabling reads will not slow down other upstream requests. If we've - // already seen the full downstream requst (downstream_end_stream_) then + // already seen the full downstream request (downstream_end_stream_) then // disabling reads is a no-op. ASSERT(parent_.upstream_requests_.size() == 1 || parent_.downstream_end_stream_); parent_.cluster_->stats().upstream_flow_control_backed_up_total_.inc(); @@ -333,7 +333,7 @@ class Filter : Logger::Loggable, void enableDataFromDownstream() { // If there is only one upstream request, we can be assured that // disabling reads will not overflow any write buffers in other upstream - // requests. If we've already seen the full downstream requst + // requests. If we've already seen the full downstream request // (downstream_end_stream_) then enabling reads is a no-op. ASSERT(parent_.upstream_requests_.size() == 1 || parent_.downstream_end_stream_); parent_.cluster_->stats().upstream_flow_control_drained_total_.inc(); From 3ba2e572698692540ba877338e1a6f55e8d91a19 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 22 Apr 2019 11:54:44 -0400 Subject: [PATCH 35/70] switch to getStringView() for header comparisons Signed-off-by: Michael Puncel --- test/integration/http_timeout_integration_test.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/integration/http_timeout_integration_test.cc b/test/integration/http_timeout_integration_test.cc index edf4eb9b3d054..bbe8e08d46422 100644 --- a/test/integration/http_timeout_integration_test.cc +++ b/test/integration/http_timeout_integration_test.cc @@ -44,7 +44,7 @@ TEST_P(HttpTimeoutIntegrationTest, GlobalTimeout) { EXPECT_EQ(0U, upstream_request_->bodyLength()); EXPECT_TRUE(response->complete()); - EXPECT_STREQ("504", response->headers().Status()->value().c_str()); + EXPECT_STR("504", response->headers().Status()->value().getStringView()); } // Sends a request with a global timeout and per try timeout specified, sleeps @@ -92,7 +92,7 @@ TEST_P(HttpTimeoutIntegrationTest, PerTryTimeout) { EXPECT_EQ(0U, upstream_request_->bodyLength()); EXPECT_TRUE(response->complete()); - EXPECT_STREQ("504", response->headers().Status()->value().c_str()); + EXPECT_STR("504", response->headers().Status()->value().getStringView()); } // With hedge_on_per_try_timeout enabled via config, sends a request with a @@ -151,7 +151,7 @@ TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeout) { EXPECT_EQ(0U, upstream_request_->bodyLength()); EXPECT_TRUE(response->complete()); - EXPECT_STREQ("200", response->headers().Status()->value().c_str()); + EXPECT_STR("200", response->headers().Status()->value().getStringView()); } TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutWithBodyNoBuffer) { @@ -233,7 +233,7 @@ void HttpTimeoutIntegrationTest::testRouterRequestAndResponseWithHedgedPerTryTim EXPECT_EQ(request_size, upstream_request_->bodyLength()); EXPECT_TRUE(response->complete()); - EXPECT_STREQ("200", response->headers().Status()->value().c_str()); + EXPECT_STR("200", response->headers().Status()->value().getStringView()); } } // namespace Envoy From 7e41cd1942a106750c93c4c6d434a9e21ffcc057 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 22 Apr 2019 12:09:27 -0400 Subject: [PATCH 36/70] fix typo Signed-off-by: Michael Puncel --- test/integration/http_timeout_integration_test.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/integration/http_timeout_integration_test.cc b/test/integration/http_timeout_integration_test.cc index bbe8e08d46422..fdb5a9586b51b 100644 --- a/test/integration/http_timeout_integration_test.cc +++ b/test/integration/http_timeout_integration_test.cc @@ -44,7 +44,7 @@ TEST_P(HttpTimeoutIntegrationTest, GlobalTimeout) { EXPECT_EQ(0U, upstream_request_->bodyLength()); EXPECT_TRUE(response->complete()); - EXPECT_STR("504", response->headers().Status()->value().getStringView()); + EXPECT_EQ("504", response->headers().Status()->value().getStringView()); } // Sends a request with a global timeout and per try timeout specified, sleeps @@ -92,7 +92,7 @@ TEST_P(HttpTimeoutIntegrationTest, PerTryTimeout) { EXPECT_EQ(0U, upstream_request_->bodyLength()); EXPECT_TRUE(response->complete()); - EXPECT_STR("504", response->headers().Status()->value().getStringView()); + EXPECT_EQ("504", response->headers().Status()->value().getStringView()); } // With hedge_on_per_try_timeout enabled via config, sends a request with a @@ -151,7 +151,7 @@ TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeout) { EXPECT_EQ(0U, upstream_request_->bodyLength()); EXPECT_TRUE(response->complete()); - EXPECT_STR("200", response->headers().Status()->value().getStringView()); + EXPECT_EQ("200", response->headers().Status()->value().getStringView()); } TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutWithBodyNoBuffer) { @@ -233,7 +233,7 @@ void HttpTimeoutIntegrationTest::testRouterRequestAndResponseWithHedgedPerTryTim EXPECT_EQ(request_size, upstream_request_->bodyLength()); EXPECT_TRUE(response->complete()); - EXPECT_STR("200", response->headers().Status()->value().getStringView()); + EXPECT_EQ("200", response->headers().Status()->value().getStringView()); } } // namespace Envoy From 57593fb103b8a73d2234bc9d754b20391f9afdd1 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 22 Apr 2019 14:27:38 -0400 Subject: [PATCH 37/70] fix faulty assumption in assert. it's possible to get a downstream watermark callback that was caused by another simultaneous request over the same downstream connection Signed-off-by: Michael Puncel --- source/common/router/router.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index d114ab4c90cc5..ada66208875f2 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -1423,10 +1423,12 @@ void Filter::UpstreamRequest::clearRequestEncoder() { void Filter::UpstreamRequest::DownstreamWatermarkManager::onAboveWriteBufferHighWatermark() { ASSERT(parent_.request_encoder_); - // We only write response data downstream for the "winning" upstream request, - // so we shouldn't get the watermark callback invoked on the non-winning - // upstream request. - ASSERT(&parent_ == parent_.parent_.final_upstream_request_); + // There are two states we should get this callback in: 1) the watermark was + // hit due to writes from a different filter instance over a shared + // downstream connection, or 2) the watermark was hit due to THIS filter + // instance due to writing back the "winning" upstream request. In either + // case we can disable reads from upstream. + ASSERT(!parent_.parent_.final_upstream_request_ || &parent_ == parent_.parent_.final_upstream_request_); // The downstream connection is overrun. Pause reads from upstream. parent_.parent_.cluster_->stats().upstream_flow_control_paused_reading_total_.inc(); From f84a666105f7bcb6463c21d4eb2a24e30c142c33 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 22 Apr 2019 14:45:01 -0400 Subject: [PATCH 38/70] fix format Signed-off-by: Michael Puncel --- source/common/router/router.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index ada66208875f2..21543e1d8d19d 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -1428,7 +1428,8 @@ void Filter::UpstreamRequest::DownstreamWatermarkManager::onAboveWriteBufferHigh // downstream connection, or 2) the watermark was hit due to THIS filter // instance due to writing back the "winning" upstream request. In either // case we can disable reads from upstream. - ASSERT(!parent_.parent_.final_upstream_request_ || &parent_ == parent_.parent_.final_upstream_request_); + ASSERT(!parent_.parent_.final_upstream_request_ || + &parent_ == parent_.parent_.final_upstream_request_); // The downstream connection is overrun. Pause reads from upstream. parent_.parent_.cluster_->stats().upstream_flow_control_paused_reading_total_.inc(); From 6eafa00570f6062b140e0b9f24cfd6cfd2359951 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 22 Apr 2019 15:23:38 -0400 Subject: [PATCH 39/70] add test case where downstream watermark callbacks are deregistered from HCM filter Signed-off-by: Michael Puncel --- test/common/http/conn_manager_impl_test.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/common/http/conn_manager_impl_test.cc b/test/common/http/conn_manager_impl_test.cc index dd6e61bcccf34..9f6796a6f2eaf 100644 --- a/test/common/http/conn_manager_impl_test.cc +++ b/test/common/http/conn_manager_impl_test.cc @@ -3104,10 +3104,13 @@ TEST_F(HttpConnectionManagerImplTest, HitFilterWatermarkLimits) { .WillOnce(Return(FilterDataStatus::StopIterationAndWatermark)); decoder_filters_[0]->callbacks_->encodeData(fake_response, false); + // deregister callbacks2 + decoder_filters_[0]->callbacks_->removeDownstreamWatermarkCallbacks(callbacks2); + // Change the limit so the buffered data is below the new watermark. buffer_len = encoder_filters_[1]->callbacks_->encodingBuffer()->length(); EXPECT_CALL(callbacks, onBelowWriteBufferLowWatermark()); - EXPECT_CALL(callbacks2, onBelowWriteBufferLowWatermark()); + EXPECT_CALL(callbacks2, onBelowWriteBufferLowWatermark()).Times(0); encoder_filters_[1]->callbacks_->setEncoderBufferLimit((buffer_len + 1) * 2); } From 13444325a840745ad43259f1d100649c90efbfca Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 23 Apr 2019 11:01:00 -0400 Subject: [PATCH 40/70] deregister -> unregister Signed-off-by: Michael Puncel --- test/common/http/conn_manager_impl_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/common/http/conn_manager_impl_test.cc b/test/common/http/conn_manager_impl_test.cc index 9f6796a6f2eaf..fd784cdf410e0 100644 --- a/test/common/http/conn_manager_impl_test.cc +++ b/test/common/http/conn_manager_impl_test.cc @@ -3104,7 +3104,7 @@ TEST_F(HttpConnectionManagerImplTest, HitFilterWatermarkLimits) { .WillOnce(Return(FilterDataStatus::StopIterationAndWatermark)); decoder_filters_[0]->callbacks_->encodeData(fake_response, false); - // deregister callbacks2 + // unregister callbacks2 decoder_filters_[0]->callbacks_->removeDownstreamWatermarkCallbacks(callbacks2); // Change the limit so the buffered data is below the new watermark. From 0d7a7fcf4c8532bbeb84c551e6641d1d7354f32d Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 23 Apr 2019 12:55:58 -0400 Subject: [PATCH 41/70] initialize final_upstream_request_ to nullptr Signed-off-by: Michael Puncel --- source/common/router/router.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/common/router/router.h b/source/common/router/router.h index 8e08622439efa..e2d59874daba9 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -165,8 +165,8 @@ class Filter : Logger::Loggable, public Upstream::LoadBalancerContextBase { public: Filter(FilterConfig& config) - : config_(config), downstream_response_started_(false), downstream_end_stream_(false), - do_shadowing_(false), is_retry_(false), + : config_(config), final_upstream_request_(nullptr), downstream_response_started_(false), + downstream_end_stream_(false), do_shadowing_(false), is_retry_(false), attempting_internal_redirect_with_complete_stream_(false) {} ~Filter(); From 00f473f7e96fb7e7c3a4bb386935712dad5cace1 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Wed, 24 Apr 2019 18:16:59 -0400 Subject: [PATCH 42/70] PR feedback Signed-off-by: Michael Puncel --- docs/root/intro/version_history.rst | 2 +- source/common/router/router.cc | 42 +++++++----------- source/common/router/router.h | 17 ++++--- test/common/router/router_test.cc | 69 ++++------------------------- 4 files changed, 38 insertions(+), 92 deletions(-) diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index 7674401e4d17a..b5076d6da05c9 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -3,7 +3,6 @@ Version history 1.11.0 (Pending) ================ -* router: added ability to issue a hedged retry in response to a per try timeout via a :ref:`hedge policy `. * access log: added a new field for response code details in :ref:`file access logger` and :ref:`gRPC access logger`. * dubbo_proxy: support the :ref:`Dubbo proxy filter `. * eds: added support to specify max time for which endpoints can be used :ref:`gRPC filter `. @@ -19,6 +18,7 @@ Version history :ref:`buffer_flush_timeout ` to control how quickly the buffer is flushed if it is not full. * router: add support for configuring a :ref:`grpc timeout offset ` on incoming requests. * router: added ability to control retry back-off intervals via :ref:`retry policy `. +* router: added ability to issue a hedged retry in response to a per try timeout via a :ref:`hedge policy `. * router: per try timeouts will no longer start before the downstream request has been received in full by the router. This ensures that the per try timeout does not account for slow downstreams and that will not start before the global timeout. diff --git a/source/common/router/router.cc b/source/common/router/router.cc index c62c993ac3daf..d8c3cea1edff1 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -194,17 +194,10 @@ FilterUtility::finalTimeout(const RouteEntry& route, Http::HeaderMap& request_he } FilterUtility::HedgingParams FilterUtility::finalHedgingParams(const RouteEntry& route, - Http::HeaderMap& request_headers, - uint64_t random_value) { + Http::HeaderMap& request_headers) { HedgingParams hedgingParams; - hedgingParams.initial_requests_ = route.hedgePolicy().initialRequests(); hedgingParams.hedge_on_per_try_timeout_ = route.hedgePolicy().hedgeOnPerTryTimeout(); - if (ProtobufPercentHelper::evaluateFractionalPercent( - route.hedgePolicy().additionalRequestChance(), random_value)) { - hedgingParams.initial_requests_++; - } - Http::HeaderEntry* hedge_on_per_try_timeout_entry = request_headers.EnvoyHedgeOnPerTryTimeout(); if (hedge_on_per_try_timeout_entry) { if (hedge_on_per_try_timeout_entry->value() == "true") { @@ -414,7 +407,7 @@ Http::FilterHeadersStatus Filter::decodeHeaders(Http::HeaderMap& headers, bool e ASSERT(headers.Scheme()); hedging_params_ = - FilterUtility::finalHedgingParams(*route_entry_, headers, callbacks_->streamId()); + FilterUtility::finalHedgingParams(*route_entry_, headers); retry_state_ = createRetryState(route_entry_->retryPolicy(), headers, *cluster_, config_.runtime_, @@ -458,6 +451,12 @@ void Filter::sendNoHealthyUpstreamResponse() { } Http::FilterDataStatus Filter::decodeData(Buffer::Instance& data, bool end_stream) { + // upstream_requests_.size() cannot be 0 because we add to it unconditionally + // in decodeHeaders(). It cannot be > 1 because that only happens when a per + // try timeout occurs with hedge_on_per_try_timeout enabled but the the per + // try timeout timer is not started until onUpstreamComplete(). + ASSERT(upstream_requests_.size() == 1); + bool buffering = (retry_state_ && retry_state_->enabled()) || do_shadowing_; if (buffering && buffer_limit_ > 0 && getLength(callbacks_->decodingBuffer()) + data.length() > buffer_limit_) { @@ -468,25 +467,20 @@ Http::FilterDataStatus Filter::decodeData(Buffer::Instance& data, bool end_strea do_shadowing_ = false; } - for (auto& upstream_request : upstream_requests_) { - if (buffering) { - // We need to make a copy before encoding since it's all moves from here - // on if we might have multiple upstream requests or traffic - // shadowing/retries. - Buffer::OwnedImpl copy(data); - upstream_request->encodeData(copy, end_stream); - } else { - upstream_request->encodeData(data, end_stream); - } - } - if (buffering) { + // If we are going to buffer for retries or shadowing, we need to make a copy before encoding + // since it's all moves from here on. + Buffer::OwnedImpl copy(data); + upstream_requests_.front()->encodeData(copy, end_stream); + // If we are potentially going to retry or shadow this request we need to buffer. // This will not cause the connection manager to 413 because before we hit the // buffer limit we give up on retries and buffering. We must buffer using addDecodedData() // so that all buffered data is available by the time we do request complete processing and // potentially shadow. callbacks_->addDecodedData(data, true); + } else { + upstream_requests_.front()->encodeData(data, end_stream); } if (end_stream) { @@ -610,10 +604,8 @@ void Filter::onResponseTimeout() { // Called when the per try timeout is hit but we didn't reset the request // (hedge_on_per_try_timeout enabled). void Filter::onSoftPerTryTimeout(UpstreamRequest& upstream_request) { - // Even though we didn't cancel the request yet we still want to track it - // in outlier detection. - // TODO(mpuncel) is it weird to have a pretend response code here? we might - // get a 200 back from this request later. + // Track this as a timeout for outlier detection purposes even though we didn't + // cancel the request yet and might get a 2xx later. updateOutlierDetection(timeout_response_code_, upstream_request); upstream_request.outlier_detection_timeout_recorded_ = true; diff --git a/source/common/router/router.h b/source/common/router/router.h index e2d59874daba9..eb24abb7a1b0b 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -63,7 +63,6 @@ class FilterUtility { }; struct HedgingParams { - uint32_t initial_requests_; bool hedge_on_per_try_timeout_; }; @@ -98,12 +97,10 @@ class FilterUtility { /** * Determine the final hedging settings after applying randomized behavior. * @param route supplies the request route. - * @param random_value supplies a stable random value to use for evaluating whether an additional - * initial request should be sent + * @param request_headers supplies the request headers. * @return HedgingParams the final parameters to use for request hedging */ - static HedgingParams finalHedgingParams(const RouteEntry& route, Http::HeaderMap& request_headers, - uint64_t random_value); + static HedgingParams finalHedgingParams(const RouteEntry& route, Http::HeaderMap& request_headers); }; /** @@ -325,6 +322,11 @@ class Filter : Logger::Loggable, // disabling reads will not slow down other upstream requests. If we've // already seen the full downstream request (downstream_end_stream_) then // disabling reads is a no-op. + // This assert condition must be true because + // parent_.upstream_requests_.size() can only be greater than 1 in the + // case of a per-try-timeout with hedge_on_per_try_timeout enabled, and + // the per try timeout timer is started only after downstream_end_stream_ + // is true. ASSERT(parent_.upstream_requests_.size() == 1 || parent_.downstream_end_stream_); parent_.cluster_->stats().upstream_flow_control_backed_up_total_.inc(); parent_.callbacks_->onDecoderFilterAboveWriteBufferHighWatermark(); @@ -335,6 +337,11 @@ class Filter : Logger::Loggable, // disabling reads will not overflow any write buffers in other upstream // requests. If we've already seen the full downstream request // (downstream_end_stream_) then enabling reads is a no-op. + // This assert condition must be true because + // parent_.upstream_requests_.size() can only be greater than 1 in the + // case of a per-try-timeout with hedge_on_per_try_timeout enabled, and + // the per try timeout timer is started only after downstream_end_stream_ + // is true. ASSERT(parent_.upstream_requests_.size() == 1 || parent_.downstream_end_stream_); parent_.cluster_->stats().upstream_flow_control_drained_total_.inc(); parent_.callbacks_->onDecoderFilterBelowWriteBufferLowWatermark(); diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index 31c3167386e50..8ad01f3797538 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -210,7 +210,6 @@ class RouterTestBase : public testing::Test { } void enableHedgeOnPerTryTimeout() { - callbacks_.route_->route_entry_.hedge_policy_.initial_requests_ = 1; callbacks_.route_->route_entry_.hedge_policy_.hedge_on_per_try_timeout_ = true; callbacks_.route_->route_entry_.hedge_policy_.additional_request_chance_ = envoy::type::FractionalPercent{}; @@ -2846,41 +2845,6 @@ TEST_F(RouterTest, UpstreamTimingTimeout) { EXPECT_EQ(stream_info.firstUpstreamRxByteReceived().value(), std::chrono::milliseconds(56)); } -TEST(RouterFilterUtilityTest, FinalHedgingParamsInitialRequests) { - Http::TestHeaderMapImpl empty_headers; - { // no chance of additional request, header not present - NiceMock route; - route.hedge_policy_.initial_requests_ = 10; - EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, empty_headers, 0); - EXPECT_EQ(10, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 10); - EXPECT_EQ(10, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 100); - EXPECT_EQ(10, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 1000); - EXPECT_EQ(10, hedgingParams.initial_requests_); - } - { // 50% chance additional request - NiceMock route; - route.hedge_policy_.initial_requests_ = 10; - route.hedge_policy_.additional_request_chance_.set_numerator(50); - EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, empty_headers, 0); - EXPECT_EQ(11, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 49); - EXPECT_EQ(11, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 50); - EXPECT_EQ(10, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 99); - EXPECT_EQ(10, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 100); - EXPECT_EQ(11, hedgingParams.initial_requests_); - } -} - TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { Http::TestHeaderMapImpl empty_headers; { // route says true, header not present, expect true. @@ -2888,7 +2852,7 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = true; EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, empty_headers, 0); + FilterUtility::finalHedgingParams(route, empty_headers); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } { // route says false, header not present, expect false. @@ -2896,7 +2860,7 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = false; EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, empty_headers, 0); + FilterUtility::finalHedgingParams(route, empty_headers); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } { // route says false, header says true, expect true. @@ -2905,7 +2869,7 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = false; EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers, 0); + FilterUtility::finalHedgingParams(route, headers); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } { // route says false, header says false, expect false. @@ -2914,7 +2878,7 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = false; EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers, 0); + FilterUtility::finalHedgingParams(route, headers); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } { // route says true, header says false, expect false. @@ -2923,7 +2887,7 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = true; EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers, 0); + FilterUtility::finalHedgingParams(route, headers); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } { // route says true, header says true, expect true. @@ -2932,7 +2896,7 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = true; EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers, 0); + FilterUtility::finalHedgingParams(route, headers); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } { // route says true, header is invalid, expect true. @@ -2941,7 +2905,7 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = true; EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers, 0); + FilterUtility::finalHedgingParams(route, headers); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } { // route says false, header is invalid, expect false. @@ -2950,26 +2914,9 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = false; EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers, 0); + FilterUtility::finalHedgingParams(route, headers); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } - { // 50% chance additional request - NiceMock route; - route.hedge_policy_.initial_requests_ = 10; - route.hedge_policy_.additional_request_chance_.set_numerator(50); - EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, empty_headers, 0); - EXPECT_EQ(11, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 49); - EXPECT_EQ(11, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 50); - EXPECT_EQ(10, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 99); - EXPECT_EQ(10, hedgingParams.initial_requests_); - hedgingParams = FilterUtility::finalHedgingParams(route, empty_headers, 100); - EXPECT_EQ(11, hedgingParams.initial_requests_); - } } TEST(RouterFilterUtilityTest, FinalTimeout) { From 9b3398b8e0fb113742562c2fd33abaf8606911df Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Thu, 25 Apr 2019 10:27:41 -0400 Subject: [PATCH 43/70] fix format Signed-off-by: Michael Puncel --- source/common/router/router.cc | 3 +-- source/common/router/router.h | 3 ++- test/common/router/router_test.cc | 18 ++++++------------ 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index d8c3cea1edff1..f79697d02a96a 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -406,8 +406,7 @@ Http::FilterHeadersStatus Filter::decodeHeaders(Http::HeaderMap& headers, bool e // Ensure an http transport scheme is selected before continuing with decoding. ASSERT(headers.Scheme()); - hedging_params_ = - FilterUtility::finalHedgingParams(*route_entry_, headers); + hedging_params_ = FilterUtility::finalHedgingParams(*route_entry_, headers); retry_state_ = createRetryState(route_entry_->retryPolicy(), headers, *cluster_, config_.runtime_, diff --git a/source/common/router/router.h b/source/common/router/router.h index eb24abb7a1b0b..5f4694ea9de8b 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -100,7 +100,8 @@ class FilterUtility { * @param request_headers supplies the request headers. * @return HedgingParams the final parameters to use for request hedging */ - static HedgingParams finalHedgingParams(const RouteEntry& route, Http::HeaderMap& request_headers); + static HedgingParams finalHedgingParams(const RouteEntry& route, + Http::HeaderMap& request_headers); }; /** diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index 8ad01f3797538..c9d876a44b7a8 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -2868,8 +2868,7 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = false; EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } { // route says false, header says false, expect false. @@ -2877,8 +2876,7 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = false; EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } { // route says true, header says false, expect false. @@ -2886,8 +2884,7 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = true; EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } { // route says true, header says true, expect true. @@ -2895,8 +2892,7 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = true; EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } { // route says true, header is invalid, expect true. @@ -2904,8 +2900,7 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = true; EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } { // route says false, header is invalid, expect false. @@ -2913,8 +2908,7 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = false; EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } } From 8ecf8542e0b40a1e191f31909ec9d3de69deca72 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Thu, 25 Apr 2019 10:55:40 -0400 Subject: [PATCH 44/70] add stat for count of hedged requests Signed-off-by: Michael Puncel --- .../cluster_manager/cluster_stats.rst | 1 + include/envoy/upstream/upstream.h | 1 + source/common/router/router.cc | 2 ++ test/common/router/router_test.cc | 18 ++++++++++++++++++ 4 files changed, 22 insertions(+) diff --git a/docs/root/configuration/cluster_manager/cluster_stats.rst b/docs/root/configuration/cluster_manager/cluster_stats.rst index 0e2ff5676fda8..a485d37ab4913 100644 --- a/docs/root/configuration/cluster_manager/cluster_stats.rst +++ b/docs/root/configuration/cluster_manager/cluster_stats.rst @@ -62,6 +62,7 @@ Every cluster has a statistics tree rooted at *cluster..* with the followi upstream_cx_none_healthy, Counter, Total times connection not established due to no healthy hosts upstream_rq_total, Counter, Total requests upstream_rq_hedge_abandoned, Counter, Number of hedged requests that were abandoned due to accepting another response. + upstream_rq_hedge_attempted, Counter, Total number of hedged requests that were attempted. upstream_rq_active, Gauge, Total active requests upstream_rq_pending_total, Counter, Total requests pending a connection pool connection upstream_rq_pending_overflow, Counter, Total requests that overflowed connection pool circuit breaking and were failed diff --git a/include/envoy/upstream/upstream.h b/include/envoy/upstream/upstream.h index 0d44adc52eaf4..486cec19311a6 100644 --- a/include/envoy/upstream/upstream.h +++ b/include/envoy/upstream/upstream.h @@ -490,6 +490,7 @@ class PrioritySet { COUNTER (upstream_cx_pool_overflow) \ COUNTER (upstream_rq_total) \ COUNTER (upstream_rq_hedge_abandoned) \ + COUNTER (upstream_rq_hedge_attempted) \ GAUGE (upstream_rq_active) \ COUNTER (upstream_rq_completed) \ COUNTER (upstream_rq_pending_total) \ diff --git a/source/common/router/router.cc b/source/common/router/router.cc index f79697d02a96a..03b9efbab5dba 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -618,6 +618,8 @@ void Filter::onSoftPerTryTimeout(UpstreamRequest& upstream_request) { // later if 1) we hit global timeout or 2) we get bad response headers // back. upstream_request.retried_ = true; + + cluster_->stats().upstream_rq_hedge_attempted_.inc(); } else if (retry_status == RetryStatus::NoOverflow) { callbacks_->streamInfo().setResponseFlag(StreamInfo::ResponseFlag::UpstreamOverflow); } else if (retry_status == RetryStatus::NoRetryLimitExceeded) { diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index c9d876a44b7a8..b07204eb8cc38 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -1375,6 +1375,9 @@ TEST_F(RouterTest, HedgedPerTryTimeoutFirstRequestSucceeds) { EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ .counter("upstream_rq_hedge_abandoned") .value()); + EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ + .counter("upstream_rq_hedge_attempted") + .value()); } // Three requests sent: 1) 5xx error, 2) per try timeout, 3) gets good response @@ -1426,6 +1429,9 @@ TEST_F(RouterTest, HedgedPerTryTimeoutThirdRequestSucceeds) { EXPECT_EQ(0, cm_.thread_local_cluster_.cluster_.info_->stats_store_ .counter("upstream_rq_hedge_abandoned") .value()); + EXPECT_EQ(0, cm_.thread_local_cluster_.cluster_.info_->stats_store_ + .counter("upstream_rq_hedge_attempted") + .value()); // Now trigger a per try timeout on the 2nd request, expect a 3rd router_.retry_state_->expectHedgedPerTryTimeoutRetry(); @@ -1450,6 +1456,9 @@ TEST_F(RouterTest, HedgedPerTryTimeoutThirdRequestSucceeds) { EXPECT_EQ(0, cm_.thread_local_cluster_.cluster_.info_->stats_store_ .counter("upstream_rq_hedge_abandoned") .value()); + EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ + .counter("upstream_rq_hedge_attempted") + .value()); // Now write a 200 back. We expect the 2nd stream to be reset and stats to be // incremented properly. @@ -1471,6 +1480,9 @@ TEST_F(RouterTest, HedgedPerTryTimeoutThirdRequestSucceeds) { EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ .counter("upstream_rq_hedge_abandoned") .value()); + EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ + .counter("upstream_rq_hedge_attempted") + .value()); } // First request times out and is retried, and then a response is received. @@ -1665,6 +1677,9 @@ TEST_F(RouterTest, HedgedPerTryTimeoutGlobalTimeout) { EXPECT_EQ(0, cm_.thread_local_cluster_.cluster_.info_->stats_store_ .counter("upstream_rq_hedge_abandoned") .value()); + EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ + .counter("upstream_rq_hedge_attempted") + .value()); // Now trigger global timeout, expect everything to be reset EXPECT_CALL(encoder1.stream_, resetStream(_)).Times(1); @@ -1682,6 +1697,9 @@ TEST_F(RouterTest, HedgedPerTryTimeoutGlobalTimeout) { EXPECT_EQ(0, cm_.thread_local_cluster_.cluster_.info_->stats_store_ .counter("upstream_rq_hedge_abandoned") .value()); + EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ + .counter("upstream_rq_hedge_attempted") + .value()); EXPECT_EQ(2, cm_.thread_local_cluster_.cluster_.info_->stats_store_.counter("upstream_rq_timeout") .value()); } From f4e3b9f3c872c92e50962f7903c89fafd5e86f45 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Thu, 25 Apr 2019 11:07:14 -0400 Subject: [PATCH 45/70] do removal of upstream request from list inside maybeRetryReset Signed-off-by: Michael Puncel --- source/common/router/router.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 03b9efbab5dba..eec9566e7af19 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -645,7 +645,6 @@ void Filter::onPerTryTimeout(UpstreamRequest& upstream_request) { updateOutlierDetection(timeout_response_code_, upstream_request); if (maybeRetryReset(Http::StreamResetReason::LocalReset, upstream_request)) { - upstream_request.removeFromList(upstream_requests_); return; } @@ -723,6 +722,7 @@ bool Filter::maybeRetryReset(Http::StreamResetReason reset_reason, if (upstream_request.upstream_host_) { upstream_request.upstream_host_->stats().rq_error_.inc(); } + upstream_request.removeFromList(upstream_requests_); return true; } else if (retry_status == RetryStatus::NoOverflow) { callbacks_->streamInfo().setResponseFlag(StreamInfo::ResponseFlag::UpstreamOverflow); From 647e18f1dd60cdedfbb4528b0e01c275e2229e91 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 29 Apr 2019 16:29:37 -0400 Subject: [PATCH 46/70] for hedged requests, set x-envoy-expected-timeout and grpc-timeout to be global timeout not per try timeout. This will ensure that the upstream server doesn't cancel the request remotely while we're still waiting for a reply. Signed-off-by: Michael Puncel --- source/common/router/router.cc | 14 ++++--- source/common/router/router.h | 3 +- test/common/router/router_test.cc | 69 +++++++++++++++++++++---------- 3 files changed, 59 insertions(+), 27 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index eec9566e7af19..479ef4ba5b8b0 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -115,7 +115,8 @@ bool FilterUtility::shouldShadow(const ShadowPolicy& policy, Runtime::Loader& ru FilterUtility::TimeoutData FilterUtility::finalTimeout(const RouteEntry& route, Http::HeaderMap& request_headers, - bool insert_envoy_expected_request_timeout_ms, bool grpc_request) { + bool insert_envoy_expected_request_timeout_ms, bool grpc_request, + bool per_try_timeout_hedging_enabled) { // See if there is a user supplied timeout in a request header. If there is we take that. // Otherwise if the request is gRPC and a maximum gRPC timeout is configured we use the timeout // in the gRPC headers (or infinity when gRPC headers have no timeout), but cap that timeout to @@ -173,7 +174,10 @@ FilterUtility::finalTimeout(const RouteEntry& route, Http::HeaderMap& request_he // See if there is any timeout to write in the expected timeout header. uint64_t expected_timeout = timeout.per_try_timeout_.count(); - if (expected_timeout == 0) { + // Use the global timeout if no per try timeout was specified or if we're + // doing hedging when there are per try timeouts. Either of these scenarios + // mean that the upstream server can use the full global timeout. + if (per_try_timeout_hedging_enabled || expected_timeout == 0) { expected_timeout = timeout.global_timeout_.count(); } @@ -382,8 +386,10 @@ Http::FilterHeadersStatus Filter::decodeHeaders(Http::HeaderMap& headers, bool e return Http::FilterHeadersStatus::StopIteration; } + hedging_params_ = FilterUtility::finalHedgingParams(*route_entry_, headers); + timeout_ = FilterUtility::finalTimeout(*route_entry_, headers, !config_.suppress_envoy_headers_, - grpc_request_); + grpc_request_, hedging_params_.hedge_on_per_try_timeout_); // If this header is set with any value, use an alternate response code on timeout if (headers.EnvoyUpstreamRequestTimeoutAltResponse()) { @@ -406,8 +412,6 @@ Http::FilterHeadersStatus Filter::decodeHeaders(Http::HeaderMap& headers, bool e // Ensure an http transport scheme is selected before continuing with decoding. ASSERT(headers.Scheme()); - hedging_params_ = FilterUtility::finalHedgingParams(*route_entry_, headers); - retry_state_ = createRetryState(route_entry_->retryPolicy(), headers, *cluster_, config_.runtime_, config_.random_, callbacks_->dispatcher(), route_entry_->priority()); diff --git a/source/common/router/router.h b/source/common/router/router.h index 5f4694ea9de8b..8cf8276e502ee 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -92,7 +92,8 @@ class FilterUtility { * @return TimeoutData for both the global and per try timeouts. */ static TimeoutData finalTimeout(const RouteEntry& route, Http::HeaderMap& request_headers, - bool insert_envoy_expected_request_timeout_ms, bool grpc_request); + bool insert_envoy_expected_request_timeout_ms, bool grpc_request, + bool per_try_timeout_hedging_enabled); /** * Determine the final hedging settings after applying randomized behavior. diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index b07204eb8cc38..0b9e90c05b63d 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -2936,7 +2936,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { NiceMock route; EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(10), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); } @@ -2944,7 +2944,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { NiceMock route; EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -2955,7 +2955,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { NiceMock route; EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "bad"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(10), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -2967,7 +2967,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}, {"x-envoy-upstream-rq-per-try-timeout-ms", "15"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -2980,7 +2980,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}, {"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(5), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -2988,12 +2988,39 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_EQ("5", headers.get_("x-envoy-expected-rq-timeout-ms")); EXPECT_FALSE(headers.has("grpc-timeout")); } + { + NiceMock route; + EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); + Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}, + {"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, true); + EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); + EXPECT_EQ(std::chrono::milliseconds(5), timeout.per_try_timeout_); + EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); + EXPECT_FALSE(headers.has("x-envoy-upstream-rq-per-try-timeout-ms")); + EXPECT_EQ("15", headers.get_("x-envoy-expected-rq-timeout-ms")); + EXPECT_FALSE(headers.has("grpc-timeout")); + } + { + NiceMock route; + EXPECT_CALL(route, maxGrpcTimeout()) + .WillRepeatedly(Return(absl::optional(10))); + Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}, + {"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, true); + EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); + EXPECT_EQ(std::chrono::milliseconds(5), timeout.per_try_timeout_); + EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); + EXPECT_FALSE(headers.has("x-envoy-upstream-rq-per-try-timeout-ms")); + EXPECT_EQ("15", headers.get_("x-envoy-expected-rq-timeout-ms")); + EXPECT_EQ("15m", headers.get_("grpc-timeout")); + } { NiceMock route; route.retry_policy_.per_try_timeout_ = std::chrono::milliseconds(7); EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(7), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -3007,7 +3034,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}, {"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(5), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -3020,7 +3047,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, maxGrpcTimeout()) .WillRepeatedly(Return(absl::optional(0))); Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(0), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("grpc-timeout")); @@ -3030,7 +3057,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, maxGrpcTimeout()).WillRepeatedly(Return(absl::nullopt)); EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(10), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("grpc-timeout")); @@ -3041,7 +3068,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { .WillRepeatedly(Return(absl::optional(0))); Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "1000m"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(1000), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_EQ("1000m", headers.get_("grpc-timeout")); @@ -3052,7 +3079,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { .WillRepeatedly(Return(absl::optional(999))); Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "1000m"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(999), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_EQ("999m", headers.get_("grpc-timeout")); @@ -3062,7 +3089,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, maxGrpcTimeout()) .WillRepeatedly(Return(absl::optional(999))); Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "0m"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(999), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_EQ("999m", headers.get_("grpc-timeout")); @@ -3074,7 +3101,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, grpcTimeoutOffset()) .WillRepeatedly(Return(absl::optional(10))); Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "100m"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(90), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); } @@ -3085,7 +3112,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, grpcTimeoutOffset()) .WillRepeatedly(Return(absl::optional(10))); Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "1m"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(1), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); } @@ -3096,7 +3123,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "1000m"}, {"x-envoy-upstream-rq-timeout-ms", "15"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -3110,7 +3137,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "1000m"}, {"x-envoy-upstream-rq-timeout-ms", "bad"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(1000), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -3125,7 +3152,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { {"grpc-timeout", "1000m"}, {"x-envoy-upstream-rq-timeout-ms", "15"}, {"x-envoy-upstream-rq-per-try-timeout-ms", "15"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -3141,7 +3168,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { {"grpc-timeout", "1000m"}, {"x-envoy-upstream-rq-timeout-ms", "15"}, {"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(5), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -3157,7 +3184,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "1000m"}, {"x-envoy-upstream-rq-timeout-ms", "15"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(7), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -3174,7 +3201,7 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { {"grpc-timeout", "1000m"}, {"x-envoy-upstream-rq-timeout-ms", "15"}, {"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(5), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -3189,7 +3216,7 @@ TEST(RouterFilterUtilityTest, FinalTimeoutSupressEnvoyHeaders) { NiceMock route; EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false); + FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); From 73b8f5331361cf1a8b73a6125e25d2b5767bead7 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 30 Apr 2019 10:55:02 -0400 Subject: [PATCH 47/70] remove hedging stats (for now) Signed-off-by: Michael Puncel --- .../cluster_manager/cluster_stats.rst | 2 - docs/root/operations/admin.rst | 1 - include/envoy/upstream/host_description.h | 1 - include/envoy/upstream/upstream.h | 2 - source/common/router/router.cc | 8 ++- test/common/router/router_test.cc | 49 ++----------------- 6 files changed, 8 insertions(+), 55 deletions(-) diff --git a/docs/root/configuration/cluster_manager/cluster_stats.rst b/docs/root/configuration/cluster_manager/cluster_stats.rst index a485d37ab4913..b5b6554be7b63 100644 --- a/docs/root/configuration/cluster_manager/cluster_stats.rst +++ b/docs/root/configuration/cluster_manager/cluster_stats.rst @@ -61,8 +61,6 @@ Every cluster has a statistics tree rooted at *cluster..* with the followi upstream_cx_max_requests, Counter, Total connections closed due to maximum requests upstream_cx_none_healthy, Counter, Total times connection not established due to no healthy hosts upstream_rq_total, Counter, Total requests - upstream_rq_hedge_abandoned, Counter, Number of hedged requests that were abandoned due to accepting another response. - upstream_rq_hedge_attempted, Counter, Total number of hedged requests that were attempted. upstream_rq_active, Gauge, Total active requests upstream_rq_pending_total, Counter, Total requests pending a connection pool connection upstream_rq_pending_overflow, Counter, Total requests that overflowed connection pool circuit breaking and were failed diff --git a/docs/root/operations/admin.rst b/docs/root/operations/admin.rst index ea644e36dca81..5d99e09bd7c76 100644 --- a/docs/root/operations/admin.rst +++ b/docs/root/operations/admin.rst @@ -84,7 +84,6 @@ modify different aspects of the server: cx_total, Counter, Total connections cx_active, Gauge, Total active connections cx_connect_fail, Counter, Total connection failures - rq_hedge_abandoned, Counter, Total hedged requests that were canceled and abandoned due to accepting another response. rq_total, Counter, Total requests rq_timeout, Counter, Total timed out requests rq_success, Counter, Total requests with non-5xx responses diff --git a/include/envoy/upstream/host_description.h b/include/envoy/upstream/host_description.h index db3a8359e82d5..1fabc6686946e 100644 --- a/include/envoy/upstream/host_description.h +++ b/include/envoy/upstream/host_description.h @@ -24,7 +24,6 @@ namespace Upstream { COUNTER(cx_total) \ GAUGE (cx_active) \ COUNTER(cx_connect_fail) \ - COUNTER(rq_hedge_abandoned) \ COUNTER(rq_total) \ COUNTER(rq_timeout) \ COUNTER(rq_success) \ diff --git a/include/envoy/upstream/upstream.h b/include/envoy/upstream/upstream.h index 486cec19311a6..25f431860fbc9 100644 --- a/include/envoy/upstream/upstream.h +++ b/include/envoy/upstream/upstream.h @@ -489,8 +489,6 @@ class PrioritySet { COUNTER (upstream_cx_none_healthy) \ COUNTER (upstream_cx_pool_overflow) \ COUNTER (upstream_rq_total) \ - COUNTER (upstream_rq_hedge_abandoned) \ - COUNTER (upstream_rq_hedge_attempted) \ GAUGE (upstream_rq_active) \ COUNTER (upstream_rq_completed) \ COUNTER (upstream_rq_pending_total) \ diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 479ef4ba5b8b0..882a518433a23 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -623,7 +623,7 @@ void Filter::onSoftPerTryTimeout(UpstreamRequest& upstream_request) { // back. upstream_request.retried_ = true; - cluster_->stats().upstream_rq_hedge_attempted_.inc(); + // TODO: cluster stat for hedge attempted. } else if (retry_status == RetryStatus::NoOverflow) { callbacks_->streamInfo().setResponseFlag(StreamInfo::ResponseFlag::UpstreamOverflow); } else if (retry_status == RetryStatus::NoRetryLimitExceeded) { @@ -826,10 +826,8 @@ void Filter::resetOtherUpstreams(UpstreamRequest& upstream_request) { for (auto& upstream_request_tmp : upstream_requests_) { if (upstream_request_tmp.get() != &upstream_request) { upstream_request_tmp->resetStream(); - if (upstream_request_tmp->upstream_host_) { - upstream_request_tmp->upstream_host_->stats().rq_hedge_abandoned_.inc(); - } - cluster_->stats().upstream_rq_hedge_abandoned_.inc(); + // TODO: per-host stat for hedge abandoned. + // TODO: cluster stat for hedge abandoned. } } } diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index 0b9e90c05b63d..59e7913d80769 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -1371,13 +1371,8 @@ TEST_F(RouterTest, HedgedPerTryTimeoutFirstRequestSucceeds) { })); response_decoder1->decodeHeaders(std::move(response_headers), true); EXPECT_TRUE(verifyHostUpstreamStats(1, 0)); - EXPECT_EQ(1, cm_.conn_pool_.host_->stats_store_.counter("rq_hedge_abandoned").value()); - EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ - .counter("upstream_rq_hedge_abandoned") - .value()); - EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ - .counter("upstream_rq_hedge_attempted") - .value()); + + // TODO: Verify hedge stats here once they are implemented. } // Three requests sent: 1) 5xx error, 2) per try timeout, 3) gets good response @@ -1425,13 +1420,6 @@ TEST_F(RouterTest, HedgedPerTryTimeoutThirdRequestSucceeds) { router_.retry_state_->callback_(); EXPECT_TRUE(verifyHostUpstreamStats(0, 1)); - EXPECT_EQ(0, cm_.conn_pool_.host_->stats_store_.counter("rq_hedge_abandoned").value()); - EXPECT_EQ(0, cm_.thread_local_cluster_.cluster_.info_->stats_store_ - .counter("upstream_rq_hedge_abandoned") - .value()); - EXPECT_EQ(0, cm_.thread_local_cluster_.cluster_.info_->stats_store_ - .counter("upstream_rq_hedge_attempted") - .value()); // Now trigger a per try timeout on the 2nd request, expect a 3rd router_.retry_state_->expectHedgedPerTryTimeoutRetry(); @@ -1452,13 +1440,6 @@ TEST_F(RouterTest, HedgedPerTryTimeoutThirdRequestSucceeds) { expectPerTryTimerCreate(); router_.retry_state_->callback_(); EXPECT_TRUE(verifyHostUpstreamStats(0, 1)); - EXPECT_EQ(0, cm_.conn_pool_.host_->stats_store_.counter("rq_hedge_abandoned").value()); - EXPECT_EQ(0, cm_.thread_local_cluster_.cluster_.info_->stats_store_ - .counter("upstream_rq_hedge_abandoned") - .value()); - EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ - .counter("upstream_rq_hedge_attempted") - .value()); // Now write a 200 back. We expect the 2nd stream to be reset and stats to be // incremented properly. @@ -1476,13 +1457,8 @@ TEST_F(RouterTest, HedgedPerTryTimeoutThirdRequestSucceeds) { EXPECT_CALL(*router_.retry_state_, shouldRetryHeaders(_, _)).WillOnce(Return(RetryStatus::No)); response_decoder3->decodeHeaders(std::move(response_headers2), true); EXPECT_TRUE(verifyHostUpstreamStats(1, 1)); - EXPECT_EQ(1, cm_.conn_pool_.host_->stats_store_.counter("rq_hedge_abandoned").value()); - EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ - .counter("upstream_rq_hedge_abandoned") - .value()); - EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ - .counter("upstream_rq_hedge_attempted") - .value()); + + // TODO: Verify hedge stats here once they are implemented. } // First request times out and is retried, and then a response is received. @@ -1673,13 +1649,6 @@ TEST_F(RouterTest, HedgedPerTryTimeoutGlobalTimeout) { router_.retry_state_->callback_(); EXPECT_TRUE(verifyHostUpstreamStats(0, 0)); - EXPECT_EQ(0, cm_.conn_pool_.host_->stats_store_.counter("rq_hedge_abandoned").value()); - EXPECT_EQ(0, cm_.thread_local_cluster_.cluster_.info_->stats_store_ - .counter("upstream_rq_hedge_abandoned") - .value()); - EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ - .counter("upstream_rq_hedge_attempted") - .value()); // Now trigger global timeout, expect everything to be reset EXPECT_CALL(encoder1.stream_, resetStream(_)).Times(1); @@ -1692,16 +1661,8 @@ TEST_F(RouterTest, HedgedPerTryTimeoutGlobalTimeout) { })); response_timeout_->callback_(); EXPECT_TRUE(verifyHostUpstreamStats(0, 2)); - EXPECT_EQ(0, cm_.conn_pool_.host_->stats_store_.counter("rq_hedge_abandoned").value()); EXPECT_EQ(2, cm_.conn_pool_.host_->stats_store_.counter("rq_timeout").value()); - EXPECT_EQ(0, cm_.thread_local_cluster_.cluster_.info_->stats_store_ - .counter("upstream_rq_hedge_abandoned") - .value()); - EXPECT_EQ(1, cm_.thread_local_cluster_.cluster_.info_->stats_store_ - .counter("upstream_rq_hedge_attempted") - .value()); - EXPECT_EQ(2, cm_.thread_local_cluster_.cluster_.info_->stats_store_.counter("upstream_rq_timeout") - .value()); + // TODO: Verify hedge stats here once they are implemented. } TEST_F(RouterTest, RetryNoneHealthy) { From f7aa9908c927f631ea8b0001b5758de1da9d560b Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 30 Apr 2019 10:58:58 -0400 Subject: [PATCH 48/70] fix format Signed-off-by: Michael Puncel --- test/common/router/router_test.cc | 69 ++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index 59e7913d80769..851bfd5b09169 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -2897,7 +2897,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { NiceMock route; EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(10), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); } @@ -2905,7 +2906,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { NiceMock route; EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -2916,7 +2918,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { NiceMock route; EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "bad"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(10), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -2928,7 +2931,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}, {"x-envoy-upstream-rq-per-try-timeout-ms", "15"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -2941,7 +2945,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}, {"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(5), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -2954,7 +2959,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}, {"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, true); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, false, true); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(5), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -2968,7 +2974,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { .WillRepeatedly(Return(absl::optional(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}, {"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, true); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, true, true); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(5), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -2981,7 +2988,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { route.retry_policy_.per_try_timeout_ = std::chrono::milliseconds(7); EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(7), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -2995,7 +3003,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}, {"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(5), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -3008,7 +3017,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, maxGrpcTimeout()) .WillRepeatedly(Return(absl::optional(0))); Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(0), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("grpc-timeout")); @@ -3018,7 +3028,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, maxGrpcTimeout()).WillRepeatedly(Return(absl::nullopt)); EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(10), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("grpc-timeout")); @@ -3029,7 +3040,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { .WillRepeatedly(Return(absl::optional(0))); Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "1000m"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(1000), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_EQ("1000m", headers.get_("grpc-timeout")); @@ -3040,7 +3052,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { .WillRepeatedly(Return(absl::optional(999))); Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "1000m"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(999), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_EQ("999m", headers.get_("grpc-timeout")); @@ -3050,7 +3063,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, maxGrpcTimeout()) .WillRepeatedly(Return(absl::optional(999))); Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "0m"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(999), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_EQ("999m", headers.get_("grpc-timeout")); @@ -3062,7 +3076,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, grpcTimeoutOffset()) .WillRepeatedly(Return(absl::optional(10))); Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "100m"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(90), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); } @@ -3073,7 +3088,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { EXPECT_CALL(route, grpcTimeoutOffset()) .WillRepeatedly(Return(absl::optional(10))); Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "1m"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(1), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); } @@ -3084,7 +3100,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "1000m"}, {"x-envoy-upstream-rq-timeout-ms", "15"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -3098,7 +3115,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "1000m"}, {"x-envoy-upstream-rq-timeout-ms", "bad"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(1000), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -3113,7 +3131,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { {"grpc-timeout", "1000m"}, {"x-envoy-upstream-rq-timeout-ms", "15"}, {"x-envoy-upstream-rq-per-try-timeout-ms", "15"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -3129,7 +3148,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { {"grpc-timeout", "1000m"}, {"x-envoy-upstream-rq-timeout-ms", "15"}, {"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(5), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -3145,7 +3165,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { Http::TestHeaderMapImpl headers{{"content-type", "application/grpc"}, {"grpc-timeout", "1000m"}, {"x-envoy-upstream-rq-timeout-ms", "15"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(7), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -3162,7 +3183,8 @@ TEST(RouterFilterUtilityTest, FinalTimeout) { {"grpc-timeout", "1000m"}, {"x-envoy-upstream-rq-timeout-ms", "15"}, {"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, true, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, true, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(5), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); @@ -3177,7 +3199,8 @@ TEST(RouterFilterUtilityTest, FinalTimeoutSupressEnvoyHeaders) { NiceMock route; EXPECT_CALL(route, timeout()).WillOnce(Return(std::chrono::milliseconds(10))); Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-timeout-ms", "15"}}; - FilterUtility::TimeoutData timeout = FilterUtility::finalTimeout(route, headers, true, false, false); + FilterUtility::TimeoutData timeout = + FilterUtility::finalTimeout(route, headers, true, false, false); EXPECT_EQ(std::chrono::milliseconds(15), timeout.global_timeout_); EXPECT_EQ(std::chrono::milliseconds(0), timeout.per_try_timeout_); EXPECT_FALSE(headers.has("x-envoy-upstream-rq-timeout-ms")); From 701ecce618a4535e4a9dca459f859865725ec178 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 30 Apr 2019 12:33:09 -0400 Subject: [PATCH 49/70] make hedging require a cluster flag This will be useful for avoiding a backwards compatibility issue when we want to add stats for hedging. We want people to opt-in to both the stats and the functionality at once, not just the stats. Signed-off-by: Michael Puncel --- api/envoy/api/v2/cds.proto | 6 ++- include/envoy/upstream/upstream.h | 5 ++ source/common/router/router.cc | 9 +++- source/common/router/router.h | 6 ++- source/common/upstream/upstream_impl.cc | 3 +- source/common/upstream/upstream_impl.h | 3 ++ test/common/router/router_test.cc | 54 +++++++++++++++---- .../http_timeout_integration_test.h | 4 ++ test/mocks/upstream/cluster_info.h | 1 + 9 files changed, 76 insertions(+), 15 deletions(-) diff --git a/api/envoy/api/v2/cds.proto b/api/envoy/api/v2/cds.proto index 6fb858efd6420..ae30310188f12 100644 --- a/api/envoy/api/v2/cds.proto +++ b/api/envoy/api/v2/cds.proto @@ -51,7 +51,7 @@ service ClusterDiscoveryService { // [#protodoc-title: Clusters] // Configuration for a single upstream cluster. -// [#comment:next free field: 39] +// [#comment:next free field: 40] message Cluster { // Supplies the name of the cluster which must be unique across all clusters. // The cluster name is used when emitting @@ -581,6 +581,10 @@ message Cluster { // If this flag is not set to true, Envoy will wait until the hosts fail active health // checking before removing it from the cluster. bool drain_connections_on_host_removal = 32; + + // If enabled, allow HTTP/gRPC requests to this cluster to use a hedging + // strategy in which multiple upstream requests may be sent simultaneously. + bool allow_request_hedging = 39; } // An extensible structure containing the address Envoy should bind to when diff --git a/include/envoy/upstream/upstream.h b/include/envoy/upstream/upstream.h index 25f431860fbc9..8ad43767b3a05 100644 --- a/include/envoy/upstream/upstream.h +++ b/include/envoy/upstream/upstream.h @@ -777,6 +777,11 @@ class ClusterInfo { */ virtual absl::optional eds_service_name() const PURE; + /** + * @return whether to allow request hedging to this cluster to occur. + */ + virtual bool allowRequestHedging() const PURE; + protected: /** * Invoked by extensionProtocolOptionsTyped. diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 882a518433a23..c0731c1863e9e 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -198,8 +198,13 @@ FilterUtility::finalTimeout(const RouteEntry& route, Http::HeaderMap& request_he } FilterUtility::HedgingParams FilterUtility::finalHedgingParams(const RouteEntry& route, - Http::HeaderMap& request_headers) { + Http::HeaderMap& request_headers, + const Upstream::ClusterInfo& cluster) { HedgingParams hedgingParams; + if (!cluster.allowRequestHedging()) { + return hedgingParams; + } + hedgingParams.hedge_on_per_try_timeout_ = route.hedgePolicy().hedgeOnPerTryTimeout(); Http::HeaderEntry* hedge_on_per_try_timeout_entry = request_headers.EnvoyHedgeOnPerTryTimeout(); @@ -386,7 +391,7 @@ Http::FilterHeadersStatus Filter::decodeHeaders(Http::HeaderMap& headers, bool e return Http::FilterHeadersStatus::StopIteration; } - hedging_params_ = FilterUtility::finalHedgingParams(*route_entry_, headers); + hedging_params_ = FilterUtility::finalHedgingParams(*route_entry_, headers, *cluster_); timeout_ = FilterUtility::finalTimeout(*route_entry_, headers, !config_.suppress_envoy_headers_, grpc_request_, hedging_params_.hedge_on_per_try_timeout_); diff --git a/source/common/router/router.h b/source/common/router/router.h index 8cf8276e502ee..9d3a216444213 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -99,10 +99,12 @@ class FilterUtility { * Determine the final hedging settings after applying randomized behavior. * @param route supplies the request route. * @param request_headers supplies the request headers. - * @return HedgingParams the final parameters to use for request hedging + * @param cluster supplies the cluster info the request is destined for. + * @return HedgingParams the final parameters to use for request hedging. */ static HedgingParams finalHedgingParams(const RouteEntry& route, - Http::HeaderMap& request_headers); + Http::HeaderMap& request_headers, + const Upstream::ClusterInfo& cluster); }; /** diff --git a/source/common/upstream/upstream_impl.cc b/source/common/upstream/upstream_impl.cc index fbc27861a3f4f..6af55bf5532d2 100644 --- a/source/common/upstream/upstream_impl.cc +++ b/source/common/upstream/upstream_impl.cc @@ -567,7 +567,8 @@ ClusterInfoImpl::ClusterInfoImpl(const envoy::api::v2::Cluster& config, metadata_(config.metadata()), typed_metadata_(config.metadata()), common_lb_config_(config.common_lb_config()), cluster_socket_options_(parseClusterSocketOptions(config, bind_config)), - drain_connections_on_host_removal_(config.drain_connections_on_host_removal()) { + drain_connections_on_host_removal_(config.drain_connections_on_host_removal()), + allow_request_hedging_(config.allow_request_hedging()) { switch (config.lb_policy()) { case envoy::api::v2::Cluster::ROUND_ROBIN: lb_type_ = LoadBalancerType::RoundRobin; diff --git a/source/common/upstream/upstream_impl.h b/source/common/upstream/upstream_impl.h index cb10291ea233f..8520220328b95 100644 --- a/source/common/upstream/upstream_impl.h +++ b/source/common/upstream/upstream_impl.h @@ -551,6 +551,8 @@ class ClusterInfoImpl : public ClusterInfo { absl::optional eds_service_name() const override { return eds_service_name_; } + bool allowRequestHedging() const override { return allow_request_hedging_; } + private: struct ResourceManagers { ResourceManagers(const envoy::api::v2::Cluster& config, Runtime::Loader& runtime, @@ -593,6 +595,7 @@ class ClusterInfoImpl : public ClusterInfo { const envoy::api::v2::Cluster::CommonLbConfig common_lb_config_; const Network::ConnectionSocket::OptionsSharedPtr cluster_socket_options_; const bool drain_connections_on_host_removal_; + const bool allow_request_hedging_; absl::optional eds_service_name_; }; diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index 851bfd5b09169..9b7adbe1880e6 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -2826,68 +2826,104 @@ TEST_F(RouterTest, UpstreamTimingTimeout) { TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { Http::TestHeaderMapImpl empty_headers; - { // route says true, header not present, expect true. + std::shared_ptr cluster{new Upstream::MockClusterInfo()}; + { // route says true, header not present, cluster allows it, expect true. NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = true; + EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, empty_headers); + FilterUtility::finalHedgingParams(route, empty_headers, *cluster); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } + { // route says true, header not present, cluster does not allow it, expect false. + NiceMock route; + route.hedge_policy_.hedge_on_per_try_timeout_ = true; + EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(false)); + EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, empty_headers, *cluster); + EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); + } { // route says false, header not present, expect false. NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = false; + EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, empty_headers); + FilterUtility::finalHedgingParams(route, empty_headers, *cluster); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } { // route says false, header says true, expect true. Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "true"}}; NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = false; + EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } + { // route says false, header says true, cluster does not allow it, expect true. + Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "true"}}; + NiceMock route; + route.hedge_policy_.hedge_on_per_try_timeout_ = false; + EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(false)); + EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); + EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); + } { // route says false, header says false, expect false. Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "false"}}; NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = false; + EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } { // route says true, header says false, expect false. Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "false"}}; NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = true; + EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } { // route says true, header says true, expect true. Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "true"}}; NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = true; + EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } + { // route says true, header says true, cluster does not allow it, expect false. + Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "true"}}; + NiceMock route; + route.hedge_policy_.hedge_on_per_try_timeout_ = true; + EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(false)); + EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); + EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); + } { // route says true, header is invalid, expect true. Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "bad"}}; NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = true; + EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } { // route says false, header is invalid, expect false. Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "bad"}}; NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = false; + EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } } diff --git a/test/integration/http_timeout_integration_test.h b/test/integration/http_timeout_integration_test.h index 356a9f905011d..36d6c4ac9dfcf 100644 --- a/test/integration/http_timeout_integration_test.h +++ b/test/integration/http_timeout_integration_test.h @@ -16,6 +16,10 @@ class HttpTimeoutIntegrationTest : public testing::TestWithParammutable_clusters()[0][0].set_allow_request_hedging(true); + }); } void testRouterRequestAndResponseWithHedgedPerTryTimeout(uint64_t request_size, diff --git a/test/mocks/upstream/cluster_info.h b/test/mocks/upstream/cluster_info.h index dd498b8fae26c..085589da712d2 100644 --- a/test/mocks/upstream/cluster_info.h +++ b/test/mocks/upstream/cluster_info.h @@ -85,6 +85,7 @@ class MockClusterInfo : public ClusterInfo { MOCK_CONST_METHOD0(clusterSocketOptions, const Network::ConnectionSocket::OptionsSharedPtr&()); MOCK_CONST_METHOD0(drainConnectionsOnHostRemoval, bool()); MOCK_CONST_METHOD0(eds_service_name, absl::optional()); + MOCK_CONST_METHOD0(allowRequestHedging, bool()); std::string name_{"fake_cluster"}; absl::optional eds_service_name_; From 8312bb70f8a8ef9bd10dc1eb3aa58f5d6e0e8e3a Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 30 Apr 2019 14:20:10 -0400 Subject: [PATCH 50/70] add more test permutations to hedging integration tests. Previously they only tested the first (timed out) request eventually getting a 200 response. Now both scenarios are tested, first and second request getting a response. Signed-off-by: Michael Puncel --- .../http_timeout_integration_test.cc | 64 +++++++++++++++---- .../http_timeout_integration_test.h | 3 +- 2 files changed, 52 insertions(+), 15 deletions(-) diff --git a/test/integration/http_timeout_integration_test.cc b/test/integration/http_timeout_integration_test.cc index fdb5a9586b51b..54b554f1a971c 100644 --- a/test/integration/http_timeout_integration_test.cc +++ b/test/integration/http_timeout_integration_test.cc @@ -154,27 +154,44 @@ TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeout) { EXPECT_EQ("200", response->headers().Status()->value().getStringView()); } -TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutWithBodyNoBuffer) { - testRouterRequestAndResponseWithHedgedPerTryTimeout(1024, 512); +TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutWithBodyNoBufferFirstRequestWins) { + testRouterRequestAndResponseWithHedgedPerTryTimeout(1024, 512, true); } -TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutLowUpstreamBufferLimitLargeRequest) { +TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutWithBodyNoBufferSecondRequestWins) { + testRouterRequestAndResponseWithHedgedPerTryTimeout(1024, 512, false); +} + +TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutLowUpstreamBufferLimitLargeRequestFirstRequestWins) { config_helper_.setBufferLimits(1024, 1024 * 1024); // Set buffer limits upstream and downstream. - testRouterRequestAndResponseWithHedgedPerTryTimeout(1024 * 1024, 1024); + testRouterRequestAndResponseWithHedgedPerTryTimeout(1024 * 1024, 1024, true); +} + +TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutLowUpstreamBufferLimitLargeRequestSecondRequestWins) { + config_helper_.setBufferLimits(1024, 1024 * 1024); // Set buffer limits upstream and downstream. + testRouterRequestAndResponseWithHedgedPerTryTimeout(1024 * 1024, 1024, false); +} + +TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutLowDownstreamBufferLimitLargeResponseFirstRequestWins) { + config_helper_.setBufferLimits(1024 * 1024, 1024); // Set buffer limits upstream and downstream. + testRouterRequestAndResponseWithHedgedPerTryTimeout(1024, 1024 * 1024, true); } -TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutLowDownstreamBufferLimitLargeResponse) { +TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutLowDownstreamBufferLimitLargeResponseSecondRequestWins) { config_helper_.setBufferLimits(1024 * 1024, 1024); // Set buffer limits upstream and downstream. - testRouterRequestAndResponseWithHedgedPerTryTimeout(1024, 1024 * 1024); + testRouterRequestAndResponseWithHedgedPerTryTimeout(1024, 1024 * 1024, false); } // Sends a request with x-envoy-hedge-on-per-try-timeout, sleeps (with // simulated time) for longer than the per try timeout but shorter than the // global timeout, asserts that a retry is sent, and then responds with a 200 // response on the original request and ensures the downstream sees it. -// Request/response/header size are configurable to test flow control. +// Request/response/header size are configurable to test flow control. If +// first_request_wins is true, then the "winning" response will be sent in +// response to the first (timed out) request. If false, the second request will +// get the good response. void HttpTimeoutIntegrationTest::testRouterRequestAndResponseWithHedgedPerTryTimeout( - uint64_t request_size, uint64_t response_size) { + uint64_t request_size, uint64_t response_size, bool first_request_wins) { initialize(); codec_client_ = makeHttpConnection(makeClientConnection(lookupPort("http"))); @@ -212,17 +229,31 @@ void HttpTimeoutIntegrationTest::testRouterRequestAndResponseWithHedgedPerTryTim ASSERT_TRUE(upstream_request2->waitForHeadersComplete()); ASSERT_TRUE(upstream_request2->waitForEndStream(*dispatcher_)); - // Encode 200 response headers for the first (timed out) request. Http::TestHeaderMapImpl response_headers{{":status", "200"}}; - upstream_request_->encodeHeaders(response_headers, response_size == 0); + if (first_request_wins) { + // Encode 200 response headers for the first (timed out) request. + upstream_request_->encodeHeaders(response_headers, response_size == 0); + } else { + // Encode 200 response headers for the second request. + upstream_request2->encodeHeaders(response_headers, response_size == 0); + } response->waitForHeaders(); - // The second request should be reset since we used the response from the first request. - ASSERT_TRUE(upstream_request2->waitForReset(std::chrono::seconds(15))); + if (first_request_wins) { + // The second request should be reset since we used the response from the first request. + ASSERT_TRUE(upstream_request2->waitForReset(std::chrono::seconds(15))); + } else { + // The first request should be reset since we used the response from the second request. + ASSERT_TRUE(upstream_request_->waitForReset(std::chrono::seconds(15))); + } if (response_size) { - upstream_request_->encodeData(response_size, true); + if (first_request_wins) { + upstream_request_->encodeData(response_size, true); + } else { + upstream_request2->encodeData(response_size, true); + } } response->waitForEndStream(); @@ -230,7 +261,12 @@ void HttpTimeoutIntegrationTest::testRouterRequestAndResponseWithHedgedPerTryTim codec_client_->close(); EXPECT_TRUE(upstream_request_->complete()); - EXPECT_EQ(request_size, upstream_request_->bodyLength()); + EXPECT_TRUE(upstream_request2->complete()); + if (first_request_wins) { + EXPECT_EQ(request_size, upstream_request_->bodyLength()); + } else { + EXPECT_EQ(request_size, upstream_request2->bodyLength()); + } EXPECT_TRUE(response->complete()); EXPECT_EQ("200", response->headers().Status()->value().getStringView()); diff --git a/test/integration/http_timeout_integration_test.h b/test/integration/http_timeout_integration_test.h index 36d6c4ac9dfcf..2f198f7ff48d0 100644 --- a/test/integration/http_timeout_integration_test.h +++ b/test/integration/http_timeout_integration_test.h @@ -23,7 +23,8 @@ class HttpTimeoutIntegrationTest : public testing::TestWithParam Date: Tue, 30 Apr 2019 14:34:52 -0400 Subject: [PATCH 51/70] fix format Signed-off-by: Michael Puncel --- source/common/router/router.cc | 6 ++--- source/common/router/router.h | 3 +-- test/common/router/router_test.cc | 24 ++++++++++++------- .../http_timeout_integration_test.cc | 12 ++++++---- .../http_timeout_integration_test.h | 5 ++-- 5 files changed, 31 insertions(+), 19 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index c0731c1863e9e..304ed00ac25b5 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -197,9 +197,9 @@ FilterUtility::finalTimeout(const RouteEntry& route, Http::HeaderMap& request_he return timeout; } -FilterUtility::HedgingParams FilterUtility::finalHedgingParams(const RouteEntry& route, - Http::HeaderMap& request_headers, - const Upstream::ClusterInfo& cluster) { +FilterUtility::HedgingParams +FilterUtility::finalHedgingParams(const RouteEntry& route, Http::HeaderMap& request_headers, + const Upstream::ClusterInfo& cluster) { HedgingParams hedgingParams; if (!cluster.allowRequestHedging()) { return hedgingParams; diff --git a/source/common/router/router.h b/source/common/router/router.h index 9d3a216444213..056f514d3dea1 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -102,8 +102,7 @@ class FilterUtility { * @param cluster supplies the cluster info the request is destined for. * @return HedgingParams the final parameters to use for request hedging. */ - static HedgingParams finalHedgingParams(const RouteEntry& route, - Http::HeaderMap& request_headers, + static HedgingParams finalHedgingParams(const RouteEntry& route, Http::HeaderMap& request_headers, const Upstream::ClusterInfo& cluster); }; diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index 9b7adbe1880e6..244a85d8f4bcc 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -2860,7 +2860,8 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = false; EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, headers, *cluster); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } { // route says false, header says true, cluster does not allow it, expect true. @@ -2869,7 +2870,8 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = false; EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(false)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, headers, *cluster); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } { // route says false, header says false, expect false. @@ -2878,7 +2880,8 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = false; EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, headers, *cluster); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } { // route says true, header says false, expect false. @@ -2887,7 +2890,8 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = true; EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, headers, *cluster); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } { // route says true, header says true, expect true. @@ -2896,7 +2900,8 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = true; EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, headers, *cluster); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } { // route says true, header says true, cluster does not allow it, expect false. @@ -2905,7 +2910,8 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = true; EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(false)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, headers, *cluster); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } { // route says true, header is invalid, expect true. @@ -2914,7 +2920,8 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = true; EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, headers, *cluster); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } { // route says false, header is invalid, expect false. @@ -2923,7 +2930,8 @@ TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { route.hedge_policy_.hedge_on_per_try_timeout_ = false; EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers, *cluster); + FilterUtility::HedgingParams hedgingParams = + FilterUtility::finalHedgingParams(route, headers, *cluster); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } } diff --git a/test/integration/http_timeout_integration_test.cc b/test/integration/http_timeout_integration_test.cc index 54b554f1a971c..70ecd9036ea15 100644 --- a/test/integration/http_timeout_integration_test.cc +++ b/test/integration/http_timeout_integration_test.cc @@ -162,22 +162,26 @@ TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutWithBodyNoBufferSecondRequ testRouterRequestAndResponseWithHedgedPerTryTimeout(1024, 512, false); } -TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutLowUpstreamBufferLimitLargeRequestFirstRequestWins) { +TEST_P(HttpTimeoutIntegrationTest, + HedgedPerTryTimeoutLowUpstreamBufferLimitLargeRequestFirstRequestWins) { config_helper_.setBufferLimits(1024, 1024 * 1024); // Set buffer limits upstream and downstream. testRouterRequestAndResponseWithHedgedPerTryTimeout(1024 * 1024, 1024, true); } -TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutLowUpstreamBufferLimitLargeRequestSecondRequestWins) { +TEST_P(HttpTimeoutIntegrationTest, + HedgedPerTryTimeoutLowUpstreamBufferLimitLargeRequestSecondRequestWins) { config_helper_.setBufferLimits(1024, 1024 * 1024); // Set buffer limits upstream and downstream. testRouterRequestAndResponseWithHedgedPerTryTimeout(1024 * 1024, 1024, false); } -TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutLowDownstreamBufferLimitLargeResponseFirstRequestWins) { +TEST_P(HttpTimeoutIntegrationTest, + HedgedPerTryTimeoutLowDownstreamBufferLimitLargeResponseFirstRequestWins) { config_helper_.setBufferLimits(1024 * 1024, 1024); // Set buffer limits upstream and downstream. testRouterRequestAndResponseWithHedgedPerTryTimeout(1024, 1024 * 1024, true); } -TEST_P(HttpTimeoutIntegrationTest, HedgedPerTryTimeoutLowDownstreamBufferLimitLargeResponseSecondRequestWins) { +TEST_P(HttpTimeoutIntegrationTest, + HedgedPerTryTimeoutLowDownstreamBufferLimitLargeResponseSecondRequestWins) { config_helper_.setBufferLimits(1024 * 1024, 1024); // Set buffer limits upstream and downstream. testRouterRequestAndResponseWithHedgedPerTryTimeout(1024, 1024 * 1024, false); } diff --git a/test/integration/http_timeout_integration_test.h b/test/integration/http_timeout_integration_test.h index 2f198f7ff48d0..8f4348c99bda1 100644 --- a/test/integration/http_timeout_integration_test.h +++ b/test/integration/http_timeout_integration_test.h @@ -18,8 +18,9 @@ class HttpTimeoutIntegrationTest : public testing::TestWithParammutable_clusters()[0][0].set_allow_request_hedging(true); - }); + bootstrap.mutable_static_resources()->mutable_clusters()[0][0].set_allow_request_hedging( + true); + }); } void testRouterRequestAndResponseWithHedgedPerTryTimeout(uint64_t request_size, From 068508837f501f2eacbc4cbcdd2f60074e989199 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 30 Apr 2019 15:29:24 -0400 Subject: [PATCH 52/70] initialize hedge_on_per_try_timeout_ properly and fix router tests Signed-off-by: Michael Puncel --- source/common/router/router.cc | 1 + test/common/router/router_test.cc | 1 + 2 files changed, 2 insertions(+) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 304ed00ac25b5..a39989ba56dbf 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -202,6 +202,7 @@ FilterUtility::finalHedgingParams(const RouteEntry& route, Http::HeaderMap& requ const Upstream::ClusterInfo& cluster) { HedgingParams hedgingParams; if (!cluster.allowRequestHedging()) { + hedgingParams.hedge_on_per_try_timeout_ = false; return hedgingParams; } diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index 244a85d8f4bcc..0c834d7c6fba7 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -216,6 +216,7 @@ class RouterTestBase : public testing::Test { callbacks_.route_->route_entry_.hedge_policy_.additional_request_chance_.set_numerator(0); callbacks_.route_->route_entry_.hedge_policy_.additional_request_chance_.set_denominator( envoy::type::FractionalPercent::HUNDRED); + EXPECT_CALL(*cm_.thread_local_cluster_.cluster_.info_, allowRequestHedging).WillRepeatedly(Return(true)); } Event::SimulatedTimeSystem test_time_; From b29d6ca96bb783f3124ab0edaf73f6dedd2ae3ef Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 30 Apr 2019 15:59:02 -0400 Subject: [PATCH 53/70] update docs with allow_request_hedging cluster option Signed-off-by: Michael Puncel --- .../configuration/http_filters/router_filter.rst | 4 ++++ docs/root/intro/arch_overview/http_routing.rst | 14 ++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/docs/root/configuration/http_filters/router_filter.rst b/docs/root/configuration/http_filters/router_filter.rst index 0d416ac865026..360d269408293 100644 --- a/docs/root/configuration/http_filters/router_filter.rst +++ b/docs/root/configuration/http_filters/router_filter.rst @@ -229,6 +229,10 @@ in flight. The value of the header should be "true" or "false", and is ignored if invalid. +Hedging cannot be enabled by this header unless :ref:`allow_request_hedging +` has been enabled for the +cluster the request is destined for. + .. _config_http_filters_router_x-envoy-immediate-health-check-fail: x-envoy-immediate-health-check-fail diff --git a/docs/root/intro/arch_overview/http_routing.rst b/docs/root/intro/arch_overview/http_routing.rst index 3fd78b6dda849..6ba07f108e52b 100644 --- a/docs/root/intro/arch_overview/http_routing.rst +++ b/docs/root/intro/arch_overview/http_routing.rst @@ -93,12 +93,14 @@ Note that retries may be disabled depending on the contents of the :ref:`x-envoy Request Hedging --------------- -Envoy supports request hedging which can be enabled by specifying a :ref:`hedge policy -`. This means that Envoy will race multiple -simultaneous upstream requests and return the response associated with the -first acceptable response headers to the downstream. The retry policy is used -to determine whether a response should be returned or whether more responses -should be awaited. +Envoy supports request hedging which can be enabled by specifying a :ref:`hedge +policy ` along with enabling +:ref:`allow_request_hedging ` on +the relevant cluster. This means that Envoy will race multiple simultaneous +upstream requests and return the response associated with the first acceptable +response headers to the downstream. The retry policy is used to determine +whether a response should be returned or whether more responses should be +awaited. Currently hedging can only be performed in response to a request timeout. This means that a retry request will be issued without canceling the initial From 73652ee5fb87c42c530418caf49b3630d5cac399 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Wed, 1 May 2019 11:29:35 -0400 Subject: [PATCH 54/70] fix format Signed-off-by: Michael Puncel --- test/common/router/router_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index a74484ec96f51..f48f9c55fb1e6 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -216,7 +216,8 @@ class RouterTestBase : public testing::Test { callbacks_.route_->route_entry_.hedge_policy_.additional_request_chance_.set_numerator(0); callbacks_.route_->route_entry_.hedge_policy_.additional_request_chance_.set_denominator( envoy::type::FractionalPercent::HUNDRED); - EXPECT_CALL(*cm_.thread_local_cluster_.cluster_.info_, allowRequestHedging).WillRepeatedly(Return(true)); + EXPECT_CALL(*cm_.thread_local_cluster_.cluster_.info_, allowRequestHedging) + .WillRepeatedly(Return(true)); } Event::SimulatedTimeSystem test_time_; From ca87a4a0b6d59f07eaa21232398ca3f73fa1da19 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Fri, 10 May 2019 09:01:12 -0400 Subject: [PATCH 55/70] revert cluster config option to allow request hedging Signed-off-by: Michael Puncel --- api/envoy/api/v2/cds.proto | 6 +- .../http_filters/router_filter.rst | 4 -- .../root/intro/arch_overview/http_routing.rst | 12 ++-- include/envoy/upstream/upstream.h | 5 -- source/common/router/router.cc | 10 +-- source/common/router/router.h | 4 +- source/common/upstream/upstream_impl.cc | 3 +- source/common/upstream/upstream_impl.h | 3 - test/common/router/router_test.cc | 64 +++---------------- .../http_timeout_integration_test.h | 5 -- test/mocks/upstream/cluster_info.h | 1 - 11 files changed, 19 insertions(+), 98 deletions(-) diff --git a/api/envoy/api/v2/cds.proto b/api/envoy/api/v2/cds.proto index ae30310188f12..6fb858efd6420 100644 --- a/api/envoy/api/v2/cds.proto +++ b/api/envoy/api/v2/cds.proto @@ -51,7 +51,7 @@ service ClusterDiscoveryService { // [#protodoc-title: Clusters] // Configuration for a single upstream cluster. -// [#comment:next free field: 40] +// [#comment:next free field: 39] message Cluster { // Supplies the name of the cluster which must be unique across all clusters. // The cluster name is used when emitting @@ -581,10 +581,6 @@ message Cluster { // If this flag is not set to true, Envoy will wait until the hosts fail active health // checking before removing it from the cluster. bool drain_connections_on_host_removal = 32; - - // If enabled, allow HTTP/gRPC requests to this cluster to use a hedging - // strategy in which multiple upstream requests may be sent simultaneously. - bool allow_request_hedging = 39; } // An extensible structure containing the address Envoy should bind to when diff --git a/docs/root/configuration/http_filters/router_filter.rst b/docs/root/configuration/http_filters/router_filter.rst index 360d269408293..0d416ac865026 100644 --- a/docs/root/configuration/http_filters/router_filter.rst +++ b/docs/root/configuration/http_filters/router_filter.rst @@ -229,10 +229,6 @@ in flight. The value of the header should be "true" or "false", and is ignored if invalid. -Hedging cannot be enabled by this header unless :ref:`allow_request_hedging -` has been enabled for the -cluster the request is destined for. - .. _config_http_filters_router_x-envoy-immediate-health-check-fail: x-envoy-immediate-health-check-fail diff --git a/docs/root/intro/arch_overview/http_routing.rst b/docs/root/intro/arch_overview/http_routing.rst index 6ba07f108e52b..6a191be268214 100644 --- a/docs/root/intro/arch_overview/http_routing.rst +++ b/docs/root/intro/arch_overview/http_routing.rst @@ -94,13 +94,11 @@ Request Hedging --------------- Envoy supports request hedging which can be enabled by specifying a :ref:`hedge -policy ` along with enabling -:ref:`allow_request_hedging ` on -the relevant cluster. This means that Envoy will race multiple simultaneous -upstream requests and return the response associated with the first acceptable -response headers to the downstream. The retry policy is used to determine -whether a response should be returned or whether more responses should be -awaited. +policy `. This means that Envoy will race +multiple simultaneous upstream requests and return the response associated with +the first acceptable response headers to the downstream. The retry policy is +used to determine whether a response should be returned or whether more +responses should be awaited. Currently hedging can only be performed in response to a request timeout. This means that a retry request will be issued without canceling the initial diff --git a/include/envoy/upstream/upstream.h b/include/envoy/upstream/upstream.h index 8ad43767b3a05..25f431860fbc9 100644 --- a/include/envoy/upstream/upstream.h +++ b/include/envoy/upstream/upstream.h @@ -777,11 +777,6 @@ class ClusterInfo { */ virtual absl::optional eds_service_name() const PURE; - /** - * @return whether to allow request hedging to this cluster to occur. - */ - virtual bool allowRequestHedging() const PURE; - protected: /** * Invoked by extensionProtocolOptionsTyped. diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 8d51edeb7e15f..d97c11f839592 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -194,14 +194,8 @@ FilterUtility::finalTimeout(const RouteEntry& route, Http::HeaderMap& request_he } FilterUtility::HedgingParams -FilterUtility::finalHedgingParams(const RouteEntry& route, Http::HeaderMap& request_headers, - const Upstream::ClusterInfo& cluster) { +FilterUtility::finalHedgingParams(const RouteEntry& route, Http::HeaderMap& request_headers) { HedgingParams hedgingParams; - if (!cluster.allowRequestHedging()) { - hedgingParams.hedge_on_per_try_timeout_ = false; - return hedgingParams; - } - hedgingParams.hedge_on_per_try_timeout_ = route.hedgePolicy().hedgeOnPerTryTimeout(); Http::HeaderEntry* hedge_on_per_try_timeout_entry = request_headers.EnvoyHedgeOnPerTryTimeout(); @@ -390,7 +384,7 @@ Http::FilterHeadersStatus Filter::decodeHeaders(Http::HeaderMap& headers, bool e return Http::FilterHeadersStatus::StopIteration; } - hedging_params_ = FilterUtility::finalHedgingParams(*route_entry_, headers, *cluster_); + hedging_params_ = FilterUtility::finalHedgingParams(*route_entry_, headers); timeout_ = FilterUtility::finalTimeout(*route_entry_, headers, !config_.suppress_envoy_headers_, grpc_request_, hedging_params_.hedge_on_per_try_timeout_); diff --git a/source/common/router/router.h b/source/common/router/router.h index 63f1a018b34ca..80232f757d2eb 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -99,11 +99,9 @@ class FilterUtility { * Determine the final hedging settings after applying randomized behavior. * @param route supplies the request route. * @param request_headers supplies the request headers. - * @param cluster supplies the cluster info the request is destined for. * @return HedgingParams the final parameters to use for request hedging. */ - static HedgingParams finalHedgingParams(const RouteEntry& route, Http::HeaderMap& request_headers, - const Upstream::ClusterInfo& cluster); + static HedgingParams finalHedgingParams(const RouteEntry& route, Http::HeaderMap& request_headers); }; /** diff --git a/source/common/upstream/upstream_impl.cc b/source/common/upstream/upstream_impl.cc index 5e519359afc71..0e1cf05619b7f 100644 --- a/source/common/upstream/upstream_impl.cc +++ b/source/common/upstream/upstream_impl.cc @@ -567,8 +567,7 @@ ClusterInfoImpl::ClusterInfoImpl(const envoy::api::v2::Cluster& config, metadata_(config.metadata()), typed_metadata_(config.metadata()), common_lb_config_(config.common_lb_config()), cluster_socket_options_(parseClusterSocketOptions(config, bind_config)), - drain_connections_on_host_removal_(config.drain_connections_on_host_removal()), - allow_request_hedging_(config.allow_request_hedging()) { + drain_connections_on_host_removal_(config.drain_connections_on_host_removal()) { switch (config.lb_policy()) { case envoy::api::v2::Cluster::ROUND_ROBIN: lb_type_ = LoadBalancerType::RoundRobin; diff --git a/source/common/upstream/upstream_impl.h b/source/common/upstream/upstream_impl.h index 8520220328b95..cb10291ea233f 100644 --- a/source/common/upstream/upstream_impl.h +++ b/source/common/upstream/upstream_impl.h @@ -551,8 +551,6 @@ class ClusterInfoImpl : public ClusterInfo { absl::optional eds_service_name() const override { return eds_service_name_; } - bool allowRequestHedging() const override { return allow_request_hedging_; } - private: struct ResourceManagers { ResourceManagers(const envoy::api::v2::Cluster& config, Runtime::Loader& runtime, @@ -595,7 +593,6 @@ class ClusterInfoImpl : public ClusterInfo { const envoy::api::v2::Cluster::CommonLbConfig common_lb_config_; const Network::ConnectionSocket::OptionsSharedPtr cluster_socket_options_; const bool drain_connections_on_host_removal_; - const bool allow_request_hedging_; absl::optional eds_service_name_; }; diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index f48f9c55fb1e6..3acdf5f0ceac2 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -216,8 +216,6 @@ class RouterTestBase : public testing::Test { callbacks_.route_->route_entry_.hedge_policy_.additional_request_chance_.set_numerator(0); callbacks_.route_->route_entry_.hedge_policy_.additional_request_chance_.set_denominator( envoy::type::FractionalPercent::HUNDRED); - EXPECT_CALL(*cm_.thread_local_cluster_.cluster_.info_, allowRequestHedging) - .WillRepeatedly(Return(true)); } Event::SimulatedTimeSystem test_time_; @@ -2840,112 +2838,68 @@ TEST_F(RouterTest, UpstreamTimingTimeout) { TEST(RouterFilterUtilityTest, FinalHedgingParamsHedgeOnPerTryTimeout) { Http::TestHeaderMapImpl empty_headers; - std::shared_ptr cluster{new Upstream::MockClusterInfo()}; - { // route says true, header not present, cluster allows it, expect true. + { // route says true, header not present, expect true. NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = true; - EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, empty_headers, *cluster); + FilterUtility::finalHedgingParams(route, empty_headers); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } - { // route says true, header not present, cluster does not allow it, expect false. - NiceMock route; - route.hedge_policy_.hedge_on_per_try_timeout_ = true; - EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(false)); - EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, empty_headers, *cluster); - EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); - } { // route says false, header not present, expect false. NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = false; - EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, empty_headers, *cluster); + FilterUtility::finalHedgingParams(route, empty_headers); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } { // route says false, header says true, expect true. Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "true"}}; NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = false; - EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers, *cluster); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } - { // route says false, header says true, cluster does not allow it, expect true. - Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "true"}}; - NiceMock route; - route.hedge_policy_.hedge_on_per_try_timeout_ = false; - EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(false)); - EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers, *cluster); - EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); - } { // route says false, header says false, expect false. Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "false"}}; NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = false; - EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers, *cluster); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } { // route says true, header says false, expect false. Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "false"}}; NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = true; - EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers, *cluster); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } { // route says true, header says true, expect true. Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "true"}}; NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = true; - EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers, *cluster); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } - { // route says true, header says true, cluster does not allow it, expect false. - Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "true"}}; - NiceMock route; - route.hedge_policy_.hedge_on_per_try_timeout_ = true; - EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(false)); - EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers, *cluster); - EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); - } { // route says true, header is invalid, expect true. Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "bad"}}; NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = true; - EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers, *cluster); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); EXPECT_TRUE(hedgingParams.hedge_on_per_try_timeout_); } { // route says false, header is invalid, expect false. Http::TestHeaderMapImpl headers{{"x-envoy-hedge-on-per-try-timeout", "bad"}}; NiceMock route; route.hedge_policy_.hedge_on_per_try_timeout_ = false; - EXPECT_CALL(*cluster, allowRequestHedging).WillRepeatedly(Return(true)); EXPECT_CALL(route, hedgePolicy).WillRepeatedly(ReturnRef(route.hedge_policy_)); - FilterUtility::HedgingParams hedgingParams = - FilterUtility::finalHedgingParams(route, headers, *cluster); + FilterUtility::HedgingParams hedgingParams = FilterUtility::finalHedgingParams(route, headers); EXPECT_FALSE(hedgingParams.hedge_on_per_try_timeout_); } } diff --git a/test/integration/http_timeout_integration_test.h b/test/integration/http_timeout_integration_test.h index 8f4348c99bda1..230a82d2577ae 100644 --- a/test/integration/http_timeout_integration_test.h +++ b/test/integration/http_timeout_integration_test.h @@ -16,11 +16,6 @@ class HttpTimeoutIntegrationTest : public testing::TestWithParammutable_clusters()[0][0].set_allow_request_hedging( - true); - }); } void testRouterRequestAndResponseWithHedgedPerTryTimeout(uint64_t request_size, diff --git a/test/mocks/upstream/cluster_info.h b/test/mocks/upstream/cluster_info.h index 085589da712d2..dd498b8fae26c 100644 --- a/test/mocks/upstream/cluster_info.h +++ b/test/mocks/upstream/cluster_info.h @@ -85,7 +85,6 @@ class MockClusterInfo : public ClusterInfo { MOCK_CONST_METHOD0(clusterSocketOptions, const Network::ConnectionSocket::OptionsSharedPtr&()); MOCK_CONST_METHOD0(drainConnectionsOnHostRemoval, bool()); MOCK_CONST_METHOD0(eds_service_name, absl::optional()); - MOCK_CONST_METHOD0(allowRequestHedging, bool()); std::string name_{"fake_cluster"}; absl::optional eds_service_name_; From 597786054315f296cc430f4464f29b18e6e1b6bf Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Fri, 10 May 2019 09:30:10 -0400 Subject: [PATCH 56/70] fix format Signed-off-by: Michael Puncel --- source/common/router/router.cc | 4 ++-- source/common/router/router.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index d97c11f839592..e3108953de146 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -193,8 +193,8 @@ FilterUtility::finalTimeout(const RouteEntry& route, Http::HeaderMap& request_he return timeout; } -FilterUtility::HedgingParams -FilterUtility::finalHedgingParams(const RouteEntry& route, Http::HeaderMap& request_headers) { +FilterUtility::HedgingParams FilterUtility::finalHedgingParams(const RouteEntry& route, + Http::HeaderMap& request_headers) { HedgingParams hedgingParams; hedgingParams.hedge_on_per_try_timeout_ = route.hedgePolicy().hedgeOnPerTryTimeout(); diff --git a/source/common/router/router.h b/source/common/router/router.h index 80232f757d2eb..8d038539c9e41 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -101,7 +101,8 @@ class FilterUtility { * @param request_headers supplies the request headers. * @return HedgingParams the final parameters to use for request hedging. */ - static HedgingParams finalHedgingParams(const RouteEntry& route, Http::HeaderMap& request_headers); + static HedgingParams finalHedgingParams(const RouteEntry& route, + Http::HeaderMap& request_headers); }; /** From a9db675799368f5b5d0381bc441fe830fa2ec43e Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Fri, 10 May 2019 22:37:39 -0400 Subject: [PATCH 57/70] fix logical merge where upstream_headers_ isn't always set when headers are seen Signed-off-by: Michael Puncel --- source/common/router/router.cc | 7 ++++--- source/common/router/router.h | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index e65a8c3347055..6b50d04ed76e7 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -584,7 +584,7 @@ void Filter::onResponseTimeout() { for (auto& upstream_request : upstream_requests_) { // Don't record a timeout for upstream requests we've already seen headers // for. - if (!upstream_request->upstream_headers_) { + if (!upstream_request->seen_headers_) { cluster_->stats().upstream_rq_timeout_.inc(); if (upstream_request->upstream_host_) { upstream_request->upstream_host_->stats().rq_timeout_.inc(); @@ -1133,7 +1133,7 @@ void Filter::doRetry() { uint32_t Filter::numRequestsAwaitingHeaders() { uint32_t ret = 0; for (auto& upstream_request : upstream_requests_) { - if (!upstream_request->upstream_headers_) { + if (!upstream_request->seen_headers_) { ret++; } } @@ -1146,7 +1146,7 @@ Filter::UpstreamRequest::UpstreamRequest(Filter& parent, Http::ConnectionPool::I stream_info_(pool.protocol(), parent_.callbacks_->dispatcher().timeSource()), calling_encode_headers_(false), upstream_canary_(false), decode_complete_(false), encode_complete_(false), encode_trailers_(false), retried_(false), - outlier_detection_timeout_recorded_(false), + seen_headers_(false), outlier_detection_timeout_recorded_(false), create_per_try_timeout_on_request_complete_(false) { if (parent_.config_.start_child_span_) { @@ -1188,6 +1188,7 @@ void Filter::UpstreamRequest::decodeHeaders(Http::HeaderMapPtr&& headers, bool e upstream_timing_.onFirstUpstreamRxByteReceived(parent_.callbacks_->dispatcher().timeSource()); maybeEndDecode(end_stream); + seen_headers_ = true; if (!parent_.config_.upstream_logs_.empty()) { upstream_headers_ = std::make_unique(*headers); } diff --git a/source/common/router/router.h b/source/common/router/router.h index 1404061160f28..480c88d182a03 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -399,6 +399,7 @@ class Filter : Logger::Loggable, bool encode_complete_ : 1; bool encode_trailers_ : 1; bool retried_ : 1; + bool seen_headers_ : 1; bool outlier_detection_timeout_recorded_ : 1; // Tracks whether we deferred a per try timeout because the downstream request // had not been completed yet. From 6d8e7be0cf05e03ddb43baff29041989a40c0872 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Fri, 10 May 2019 23:12:07 -0400 Subject: [PATCH 58/70] fix case where rq_error_ would not be incremented if other requests are in flight when a request fails Signed-off-by: Michael Puncel --- source/common/router/router.cc | 1 + test/common/router/router_test.cc | 70 +++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 6b50d04ed76e7..477e648bcded8 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -903,6 +903,7 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head // flight awaiting headers or scheduled retries. If so, exit to give them a // chance to return before returning a response downstream. if (could_not_retry && (numRequestsAwaitingHeaders() > 0 || pending_retries_ > 0)) { + upstream_request.upstream_host_->stats().rq_error_.inc(); return; } diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index 35ca7209e4e17..86170e1f4c81a 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -1674,6 +1674,76 @@ TEST_F(RouterTest, HedgedPerTryTimeoutGlobalTimeout) { // TODO: Verify hedge stats here once they are implemented. } +// Sequence: 1) per try timeout w/ hedge retry, 2) second request gets a 5xx +// response, no retries remaining 3) first request gets a 5xx response. +TEST_F(RouterTest, HedgingRetriesExhaustedBadResponse) { + enableHedgeOnPerTryTimeout(); + + NiceMock encoder1; + Http::StreamDecoder* response_decoder1 = nullptr; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder1 = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder1, cm_.conn_pool_.host_); + return nullptr; + })); + expectPerTryTimerCreate(); + expectResponseTimerCreate(); + + Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; + HttpTestUtility::addDefaultHeaders(headers); + router_.decodeHeaders(headers, true); + + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(504)); + EXPECT_CALL(encoder1.stream_, resetStream(_)).Times(0); + EXPECT_CALL(callbacks_, encodeHeaders_(_, _)).Times(0); + router_.retry_state_->expectHedgedPerTryTimeoutRetry(); + per_try_timeout_->callback_(); + + NiceMock encoder2; + Http::StreamDecoder* response_decoder2 = nullptr; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder2 = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder2, cm_.conn_pool_.host_); + return nullptr; + })); + expectPerTryTimerCreate(); + router_.retry_state_->callback_(); + + EXPECT_TRUE(verifyHostUpstreamStats(0, 0)); + + // Now trigger a 503 n response to the second request. + Http::HeaderMapPtr bad_response_headers1(new Http::TestHeaderMapImpl{{":status", "503"}}); + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(503)); + + EXPECT_CALL(*router_.retry_state_, shouldRetryHeaders(_, _)) + .WillOnce(Return(RetryStatus::NoRetryLimitExceeded)); + response_decoder2->decodeHeaders(std::move(bad_response_headers1), true); + + EXPECT_TRUE(verifyHostUpstreamStats(0, 1)); + + // Now trigger a 502 n response to the first request. + Http::HeaderMapPtr bad_response_headers2(new Http::TestHeaderMapImpl{{":status", "502"}}); + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(502)); + + // We should not call shouldRetryHeaders() because you never retry the same + // request twice. + EXPECT_CALL(*router_.retry_state_, shouldRetryHeaders(_, _)).Times(0); + + EXPECT_CALL(callbacks_, encodeHeaders_(_, _)) + .WillOnce(Invoke([&](Http::HeaderMap& headers, bool) -> void { + EXPECT_EQ(headers.Status()->value(), "502"); + })); + response_decoder1->decodeHeaders(std::move(bad_response_headers2), true); + + EXPECT_TRUE(verifyHostUpstreamStats(0, 2)); +} + TEST_F(RouterTest, RetryNoneHealthy) { NiceMock encoder1; Http::StreamDecoder* response_decoder = nullptr; From 35ce08194f3d387d2acdefdd4243fa9263c8c649 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Fri, 10 May 2019 23:35:50 -0400 Subject: [PATCH 59/70] fix format Signed-off-by: Michael Puncel --- source/common/router/router.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 477e648bcded8..8993c07035848 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -1146,8 +1146,8 @@ Filter::UpstreamRequest::UpstreamRequest(Filter& parent, Http::ConnectionPool::I : parent_(parent), conn_pool_(pool), grpc_rq_success_deferred_(false), stream_info_(pool.protocol(), parent_.callbacks_->dispatcher().timeSource()), calling_encode_headers_(false), upstream_canary_(false), decode_complete_(false), - encode_complete_(false), encode_trailers_(false), retried_(false), - seen_headers_(false), outlier_detection_timeout_recorded_(false), + encode_complete_(false), encode_trailers_(false), retried_(false), seen_headers_(false), + outlier_detection_timeout_recorded_(false), create_per_try_timeout_on_request_complete_(false) { if (parent_.config_.start_child_span_) { From ec80324cf3960dd2a6c2017a589c5911dbea3007 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 13 May 2019 12:37:52 -0400 Subject: [PATCH 60/70] fix handling of resets Resets didn't have the same handling that upstream headers had. In particular, a reset would have always returned early even if other requests were in flight, and we might retry the same request twice if there was first a per try timeout and then a reset. Signed-off-by: Michael Puncel --- source/common/router/router.cc | 22 +++++++--- source/common/router/router.h | 2 +- test/common/router/router_test.cc | 73 ++++++++++++++++++++++++++++++- 3 files changed, 88 insertions(+), 9 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 8993c07035848..4fd2f7f31b39c 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -584,7 +584,7 @@ void Filter::onResponseTimeout() { for (auto& upstream_request : upstream_requests_) { // Don't record a timeout for upstream requests we've already seen headers // for. - if (!upstream_request->seen_headers_) { + if (upstream_request->awaiting_headers_) { cluster_->stats().upstream_rq_timeout_.inc(); if (upstream_request->upstream_host_) { upstream_request->upstream_host_->stats().rq_timeout_.inc(); @@ -719,8 +719,10 @@ void Filter::onUpstreamAbort(Http::Code code, StreamInfo::ResponseFlag response_ bool Filter::maybeRetryReset(Http::StreamResetReason reset_reason, UpstreamRequest& upstream_request) { - // We don't retry if we already started the response. - if (downstream_response_started_ || !retry_state_) { + // We don't retry if we already started the response, don't have a retry policy defined, + // or if we've already retried this upstream request (currently only possible if a per + // try timeout occurred and hedge_on_per_try_timeout is enabled). + if (downstream_response_started_ || !retry_state_ || upstream_request.retried_) { return false; } @@ -756,6 +758,13 @@ void Filter::onUpstreamReset(Http::StreamResetReason reset_reason, const bool dropped = reset_reason == Http::StreamResetReason::Overflow; chargeUpstreamAbort(Http::Code::ServiceUnavailable, dropped, upstream_request); + // If there are other in-flight requests that might see an upstream response, + // don't return anything downstream. + if (numRequestsAwaitingHeaders() > 0 || pending_retries_ > 0) { + upstream_request.removeFromList(upstream_requests_); + return; + } + const StreamInfo::ResponseFlag response_flags = streamResetReasonToResponseFlag(reset_reason); const std::string body = absl::StrCat("upstream connect error or disconnect/reset before headers. reset reason: ", @@ -904,6 +913,7 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head // chance to return before returning a response downstream. if (could_not_retry && (numRequestsAwaitingHeaders() > 0 || pending_retries_ > 0)) { upstream_request.upstream_host_->stats().rq_error_.inc(); + upstream_request.removeFromList(upstream_requests_); return; } @@ -1134,7 +1144,7 @@ void Filter::doRetry() { uint32_t Filter::numRequestsAwaitingHeaders() { uint32_t ret = 0; for (auto& upstream_request : upstream_requests_) { - if (!upstream_request->seen_headers_) { + if (upstream_request->awaiting_headers_) { ret++; } } @@ -1146,7 +1156,7 @@ Filter::UpstreamRequest::UpstreamRequest(Filter& parent, Http::ConnectionPool::I : parent_(parent), conn_pool_(pool), grpc_rq_success_deferred_(false), stream_info_(pool.protocol(), parent_.callbacks_->dispatcher().timeSource()), calling_encode_headers_(false), upstream_canary_(false), decode_complete_(false), - encode_complete_(false), encode_trailers_(false), retried_(false), seen_headers_(false), + encode_complete_(false), encode_trailers_(false), retried_(false), awaiting_headers_(true), outlier_detection_timeout_recorded_(false), create_per_try_timeout_on_request_complete_(false) { @@ -1189,7 +1199,7 @@ void Filter::UpstreamRequest::decodeHeaders(Http::HeaderMapPtr&& headers, bool e upstream_timing_.onFirstUpstreamRxByteReceived(parent_.callbacks_->dispatcher().timeSource()); maybeEndDecode(end_stream); - seen_headers_ = true; + awaiting_headers_ = false; if (!parent_.config_.upstream_logs_.empty()) { upstream_headers_ = std::make_unique(*headers); } diff --git a/source/common/router/router.h b/source/common/router/router.h index 480c88d182a03..7466a99f0ce59 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -399,7 +399,7 @@ class Filter : Logger::Loggable, bool encode_complete_ : 1; bool encode_trailers_ : 1; bool retried_ : 1; - bool seen_headers_ : 1; + bool awaiting_headers_ : 1; bool outlier_detection_timeout_recorded_ : 1; // Tracks whether we deferred a per try timeout because the downstream request // had not been completed yet. diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index 86170e1f4c81a..ee68b634e5620 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -1717,7 +1717,7 @@ TEST_F(RouterTest, HedgingRetriesExhaustedBadResponse) { EXPECT_TRUE(verifyHostUpstreamStats(0, 0)); - // Now trigger a 503 n response to the second request. + // Now trigger a 503 in response to the second request. Http::HeaderMapPtr bad_response_headers1(new Http::TestHeaderMapImpl{{":status", "503"}}); EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(503)); @@ -1727,7 +1727,7 @@ TEST_F(RouterTest, HedgingRetriesExhaustedBadResponse) { EXPECT_TRUE(verifyHostUpstreamStats(0, 1)); - // Now trigger a 502 n response to the first request. + // Now trigger a 502 in response to the first request. Http::HeaderMapPtr bad_response_headers2(new Http::TestHeaderMapImpl{{":status", "502"}}); EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(502)); @@ -1744,6 +1744,75 @@ TEST_F(RouterTest, HedgingRetriesExhaustedBadResponse) { EXPECT_TRUE(verifyHostUpstreamStats(0, 2)); } +// Sequence: 1) per try timeout w/ hedge retry, 2) first request gets reset by upstream, +// 3) 2nd request gets a 200 which should be sent downstream. +TEST_F(RouterTest, HedgingRetriesProceedAfterReset) { + enableHedgeOnPerTryTimeout(); + + NiceMock encoder1; + Http::StreamDecoder* response_decoder1 = nullptr; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder1 = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder1, cm_.conn_pool_.host_); + return nullptr; + })); + expectPerTryTimerCreate(); + expectResponseTimerCreate(); + + Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; + HttpTestUtility::addDefaultHeaders(headers); + router_.decodeHeaders(headers, true); + + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(504)); + EXPECT_CALL(encoder1.stream_, resetStream(_)).Times(0); + EXPECT_CALL(callbacks_, encodeHeaders_(_, _)).Times(0); + router_.retry_state_->expectHedgedPerTryTimeoutRetry(); + per_try_timeout_->callback_(); + + NiceMock encoder2; + Http::StreamDecoder* response_decoder2 = nullptr; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder2 = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder2, cm_.conn_pool_.host_); + return nullptr; + })); + expectPerTryTimerCreate(); + router_.retry_state_->callback_(); + + EXPECT_TRUE(verifyHostUpstreamStats(0, 0)); + + // Now trigger an upstream reset in response to the first request. + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(503)); + EXPECT_CALL(encoder1.stream_, resetStream(_)); + encoder1.stream_.resetStream(Http::StreamResetReason::RemoteReset); + + EXPECT_TRUE(verifyHostUpstreamStats(0, 1)); + + // We should not call shouldRetryReset() because you never retry the same + // request twice. + EXPECT_CALL(*router_.retry_state_, shouldRetryReset(_, _)).Times(0); + + // Now trigger a 200 in response to the second request. + Http::HeaderMapPtr response_headers(new Http::TestHeaderMapImpl{{":status", "200"}}); + + EXPECT_CALL(*router_.retry_state_, shouldRetryHeaders(_, _)) + .WillOnce(Return(RetryStatus::No)); + EXPECT_CALL(callbacks_, encodeHeaders_(_, _)) + .WillOnce(Invoke([&](Http::HeaderMap& headers, bool) -> void { + EXPECT_EQ(headers.Status()->value(), "200"); + })); + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(200)); + response_decoder2->decodeHeaders(std::move(response_headers), true); + + EXPECT_TRUE(verifyHostUpstreamStats(1, 1)); +} + TEST_F(RouterTest, RetryNoneHealthy) { NiceMock encoder1; Http::StreamDecoder* response_decoder = nullptr; From f2c645c4cc00958597334172ae7ada1461b73698 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 13 May 2019 13:03:52 -0400 Subject: [PATCH 61/70] fix pending_retries counting Signed-off-by: Michael Puncel --- source/common/router/router.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 4fd2f7f31b39c..3abce68a3767e 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -913,7 +913,7 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head // chance to return before returning a response downstream. if (could_not_retry && (numRequestsAwaitingHeaders() > 0 || pending_retries_ > 0)) { upstream_request.upstream_host_->stats().rq_error_.inc(); - upstream_request.removeFromList(upstream_requests_); + upstream_request.removeFromList(upstream_requests_); return; } @@ -1056,7 +1056,6 @@ void Filter::onUpstreamComplete(UpstreamRequest& upstream_request) { } bool Filter::setupRetry() { - pending_retries_++; // If we responded before the request was complete we don't bother doing a retry. This may not // catch certain cases where we are in full streaming mode and we have a connect timeout or an // overflow of some kind. However, in many cases deployments will use the buffer filter before @@ -1065,6 +1064,7 @@ bool Filter::setupRetry() { if (!downstream_end_stream_) { return false; } + pending_retries_++; ENVOY_STREAM_LOG(debug, "performing retry", *callbacks_); @@ -1287,6 +1287,7 @@ void Filter::UpstreamRequest::encodeTrailers(const Http::HeaderMap& trailers) { void Filter::UpstreamRequest::onResetStream(Http::StreamResetReason reason, absl::string_view transport_failure_reason) { clearRequestEncoder(); + awaiting_headers_ = false; if (!calling_encode_headers_) { stream_info_.setResponseFlag(parent_.streamResetReasonToResponseFlag(reason)); parent_.onUpstreamReset(reason, transport_failure_reason, *this); From 67b7fa99a74f1b1fe8288a81bdf5cc2dea6c1997 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Mon, 13 May 2019 13:06:34 -0400 Subject: [PATCH 62/70] fix format Signed-off-by: Michael Puncel --- test/common/router/router_test.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index ee68b634e5620..676728cfb3d49 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -1801,8 +1801,7 @@ TEST_F(RouterTest, HedgingRetriesProceedAfterReset) { // Now trigger a 200 in response to the second request. Http::HeaderMapPtr response_headers(new Http::TestHeaderMapImpl{{":status", "200"}}); - EXPECT_CALL(*router_.retry_state_, shouldRetryHeaders(_, _)) - .WillOnce(Return(RetryStatus::No)); + EXPECT_CALL(*router_.retry_state_, shouldRetryHeaders(_, _)).WillOnce(Return(RetryStatus::No)); EXPECT_CALL(callbacks_, encodeHeaders_(_, _)) .WillOnce(Invoke([&](Http::HeaderMap& headers, bool) -> void { EXPECT_EQ(headers.Status()->value(), "200"); From fcd8ba719370755a74482f0cc8b7e28c136a4760 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 14 May 2019 13:30:32 -0400 Subject: [PATCH 63/70] partial PR feedback Signed-off-by: Michael Puncel --- source/common/router/router.cc | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 3abce68a3767e..784d6b1067066 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -195,22 +195,22 @@ FilterUtility::finalTimeout(const RouteEntry& route, Http::HeaderMap& request_he FilterUtility::HedgingParams FilterUtility::finalHedgingParams(const RouteEntry& route, Http::HeaderMap& request_headers) { - HedgingParams hedgingParams; - hedgingParams.hedge_on_per_try_timeout_ = route.hedgePolicy().hedgeOnPerTryTimeout(); + HedgingParams hedging_params; + hedging_params.hedge_on_per_try_timeout_ = route.hedgePolicy().hedgeOnPerTryTimeout(); Http::HeaderEntry* hedge_on_per_try_timeout_entry = request_headers.EnvoyHedgeOnPerTryTimeout(); if (hedge_on_per_try_timeout_entry) { if (hedge_on_per_try_timeout_entry->value() == "true") { - hedgingParams.hedge_on_per_try_timeout_ = true; + hedging_params.hedge_on_per_try_timeout_ = true; } if (hedge_on_per_try_timeout_entry->value() == "false") { - hedgingParams.hedge_on_per_try_timeout_ = false; + hedging_params.hedge_on_per_try_timeout_ = false; } request_headers.removeEnvoyHedgeOnPerTryTimeout(); } - return hedgingParams; + return hedging_params; } Filter::~Filter() { @@ -495,6 +495,12 @@ Http::FilterDataStatus Filter::decodeData(Buffer::Instance& data, bool end_strea Http::FilterTrailersStatus Filter::decodeTrailers(Http::HeaderMap& trailers) { ENVOY_STREAM_LOG(debug, "router decoding trailers:\n{}", *callbacks_, trailers); + + // upstream_requests_.size() cannot be 0 because we add to it unconditionally + // in decodeHeaders(). It cannot be > 1 because that only happens when a per + // try timeout occurs with hedge_on_per_try_timeout enabled but the the per + // try timeout timer is not started until onUpstreamComplete(). + ASSERT(upstream_requests_.size() == 1); downstream_trailers_ = &trailers; for (auto& upstream_request : upstream_requests_) { upstream_request->encodeTrailers(trailers); From 3dafb66ec0ff73e61d97a4008f06433335b42847 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 14 May 2019 13:31:03 -0400 Subject: [PATCH 64/70] fix bug in doRetry() checking for an immediate reset that could result on calling encodeData on the wrong upstream_request Signed-off-by: Michael Puncel --- source/common/router/router.cc | 7 +++- test/common/router/router_test.cc | 68 +++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 784d6b1067066..77156f4172322 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -1131,10 +1131,13 @@ void Filter::doRetry() { ASSERT(response_timeout_ || timeout_.global_timeout_.count() == 0); UpstreamRequestPtr upstream_request = std::make_unique(*this, *conn_pool); + UpstreamRequest* upstream_request_tmp = upstream_request.get(); upstream_request->moveIntoList(std::move(upstream_request), upstream_requests_); upstream_requests_.front()->encodeHeaders(!callbacks_->decodingBuffer() && !downstream_trailers_); - // It's possible we got immediately reset. - if (upstream_requests_.front()) { + // It's possible we got immediately reset which means the upstream request we just + // added to the front of the list might have been removed, so we need to check to make + // sure we don't encodeData on the wrong request. + if (!upstream_requests_.empty() && (upstream_requests_.front().get() == upstream_request_tmp)) { if (callbacks_->decodingBuffer()) { // If we are doing a retry we need to make a copy. Buffer::OwnedImpl copy(*callbacks_->decodingBuffer()); diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index 676728cfb3d49..ca01416e64f0f 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -1812,6 +1812,74 @@ TEST_F(RouterTest, HedgingRetriesProceedAfterReset) { EXPECT_TRUE(verifyHostUpstreamStats(1, 1)); } +// Sequence: 1) request with data hits per try timeout w/ hedge retry, 2) +// second request is immediately reset 3) 1st request gets a 200. +// The goal of this test is to ensure that the router can properly detect that an immediate +// reset happens and that we don't accidentally write data twice on the first request. +TEST_F(RouterTest, HedgingRetryImmediatelyReset) { + enableHedgeOnPerTryTimeout(); + + NiceMock encoder; + Http::StreamDecoder* response_decoder = nullptr; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder& decoder, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + response_decoder = &decoder; + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + callbacks.onPoolReady(encoder, cm_.conn_pool_.host_); + return nullptr; + })); + + Http::TestHeaderMapImpl headers{{"x-envoy-upstream-rq-per-try-timeout-ms", "5"}}; + HttpTestUtility::addDefaultHeaders(headers); + router_.decodeHeaders(headers, false); + + expectPerTryTimerCreate(); + expectResponseTimerCreate(); + Buffer::OwnedImpl body("test body"); + EXPECT_CALL(encoder, encodeData(_, _)).Times(1); + Buffer::InstancePtr body_data(new Buffer::OwnedImpl("hello")); + router_.retry_state_->expectHedgedPerTryTimeoutRetry(); + EXPECT_EQ(Http::FilterDataStatus::StopIterationNoBuffer, router_.decodeData(*body_data, true)); + + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(504)); + EXPECT_CALL(encoder.stream_, resetStream(_)).Times(0); + EXPECT_CALL(callbacks_, encodeHeaders_(_, _)).Times(0); + per_try_timeout_->callback_(); + + NiceMock encoder2; + EXPECT_CALL(cm_.conn_pool_, newStream(_, _)) + .WillOnce(Invoke([&](Http::StreamDecoder&, Http::ConnectionPool::Callbacks& callbacks) + -> Http::ConnectionPool::Cancellable* { + EXPECT_CALL(*router_.retry_state_, onHostAttempted(_)); + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(503)); + callbacks.onPoolFailure(Http::ConnectionPool::PoolFailureReason::ConnectionFailure, + absl::string_view(), cm_.conn_pool_.host_); + return nullptr; + })); + EXPECT_CALL(*router_.retry_state_, shouldRetryReset(_, _)) + .WillOnce(Return(RetryStatus::NoRetryLimitExceeded)); + ON_CALL(callbacks_, decodingBuffer()).WillByDefault(Return(body_data.get())); + router_.retry_state_->callback_(); + + EXPECT_TRUE(verifyHostUpstreamStats(0, 1)); + + // Now trigger a 200 in response to the first request. + Http::HeaderMapPtr response_headers(new Http::TestHeaderMapImpl{{":status", "200"}}); + + // The request was already retried when the per try timeout occured so it + // should't even consult the retry state. + EXPECT_CALL(*router_.retry_state_, shouldRetryHeaders(_, _)).Times(0); + EXPECT_CALL(callbacks_, encodeHeaders_(_, _)) + .WillOnce(Invoke([&](Http::HeaderMap& headers, bool) -> void { + EXPECT_EQ(headers.Status()->value(), "200"); + })); + EXPECT_CALL(cm_.conn_pool_.host_->outlier_detector_, putHttpResponseCode(200)); + response_decoder->decodeHeaders(std::move(response_headers), true); + + EXPECT_TRUE(verifyHostUpstreamStats(1, 1)); +} + TEST_F(RouterTest, RetryNoneHealthy) { NiceMock encoder1; Http::StreamDecoder* response_decoder = nullptr; From 3d0f2a911f9cf961e014520445bb24dd9f498c5d Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Tue, 14 May 2019 16:00:55 -0400 Subject: [PATCH 65/70] fix spelling Signed-off-by: Michael Puncel --- test/common/router/router_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/common/router/router_test.cc b/test/common/router/router_test.cc index ca01416e64f0f..c2df55a261127 100644 --- a/test/common/router/router_test.cc +++ b/test/common/router/router_test.cc @@ -1867,7 +1867,7 @@ TEST_F(RouterTest, HedgingRetryImmediatelyReset) { // Now trigger a 200 in response to the first request. Http::HeaderMapPtr response_headers(new Http::TestHeaderMapImpl{{":status", "200"}}); - // The request was already retried when the per try timeout occured so it + // The request was already retried when the per try timeout occurred so it // should't even consult the retry state. EXPECT_CALL(*router_.retry_state_, shouldRetryHeaders(_, _)).Times(0); EXPECT_CALL(callbacks_, encodeHeaders_(_, _)) From 5b92d57f623d70f89943405bcdc44ebf1718ccd3 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Wed, 22 May 2019 18:00:21 -0400 Subject: [PATCH 66/70] address PR feedback Signed-off-by: Michael Puncel --- source/common/router/router.cc | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 93153ea9454f9..af6553befc390 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -1155,14 +1155,8 @@ void Filter::doRetry() { } uint32_t Filter::numRequestsAwaitingHeaders() { - uint32_t ret = 0; - for (auto& upstream_request : upstream_requests_) { - if (upstream_request->awaiting_headers_) { - ret++; - } - } - - return ret; + return std::count_if(upstream_requests_.begin(), upstream_requests_.end(), + [](const auto& req) -> bool { return req.get()->awaiting_headers_; }); } Filter::UpstreamRequest::UpstreamRequest(Filter& parent, Http::ConnectionPool::Instance& pool) @@ -1345,8 +1339,6 @@ void Filter::UpstreamRequest::onPerTryTimeout() { if (!parent_.downstream_response_started_) { ENVOY_STREAM_LOG(debug, "upstream per try timeout", *parent_.callbacks_); - // Set response flag to UT for now, but it might be overwritten if a - // response arrives later and hedge_on_per_try_timeout_ is set stream_info_.setResponseFlag(StreamInfo::ResponseFlag::UpstreamRequestTimeout); parent_.onPerTryTimeout(*this); } else { @@ -1459,8 +1451,8 @@ void Filter::UpstreamRequest::DownstreamWatermarkManager::onAboveWriteBufferHigh // There are two states we should get this callback in: 1) the watermark was // hit due to writes from a different filter instance over a shared // downstream connection, or 2) the watermark was hit due to THIS filter - // instance due to writing back the "winning" upstream request. In either - // case we can disable reads from upstream. + // instance writing back the "winning" upstream request. In either case we + // can disable reads from upstream. ASSERT(!parent_.parent_.final_upstream_request_ || &parent_ == parent_.parent_.final_upstream_request_); From 80cf6a1b962ad5215a8abf2a12d1032459e68eb3 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Thu, 23 May 2019 10:10:56 -0400 Subject: [PATCH 67/70] Clean up unnecessary final_upstream_request_ checks. Additionally moves the handling of in-flight upstream requests out of cleanup() and into the various applicable places of the request lifecycle. Hopefully this makes it clear what the cycle is instead of having a cleanup() function that hands a various number of states. Signed-off-by: Michael Puncel --- source/common/router/router.cc | 106 ++++++++++++++++++++------------- source/common/router/router.h | 2 + 2 files changed, 68 insertions(+), 40 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index af6553befc390..59efee39382de 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -522,17 +522,10 @@ void Filter::setDecoderFilterCallbacks(Http::StreamDecoderFilterCallbacks& callb } void Filter::cleanup() { - while (!upstream_requests_.empty()) { - UpstreamRequestPtr upstream_request = - upstream_requests_.back()->removeFromList(upstream_requests_); - if (upstream_request.get() == final_upstream_request_) { - callbacks_->streamInfo().setUpstreamTiming(final_upstream_request_->upstream_timing_); - } - if ((upstream_request.get() != final_upstream_request_) || - !attempting_internal_redirect_with_complete_stream_) { - upstream_request->resetStream(); // Idempotent. - } - } + // All callers of cleanup() should have cleaned out the upstream_requests_ + // list as appropriate. + ASSERT(upstream_requests_.size() == 0); + retry_state_.reset(); if (response_timeout_) { response_timeout_->disableTimer(); @@ -585,13 +578,26 @@ void Filter::onRequestComplete() { } } -void Filter::onDestroy() { cleanup(); } +void Filter::onDestroy() { + // Reset any in-flight upstream requests. + resetAll(); + cleanup(); +} void Filter::onResponseTimeout() { ENVOY_STREAM_LOG(debug, "upstream timeout", *callbacks_); + // If we had an upstream request that got a "good" response, save its + // upstream timing information into the downstream stream info. + if (final_upstream_request_) { + callbacks_->streamInfo().setUpstreamTiming(final_upstream_request_->upstream_timing_); + } + // Reset any upstream requests that are still in flight. - for (auto& upstream_request : upstream_requests_) { + while (!upstream_requests_.empty()) { + UpstreamRequestPtr upstream_request = + upstream_requests_.back()->removeFromList(upstream_requests_); + // Don't record a timeout for upstream requests we've already seen headers // for. if (upstream_request->awaiting_headers_) { @@ -664,6 +670,9 @@ void Filter::onPerTryTimeout(UpstreamRequest& upstream_request) { } chargeUpstreamAbort(timeout_response_code_, false, upstream_request); + + // Remove this upstream request from the list now that we're done with it. + upstream_request.removeFromList(upstream_requests_); onUpstreamTimeoutAbort(StreamInfo::ResponseFlag::UpstreamRequestTimeout, StreamInfo::ResponseCodeDetails::get().UpstreamPerTryTimeout); } @@ -742,7 +751,7 @@ bool Filter::maybeRetryReset(Http::StreamResetReason reset_reason, if (upstream_request.upstream_host_) { upstream_request.upstream_host_->stats().rq_error_.inc(); } - upstream_request.removeFromList(upstream_requests_); + upstream_request.removeFromList(upstream_requests_); return true; } else if (retry_status == RetryStatus::NoOverflow) { callbacks_->streamInfo().setResponseFlag(StreamInfo::ResponseFlag::UpstreamOverflow); @@ -767,11 +776,11 @@ void Filter::onUpstreamReset(Http::StreamResetReason reset_reason, const bool dropped = reset_reason == Http::StreamResetReason::Overflow; chargeUpstreamAbort(Http::Code::ServiceUnavailable, dropped, upstream_request); + upstream_request.removeFromList(upstream_requests_); // If there are other in-flight requests that might see an upstream response, // don't return anything downstream. if (numRequestsAwaitingHeaders() > 0 || pending_retries_ > 0) { - upstream_request.removeFromList(upstream_requests_); return; } @@ -836,30 +845,43 @@ void Filter::onUpstream100ContinueHeaders(Http::HeaderMapPtr&& headers, chargeUpstreamCode(100, *headers, upstream_request.upstream_host_, false); ENVOY_STREAM_LOG(debug, "upstream 100 continue", *callbacks_); - if (!downstream_response_started_) { - downstream_response_started_ = true; - final_upstream_request_ = &upstream_request; - resetOtherUpstreams(upstream_request); - } + downstream_response_started_ = true; + final_upstream_request_ = &upstream_request; + resetOtherUpstreams(upstream_request); + // Don't send retries after 100-Continue has been sent on. Arguably we could attempt to do a // retry, assume the next upstream would also send an 100-Continue and swallow the second one // but it's sketchy (as the subsequent upstream might not send a 100-Continue) and not worth // the complexity until someone asks for it. retry_state_.reset(); - if (final_upstream_request_ == &upstream_request) { - callbacks_->encode100ContinueHeaders(std::move(headers)); + callbacks_->encode100ContinueHeaders(std::move(headers)); +} + +void Filter::resetAll() { + while (!upstream_requests_.empty()) { + upstream_requests_.back()->removeFromList(upstream_requests_)->resetStream(); } } void Filter::resetOtherUpstreams(UpstreamRequest& upstream_request) { - for (auto& upstream_request_tmp : upstream_requests_) { + // Pop each upstream request on the list and reset it if it's not the one + // provided. At the end we'll move it back into the list. + UpstreamRequestPtr final_upstream_request; + while (!upstream_requests_.empty()) { + UpstreamRequestPtr upstream_request_tmp = + upstream_requests_.back()->removeFromList(upstream_requests_); if (upstream_request_tmp.get() != &upstream_request) { upstream_request_tmp->resetStream(); // TODO: per-host stat for hedge abandoned. // TODO: cluster stat for hedge abandoned. + } else { + final_upstream_request = std::move(upstream_request_tmp); } } + + // Now put the final request back on thie list. + final_upstream_request->moveIntoList(std::move(final_upstream_request), upstream_requests_); } void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& headers, @@ -963,24 +985,24 @@ void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& head // provide finalizeResponseHeaders functions on the Router::Config and VirtualHost interfaces. route_entry_->finalizeResponseHeaders(*headers, callbacks_->streamInfo()); - if (!downstream_response_started_) { - downstream_response_started_ = true; - final_upstream_request_ = &upstream_request; - resetOtherUpstreams(upstream_request); - } + downstream_response_started_ = true; + final_upstream_request_ = &upstream_request; + resetOtherUpstreams(upstream_request); if (end_stream) { onUpstreamComplete(upstream_request); } - if (final_upstream_request_ == &upstream_request) { - callbacks_->streamInfo().setResponseCodeDetails( - StreamInfo::ResponseCodeDetails::get().ViaUpstream); - callbacks_->encodeHeaders(std::move(headers), end_stream); - } + callbacks_->streamInfo().setResponseCodeDetails( + StreamInfo::ResponseCodeDetails::get().ViaUpstream); + callbacks_->encodeHeaders(std::move(headers), end_stream); } void Filter::onUpstreamData(Buffer::Instance& data, UpstreamRequest& upstream_request, bool end_stream) { + // This should be true because when we saw headers we either reset the stream + // (hence wouldn't have made it to onUpstreamData) or all other in-flight + // streams. + ASSERT(upstream_requests_.size() == 1); if (end_stream) { // gRPC request termination without trailers is an error. if (upstream_request.grpc_rq_success_deferred_) { @@ -989,12 +1011,15 @@ void Filter::onUpstreamData(Buffer::Instance& data, UpstreamRequest& upstream_re onUpstreamComplete(upstream_request); } - if (final_upstream_request_ == &upstream_request) { - callbacks_->encodeData(data, end_stream); - } + callbacks_->encodeData(data, end_stream); } void Filter::onUpstreamTrailers(Http::HeaderMapPtr&& trailers, UpstreamRequest& upstream_request) { + // This should be true because when we saw headers we either reset the stream + // (hence wouldn't have made it to onUpstreamTrailers) or all other in-flight + // streams. + ASSERT(upstream_requests_.size() == 1); + if (upstream_request.grpc_rq_success_deferred_) { absl::optional grpc_status = Grpc::Common::getGrpcStatus(*trailers); if (grpc_status && @@ -1004,10 +1029,10 @@ void Filter::onUpstreamTrailers(Http::HeaderMapPtr&& trailers, UpstreamRequest& upstream_request.upstream_host_->stats().rq_error_.inc(); } } + onUpstreamComplete(upstream_request); - if (final_upstream_request_ == &upstream_request) { - callbacks_->encodeTrailers(std::move(trailers)); - } + + callbacks_->encodeTrailers(std::move(trailers)); } void Filter::onUpstreamMetadata(Http::MetadataMapPtr&& metadata_map) { @@ -1018,6 +1043,7 @@ void Filter::onUpstreamComplete(UpstreamRequest& upstream_request) { if (!downstream_end_stream_) { upstream_request.resetStream(); } + callbacks_->streamInfo().setUpstreamTiming(final_upstream_request_->upstream_timing_); if (config_.emit_dynamic_stats_ && !callbacks_->streamInfo().healthCheck() && DateUtil::timePointValid(downstream_request_complete_time_)) { @@ -1062,6 +1088,7 @@ void Filter::onUpstreamComplete(UpstreamRequest& upstream_request) { } } + upstream_request.removeFromList(upstream_requests_); cleanup(); } @@ -1352,7 +1379,6 @@ void Filter::UpstreamRequest::onPoolFailure(Http::ConnectionPool::PoolFailureRea absl::string_view transport_failure_reason, Upstream::HostDescriptionConstSharedPtr host) { Http::StreamResetReason reset_reason = Http::StreamResetReason::ConnectionFailure; - conn_pool_stream_handle_ = nullptr; switch (reason) { case Http::ConnectionPool::PoolFailureReason::Overflow: reset_reason = Http::StreamResetReason::Overflow; diff --git a/source/common/router/router.h b/source/common/router/router.h index 504dc7092a896..cf3e181428eff 100644 --- a/source/common/router/router.h +++ b/source/common/router/router.h @@ -458,6 +458,8 @@ class Filter : Logger::Loggable, void onUpstreamComplete(UpstreamRequest& upstream_request); void onUpstreamReset(Http::StreamResetReason reset_reason, absl::string_view transport_failure, UpstreamRequest& upstream_request); + // Reset all in-flight upstream requests. + void resetAll(); // Reset all in-flight upstream requests that do NOT match the passed argument. This is used // if a "good" response comes back and we return downstream, so there is no point in waiting // for the remaining upstream requests to return. From d364c47363a0783adcdc52680100ce53ba9cfe44 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Thu, 23 May 2019 16:18:18 -0400 Subject: [PATCH 68/70] fix format and missing null guard Signed-off-by: Michael Puncel --- source/common/router/router.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index 053e64dc5501c..d905198634e19 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -593,7 +593,7 @@ void Filter::onResponseTimeout() { // If we had an upstream request that got a "good" response, save its // upstream timing information into the downstream stream info. if (final_upstream_request_) { - callbacks_->streamInfo().setUpstreamTiming(final_upstream_request_->upstream_timing_); + callbacks_->streamInfo().setUpstreamTiming(final_upstream_request_->upstream_timing_); } // Reset any upstream requests that are still in flight. @@ -754,7 +754,7 @@ bool Filter::maybeRetryReset(Http::StreamResetReason reset_reason, if (upstream_request.upstream_host_) { upstream_request.upstream_host_->stats().rq_error_.inc(); } - upstream_request.removeFromList(upstream_requests_); + upstream_request.removeFromList(upstream_requests_); return true; } else if (retry_status == RetryStatus::NoOverflow) { callbacks_->streamInfo().setResponseFlag(StreamInfo::ResponseFlag::UpstreamOverflow); @@ -884,7 +884,9 @@ void Filter::resetOtherUpstreams(UpstreamRequest& upstream_request) { } // Now put the final request back on thie list. - final_upstream_request->moveIntoList(std::move(final_upstream_request), upstream_requests_); + if (final_upstream_request) { + final_upstream_request->moveIntoList(std::move(final_upstream_request), upstream_requests_); + } } void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& headers, From 1847f331ade5b7cdfff814e395b58a580e7e02f4 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Fri, 24 May 2019 10:22:36 -0400 Subject: [PATCH 69/70] fix redirect Signed-off-by: Michael Puncel --- source/common/router/router.cc | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index d905198634e19..f3d229d356517 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -883,10 +883,9 @@ void Filter::resetOtherUpstreams(UpstreamRequest& upstream_request) { } } + ASSERT(final_upstream_request); // Now put the final request back on thie list. - if (final_upstream_request) { - final_upstream_request->moveIntoList(std::move(final_upstream_request), upstream_requests_); - } + final_upstream_request->moveIntoList(std::move(final_upstream_request), upstream_requests_); } void Filter::onUpstreamHeaders(uint64_t response_code, Http::HeaderMapPtr&& headers, @@ -1136,8 +1135,6 @@ bool Filter::setupRedirect(const Http::HeaderMap& headers, UpstreamRequest& upst convertRequestHeadersForInternalRedirect(*downstream_headers_, *location, *callbacks_->connection()) && callbacks_->recreateStream()) { - final_upstream_request_ = &upstream_request; - resetOtherUpstreams(upstream_request); cluster_->stats().upstream_internal_redirect_succeeded_total_.inc(); return true; } From 597719712ffcdb758a66651bd2e1e78191463693 Mon Sep 17 00:00:00 2001 From: Michael Puncel Date: Fri, 24 May 2019 10:27:49 -0400 Subject: [PATCH 70/70] fix spelling/clang tidy errors Signed-off-by: Michael Puncel --- source/common/router/router.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/common/router/router.cc b/source/common/router/router.cc index f3d229d356517..e3b0da51f971a 100644 --- a/source/common/router/router.cc +++ b/source/common/router/router.cc @@ -527,7 +527,7 @@ void Filter::setDecoderFilterCallbacks(Http::StreamDecoderFilterCallbacks& callb void Filter::cleanup() { // All callers of cleanup() should have cleaned out the upstream_requests_ // list as appropriate. - ASSERT(upstream_requests_.size() == 0); + ASSERT(upstream_requests_.empty()); retry_state_.reset(); if (response_timeout_) { @@ -884,7 +884,7 @@ void Filter::resetOtherUpstreams(UpstreamRequest& upstream_request) { } ASSERT(final_upstream_request); - // Now put the final request back on thie list. + // Now put the final request back on this list. final_upstream_request->moveIntoList(std::move(final_upstream_request), upstream_requests_); } @@ -1185,7 +1185,7 @@ void Filter::doRetry() { uint32_t Filter::numRequestsAwaitingHeaders() { return std::count_if(upstream_requests_.begin(), upstream_requests_.end(), - [](const auto& req) -> bool { return req.get()->awaiting_headers_; }); + [](const auto& req) -> bool { return req->awaiting_headers_; }); } Filter::UpstreamRequest::UpstreamRequest(Filter& parent, Http::ConnectionPool::Instance& pool)