From c926f9830ac3407544e1cbdd0ec2ace5a87e8544 Mon Sep 17 00:00:00 2001 From: Geoffroy Couprie Date: Wed, 26 Oct 2022 15:50:15 +0200 Subject: [PATCH 1/7] WIP: retries for subgraph queries --- .../src/plugins/traffic_shaping/mod.rs | 16 ++++++- .../src/plugins/traffic_shaping/retry.rs | 44 +++++++++++++++++++ 2 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 apollo-router/src/plugins/traffic_shaping/retry.rs diff --git a/apollo-router/src/plugins/traffic_shaping/mod.rs b/apollo-router/src/plugins/traffic_shaping/mod.rs index d1b21c5ef2..43c5924053 100644 --- a/apollo-router/src/plugins/traffic_shaping/mod.rs +++ b/apollo-router/src/plugins/traffic_shaping/mod.rs @@ -9,6 +9,7 @@ pub(crate) mod deduplication; mod rate; +mod retry; mod timeout; use std::collections::HashMap; @@ -27,6 +28,7 @@ use tower::ServiceExt; use self::rate::RateLimitLayer; pub(crate) use self::rate::RateLimited; +use self::retry::RetryPolicy; pub(crate) use self::timeout::Elapsed; use self::timeout::TimeoutLayer; use crate::error::ConfigurationError; @@ -57,6 +59,7 @@ struct Shaping { #[schemars(with = "String", default)] /// Enable timeout for incoming requests timeout: Option, + // retri } impl Merge for Shaping { @@ -136,6 +139,7 @@ pub(crate) struct TrafficShaping { config: Config, rate_limit_router: Option, rate_limit_subgraphs: Mutex>, + retry_policy: Option, } #[async_trait::async_trait] @@ -170,6 +174,7 @@ impl Plugin for TrafficShaping { config: init.config, rate_limit_router, rate_limit_subgraphs: Mutex::new(HashMap::new()), + retry_policy: Some(RetryPolicy::default()), }) } @@ -204,13 +209,20 @@ impl Plugin for TrafficShaping { }) .clone() }); + /*let retry = self + .retry_policy + .as_ref() + .map(|policy| tower::retry::RetryLayer::new(policy.clone()));*/ + let retry = tower::retry::RetryLayer::new(self.policy.clone().unwrap()); ServiceBuilder::new() - .layer(TimeoutLayer::new( + /*.layer(TimeoutLayer::new( config .timeout .unwrap_or(DEFAULT_TIMEOUT), )) - .option_layer(rate_limit) + .option_layer(rate_limit)*/ + //.option_layer(retry) + .layer(retry) .service(service) .map_request(move |mut req: SubgraphRequest| { if let Some(compression) = config.compression { diff --git a/apollo-router/src/plugins/traffic_shaping/retry.rs b/apollo-router/src/plugins/traffic_shaping/retry.rs new file mode 100644 index 0000000000..5dbdd3b82d --- /dev/null +++ b/apollo-router/src/plugins/traffic_shaping/retry.rs @@ -0,0 +1,44 @@ +use std::sync::Arc; +use std::{future, time::Duration}; + +use tower::retry::{budget::Budget, Policy}; + +#[derive(Clone, Default)] +pub(crate) struct RetryPolicy { + budget: Arc, +} + +impl RetryPolicy { + pub fn new(duration: Duration, min_per_sec: u32, retry_percent: f32) -> Self { + Self { + budget: Arc::new(Budget::new(duration, min_per_sec, retry_percent)), + } + } +} + +impl Policy for RetryPolicy { + type Future = future::Ready; + + fn retry(&self, req: &Req, result: Result<&Res, &E>) -> Option { + match result { + Ok(_) => { + // Treat all `Response`s as success, + // so deposit budget and don't retry... + self.budget.deposit(); + None + } + Err(e) => { + let withdrew = self.budget.withdraw(); + if withdrew.is_err() { + return None; + } + + Some(future::ready(self.clone())) + } + } + } + + fn clone_request(&self, req: &Req) -> Option { + Some(req.clone()) + } +} From 43290e0e79039ac7e2d9cdfe0dc63b84f9cf8a41 Mon Sep 17 00:00:00 2001 From: Geoffroy Couprie Date: Mon, 7 Nov 2022 16:29:49 +0100 Subject: [PATCH 2/7] wip --- apollo-router/src/plugins/traffic_shaping/mod.rs | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/apollo-router/src/plugins/traffic_shaping/mod.rs b/apollo-router/src/plugins/traffic_shaping/mod.rs index 729ea6ba09..a4bd08f49e 100644 --- a/apollo-router/src/plugins/traffic_shaping/mod.rs +++ b/apollo-router/src/plugins/traffic_shaping/mod.rs @@ -289,23 +289,13 @@ impl TrafficShaping { }) .clone() }); - /*let retry = self - .retry_policy - .as_ref() - .map(|policy| tower::retry::RetryLayer::new(policy.clone()));*/ - /*let retry = tower::retry::RetryLayer::new(self.policy.clone().unwrap()); - ServiceBuilder::new()*/ - /*.layer(TimeoutLayer::new( - config - .timeout - .unwrap_or(DEFAULT_TIMEOUT), - )) - .option_layer(rate_limit)*/ - //.option_layer(retry) + let retry = tower::retry::RetryLayer::new(self.retry_policy.clone().unwrap()); + Either::A(ServiceBuilder::new() .option_layer(config.deduplicate_query.unwrap_or_default().then( QueryDeduplicationLayer::default )) + //.option_layer(Some(retry)) .layer(TimeoutLayer::new( config .timeout From fabcfd785e240b68d9bbc30d3ff3494aba3343f0 Mon Sep 17 00:00:00 2001 From: Geoffroy Couprie Date: Mon, 7 Nov 2022 17:56:54 +0100 Subject: [PATCH 3/7] integrate the layer in traffic shaping retries require a clonable request so it is now implemented on subgraph::Request --- .../src/plugins/traffic_shaping/mod.rs | 30 +++++++------------ .../src/plugins/traffic_shaping/retry.rs | 2 +- apollo-router/src/services/subgraph.rs | 29 ++++++++++++++++++ 3 files changed, 41 insertions(+), 20 deletions(-) diff --git a/apollo-router/src/plugins/traffic_shaping/mod.rs b/apollo-router/src/plugins/traffic_shaping/mod.rs index a4bd08f49e..7a8ab0e72b 100644 --- a/apollo-router/src/plugins/traffic_shaping/mod.rs +++ b/apollo-router/src/plugins/traffic_shaping/mod.rs @@ -14,15 +14,16 @@ mod timeout; use std::collections::HashMap; use std::num::NonZeroU64; -use std::pin::Pin; use std::sync::Mutex; use std::time::Duration; +use futures::future::BoxFuture; use http::header::ACCEPT_ENCODING; use http::header::CONTENT_ENCODING; use http::HeaderValue; use schemars::JsonSchema; use serde::Deserialize; +use tower::retry::Retry; use tower::util::Either; use tower::util::Oneshot; use tower::BoxError; @@ -239,24 +240,15 @@ impl TrafficShaping { Error = BoxError, Future = tower::util::Either< tower::util::Either< - Pin< - Box< - (dyn futures::Future< - Output = std::result::Result< - subgraph::Response, - Box< - (dyn std::error::Error - + std::marker::Send - + std::marker::Sync - + 'static), - >, - >, - > + std::marker::Send - + 'static), - >, - >, + BoxFuture<'static, Result>, timeout::future::ResponseFuture< - Oneshot, S>, subgraph::Request>, + Oneshot< + tower::util::Either< + Retry, S>>, + tower::util::Either, S>, + >, + subgraph::Request, + >, >, >, >::Future, @@ -295,12 +287,12 @@ impl TrafficShaping { .option_layer(config.deduplicate_query.unwrap_or_default().then( QueryDeduplicationLayer::default )) - //.option_layer(Some(retry)) .layer(TimeoutLayer::new( config .timeout .unwrap_or(DEFAULT_TIMEOUT), )) + .option_layer(Some(retry)) .option_layer(rate_limit) .service(service) .map_request(move |mut req: SubgraphRequest| { diff --git a/apollo-router/src/plugins/traffic_shaping/retry.rs b/apollo-router/src/plugins/traffic_shaping/retry.rs index 5dbdd3b82d..3014898d50 100644 --- a/apollo-router/src/plugins/traffic_shaping/retry.rs +++ b/apollo-router/src/plugins/traffic_shaping/retry.rs @@ -9,7 +9,7 @@ pub(crate) struct RetryPolicy { } impl RetryPolicy { - pub fn new(duration: Duration, min_per_sec: u32, retry_percent: f32) -> Self { + pub(crate) fn new(duration: Duration, min_per_sec: u32, retry_percent: f32) -> Self { Self { budget: Arc::new(Budget::new(duration, min_per_sec, retry_percent)), } diff --git a/apollo-router/src/services/subgraph.rs b/apollo-router/src/services/subgraph.rs index e7f247f502..6e7f6d72c7 100644 --- a/apollo-router/src/services/subgraph.rs +++ b/apollo-router/src/services/subgraph.rs @@ -74,6 +74,35 @@ impl Request { } } +impl Clone for Request { + fn clone(&self) -> Self { + // http::Request is not clonable so we have to rebuild a new one + // we don't use the extensions field for now + let mut builder = http::Request::builder() + .method(self.subgraph_request.method()) + .version(self.subgraph_request.version()) + .uri(self.subgraph_request.uri()); + + { + let headers = builder.headers_mut().unwrap(); + headers.extend( + self.subgraph_request + .headers() + .iter() + .map(|(name, value)| (name.clone(), value.clone())), + ); + } + let subgraph_request = builder.body(self.subgraph_request.body().clone()).unwrap(); + + Self { + supergraph_request: self.supergraph_request.clone(), + subgraph_request, + operation_kind: self.operation_kind.clone(), + context: self.context.clone(), + } + } +} + assert_impl_all!(Response: Send); #[derive(Debug)] #[non_exhaustive] From adedfd0b14fa77b3b82cb158508a6a211aaaf069 Mon Sep 17 00:00:00 2001 From: Geoffroy Couprie Date: Thu, 17 Nov 2022 18:12:20 +0100 Subject: [PATCH 4/7] add configuration options marking request retry as experimental --- ...nfiguration__tests__schema_generation.snap | 50 +++++++++++++++++++ .../src/plugins/traffic_shaping/mod.rs | 50 +++++++++++++++++-- .../src/plugins/traffic_shaping/retry.rs | 22 +++++--- apollo-router/src/services/subgraph.rs | 2 +- 4 files changed, 112 insertions(+), 12 deletions(-) diff --git a/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap b/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap index 8a249cfcc5..370402806e 100644 --- a/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap +++ b/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap @@ -2080,6 +2080,31 @@ expression: "&schema" "type": "boolean", "nullable": true }, + "experimental_retry": { + "type": "object", + "properties": { + "min_per_sec": { + "description": "minimum rate of retries allowed to accomodate clients that have just started issuing requests, or clients that do not issue many requests per window. The default value is 10", + "type": "integer", + "format": "uint32", + "minimum": 0.0, + "nullable": true + }, + "retry_percent": { + "description": "percentage of calls to deposit that can be retried. This is in addition to any retries allowed for via min_per_sec. Must be between 0 and 1000, default value is 0.2", + "type": "number", + "format": "float", + "nullable": true + }, + "ttl": { + "description": "how long a single deposit should be considered. Must be between 1 and 60 seconds, default value is 10 seconds", + "default": null, + "type": "string" + } + }, + "additionalProperties": false, + "nullable": true + }, "global_rate_limit": { "description": "Enable global rate limiting", "type": "object", @@ -2189,6 +2214,31 @@ expression: "&schema" "type": "boolean", "nullable": true }, + "experimental_retry": { + "type": "object", + "properties": { + "min_per_sec": { + "description": "minimum rate of retries allowed to accomodate clients that have just started issuing requests, or clients that do not issue many requests per window. The default value is 10", + "type": "integer", + "format": "uint32", + "minimum": 0.0, + "nullable": true + }, + "retry_percent": { + "description": "percentage of calls to deposit that can be retried. This is in addition to any retries allowed for via min_per_sec. Must be between 0 and 1000, default value is 0.2", + "type": "number", + "format": "float", + "nullable": true + }, + "ttl": { + "description": "how long a single deposit should be considered. Must be between 1 and 60 seconds, default value is 10 seconds", + "default": null, + "type": "string" + } + }, + "additionalProperties": false, + "nullable": true + }, "global_rate_limit": { "description": "Enable global rate limiting", "type": "object", diff --git a/apollo-router/src/plugins/traffic_shaping/mod.rs b/apollo-router/src/plugins/traffic_shaping/mod.rs index 7a8ab0e72b..be952aaaf1 100644 --- a/apollo-router/src/plugins/traffic_shaping/mod.rs +++ b/apollo-router/src/plugins/traffic_shaping/mod.rs @@ -67,7 +67,8 @@ struct Shaping { #[schemars(with = "String", default)] /// Enable timeout for incoming requests timeout: Option, - // retri + // *experimental feature*: Enables request retry + experimental_retry: Option, } impl Merge for Shaping { @@ -83,6 +84,42 @@ impl Merge for Shaping { .as_ref() .or(fallback.global_rate_limit.as_ref()) .cloned(), + experimental_retry: self + .experimental_retry + .as_ref() + .or(fallback.experimental_retry.as_ref()) + .cloned(), + }, + } + } +} + +#[derive(PartialEq, Debug, Clone, Deserialize, JsonSchema)] +#[serde(deny_unknown_fields)] +struct RetryConfig { + #[serde(deserialize_with = "humantime_serde::deserialize", default)] + #[schemars(with = "String", default)] + /// how long a single deposit should be considered. Must be between 1 and 60 seconds, + /// default value is 10 seconds + ttl: Option, + /// minimum rate of retries allowed to accomodate clients that have just started + /// issuing requests, or clients that do not issue many requests per window. The + /// default value is 10 + min_per_sec: Option, + /// percentage of calls to deposit that can be retried. This is in addition to any + /// retries allowed for via min_per_sec. Must be between 0 and 1000, default value + /// is 0.2 + retry_percent: Option, +} + +impl Merge for RetryConfig { + fn merge(&self, fallback: Option<&Self>) -> Self { + match fallback { + None => self.clone(), + Some(fallback) => RetryConfig { + ttl: self.ttl.or(fallback.ttl), + min_per_sec: self.min_per_sec.or(fallback.min_per_sec), + retry_percent: self.retry_percent.or(fallback.retry_percent), }, } } @@ -147,7 +184,6 @@ pub(crate) struct TrafficShaping { config: Config, rate_limit_router: Option, rate_limit_subgraphs: Mutex>, - retry_policy: Option, } #[async_trait::async_trait] @@ -182,7 +218,6 @@ impl Plugin for TrafficShaping { config: init.config, rate_limit_router, rate_limit_subgraphs: Mutex::new(HashMap::new()), - retry_policy: Some(RetryPolicy::default()), }) } } @@ -281,7 +316,12 @@ impl TrafficShaping { }) .clone() }); - let retry = tower::retry::RetryLayer::new(self.retry_policy.clone().unwrap()); + + let retry = config.experimental_retry.as_ref().map(|config| { + let retry_policy = + RetryPolicy::new(config.ttl, config.min_per_sec, config.retry_percent); + tower::retry::RetryLayer::new(retry_policy) + }); Either::A(ServiceBuilder::new() .option_layer(config.deduplicate_query.unwrap_or_default().then( @@ -292,7 +332,7 @@ impl TrafficShaping { .timeout .unwrap_or(DEFAULT_TIMEOUT), )) - .option_layer(Some(retry)) + .option_layer(retry) .option_layer(rate_limit) .service(service) .map_request(move |mut req: SubgraphRequest| { diff --git a/apollo-router/src/plugins/traffic_shaping/retry.rs b/apollo-router/src/plugins/traffic_shaping/retry.rs index 3014898d50..6138581c21 100644 --- a/apollo-router/src/plugins/traffic_shaping/retry.rs +++ b/apollo-router/src/plugins/traffic_shaping/retry.rs @@ -1,7 +1,9 @@ +use std::future; use std::sync::Arc; -use std::{future, time::Duration}; +use std::time::Duration; -use tower::retry::{budget::Budget, Policy}; +use tower::retry::budget::Budget; +use tower::retry::Policy; #[derive(Clone, Default)] pub(crate) struct RetryPolicy { @@ -9,9 +11,17 @@ pub(crate) struct RetryPolicy { } impl RetryPolicy { - pub(crate) fn new(duration: Duration, min_per_sec: u32, retry_percent: f32) -> Self { + pub(crate) fn new( + duration: Option, + min_per_sec: Option, + retry_percent: Option, + ) -> Self { Self { - budget: Arc::new(Budget::new(duration, min_per_sec, retry_percent)), + budget: Arc::new(Budget::new( + duration.unwrap_or_else(|| Duration::from_secs(10)), + min_per_sec.unwrap_or(10), + retry_percent.unwrap_or(0.2), + )), } } } @@ -19,7 +29,7 @@ impl RetryPolicy { impl Policy for RetryPolicy { type Future = future::Ready; - fn retry(&self, req: &Req, result: Result<&Res, &E>) -> Option { + fn retry(&self, _req: &Req, result: Result<&Res, &E>) -> Option { match result { Ok(_) => { // Treat all `Response`s as success, @@ -27,7 +37,7 @@ impl Policy for RetryPolicy { self.budget.deposit(); None } - Err(e) => { + Err(_e) => { let withdrew = self.budget.withdraw(); if withdrew.is_err() { return None; diff --git a/apollo-router/src/services/subgraph.rs b/apollo-router/src/services/subgraph.rs index 6e7f6d72c7..1b9ea53a7c 100644 --- a/apollo-router/src/services/subgraph.rs +++ b/apollo-router/src/services/subgraph.rs @@ -97,7 +97,7 @@ impl Clone for Request { Self { supergraph_request: self.supergraph_request.clone(), subgraph_request, - operation_kind: self.operation_kind.clone(), + operation_kind: self.operation_kind, context: self.context.clone(), } } From 972d09bd8c886f77c477b894c52d4df8f8e27dec Mon Sep 17 00:00:00 2001 From: Geoffroy Couprie Date: Wed, 23 Nov 2022 10:52:41 +0100 Subject: [PATCH 5/7] changelog --- NEXT_CHANGELOG.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index 62868a2d42..f0c9ec0459 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -64,6 +64,31 @@ helm upgrade --install --create-namespace --namespace router-test --set-file sup By [@garypen](https://github.com/garypen) in https://github.com/apollographql/router/pull/2119 +### *Experimental* subgraph request retry ([Issue #338](https://github.com/apollographql/router/issues/338), [Issue #1956](https://github.com/apollographql/router/issues/1956)) + +Implements subgraph request retries, using Finagle's retry buckets algorithm: +- it defines a minimal number of retries per second (`min_per_sec`, default is 10 retries per second), to +bootstrap the system or for low traffic deployments +- for each successful request, we add a "token" to the bucket, those tokens expire after `ttl` (default: 10 seconds) +- the number of available additional retries is a part of the number of tokens, defined by `retry_percent` (default is 0.2) + +This is activated in the `traffic_shaping` plugin, either globally or per subgraph: + +```yaml +traffic_shaping: + all: + experimental_retry: + min_per_sec: 10 + ttl: 10s + retry_percent: 0.2 + subgraphs: + accounts: + experimental_retry: + min_per_sec: 20 +``` + +By [@Geal](https://github.com/Geal) in https://github.com/apollographql/router/pull/2006 + ## 🐛 Fixes ### Improve errors when subgraph returns non-GraphQL response with a non-2xx status code ([Issue #2117](https://github.com/apollographql/router/issues/2117)) From 780a08dff6d306b3f78b1189ba95597a68ef53e5 Mon Sep 17 00:00:00 2001 From: Geoffroy Couprie Date: Thu, 24 Nov 2022 15:54:15 +0100 Subject: [PATCH 6/7] add docs --- docs/source/configuration/traffic-shaping.mdx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/configuration/traffic-shaping.mdx b/docs/source/configuration/traffic-shaping.mdx index c01b7b6168..99bac609cc 100644 --- a/docs/source/configuration/traffic-shaping.mdx +++ b/docs/source/configuration/traffic-shaping.mdx @@ -12,6 +12,7 @@ The Apollo Router supports the following types of traffic shaping between itself - The router currently supports `gzip`, `br`, and `deflate`. - **Global rate limiting** - If you want to rate limit requests to subgraphs or to the router itself. - **Timeout**: - Set a timeout to subgraphs and router requests. +- **Request Retry**: - retry subgraph requests if they fail due to network errors. This implements Finagle's retry buckets mechanism. Each of these optimizations can reduce network bandwidth and CPU usage for your subgraphs. @@ -38,6 +39,11 @@ traffic_shaping: capacity: 10 interval: 5s # Must not be greater than 18_446_744_073_709_551_615 milliseconds and not less than 0 milliseconds timeout: 50s # If a request to the subgraph 'products' takes more than 50secs then cancel the request (30 sec by default) + experimental_retry: + min_per_sec: 10 # minimal number of retries per second (`min_per_sec`, default is 10 retries per second) + ttl: 10s # for each successful request, we register a token, that expires according to this option (default: 10s) + retry_percent: 0.2 # defines the proportion of available retries to the current number of tokens + ``` Any configuration under the `subgraphs` key takes precedence over configuration under the `all` key. In the example above, query deduplication is enabled for all subgraphs _except_ the `products` subgraph. From 0fac976b9c834a4066c606a2e9f187824baf60b1 Mon Sep 17 00:00:00 2001 From: Geoffroy Couprie Date: Thu, 24 Nov 2022 15:55:22 +0100 Subject: [PATCH 7/7] mark as experimental in docs --- docs/source/configuration/traffic-shaping.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/configuration/traffic-shaping.mdx b/docs/source/configuration/traffic-shaping.mdx index 99bac609cc..c61f48e698 100644 --- a/docs/source/configuration/traffic-shaping.mdx +++ b/docs/source/configuration/traffic-shaping.mdx @@ -12,7 +12,7 @@ The Apollo Router supports the following types of traffic shaping between itself - The router currently supports `gzip`, `br`, and `deflate`. - **Global rate limiting** - If you want to rate limit requests to subgraphs or to the router itself. - **Timeout**: - Set a timeout to subgraphs and router requests. -- **Request Retry**: - retry subgraph requests if they fail due to network errors. This implements Finagle's retry buckets mechanism. +- **Request Retry**: - retry subgraph requests if they fail due to network errors. This implements Finagle's retry buckets mechanism. **Experimental feature**: retry configuration might change in the future. Each of these optimizations can reduce network bandwidth and CPU usage for your subgraphs.