Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changesets/fix_renee_router_1343.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
### Reliably distinguish GraphQL errors and transport errors in subscriptions ([PR #7901](https://github.com/apollographql/router/pull/7901))

The [Multipart HTTP protocol for GraphQL Subscriptions](https://www.apollographql.com/docs/graphos/routing/operations/subscriptions/multipart-protocol) distinguishes between GraphQL-level errors and fatal transport-level errors. The router previously used a heuristic to determine if a given error was fatal or not, which could sometimes cause errors to be wrongly classified. For example, if a subgraph returned a GraphQL-level error for a subscription and then immediately ended the subscription, the router might propagate this as a fatal transport-level error.

This is now fixed. Fatal transport-level errors are tagged as such when they are constructed, so the router can reliably know how to serialize errors when sending them to the client.
2 changes: 0 additions & 2 deletions .config/nextest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,6 @@ or ( binary_id(=apollo-router::integration_tests) & test(=integration::rhai::tes
or ( binary_id(=apollo-router::integration_tests) & test(=integration::subgraph_response::test_invalid_error_locations_contains_negative_one_location) )
or ( binary_id(=apollo-router::integration_tests) & test(=integration::subgraph_response::test_valid_extensions_service_for_subgraph_error) )
or ( binary_id(=apollo-router::integration_tests) & test(=integration::subgraph_response::test_valid_extensions_service_is_preserved_for_subgraph_error) )
or ( binary_id(=apollo-router::integration_tests) & test(=integration::subscriptions::callback::test_subscription_callback_pure_error_payload) )
or ( binary_id(=apollo-router::integration_tests) & test(=integration::subscriptions::ws_passthrough::test_subscription_ws_passthrough_pure_error_payload) )
or ( binary_id(=apollo-router::integration_tests) & test(=integration::telemetry::datadog::test_basic) )
or ( binary_id(=apollo-router::integration_tests) & test(=integration::telemetry::datadog::test_priority_sampling_no_parent_propagated) )
or ( binary_id(=apollo-router::integration_tests) & test(=integration::telemetry::datadog::test_resource_mapping_default) )
Expand Down
49 changes: 31 additions & 18 deletions apollo-router/src/protocols/multipart.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use tokio_stream::once;
use tokio_stream::wrappers::IntervalStream;

use crate::graphql;
use crate::services::SUBSCRIPTION_ERROR_EXTENSION_KEY;

#[cfg(test)]
const HEARTBEAT_INTERVAL: Duration = Duration::from_millis(10);
Expand Down Expand Up @@ -115,27 +116,39 @@ impl Stream for Multipart {

match self.mode {
ProtocolMode::Subscription => {
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is where the actual change happens.
Now, we decide whether to send payload or errors in the response exclusively based on the SUBSCRIPTION_ERROR_EXTENSION_KEY being true.

The code is reordered a bit (special handling for graceful server-side close is moved earlier) so we aren't creating another special-meaning internal message (SubscriptionPayload with payload being an empty object/null...)

let resp = SubscriptionPayload {
errors: if is_still_open {
Vec::new()
} else {
response.errors.drain(..).collect()
},
payload: match response.data {
None | Some(Value::Null) if response.extensions.is_empty() => {
None
}
_ => (*response).into(),
},
};

// Gracefully closed at the server side
if !is_still_open && resp.payload.is_none() && resp.errors.is_empty() {
let is_transport_error =
response.extensions.remove(SUBSCRIPTION_ERROR_EXTENSION_KEY)
== Some(true.into());
// Magic empty response (that we create internally) means the connection was gracefully closed at the server side
if !is_still_open
&& response.data.is_none()
&& response.errors.is_empty()
&& response.extensions.is_empty()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the significance of extensions being empty? If something other than the subscriptions error key was in extensions then will that break graceful close?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess if extensions is not empty then we want to return the response

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Essentially I'm trying to mimick what the previous code does. In the previous code, resp.payload.is_none() would only be true if both data and extensions were null/none.

I'm not sure if this is like actually required, to be honest. The subscriptions graceful close response can only have extensions if a plugin or user code added them.

{
self.is_terminated = true;
return Poll::Ready(Some(Ok(Bytes::from_static(&b"--\r\n"[..]))));
} else {
serde_json::to_writer(&mut buf, &resp)?;
}

let response = if is_transport_error {
SubscriptionPayload {
errors: std::mem::take(&mut response.errors),
payload: match response.data {
None | Some(Value::Null)
if response.extensions.is_empty() =>
{
None
}
_ => (*response).into(),
},
}
} else {
SubscriptionPayload {
errors: Vec::new(),
payload: (*response).into(),
}
};

serde_json::to_writer(&mut buf, &response)?;
}
ProtocolMode::Defer => {
serde_json::to_writer(&mut buf, &response)?;
Expand Down
5 changes: 5 additions & 0 deletions apollo-router/src/query_planner/execution.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,17 @@ impl QueryPlan {
&self,
context: &'a Context,
service_factory: &'a Arc<FetchServiceFactory>,
// The original supergraph request is used to populate variable values and for plugin
// features like propagating headers or subgraph telemetry based on supergraph request
// values.
supergraph_request: &'a Arc<http::Request<Request>>,
schema: &'a Arc<Schema>,
subgraph_schemas: &'a Arc<SubgraphSchemas>,
// Sender for additional responses past the first one (@defer, @stream, subscriptions)
sender: mpsc::Sender<Response>,
subscription_handle: Option<SubscriptionHandle>,
subscription_config: &'a Option<SubscriptionConfig>,
// Query plan execution builds up a JSON result value, use this as the initial data.
initial_value: Option<Value>,
) -> Response {
let root = Path::empty();
Expand Down
85 changes: 30 additions & 55 deletions apollo-router/src/services/supergraph/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,11 @@ use crate::spec::Schema;
use crate::spec::operation_limits::OperationLimits;

pub(crate) const FIRST_EVENT_CONTEXT_KEY: &str = "apollo::supergraph::first_event";
pub(crate) const SUBSCRIPTION_ERROR_EXTENSION_KEY: &str = "apollo::subscriptions::fatal_error";
const SUBSCRIPTION_CONFIG_RELOAD_EXTENSION_CODE: &str = "SUBSCRIPTION_CONFIG_RELOAD";
const SUBSCRIPTION_SCHEMA_RELOAD_EXTENSION_CODE: &str = "SUBSCRIPTION_SCHEMA_RELOAD";
const SUBSCRIPTION_JWT_EXPIRED_EXTENSION_CODE: &str = "SUBSCRIPTION_JWT_EXPIRED";
const SUBSCRIPTION_EXECUTION_ERROR_EXTENSION_CODE: &str = "SUBSCRIPTION_EXECUTION_ERROR";

/// An [`IndexMap`] of available plugins.
pub(crate) type Plugins = IndexMap<String, Box<dyn DynPlugin>>;
Expand Down Expand Up @@ -464,6 +467,19 @@ pub struct SubscriptionTaskParams {
pub(crate) stream_rx: ReceiverStream<BoxGqlStream>,
}

fn subscription_fatal_error(message: impl Into<String>, extension_code: &str) -> Response {
Response::builder()
.subscribed(false)
.extension(SUBSCRIPTION_ERROR_EXTENSION_KEY, true)
.error(
graphql::Error::builder()
.message(message)
.extension_code(extension_code)
.build(),
)
.build()
}

#[allow(clippy::too_many_arguments)]
async fn subscription_task(
execution_service: execution::BoxCloneService,
Expand Down Expand Up @@ -498,16 +514,10 @@ async fn subscription_task(
}),
_ => {
let _ = sender
.send(
graphql::Response::builder()
.error(
graphql::Error::builder()
.message("cannot execute the subscription event")
.extension_code("SUBSCRIPTION_EXECUTION_ERROR")
.build(),
)
.build(),
)
.send(subscription_fatal_error(
"cannot execute the subscription event",
SUBSCRIPTION_EXECUTION_ERROR_EXTENSION_CODE,
))
.await;
return;
}
Expand Down Expand Up @@ -564,31 +574,22 @@ async fn subscription_task(
break;
}
_ = &mut timeout => {
let response = Response::builder()
.subscribed(false)
.error(
crate::error::Error::builder()
.message("subscription closed because the JWT has expired")
.extension_code("SUBSCRIPTION_JWT_EXPIRED")
.build(),
)
.build();
let _ = sender.send(response).await;
let _ = sender.send(subscription_fatal_error("subscription closed because the JWT has expired", SUBSCRIPTION_JWT_EXPIRED_EXTENSION_CODE)).await;
break;
},
message = receiver.next() => {
match message {
Some(mut val) => {
val.created_at = Some(Instant::now());
let res = dispatch_event(&supergraph_req, execution_service.clone(), query_plan.as_ref(), context.clone(), val, sender.clone())
let res = dispatch_subscription_event(&supergraph_req, execution_service.clone(), query_plan.as_ref(), context.clone(), val, sender.clone())
.instrument(tracing::info_span!(SUBSCRIPTION_EVENT_SPAN_NAME,
graphql.operation.name = %operation_name,
otel.kind = "INTERNAL",
apollo_private.operation_signature = %operation_signature,
apollo_private.duration_ns = field::Empty,)
).await;
if let Err(err) = res {
tracing::error!("cannot send the subscription to the client: {err:?}");
tracing::error!("cannot send the subscription to the client: {err:?}");
break;
}
}
Expand All @@ -597,32 +598,12 @@ async fn subscription_task(
}
Some(_new_configuration) = configuration_updated_rx.next() => {
let _ = sender
.send(
Response::builder()
.subscribed(false)
.error(
graphql::Error::builder()
.message("subscription has been closed due to a configuration reload")
.extension_code(SUBSCRIPTION_CONFIG_RELOAD_EXTENSION_CODE)
.build(),
)
.build(),
)
.send(subscription_fatal_error("subscription has been closed due to a configuration reload", SUBSCRIPTION_CONFIG_RELOAD_EXTENSION_CODE))
.await;
}
Some(_new_schema) = schema_updated_rx.next() => {
let _ = sender
.send(
Response::builder()
.subscribed(false)
.error(
graphql::Error::builder()
.message("subscription has been closed due to a schema reload")
.extension_code(SUBSCRIPTION_SCHEMA_RELOAD_EXTENSION_CODE)
.build(),
)
.build(),
)
.send(subscription_fatal_error("subscription has been closed due to a schema reload", SUBSCRIPTION_SCHEMA_RELOAD_EXTENSION_CODE))
.await;

break;
Expand All @@ -636,7 +617,7 @@ async fn subscription_task(
}
}

async fn dispatch_event(
async fn dispatch_subscription_event(
supergraph_req: &SupergraphRequest,
execution_service: execution::BoxCloneService,
query_plan: Option<&Arc<QueryPlan>>,
Expand Down Expand Up @@ -666,16 +647,10 @@ async fn dispatch_event(
Err(err) => {
tracing::error!("cannot execute the subscription event: {err:?}");
let _ = sender
.send(
graphql::Response::builder()
.error(
graphql::Error::builder()
.message("cannot execute the subscription event")
.extension_code("SUBSCRIPTION_EXECUTION_ERROR")
.build(),
)
.build(),
)
.send(subscription_fatal_error(
"cannot execute the subscription event",
SUBSCRIPTION_EXECUTION_ERROR_EXTENSION_CODE,
))
.await;
return Ok(());
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
---
source: apollo-router/src/services/supergraph/tests.rs
expression: "tokio::time::timeout(Duration::from_secs(1),\nstream.next_response()).await.unwrap().unwrap()"
snapshot_kind: text
---
{
"data": null,
Expand All @@ -11,5 +12,8 @@ expression: "tokio::time::timeout(Duration::from_secs(1),\nstream.next_response(
"code": "SUBSCRIPTION_SCHEMA_RELOAD"
}
}
]
],
"extensions": {
"apollo::subscriptions::fatal_error": true
}
}
Loading