diff --git a/.changesets/fix_rreg_fix_entities_errors_missing_service.md b/.changesets/fix_rreg_fix_entities_errors_missing_service.md new file mode 100644 index 0000000000..50c6b2abc6 --- /dev/null +++ b/.changesets/fix_rreg_fix_entities_errors_missing_service.md @@ -0,0 +1,7 @@ +### Fix _entities Apollo Error Metrics Missing Service Attribute ([PR #8153](https://github.com/apollographql/router/pull/8153)) + +Error counting https://github.com/apollographql/router/pull/7712 introduced a bug where `_entities` errors from a subgraph fetch no longer reported a service (subgraph or connector) attribute. This erroneously categorized these errors as from the Router rather than their originating service in the Studio UI. + +The attribute has been re-added, fixing this issue. + +By [@rregitsky](https://github.com/rregitsky) in https://github.com/apollographql/router/pull/8153 diff --git a/apollo-router/src/services/subgraph_service.rs b/apollo-router/src/services/subgraph_service.rs index bd1b6b8394..3144002684 100644 --- a/apollo-router/src/services/subgraph_service.rs +++ b/apollo-router/src/services/subgraph_service.rs @@ -27,6 +27,8 @@ use opentelemetry::Key; use opentelemetry::KeyValue; use rustls::RootCertStore; use serde::Serialize; +use serde_json_bytes::Entry; +use serde_json_bytes::json; use tokio::sync::oneshot; use tokio_tungstenite::connect_async; use tokio_tungstenite::connect_async_tls_with_config; @@ -785,6 +787,14 @@ fn http_response_to_graphql_response( } }; + // Any errors directly parsed from the response likely won't yet have the service name set, + // but we need it for telemetry error counting + for err in &mut graphql_response.errors { + if let Entry::Vacant(v) = err.extensions.entry("service") { + v.insert(json!(service_name)); + } + } + // Add an error for response codes that are not 2xx if !parts.status.is_success() { let status = parts.status; @@ -3261,6 +3271,7 @@ mod tests { let error = graphql::Error::builder() .message("error was encountered for test") .extension_code("SOME_EXTENSION") + .extension("service", "test_service") .build(); let mut json = serde_json::json!({ "data": { @@ -3295,6 +3306,7 @@ mod tests { let error = graphql::Error::builder() .message("error was encountered for test") .extension_code("SOME_EXTENSION") + .extension("service", "test_service") .build(); let mut json = serde_json::json!({ "data": { diff --git a/apollo-router/tests/integration/telemetry/apollo_otel_metrics.rs b/apollo-router/tests/integration/telemetry/apollo_otel_metrics.rs index c18a29eef2..e9f0f483a1 100644 --- a/apollo-router/tests/integration/telemetry/apollo_otel_metrics.rs +++ b/apollo-router/tests/integration/telemetry/apollo_otel_metrics.rs @@ -222,6 +222,83 @@ async fn test_subgraph_layer_error_emits_metric() { router.graceful_shutdown().await; } +#[tokio::test(flavor = "multi_thread")] +async fn test_subgraph_layer_entities_error_emits_metric() { + if !graph_os_enabled() { + return; + } + let expected_service = "products"; + let expected_error_code = "SUBGRAPH_CODE"; + let expected_client_name = "CLIENT_NAME"; + let expected_client_version = "v0.14"; + let expected_path = "/_entities/0/name"; + + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Otlp { endpoint: None }) + .config( + r#" + telemetry: + apollo: + experimental_otlp_metrics_protocol: http + batch_processor: + scheduled_delay: 10ms + errors: + preview_extended_error_metrics: enabled + "#, + ) + .responder( + ResponseTemplate::new(200).set_body_json( + graphql::Response::builder() + .data(json!({"data": {"_entities": [{"name": null}]}})) + .errors(vec![ + graphql::Error::builder() + .message("error in subgraph layer") + // Explicitly exclude setting service as it should get populated by subgraph_service + .extension_code(expected_error_code) + // Path must not have leading slash to match expected + .path("_entities/0/name") + .build(), + ]) + .build(), + ), + ) + .build() + .await; + + router.start().await; + router.assert_started().await; + + router + .execute_query( + Query::builder() + .header("apollographql-client-name", expected_client_name) + .header("apollographql-client-version", expected_client_version) + .build(), + ) + .await; + + let metrics = router + .wait_for_emitted_otel_metrics(Duration::from_millis(20)) + .await; + + assert!(!metrics.is_empty()); + assert_metrics_contain( + &metrics, + Metric::builder() + .name("apollo.router.operations.error".to_string()) + .attribute("graphql.operation.name", "ExampleQuery") + .attribute("graphql.operation.type", "query") + .attribute("apollo.client.name", expected_client_name) + .attribute("apollo.client.version", expected_client_version) + .attribute("graphql.error.extensions.code", expected_error_code) + .attribute("apollo.router.error.service", expected_service) + .attribute("graphql.error.path", expected_path) + .value(1) + .build(), + ); + router.graceful_shutdown().await; +} + #[tokio::test(flavor = "multi_thread")] async fn test_include_subgraph_error_disabled_does_not_redact_error_metrics() { if !graph_os_enabled() {