diff --git a/.changesets/feat_subgraph_fetch_histogram.md b/.changesets/feat_subgraph_fetch_histogram.md new file mode 100644 index 0000000000..8b7a73c941 --- /dev/null +++ b/.changesets/feat_subgraph_fetch_histogram.md @@ -0,0 +1,30 @@ +### [Subgraph Insights] Experimental Apollo Subgraph Fetch Histogram ([PR #7960](https://github.com/apollographql/router/pull/7960)) + + + + +--- +This change adds a new, experimental histogram to capture subgraph fetch duration, +`apollo.router.operations.fetch.duration` with the following attributes: +- client.name +- client.version +- has.errors +- operation.name +- operation.id +- subgraph.name + +This can be controlled using a new boolean config flag: +```yaml +telemetry: + instrumentation: + instruments: + apollo: + subgraph: + experimental_subgraph_fetch_duration: true +``` +The metric is currently only sent to GraphOS and is not available in 3rd-party OTel export targets. It is not currently +user customizable. + +The metric `http.` + +By [@rregitsky](https://github.com/rregitsky) in https://github.com/apollographql/router/pull/7960 diff --git a/apollo-router/src/metrics/filter.rs b/apollo-router/src/metrics/filter.rs index 1b46ee8135..dd9115648e 100644 --- a/apollo-router/src/metrics/filter.rs +++ b/apollo-router/src/metrics/filter.rs @@ -89,7 +89,8 @@ impl FilterMeterProvider { } fn get_private_realtime_regex() -> Regex { - Regex::new(r"apollo\.router\.operations\.error").expect("regex should have been valid") + Regex::new(r"apollo\.router\.operations\.(?:error|fetch\.duration)") + .expect("regex should have been valid") } pub(crate) fn private_realtime>(delegate: T) -> Self { diff --git a/apollo-router/src/plugins/telemetry/config_new/fixtures/subgraph/apollo_histogram_duration/metrics.snap b/apollo-router/src/plugins/telemetry/config_new/fixtures/subgraph/apollo_histogram_duration/metrics.snap new file mode 100644 index 0000000000..89a32937c8 --- /dev/null +++ b/apollo-router/src/plugins/telemetry/config_new/fixtures/subgraph/apollo_histogram_duration/metrics.snap @@ -0,0 +1,27 @@ +--- +source: apollo-router/src/plugins/telemetry/config_new/instruments.rs +description: Custom histogram duration +expression: "&metrics.all()" +info: + telemetry: + instrumentation: + instruments: + subgraph: + apollo: + experimental_subgraph_fetch_duration: true + http.client.request.duration: false +snapshot_kind: text +--- +- name: apollo.router.operations.fetch.duration + description: Duration of a subgraph fetch. + unit: s + data: + datapoints: + - sum: 0.1 + count: 1 + attributes: + client.name: myClient + client.version: v0.1.0 + has.errors: false + operation.name: Test + subgraph.name: products diff --git a/apollo-router/src/plugins/telemetry/config_new/fixtures/subgraph/apollo_histogram_duration/metrics.snap.new b/apollo-router/src/plugins/telemetry/config_new/fixtures/subgraph/apollo_histogram_duration/metrics.snap.new new file mode 100644 index 0000000000..3b66a647da --- /dev/null +++ b/apollo-router/src/plugins/telemetry/config_new/fixtures/subgraph/apollo_histogram_duration/metrics.snap.new @@ -0,0 +1,29 @@ +--- +source: apollo-router/src/plugins/telemetry/config_new/instruments.rs +assertion_line: 3310 +description: Custom histogram duration +expression: "&metrics.all()" +info: + telemetry: + instrumentation: + instruments: + apollo: + subgraph: + experimental_subgraph_fetch_duration: true + subgraph: + http.client.request.duration: false +--- +- name: apollo.router.operations.fetch.duration + description: Duration of a subgraph fetch. + unit: s + data: + datapoints: + - sum: 0.1 + count: 1 + attributes: + client.name: myClient + client.version: v0.1.0 + has.errors: false + operation.id: myOperationID + operation.name: Test + subgraph.name: products diff --git a/apollo-router/src/plugins/telemetry/config_new/fixtures/subgraph/apollo_histogram_duration/router.yaml b/apollo-router/src/plugins/telemetry/config_new/fixtures/subgraph/apollo_histogram_duration/router.yaml new file mode 100644 index 0000000000..27ba45ede7 --- /dev/null +++ b/apollo-router/src/plugins/telemetry/config_new/fixtures/subgraph/apollo_histogram_duration/router.yaml @@ -0,0 +1,8 @@ +telemetry: + instrumentation: + instruments: + apollo: + subgraph: + experimental_subgraph_fetch_duration: true + subgraph: + http.client.request.duration: false \ No newline at end of file diff --git a/apollo-router/src/plugins/telemetry/config_new/fixtures/subgraph/apollo_histogram_duration/test.yaml b/apollo-router/src/plugins/telemetry/config_new/fixtures/subgraph/apollo_histogram_duration/test.yaml new file mode 100644 index 0000000000..e987f6f0fd --- /dev/null +++ b/apollo-router/src/plugins/telemetry/config_new/fixtures/subgraph/apollo_histogram_duration/test.yaml @@ -0,0 +1,17 @@ +description: Custom histogram duration +events: + - - context: + map: + "apollo::supergraph::operation_name": "Test" + "apollo::supergraph::operation_id": "myOperationID" + "apollo::telemetry::client_name": "myClient" + "apollo::telemetry::client_version": "v0.1.0" + - subgraph_request: + query: "query { hello }" + operation_name: "Products" + operation_kind: query + subgraph_name: "products" + - subgraph_response: + status: 200 + data: + hello: "world" \ No newline at end of file diff --git a/apollo-router/src/plugins/telemetry/config_new/instruments.rs b/apollo-router/src/plugins/telemetry/config_new/instruments.rs index 2f0e6201f5..45f8417cce 100644 --- a/apollo-router/src/plugins/telemetry/config_new/instruments.rs +++ b/apollo-router/src/plugins/telemetry/config_new/instruments.rs @@ -34,7 +34,7 @@ use super::http_server::attributes::HttpServerAttributes; use super::router::instruments::RouterInstruments; use super::router::instruments::RouterInstrumentsConfig; use super::selectors::CacheKind; -use super::subgraph::instruments::SubgraphInstruments; +use super::subgraph::instruments::{ApolloSubgraphInstrumentsConfig, SubgraphInstruments}; use super::subgraph::instruments::SubgraphInstrumentsConfig; use super::supergraph::instruments::SupergraphCustomInstruments; use super::supergraph::instruments::SupergraphInstrumentsConfig; @@ -43,6 +43,9 @@ use crate::axum_factory::connection_handle::ConnectionState; use crate::axum_factory::connection_handle::OPEN_CONNECTIONS_METRIC; use crate::metrics; use crate::metrics::meter_provider; +use crate::plugins::telemetry::CLIENT_NAME; +use crate::plugins::telemetry::CLIENT_VERSION; +use crate::query_planner::APOLLO_OPERATION_ID; use crate::plugins::telemetry::config_new::Selectors; use crate::plugins::telemetry::config_new::attributes::DefaultAttributeRequirementLevel; use crate::plugins::telemetry::config_new::conditions::Condition; @@ -110,6 +113,15 @@ pub(crate) struct InstrumentsConfig { CacheInstrumentsConfig, Instrument, >, + + /// Apollo instruments + pub (crate) apollo: ApolloInstrumentsConfig +} + +#[derive(Clone, Deserialize, JsonSchema, Debug, Default)] +#[serde(deny_unknown_fields, default)] +pub (crate) struct ApolloInstrumentsConfig { + pub(crate) subgraph: ApolloSubgraphInstrumentsConfig, } const HTTP_SERVER_REQUEST_DURATION_METRIC: &str = "http.server.request.duration"; @@ -120,6 +132,8 @@ const HTTP_SERVER_ACTIVE_REQUESTS: &str = "http.server.active_requests"; pub(super) const HTTP_CLIENT_REQUEST_DURATION_METRIC: &str = "http.client.request.duration"; pub(super) const HTTP_CLIENT_REQUEST_BODY_SIZE_METRIC: &str = "http.client.request.body.size"; pub(super) const HTTP_CLIENT_RESPONSE_BODY_SIZE_METRIC: &str = "http.client.response.body.size"; +pub(super) const APOLLO_ROUTER_OPERATIONS_FETCH_DURATION: &str = + "apollo.router.operations.fetch.duration"; impl InstrumentsConfig { pub(crate) fn validate(&self) -> Result<(), String> { @@ -497,6 +511,19 @@ impl InstrumentsConfig { let meter = metrics::meter_provider().meter(METER_NAME); let mut static_instruments = HashMap::with_capacity(self.subgraph.custom.len()); + // Built-in Apollo instruments. Not currently user configurable. + static_instruments.insert( + APOLLO_ROUTER_OPERATIONS_FETCH_DURATION.to_string(), + StaticInstrument::Histogram( + meter + .f64_histogram(APOLLO_ROUTER_OPERATIONS_FETCH_DURATION) + .with_unit("s") + .with_description("Duration of a subgraph fetch.") + .init(), + ), + ); + + // Built-in user customizable instruments if self .subgraph .attributes @@ -551,6 +578,7 @@ impl InstrumentsConfig { ); } + // Custom user instruments for (instrument_name, instrument) in &self.subgraph.custom { match instrument.ty { InstrumentType::Counter => { @@ -709,10 +737,66 @@ impl InstrumentsConfig { }), } }); + + // Apollo instruments. Not currently user configurable + let apollo_router_operation_fetch_duration = self.apollo + .subgraph + .experimental_subgraph_fetch_duration + .then(|| { + CustomHistogram { + inner: Mutex::new(CustomHistogramInner { + increment: Increment::Duration(Instant::now()), + condition: Condition::True, + histogram: Some(static_instruments + .get(APOLLO_ROUTER_OPERATIONS_FETCH_DURATION) + .expect( + "cannot get static instrument for subgraph; this should not happen", + ) + .as_histogram() + .cloned() + .expect( + "cannot convert instrument to histogram for subgraph; this should not happen", + ) + ), + attributes: Vec::with_capacity(5), + selector: None, + // Hardcode yaml config as this is currently the only way to build attributes + // and selectors. + selectors: Some( + Arc::new( + serde_yaml::from_str::>( + &format!( + r#" + subgraph.name: true + operation.name: + supergraph_operation_name: string + operation.id: + request_context: {operation_id_key} + client.name: + request_context: {client_name_key} + client.version: + request_context: {client_version_key} + has.errors: + subgraph_on_graphql_error: true + "#, + operation_id_key = APOLLO_OPERATION_ID, + client_name_key = CLIENT_NAME, + client_version_key = CLIENT_VERSION + ) + ).unwrap() + ) + ), + updated: false, + _phantom: PhantomData, + }) + } + }); + SubgraphInstruments { http_client_request_duration, http_client_request_body_size, http_client_response_body_size, + apollo_router_operation_fetch_duration, custom: CustomInstruments::new(&self.subgraph.custom, static_instruments), } } diff --git a/apollo-router/src/plugins/telemetry/config_new/subgraph/instruments.rs b/apollo-router/src/plugins/telemetry/config_new/subgraph/instruments.rs index 65383020ca..e2180629d5 100644 --- a/apollo-router/src/plugins/telemetry/config_new/subgraph/instruments.rs +++ b/apollo-router/src/plugins/telemetry/config_new/subgraph/instruments.rs @@ -38,6 +38,14 @@ pub(crate) struct SubgraphInstrumentsConfig { DefaultedStandardInstrument>, } +#[derive(Clone, Deserialize, JsonSchema, Debug, Default)] +#[serde(deny_unknown_fields, default)] +pub(crate) struct ApolloSubgraphInstrumentsConfig { + /// Send OTLP subgraph fetch duration histogram to Apollo Studio with select dimensions [`client.name`, `client.version`, `has.errors`, `operation.name`, `subgraph.name`]. + #[serde(default)] + pub(crate) experimental_subgraph_fetch_duration: bool, +} + impl DefaultForLevel for SubgraphInstrumentsConfig { fn defaults_for_level( &mut self, @@ -107,6 +115,15 @@ pub(crate) struct SubgraphInstruments { SubgraphSelector, >, >, + pub(crate) apollo_router_operation_fetch_duration: Option< + CustomHistogram< + subgraph::Request, + subgraph::Response, + (), + SubgraphAttributes, + SubgraphSelector, + >, + >, pub(crate) custom: SubgraphCustomInstruments, } @@ -125,6 +142,11 @@ impl Instrumented for SubgraphInstruments { if let Some(http_client_response_body_size) = &self.http_client_response_body_size { http_client_response_body_size.on_request(request); } + if let Some(apollo_router_operation_fetch_duration) = + &self.apollo_router_operation_fetch_duration + { + apollo_router_operation_fetch_duration.on_request(request); + } self.custom.on_request(request); } @@ -138,6 +160,11 @@ impl Instrumented for SubgraphInstruments { if let Some(http_client_response_body_size) = &self.http_client_response_body_size { http_client_response_body_size.on_response(response); } + if let Some(apollo_router_operation_fetch_duration) = + &self.apollo_router_operation_fetch_duration + { + apollo_router_operation_fetch_duration.on_response(response); + } self.custom.on_response(response); } @@ -151,6 +178,11 @@ impl Instrumented for SubgraphInstruments { if let Some(http_client_response_body_size) = &self.http_client_response_body_size { http_client_response_body_size.on_error(error, ctx); } + if let Some(apollo_router_operation_fetch_duration) = + &self.apollo_router_operation_fetch_duration + { + apollo_router_operation_fetch_duration.on_error(error, ctx); + } self.custom.on_error(error, ctx); } } diff --git a/apollo-router/src/plugins/telemetry/metrics/apollo/mod.rs b/apollo-router/src/plugins/telemetry/metrics/apollo/mod.rs index aa58d874ba..67db40cde1 100644 --- a/apollo-router/src/plugins/telemetry/metrics/apollo/mod.rs +++ b/apollo-router/src/plugins/telemetry/metrics/apollo/mod.rs @@ -9,6 +9,7 @@ use opentelemetry_otlp::WithExportConfig; use opentelemetry_sdk::Resource; use opentelemetry_sdk::metrics::PeriodicReader; use opentelemetry_sdk::runtime; +use prometheus::exponential_buckets; use sys_info::hostname; use tonic::metadata::MetadataMap; use tonic::transport::ClientTlsConfig; @@ -187,7 +188,10 @@ impl Config { )), Box::new( CustomAggregationSelector::builder() - .boundaries(default_buckets()) + .boundaries( + // Returns [~1.4ms ... ~5min] + exponential_buckets(0.001399084909, 1.1, 129).unwrap(), + ) .build(), ), )?; diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index 4b01f6dd15..aafc497833 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -266,6 +266,7 @@ struct BuiltinInstruments { graphql_custom_instruments: Arc>, router_custom_instruments: Arc>, supergraph_custom_instruments: Arc>, + // TODO should I separate out a non-custom instruments? subgraph_custom_instruments: Arc>, connector_custom_instruments: Arc>, cache_custom_instruments: Arc>, @@ -878,6 +879,7 @@ impl PluginPrivate for Telemetry { .subgraph .attributes .on_request(sub_request); + // TODO apollo instruments instead of throwing it in with the custom ones? let custom_instruments = config .instrumentation .instruments