-
Notifications
You must be signed in to change notification settings - Fork 330
[Subgraph Insights] Add Apollo Subgraph Fetch Histogram to Telemetry Plugin #8013
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5465548
69f0746
2a1de23
237734e
ab71c4d
400fe10
9183ce7
1b10139
542c416
023a3b8
00001ba
98608c5
a3cd208
a099e94
8c55ff7
6b1f4c5
6b69a7e
7a506e7
a006a47
d949a6f
676bad7
b1df41d
60c3355
ca705d4
fd42caf
267ab2b
b6e0d50
e219496
bfdb7aa
89dcb73
8fb12eb
59ccd04
55f16f0
ebc709e
3e3e15f
023d6f1
aacbfbf
e59af96
df897ad
c018dda
6ed7aff
c7e49b2
e2f00d9
13f5be7
6950160
4253731
7dc2787
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| ### [Subgraph Insights] Experimental Apollo Subgraph Fetch Histogram ([PR #8013](https://github.com/apollographql/router/pull/8013)) | ||
|
|
||
| This change adds a new, experimental histogram to capture subgraph fetch duration for GraphOS. This will | ||
| eventually be used to power subgraph-level insights in Apollo Studio. | ||
|
|
||
| This can be toggled on using a new boolean config flag: | ||
|
|
||
| ```yaml | ||
| telemetry: | ||
| apollo: | ||
| experimental_subgraph_metrics: true | ||
| ``` | ||
|
|
||
| The new instrument is only sent to GraphOS and is not available in 3rd-party OTel export targets. It is not currently | ||
| customizable. Users requiring a customizable alternative can use the existing `http.client.request.duration` | ||
| instrument, which measures the same value. | ||
|
|
||
| By [@rregitsky](https://github.com/rregitsky) in https://github.com/apollographql/router/pull/8013 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,176 @@ | ||
| use std::collections::HashMap; | ||
| use std::sync::Arc; | ||
|
|
||
| use opentelemetry::metrics::MeterProvider; | ||
| use tokio::time::Instant; | ||
| use tower::BoxError; | ||
|
|
||
| use crate::Context; | ||
| use crate::metrics; | ||
| use crate::plugins::telemetry::APOLLO_CLIENT_NAME_ATTRIBUTE; | ||
| use crate::plugins::telemetry::APOLLO_CLIENT_VERSION_ATTRIBUTE; | ||
| use crate::plugins::telemetry::APOLLO_HAS_ERRORS_ATTRIBUTE; | ||
| use crate::plugins::telemetry::APOLLO_OPERATION_ID_ATTRIBUTE; | ||
| use crate::plugins::telemetry::CLIENT_NAME; | ||
| use crate::plugins::telemetry::CLIENT_VERSION; | ||
| use crate::plugins::telemetry::GRAPHQL_OPERATION_NAME_ATTRIBUTE; | ||
| use crate::plugins::telemetry::GRAPHQL_OPERATION_TYPE_ATTRIBUTE; | ||
| use crate::plugins::telemetry::apollo::Config; | ||
| use crate::plugins::telemetry::config_new::attributes::StandardAttribute; | ||
| use crate::plugins::telemetry::config_new::extendable::Extendable; | ||
| use crate::plugins::telemetry::config_new::instruments::APOLLO_ROUTER_OPERATIONS_FETCH_DURATION; | ||
| use crate::plugins::telemetry::config_new::instruments::CustomHistogram; | ||
| use crate::plugins::telemetry::config_new::instruments::Increment; | ||
| use crate::plugins::telemetry::config_new::instruments::Instrumented; | ||
| use crate::plugins::telemetry::config_new::instruments::METER_NAME; | ||
| use crate::plugins::telemetry::config_new::instruments::StaticInstrument; | ||
| use crate::plugins::telemetry::config_new::selectors::OperationKind; | ||
| use crate::plugins::telemetry::config_new::selectors::OperationName; | ||
| use crate::plugins::telemetry::config_new::subgraph::attributes::SubgraphAttributes; | ||
| use crate::plugins::telemetry::config_new::subgraph::selectors::SubgraphSelector; | ||
| use crate::query_planner::APOLLO_OPERATION_ID; | ||
| use crate::services::subgraph; | ||
|
|
||
| pub(crate) struct ApolloSubgraphInstruments { | ||
| pub(crate) apollo_router_operations_fetch_duration: Option< | ||
| CustomHistogram< | ||
| subgraph::Request, | ||
| subgraph::Response, | ||
| (), | ||
| SubgraphAttributes, | ||
| SubgraphSelector, | ||
| >, | ||
| >, | ||
| } | ||
|
|
||
| impl ApolloSubgraphInstruments { | ||
| pub(crate) fn new( | ||
| static_instruments: Arc<HashMap<String, StaticInstrument>>, | ||
| apollo_config: Config, | ||
| ) -> Self { | ||
| let selectors = Extendable { | ||
| attributes: SubgraphAttributes::builder() | ||
| .subgraph_name(StandardAttribute::Bool(true)) | ||
| .build(), | ||
| custom: HashMap::from([ | ||
| ( | ||
| APOLLO_CLIENT_NAME_ATTRIBUTE.to_string(), | ||
| SubgraphSelector::ResponseContext { | ||
| response_context: CLIENT_NAME.to_string(), | ||
| redact: None, | ||
| default: None, | ||
| }, | ||
| ), | ||
| ( | ||
| APOLLO_CLIENT_VERSION_ATTRIBUTE.to_string(), | ||
| SubgraphSelector::ResponseContext { | ||
| response_context: CLIENT_VERSION.to_string(), | ||
| redact: None, | ||
| default: None, | ||
| }, | ||
| ), | ||
| ( | ||
| GRAPHQL_OPERATION_NAME_ATTRIBUTE.to_string(), | ||
| SubgraphSelector::SupergraphOperationName { | ||
| supergraph_operation_name: OperationName::String, | ||
| redact: None, | ||
| default: None, | ||
| }, | ||
| ), | ||
| ( | ||
| GRAPHQL_OPERATION_TYPE_ATTRIBUTE.to_string(), | ||
| SubgraphSelector::SupergraphOperationKind { | ||
| supergraph_operation_kind: OperationKind::String, | ||
| }, | ||
| ), | ||
| ( | ||
| APOLLO_OPERATION_ID_ATTRIBUTE.to_string(), | ||
| SubgraphSelector::ResponseContext { | ||
| response_context: APOLLO_OPERATION_ID.to_string(), | ||
| redact: None, | ||
| default: None, | ||
| }, | ||
| ), | ||
| ( | ||
| APOLLO_HAS_ERRORS_ATTRIBUTE.to_string(), | ||
| SubgraphSelector::OnGraphQLError { | ||
| subgraph_on_graphql_error: true, | ||
| }, | ||
| ), | ||
| ]), | ||
| }; | ||
| let attribute_count = selectors.custom.len() + 1; // 1 for subgraph_name on attributes | ||
|
|
||
| let apollo_router_operations_fetch_duration = | ||
| apollo_config.experimental_subgraph_metrics.then(|| { | ||
| CustomHistogram::builder() | ||
| .increment(Increment::Duration(Instant::now())) | ||
| .attributes(Vec::with_capacity(attribute_count)) | ||
| .selectors(Arc::new(selectors)) | ||
| .histogram(static_instruments | ||
| .get(APOLLO_ROUTER_OPERATIONS_FETCH_DURATION) | ||
| .expect( | ||
| "cannot get apollo static instrument for subgraph; this should not happen", | ||
| ) | ||
| .as_histogram() | ||
| .cloned() | ||
| .expect( | ||
| "cannot convert apollo instrument to histogram for subgraph; this should not happen", | ||
| ) | ||
| ) | ||
| .build() | ||
| }); | ||
|
|
||
| Self { | ||
| apollo_router_operations_fetch_duration, | ||
| } | ||
| } | ||
|
|
||
| pub(crate) fn new_builtin() -> HashMap<String, StaticInstrument> { | ||
| let meter = metrics::meter_provider().meter(METER_NAME); | ||
| let mut static_instruments = HashMap::with_capacity(1); | ||
|
|
||
| static_instruments.insert( | ||
| APOLLO_ROUTER_OPERATIONS_FETCH_DURATION.to_string(), | ||
| StaticInstrument::Histogram( | ||
| meter | ||
| .f64_histogram(APOLLO_ROUTER_OPERATIONS_FETCH_DURATION) | ||
| .with_unit("s") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess seconds is the standard unit to use? Milliseconds or nanoseconds makes more sense to me but again I'm new to this code.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We actually don't have control over this for Some(instant.elapsed().as_secs_f64())In fact, all histogram refs I could find in the router were all using
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose if it was possible for this to be milliseconds maybe we would be able to use a 32 bit value and save some space over the wire. I'd need to also handle this on the ingestion side since we don't respect the
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As far as I understand it is indeed the standard unit to use for all time measurements, but anyone displaying it would convert the unit to a most-appropriate scale at that point
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @timbotnik as I mentioned,
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO there's not a great reason to change this now (an argument about precision would be the only reason, but the numbers we're dealing with are precise enough). Let's make a tech debt ticket which would include updating the ingestion side to fail gracefully on different units (or support them!)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| .with_description("Duration of a subgraph fetch.") | ||
| .init(), | ||
| ), | ||
| ); | ||
|
|
||
| static_instruments | ||
| } | ||
| } | ||
|
|
||
| impl Instrumented for ApolloSubgraphInstruments { | ||
| type Request = subgraph::Request; | ||
| type Response = subgraph::Response; | ||
| type EventResponse = (); | ||
|
|
||
| fn on_request(&self, request: &Self::Request) { | ||
| if let Some(apollo_router_operations_fetch_duration) = | ||
| &self.apollo_router_operations_fetch_duration | ||
| { | ||
| apollo_router_operations_fetch_duration.on_request(request); | ||
| } | ||
| } | ||
|
|
||
| fn on_response(&self, response: &Self::Response) { | ||
| if let Some(apollo_router_operations_fetch_duration) = | ||
| &self.apollo_router_operations_fetch_duration | ||
| { | ||
| apollo_router_operations_fetch_duration.on_response(response); | ||
| } | ||
| } | ||
|
|
||
| fn on_error(&self, error: &BoxError, ctx: &Context) { | ||
| if let Some(apollo_router_operations_fetch_duration) = | ||
| &self.apollo_router_operations_fetch_duration | ||
| { | ||
| apollo_router_operations_fetch_duration.on_error(error, ctx); | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| pub(crate) mod instruments; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| --- | ||
| source: apollo-router/src/plugins/telemetry/config_new/instruments.rs | ||
| description: Apollo subgraph fetch duration histogram | ||
| expression: "&metrics.all()" | ||
| info: | ||
| telemetry: | ||
| apollo: | ||
| experimental_subgraph_metrics: true | ||
| instrumentation: | ||
| instruments: | ||
| subgraph: | ||
| http.client.request.duration: false | ||
| snapshot_kind: text | ||
| --- | ||
| - name: apollo.router.operations.fetch.duration | ||
| description: Duration of a subgraph fetch. | ||
| unit: s | ||
| data: | ||
| datapoints: | ||
| - sum: 0.1 | ||
| count: 1 | ||
| attributes: | ||
| apollo.client.name: myClient | ||
| apollo.client.version: v0.1.0 | ||
| apollo.operation.id: myOperationID | ||
| graphql.operation.name: Test | ||
| graphql.operation.type: query | ||
| has_errors: false | ||
| subgraph.name: products |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| telemetry: | ||
| apollo: | ||
| experimental_subgraph_metrics: true | ||
| instrumentation: | ||
| instruments: | ||
| subgraph: | ||
| http.client.request.duration: false |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| description: Apollo subgraph fetch duration histogram | ||
| events: | ||
| - - context: | ||
| map: | ||
| "apollo::supergraph::operation_name": "Test" | ||
| "apollo::supergraph::operation_id": "myOperationID" | ||
| "apollo::supergraph::operation_kind": "query" | ||
| "apollo::telemetry::client_name": "myClient" | ||
| "apollo::telemetry::client_version": "v0.1.0" | ||
| - subgraph_request: | ||
| query: "query { hello }" | ||
| operation_name: "Products" | ||
| operation_kind: query | ||
| subgraph_name: "products" | ||
| - subgraph_response: | ||
| status: 200 | ||
| data: | ||
| hello: "world" |
Uh oh!
There was an error while loading. Please reload this page.