Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
5465548
create histogram
rregitsky Jul 30, 2025
69f0746
enable metric export and exp buckets
rregitsky Jul 30, 2025
2a1de23
Wrap with config
rregitsky Jul 30, 2025
237734e
add op kind
rregitsky Jul 30, 2025
ab71c4d
Make apollo instruments
rregitsky Jul 30, 2025
400fe10
Fix operation name
rregitsky Jul 30, 2025
9183ce7
changeset
rregitsky Jul 30, 2025
1b10139
Add instrument test
rregitsky Jul 30, 2025
542c416
lint
rregitsky Jul 30, 2025
023a3b8
move instrument construction to apollo/instruments
rregitsky Jul 30, 2025
00001ba
lint
rregitsky Jul 30, 2025
98608c5
WIP integration test
rregitsky Jul 30, 2025
a3cd208
PR fixes. Fix naming and op kind source
rregitsky Jul 31, 2025
a099e94
Fix integration test
rregitsky Jul 31, 2025
8c55ff7
lint
rregitsky Jul 31, 2025
6b1f4c5
fix changeset
rregitsky Jul 31, 2025
6b69a7e
remove unused imports
rregitsky Jul 31, 2025
7a506e7
Merge remote-tracking branch 'origin/dev' into rreg/PULSR-1673/top-le…
rregitsky Jul 31, 2025
a006a47
lint + snapshot updates
rregitsky Jul 31, 2025
d949a6f
PR fixes
rregitsky Aug 1, 2025
676bad7
lint
rregitsky Aug 1, 2025
b1df41d
Merge remote-tracking branch 'origin/dev' into rreg/PULSR-1673/top-le…
rregitsky Aug 1, 2025
60c3355
fix attr names
rregitsky Aug 4, 2025
ca705d4
create histogram
rregitsky Jul 30, 2025
fd42caf
enable metric export and exp buckets
rregitsky Jul 30, 2025
267ab2b
Wrap with config
rregitsky Jul 30, 2025
b6e0d50
add op kind
rregitsky Jul 30, 2025
e219496
Make apollo instruments
rregitsky Jul 30, 2025
bfdb7aa
Fix operation name
rregitsky Jul 30, 2025
89dcb73
changeset
rregitsky Jul 30, 2025
8fb12eb
Add instrument test
rregitsky Jul 30, 2025
59ccd04
lint
rregitsky Jul 30, 2025
55f16f0
move instrument construction to apollo/instruments
rregitsky Jul 30, 2025
ebc709e
lint
rregitsky Jul 30, 2025
3e3e15f
WIP integration test
rregitsky Jul 30, 2025
023d6f1
PR fixes. Fix naming and op kind source
rregitsky Jul 31, 2025
aacbfbf
Fix integration test
rregitsky Jul 31, 2025
e59af96
lint
rregitsky Jul 31, 2025
df897ad
fix changeset
rregitsky Jul 31, 2025
c018dda
remove unused imports
rregitsky Jul 31, 2025
6ed7aff
lint + snapshot updates
rregitsky Jul 31, 2025
c7e49b2
PR fixes
rregitsky Aug 1, 2025
e2f00d9
lint
rregitsky Aug 1, 2025
13f5be7
fix attr names
rregitsky Aug 4, 2025
6950160
Merge remote-tracking branch 'origin/dev' into rreg/PULSR-1673/top-le…
rregitsky Aug 5, 2025
4253731
Fix client attr names. Make attr consts
rregitsky Aug 5, 2025
7dc2787
Merge remote-tracking branch 'origin/rreg/PULSR-1673/top-level-subgra…
rregitsky Aug 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .changesets/feat_rreg_experimental_subgraph_metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
### [Subgraph Insights] Experimental Apollo Subgraph Fetch Histogram ([PR #8013](https://github.com/apollographql/router/pull/8013))

This change adds a new, experimental histogram to capture subgraph fetch duration for GraphOS. This will
eventually be used to power subgraph-level insights in Apollo Studio.

This can be toggled on using a new boolean config flag:

```yaml
telemetry:
apollo:
experimental_subgraph_metrics: true
```

The new instrument is only sent to GraphOS and is not available in 3rd-party OTel export targets. It is not currently
customizable. Users requiring a customizable alternative can use the existing `http.client.request.duration`
instrument, which measures the same value.

By [@rregitsky](https://github.com/rregitsky) in https://github.com/apollographql/router/pull/8013
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
---
source: apollo-router/src/configuration/tests.rs
expression: "&schema"
snapshot_kind: text
---
{
"$schema": "http://json-schema.org/draft-07/schema#",
Expand Down Expand Up @@ -1496,6 +1497,11 @@ expression: "&schema"
"$ref": "#/definitions/Protocol",
"description": "#/definitions/Protocol"
},
"experimental_subgraph_metrics": {
"default": false,
"description": "Enable sending additional subgraph metrics to Apollo Studio via OTLP",
"type": "boolean"
},
"field_level_instrumentation_sampler": {
"$ref": "#/definitions/SamplerOption",
"description": "#/definitions/SamplerOption"
Expand Down
3 changes: 2 additions & 1 deletion apollo-router/src/metrics/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ impl FilterMeterProvider {
}

fn get_private_realtime_regex() -> Regex {
Regex::new(r"apollo\.router\.operations\.error").expect("regex should have been valid")
Regex::new(r"apollo\.router\.operations\.(?:error|fetch\.duration)")
.expect("regex should have been valid")
}

pub(crate) fn private_realtime<T: Into<MeterProvider>>(delegate: T) -> Self {
Expand Down
4 changes: 4 additions & 0 deletions apollo-router/src/plugins/telemetry/apollo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ pub(crate) struct Config {

/// Enable field metrics that are generated without FTV1 to be sent to Apollo Studio.
pub(crate) experimental_local_field_metrics: bool,

/// Enable sending additional subgraph metrics to Apollo Studio via OTLP
pub(crate) experimental_subgraph_metrics: bool,
}

#[derive(Debug, Clone, Deserialize, JsonSchema, Default)]
Expand Down Expand Up @@ -253,6 +256,7 @@ impl Default for Config {
signature_normalization_algorithm: ApolloSignatureNormalizationAlgorithm::default(),
experimental_local_field_metrics: false,
metrics_reference_mode: ApolloMetricsReferenceMode::default(),
experimental_subgraph_metrics: false,
}
}
}
Expand Down
176 changes: 176 additions & 0 deletions apollo-router/src/plugins/telemetry/config_new/apollo/instruments.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
use std::collections::HashMap;
use std::sync::Arc;

use opentelemetry::metrics::MeterProvider;
use tokio::time::Instant;
use tower::BoxError;

use crate::Context;
use crate::metrics;
use crate::plugins::telemetry::APOLLO_CLIENT_NAME_ATTRIBUTE;
use crate::plugins::telemetry::APOLLO_CLIENT_VERSION_ATTRIBUTE;
use crate::plugins::telemetry::APOLLO_HAS_ERRORS_ATTRIBUTE;
use crate::plugins::telemetry::APOLLO_OPERATION_ID_ATTRIBUTE;
use crate::plugins::telemetry::CLIENT_NAME;
use crate::plugins::telemetry::CLIENT_VERSION;
use crate::plugins::telemetry::GRAPHQL_OPERATION_NAME_ATTRIBUTE;
use crate::plugins::telemetry::GRAPHQL_OPERATION_TYPE_ATTRIBUTE;
use crate::plugins::telemetry::apollo::Config;
use crate::plugins::telemetry::config_new::attributes::StandardAttribute;
use crate::plugins::telemetry::config_new::extendable::Extendable;
use crate::plugins::telemetry::config_new::instruments::APOLLO_ROUTER_OPERATIONS_FETCH_DURATION;
use crate::plugins::telemetry::config_new::instruments::CustomHistogram;
use crate::plugins::telemetry::config_new::instruments::Increment;
use crate::plugins::telemetry::config_new::instruments::Instrumented;
use crate::plugins::telemetry::config_new::instruments::METER_NAME;
use crate::plugins::telemetry::config_new::instruments::StaticInstrument;
use crate::plugins::telemetry::config_new::selectors::OperationKind;
use crate::plugins::telemetry::config_new::selectors::OperationName;
use crate::plugins::telemetry::config_new::subgraph::attributes::SubgraphAttributes;
use crate::plugins::telemetry::config_new::subgraph::selectors::SubgraphSelector;
use crate::query_planner::APOLLO_OPERATION_ID;
use crate::services::subgraph;

pub(crate) struct ApolloSubgraphInstruments {
pub(crate) apollo_router_operations_fetch_duration: Option<
CustomHistogram<
subgraph::Request,
subgraph::Response,
(),
SubgraphAttributes,
SubgraphSelector,
>,
>,
}

impl ApolloSubgraphInstruments {
pub(crate) fn new(
static_instruments: Arc<HashMap<String, StaticInstrument>>,
apollo_config: Config,
) -> Self {
let selectors = Extendable {
attributes: SubgraphAttributes::builder()
.subgraph_name(StandardAttribute::Bool(true))
.build(),
custom: HashMap::from([
(
APOLLO_CLIENT_NAME_ATTRIBUTE.to_string(),
SubgraphSelector::ResponseContext {
response_context: CLIENT_NAME.to_string(),
redact: None,
default: None,
},
),
(
APOLLO_CLIENT_VERSION_ATTRIBUTE.to_string(),
SubgraphSelector::ResponseContext {
response_context: CLIENT_VERSION.to_string(),
redact: None,
default: None,
},
),
(
GRAPHQL_OPERATION_NAME_ATTRIBUTE.to_string(),
SubgraphSelector::SupergraphOperationName {
supergraph_operation_name: OperationName::String,
redact: None,
default: None,
},
),
(
GRAPHQL_OPERATION_TYPE_ATTRIBUTE.to_string(),
SubgraphSelector::SupergraphOperationKind {
supergraph_operation_kind: OperationKind::String,
},
),
(
APOLLO_OPERATION_ID_ATTRIBUTE.to_string(),
SubgraphSelector::ResponseContext {
response_context: APOLLO_OPERATION_ID.to_string(),
redact: None,
default: None,
},
),
(
APOLLO_HAS_ERRORS_ATTRIBUTE.to_string(),
SubgraphSelector::OnGraphQLError {
subgraph_on_graphql_error: true,
},
),
]),
};
let attribute_count = selectors.custom.len() + 1; // 1 for subgraph_name on attributes

let apollo_router_operations_fetch_duration =
apollo_config.experimental_subgraph_metrics.then(|| {
CustomHistogram::builder()
.increment(Increment::Duration(Instant::now()))
.attributes(Vec::with_capacity(attribute_count))
.selectors(Arc::new(selectors))
.histogram(static_instruments
.get(APOLLO_ROUTER_OPERATIONS_FETCH_DURATION)
.expect(
"cannot get apollo static instrument for subgraph; this should not happen",
)
.as_histogram()
.cloned()
.expect(
"cannot convert apollo instrument to histogram for subgraph; this should not happen",
)
)
.build()
});

Self {
apollo_router_operations_fetch_duration,
}
}

pub(crate) fn new_builtin() -> HashMap<String, StaticInstrument> {
let meter = metrics::meter_provider().meter(METER_NAME);
let mut static_instruments = HashMap::with_capacity(1);

static_instruments.insert(
APOLLO_ROUTER_OPERATIONS_FETCH_DURATION.to_string(),
StaticInstrument::Histogram(
meter
.f64_histogram(APOLLO_ROUTER_OPERATIONS_FETCH_DURATION)
.with_unit("s")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess seconds is the standard unit to use? Milliseconds or nanoseconds makes more sense to me but again I'm new to this code.

Copy link
Copy Markdown
Contributor Author

@rregitsky rregitsky Jul 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We actually don't have control over this for CustomHistogram.

Some(instant.elapsed().as_secs_f64())

In fact, all histogram refs I could find in the router were all using f64 to measure seconds.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose if it was possible for this to be milliseconds maybe we would be able to use a 32 bit value and save some space over the wire. I'd need to also handle this on the ingestion side since we don't respect the unit yet and just assume that everything is in seconds. https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/metrics/v1/metrics.proto#L198 - not sure that this would work since the protobuf specifies a double type for the bounds.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I understand it is indeed the standard unit to use for all time measurements, but anyone displaying it would convert the unit to a most-appropriate scale at that point

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@timbotnik as I mentioned, CustomHistogram controls the unit recorded to the underlying histogram. We could likely add a way to specify a unit in its constructor, but if that's the route we want I'd prefer to add it as a follow-up to this PR.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO there's not a great reason to change this now (an argument about precision would be the only reason, but the numbers we're dealing with are precise enough). Let's make a tech debt ticket which would include updating the ingestion side to fail gracefully on different units (or support them!)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.with_description("Duration of a subgraph fetch.")
.init(),
),
);

static_instruments
}
}

impl Instrumented for ApolloSubgraphInstruments {
type Request = subgraph::Request;
type Response = subgraph::Response;
type EventResponse = ();

fn on_request(&self, request: &Self::Request) {
if let Some(apollo_router_operations_fetch_duration) =
&self.apollo_router_operations_fetch_duration
{
apollo_router_operations_fetch_duration.on_request(request);
}
}

fn on_response(&self, response: &Self::Response) {
if let Some(apollo_router_operations_fetch_duration) =
&self.apollo_router_operations_fetch_duration
{
apollo_router_operations_fetch_duration.on_response(response);
}
}

fn on_error(&self, error: &BoxError, ctx: &Context) {
if let Some(apollo_router_operations_fetch_duration) =
&self.apollo_router_operations_fetch_duration
{
apollo_router_operations_fetch_duration.on_error(error, ctx);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub(crate) mod instruments;
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
source: apollo-router/src/plugins/telemetry/config_new/instruments.rs
description: Apollo subgraph fetch duration histogram
expression: "&metrics.all()"
info:
telemetry:
apollo:
experimental_subgraph_metrics: true
instrumentation:
instruments:
subgraph:
http.client.request.duration: false
snapshot_kind: text
---
- name: apollo.router.operations.fetch.duration
description: Duration of a subgraph fetch.
unit: s
data:
datapoints:
- sum: 0.1
count: 1
attributes:
apollo.client.name: myClient
apollo.client.version: v0.1.0
apollo.operation.id: myOperationID
graphql.operation.name: Test
graphql.operation.type: query
has_errors: false
subgraph.name: products
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
telemetry:
apollo:
experimental_subgraph_metrics: true
instrumentation:
instruments:
subgraph:
http.client.request.duration: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
description: Apollo subgraph fetch duration histogram
events:
- - context:
map:
"apollo::supergraph::operation_name": "Test"
"apollo::supergraph::operation_id": "myOperationID"
"apollo::supergraph::operation_kind": "query"
"apollo::telemetry::client_name": "myClient"
"apollo::telemetry::client_version": "v0.1.0"
- subgraph_request:
query: "query { hello }"
operation_name: "Products"
operation_kind: query
subgraph_name: "products"
- subgraph_response:
status: 200
data:
hello: "world"
Loading