Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .changesets/feat_subgraph_fetch_histogram.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
### [Subgraph Insights] Experimental Apollo Subgraph Fetch Histogram ([PR #7960](https://github.com/apollographql/router/pull/7960))

<!-- start metadata -->

<!-- [PULSR-1673] -->
---
This change adds a new, experimental histogram to capture subgraph fetch duration,
`apollo.router.operations.fetch.duration` with the following attributes:
- client.name
- client.version
- has.errors
- operation.name
- operation.id
- subgraph.name

This can be controlled using a new boolean config flag:
```yaml
telemetry:
instrumentation:
instruments:
apollo:
subgraph:
experimental_subgraph_fetch_duration: true
```
The metric is currently only sent to GraphOS and is not available in 3rd-party OTel export targets. It is not currently
user customizable.

The metric `http.`

By [@rregitsky](https://github.com/rregitsky) in https://github.com/apollographql/router/pull/7960
3 changes: 2 additions & 1 deletion apollo-router/src/metrics/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ impl FilterMeterProvider {
}

fn get_private_realtime_regex() -> Regex {
Regex::new(r"apollo\.router\.operations\.error").expect("regex should have been valid")
Regex::new(r"apollo\.router\.operations\.(?:error|fetch\.duration)")
.expect("regex should have been valid")
}

pub(crate) fn private_realtime<T: Into<MeterProvider>>(delegate: T) -> Self {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
source: apollo-router/src/plugins/telemetry/config_new/instruments.rs
description: Custom histogram duration
expression: "&metrics.all()"
info:
telemetry:
instrumentation:
instruments:
subgraph:
apollo:
experimental_subgraph_fetch_duration: true
http.client.request.duration: false
snapshot_kind: text
---
- name: apollo.router.operations.fetch.duration
description: Duration of a subgraph fetch.
unit: s
data:
datapoints:
- sum: 0.1
count: 1
attributes:
client.name: myClient
client.version: v0.1.0
has.errors: false
operation.name: Test
subgraph.name: products
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
source: apollo-router/src/plugins/telemetry/config_new/instruments.rs
assertion_line: 3310
description: Custom histogram duration
expression: "&metrics.all()"
info:
telemetry:
instrumentation:
instruments:
apollo:
subgraph:
experimental_subgraph_fetch_duration: true
subgraph:
http.client.request.duration: false
---
- name: apollo.router.operations.fetch.duration
description: Duration of a subgraph fetch.
unit: s
data:
datapoints:
- sum: 0.1
count: 1
attributes:
client.name: myClient
client.version: v0.1.0
has.errors: false
operation.id: myOperationID
operation.name: Test
subgraph.name: products
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
telemetry:
instrumentation:
instruments:
apollo:
subgraph:
experimental_subgraph_fetch_duration: true
subgraph:
http.client.request.duration: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
description: Custom histogram duration
events:
- - context:
map:
"apollo::supergraph::operation_name": "Test"
"apollo::supergraph::operation_id": "myOperationID"
"apollo::telemetry::client_name": "myClient"
"apollo::telemetry::client_version": "v0.1.0"
- subgraph_request:
query: "query { hello }"
operation_name: "Products"
operation_kind: query
subgraph_name: "products"
- subgraph_response:
status: 200
data:
hello: "world"
86 changes: 85 additions & 1 deletion apollo-router/src/plugins/telemetry/config_new/instruments.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ use super::http_server::attributes::HttpServerAttributes;
use super::router::instruments::RouterInstruments;
use super::router::instruments::RouterInstrumentsConfig;
use super::selectors::CacheKind;
use super::subgraph::instruments::SubgraphInstruments;
use super::subgraph::instruments::{ApolloSubgraphInstrumentsConfig, SubgraphInstruments};
use super::subgraph::instruments::SubgraphInstrumentsConfig;
use super::supergraph::instruments::SupergraphCustomInstruments;
use super::supergraph::instruments::SupergraphInstrumentsConfig;
Expand All @@ -43,6 +43,9 @@ use crate::axum_factory::connection_handle::ConnectionState;
use crate::axum_factory::connection_handle::OPEN_CONNECTIONS_METRIC;
use crate::metrics;
use crate::metrics::meter_provider;
use crate::plugins::telemetry::CLIENT_NAME;
use crate::plugins::telemetry::CLIENT_VERSION;
use crate::query_planner::APOLLO_OPERATION_ID;
use crate::plugins::telemetry::config_new::Selectors;
use crate::plugins::telemetry::config_new::attributes::DefaultAttributeRequirementLevel;
use crate::plugins::telemetry::config_new::conditions::Condition;
Expand Down Expand Up @@ -110,6 +113,15 @@ pub(crate) struct InstrumentsConfig {
CacheInstrumentsConfig,
Instrument<CacheAttributes, SubgraphSelector, SubgraphValue>,
>,

/// Apollo instruments
pub (crate) apollo: ApolloInstrumentsConfig
}

#[derive(Clone, Deserialize, JsonSchema, Debug, Default)]
#[serde(deny_unknown_fields, default)]
pub (crate) struct ApolloInstrumentsConfig {
pub(crate) subgraph: ApolloSubgraphInstrumentsConfig,
}

const HTTP_SERVER_REQUEST_DURATION_METRIC: &str = "http.server.request.duration";
Expand All @@ -120,6 +132,8 @@ const HTTP_SERVER_ACTIVE_REQUESTS: &str = "http.server.active_requests";
pub(super) const HTTP_CLIENT_REQUEST_DURATION_METRIC: &str = "http.client.request.duration";
pub(super) const HTTP_CLIENT_REQUEST_BODY_SIZE_METRIC: &str = "http.client.request.body.size";
pub(super) const HTTP_CLIENT_RESPONSE_BODY_SIZE_METRIC: &str = "http.client.response.body.size";
pub(super) const APOLLO_ROUTER_OPERATIONS_FETCH_DURATION: &str =
"apollo.router.operations.fetch.duration";

impl InstrumentsConfig {
pub(crate) fn validate(&self) -> Result<(), String> {
Expand Down Expand Up @@ -497,6 +511,19 @@ impl InstrumentsConfig {
let meter = metrics::meter_provider().meter(METER_NAME);
let mut static_instruments = HashMap::with_capacity(self.subgraph.custom.len());

// Built-in Apollo instruments. Not currently user configurable.
static_instruments.insert(
APOLLO_ROUTER_OPERATIONS_FETCH_DURATION.to_string(),
StaticInstrument::Histogram(
meter
.f64_histogram(APOLLO_ROUTER_OPERATIONS_FETCH_DURATION)
.with_unit("s")
.with_description("Duration of a subgraph fetch.")
.init(),
),
);

// Built-in user customizable instruments
if self
.subgraph
.attributes
Expand Down Expand Up @@ -551,6 +578,7 @@ impl InstrumentsConfig {
);
}

// Custom user instruments
for (instrument_name, instrument) in &self.subgraph.custom {
match instrument.ty {
InstrumentType::Counter => {
Expand Down Expand Up @@ -709,10 +737,66 @@ impl InstrumentsConfig {
}),
}
});

// Apollo instruments. Not currently user configurable
let apollo_router_operation_fetch_duration = self.apollo
.subgraph
.experimental_subgraph_fetch_duration
.then(|| {
CustomHistogram {
inner: Mutex::new(CustomHistogramInner {
increment: Increment::Duration(Instant::now()),
condition: Condition::True,
histogram: Some(static_instruments
.get(APOLLO_ROUTER_OPERATIONS_FETCH_DURATION)
.expect(
"cannot get static instrument for subgraph; this should not happen",
)
.as_histogram()
.cloned()
.expect(
"cannot convert instrument to histogram for subgraph; this should not happen",
)
),
attributes: Vec::with_capacity(5),
selector: None,
// Hardcode yaml config as this is currently the only way to build attributes
// and selectors.
selectors: Some(
Arc::new(
serde_yaml::from_str::<Extendable<SubgraphAttributes, SubgraphSelector>>(
&format!(
r#"
subgraph.name: true
operation.name:
supergraph_operation_name: string
operation.id:
request_context: {operation_id_key}
client.name:
request_context: {client_name_key}
client.version:
request_context: {client_version_key}
has.errors:
subgraph_on_graphql_error: true
"#,
operation_id_key = APOLLO_OPERATION_ID,
client_name_key = CLIENT_NAME,
client_version_key = CLIENT_VERSION
)
).unwrap()
)
),
updated: false,
_phantom: PhantomData,
})
}
});

SubgraphInstruments {
http_client_request_duration,
http_client_request_body_size,
http_client_response_body_size,
apollo_router_operation_fetch_duration,
custom: CustomInstruments::new(&self.subgraph.custom, static_instruments),
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ pub(crate) struct SubgraphInstrumentsConfig {
DefaultedStandardInstrument<Extendable<SubgraphAttributes, SubgraphSelector>>,
}

#[derive(Clone, Deserialize, JsonSchema, Debug, Default)]
#[serde(deny_unknown_fields, default)]
pub(crate) struct ApolloSubgraphInstrumentsConfig {
/// Send OTLP subgraph fetch duration histogram to Apollo Studio with select dimensions [`client.name`, `client.version`, `has.errors`, `operation.name`, `subgraph.name`].
#[serde(default)]
pub(crate) experimental_subgraph_fetch_duration: bool,
}

impl DefaultForLevel for SubgraphInstrumentsConfig {
fn defaults_for_level(
&mut self,
Expand Down Expand Up @@ -107,6 +115,15 @@ pub(crate) struct SubgraphInstruments {
SubgraphSelector,
>,
>,
pub(crate) apollo_router_operation_fetch_duration: Option<
CustomHistogram<
subgraph::Request,
subgraph::Response,
(),
SubgraphAttributes,
SubgraphSelector,
>,
>,
pub(crate) custom: SubgraphCustomInstruments,
}

Expand All @@ -125,6 +142,11 @@ impl Instrumented for SubgraphInstruments {
if let Some(http_client_response_body_size) = &self.http_client_response_body_size {
http_client_response_body_size.on_request(request);
}
if let Some(apollo_router_operation_fetch_duration) =
&self.apollo_router_operation_fetch_duration
{
apollo_router_operation_fetch_duration.on_request(request);
}
self.custom.on_request(request);
}

Expand All @@ -138,6 +160,11 @@ impl Instrumented for SubgraphInstruments {
if let Some(http_client_response_body_size) = &self.http_client_response_body_size {
http_client_response_body_size.on_response(response);
}
if let Some(apollo_router_operation_fetch_duration) =
&self.apollo_router_operation_fetch_duration
{
apollo_router_operation_fetch_duration.on_response(response);
}
self.custom.on_response(response);
}

Expand All @@ -151,6 +178,11 @@ impl Instrumented for SubgraphInstruments {
if let Some(http_client_response_body_size) = &self.http_client_response_body_size {
http_client_response_body_size.on_error(error, ctx);
}
if let Some(apollo_router_operation_fetch_duration) =
&self.apollo_router_operation_fetch_duration
{
apollo_router_operation_fetch_duration.on_error(error, ctx);
}
self.custom.on_error(error, ctx);
}
}
Expand Down
6 changes: 5 additions & 1 deletion apollo-router/src/plugins/telemetry/metrics/apollo/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use opentelemetry_otlp::WithExportConfig;
use opentelemetry_sdk::Resource;
use opentelemetry_sdk::metrics::PeriodicReader;
use opentelemetry_sdk::runtime;
use prometheus::exponential_buckets;
use sys_info::hostname;
use tonic::metadata::MetadataMap;
use tonic::transport::ClientTlsConfig;
Expand Down Expand Up @@ -187,7 +188,10 @@ impl Config {
)),
Box::new(
CustomAggregationSelector::builder()
.boundaries(default_buckets())
.boundaries(
// Returns [~1.4ms ... ~5min]
exponential_buckets(0.001399084909, 1.1, 129).unwrap(),
)
.build(),
),
)?;
Expand Down
2 changes: 2 additions & 0 deletions apollo-router/src/plugins/telemetry/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ struct BuiltinInstruments {
graphql_custom_instruments: Arc<HashMap<String, StaticInstrument>>,
router_custom_instruments: Arc<HashMap<String, StaticInstrument>>,
supergraph_custom_instruments: Arc<HashMap<String, StaticInstrument>>,
// TODO should I separate out a non-custom instruments?
subgraph_custom_instruments: Arc<HashMap<String, StaticInstrument>>,
connector_custom_instruments: Arc<HashMap<String, StaticInstrument>>,
cache_custom_instruments: Arc<HashMap<String, StaticInstrument>>,
Expand Down Expand Up @@ -878,6 +879,7 @@ impl PluginPrivate for Telemetry {
.subgraph
.attributes
.on_request(sub_request);
// TODO apollo instruments instead of throwing it in with the custom ones?
let custom_instruments = config
.instrumentation
.instruments
Expand Down