diff --git a/.changesets/maint_renee_migrate_metrics_values.md b/.changesets/maint_renee_migrate_metrics_values.md new file mode 100644 index 0000000000..2bb0ee2e23 --- /dev/null +++ b/.changesets/maint_renee_migrate_metrics_values.md @@ -0,0 +1,5 @@ +### Migrate gauge metrics to OTel instruments ([PR #6476](https://github.com/apollographql/router/pull/6476)) + +Updates gauge metrics using the legacy `tracing::info!(value.*)` syntax to OTel instruments. + +By [@goto-bus-stop](https://github.com/goto-bus-stop) in https://github.com/apollographql/router/pull/6476 \ No newline at end of file diff --git a/apollo-router/src/axum_factory/axum_http_server_factory.rs b/apollo-router/src/axum_factory/axum_http_server_factory.rs index 95404382f4..bf53143815 100644 --- a/apollo-router/src/axum_factory/axum_http_server_factory.rs +++ b/apollo-router/src/axum_factory/axum_http_server_factory.rs @@ -29,12 +29,15 @@ use hyper::server::conn::Http; use hyper::Body; use itertools::Itertools; use multimap::MultiMap; +use opentelemetry_api::metrics::MeterProvider as _; +use opentelemetry_api::metrics::ObservableGauge; use serde::Serialize; use serde_json::json; #[cfg(unix)] use tokio::net::UnixListener; use tokio::sync::mpsc; use tokio_rustls::TlsAcceptor; +use tower::layer::layer_fn; use tower::service_fn; use tower::BoxError; use tower::ServiceBuilder; @@ -60,6 +63,7 @@ use crate::graphql; use crate::http_server_factory::HttpServerFactory; use crate::http_server_factory::HttpServerHandle; use crate::http_server_factory::Listener; +use crate::metrics::meter_provider; use crate::plugins::telemetry::SpanMode; use crate::router::ApolloRouterError; use crate::router_factory::Endpoint; @@ -73,20 +77,29 @@ use crate::Context; static ACTIVE_SESSION_COUNT: AtomicU64 = AtomicU64::new(0); +fn session_count_instrument() -> ObservableGauge { + let meter = meter_provider().meter("apollo/router"); + meter + .u64_observable_gauge("apollo_router_session_count_active") + .with_description("Amount of in-flight sessions") + .with_callback(|gauge| { + gauge.observe(ACTIVE_SESSION_COUNT.load(Ordering::Relaxed), &[]); + }) + .init() +} + struct SessionCountGuard; impl SessionCountGuard { fn start() -> Self { - let session_count = ACTIVE_SESSION_COUNT.fetch_add(1, Ordering::Acquire) + 1; - tracing::info!(value.apollo_router_session_count_active = session_count,); + ACTIVE_SESSION_COUNT.fetch_add(1, Ordering::Acquire); Self } } impl Drop for SessionCountGuard { fn drop(&mut self) { - let session_count = ACTIVE_SESSION_COUNT.fetch_sub(1, Ordering::Acquire) - 1; - tracing::info!(value.apollo_router_session_count_active = session_count,); + ACTIVE_SESSION_COUNT.fetch_sub(1, Ordering::Acquire); } } @@ -625,6 +638,14 @@ where ); } + // Tie the lifetime of the session count instrument to the lifetime of the router + // by moving it into a no-op layer. + let instrument = session_count_instrument(); + router = router.layer(layer_fn(move |service| { + let _ = &instrument; + service + })); + router } diff --git a/apollo-router/src/configuration/metrics.rs b/apollo-router/src/configuration/metrics.rs index 3033233462..f285c18b93 100644 --- a/apollo-router/src/configuration/metrics.rs +++ b/apollo-router/src/configuration/metrics.rs @@ -120,19 +120,20 @@ impl InstrumentData { } pub(crate) fn populate_config_instruments(&mut self, yaml: &serde_json::Value) { - // This macro will query the config json for a primary metric and optionally metric attributes. - - // The reason we use jsonpath_rust is that jsonpath_lib has correctness issues and looks abandoned. - // We should consider converting the rest of the codebase to use jsonpath_rust. - - // Example usage: - // populate_usage_instrument!( - // value.apollo.router.config.authorization, // The metric name - // "$.authorization", // The path into the config - // opt.require_authentication, // The name of the attribute - // "$[?(@.require_authentication == true)]" // The path for the attribute relative to the metric - // ); - + /// This macro will query the config json for a primary metric and optionally metric attributes. + /// + /// The reason we use jsonpath_rust is that jsonpath_lib has correctness issues and looks abandoned. + /// We should consider converting the rest of the codebase to use jsonpath_rust. + /// + /// Example usage: + /// ```rust,ignore + /// populate_config_instrument!( + /// apollo.router.config.authorization, // The metric name + /// "$.authorization", // The path into the config + /// opt.require_authentication, // The name of the attribute + /// "$[?(@.require_authentication == true)]" // The path for the attribute relative to the metric + /// ); + /// ``` macro_rules! populate_config_instrument { ($($metric:ident).+, $path:literal) => { let instrument_name = stringify!($($metric).+).to_string(); diff --git a/apollo-router/src/plugins/telemetry/tracing/apollo_telemetry.rs b/apollo-router/src/plugins/telemetry/tracing/apollo_telemetry.rs index 123cfbc8a2..1178cde63e 100644 --- a/apollo-router/src/plugins/telemetry/tracing/apollo_telemetry.rs +++ b/apollo-router/src/plugins/telemetry/tracing/apollo_telemetry.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use std::collections::HashSet; use std::io::Cursor; use std::num::NonZeroUsize; +use std::sync::atomic::AtomicU64; use std::sync::Arc; use std::time::SystemTime; use std::time::SystemTimeError; @@ -28,12 +29,15 @@ use opentelemetry::trace::TraceId; use opentelemetry::Key; use opentelemetry::KeyValue; use opentelemetry::Value; +use opentelemetry_api::metrics::MeterProvider as _; +use opentelemetry_api::metrics::ObservableGauge; use prost::Message; use rand::Rng; use serde::de::DeserializeOwned; use thiserror::Error; use url::Url; +use crate::metrics::meter_provider; use crate::plugins::telemetry; use crate::plugins::telemetry::apollo::ErrorConfiguration; use crate::plugins::telemetry::apollo::ErrorsConfiguration; @@ -245,6 +249,47 @@ impl LightSpanData { } } +/// An externally updateable gauge for "apollo_router_span_lru_size". +/// +/// When observed, it reports the most recently stored value (give or take atomicity looseness). +/// +/// This *could* be generalised to any kind of gauge, but we should ideally have gauges that can just +/// observe their accurate value whenever requested. The externally updateable approach is kind of +/// a hack that happens to work here because we only have one place where the value can change, and +/// otherwise we might have to use an inconvenient Mutex or RwLock around the entire LRU cache. +#[derive(Debug)] +struct SpanLruSizeInstrument { + value: Arc, + _gauge: ObservableGauge, +} + +impl SpanLruSizeInstrument { + fn new() -> Self { + let value = Arc::new(AtomicU64::new(0)); + + let meter = meter_provider().meter("apollo/router"); + let gauge = meter + .u64_observable_gauge("apollo_router_span_lru_size") + .with_callback({ + let value = Arc::clone(&value); + move |gauge| { + gauge.observe(value.load(std::sync::atomic::Ordering::Relaxed), &[]); + } + }) + .init(); + + Self { + value, + _gauge: gauge, + } + } + + fn update(&self, value: u64) { + self.value + .store(value, std::sync::atomic::Ordering::Relaxed); + } +} + /// A [`SpanExporter`] that writes to [`Reporter`]. /// /// [`SpanExporter`]: super::SpanExporter @@ -253,6 +298,7 @@ impl LightSpanData { #[derivative(Debug)] pub(crate) struct Exporter { spans_by_parent_id: LruCache>, + span_lru_size_instrument: SpanLruSizeInstrument, #[derivative(Debug = "ignore")] report_exporter: Option>, #[derivative(Debug = "ignore")] @@ -325,8 +371,12 @@ impl Exporter { Sampler::AlwaysOff => 0f64, }, }; + + let span_lru_size_instrument = SpanLruSizeInstrument::new(); + Ok(Self { spans_by_parent_id: LruCache::new(buffer_size), + span_lru_size_instrument, report_exporter: if otlp_tracing_ratio < 1f64 { Some(Arc::new(ApolloExporter::new( endpoint, @@ -1082,7 +1132,11 @@ impl SpanExporter for Exporter { ); } } - tracing::info!(value.apollo_router_span_lru_size = self.spans_by_parent_id.len() as u64,); + + // Note this won't be correct anymore if there is any way outside of `.export()` + // to affect the size of the cache. + self.span_lru_size_instrument + .update(self.spans_by_parent_id.len() as u64); #[allow(clippy::manual_map)] // https://github.com/rust-lang/rust-clippy/issues/8346 let report_exporter = match self.report_exporter.as_ref() {