Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changesets/maint_renee_migrate_metrics_values.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
### Migrate gauge metrics to OTel instruments ([PR #6476](https://github.com/apollographql/router/pull/6476))

Updates gauge metrics using the legacy `tracing::info!(value.*)` syntax to OTel instruments.

By [@goto-bus-stop](https://github.com/goto-bus-stop) in https://github.com/apollographql/router/pull/6476
29 changes: 25 additions & 4 deletions apollo-router/src/axum_factory/axum_http_server_factory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,15 @@ use hyper::server::conn::Http;
use hyper::Body;
use itertools::Itertools;
use multimap::MultiMap;
use opentelemetry_api::metrics::MeterProvider as _;
use opentelemetry_api::metrics::ObservableGauge;
use serde::Serialize;
use serde_json::json;
#[cfg(unix)]
use tokio::net::UnixListener;
use tokio::sync::mpsc;
use tokio_rustls::TlsAcceptor;
use tower::layer::layer_fn;
use tower::service_fn;
use tower::BoxError;
use tower::ServiceBuilder;
Expand All @@ -60,6 +63,7 @@ use crate::graphql;
use crate::http_server_factory::HttpServerFactory;
use crate::http_server_factory::HttpServerHandle;
use crate::http_server_factory::Listener;
use crate::metrics::meter_provider;
use crate::plugins::telemetry::SpanMode;
use crate::router::ApolloRouterError;
use crate::router_factory::Endpoint;
Expand All @@ -73,20 +77,29 @@ use crate::Context;

static ACTIVE_SESSION_COUNT: AtomicU64 = AtomicU64::new(0);

fn session_count_instrument() -> ObservableGauge<u64> {
let meter = meter_provider().meter("apollo/router");
meter
.u64_observable_gauge("apollo_router_session_count_active")
.with_description("Amount of in-flight sessions")
.with_callback(|gauge| {
gauge.observe(ACTIVE_SESSION_COUNT.load(Ordering::Relaxed), &[]);
})
.init()
}

struct SessionCountGuard;

impl SessionCountGuard {
fn start() -> Self {
let session_count = ACTIVE_SESSION_COUNT.fetch_add(1, Ordering::Acquire) + 1;
tracing::info!(value.apollo_router_session_count_active = session_count,);
ACTIVE_SESSION_COUNT.fetch_add(1, Ordering::Acquire);
Self
}
}

impl Drop for SessionCountGuard {
fn drop(&mut self) {
let session_count = ACTIVE_SESSION_COUNT.fetch_sub(1, Ordering::Acquire) - 1;
tracing::info!(value.apollo_router_session_count_active = session_count,);
ACTIVE_SESSION_COUNT.fetch_sub(1, Ordering::Acquire);
}
}

Expand Down Expand Up @@ -625,6 +638,14 @@ where
);
}

// Tie the lifetime of the session count instrument to the lifetime of the router
// by moving it into a no-op layer.
let instrument = session_count_instrument();
router = router.layer(layer_fn(move |service| {
let _ = &instrument;
service
}));
Comment on lines +641 to +647
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The session_count_active instrument is "stored" in an axum::Router instance by moving it into a no-op layer. Another option would be to just have one global instrument for the entire lifetime of the program, but I'm not sure if that would work everywhere we need it to (eg. does it do the right thing across reloads, tests, etc?)


router
}

Expand Down
27 changes: 14 additions & 13 deletions apollo-router/src/configuration/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,19 +120,20 @@ impl InstrumentData {
}

pub(crate) fn populate_config_instruments(&mut self, yaml: &serde_json::Value) {
// This macro will query the config json for a primary metric and optionally metric attributes.

// The reason we use jsonpath_rust is that jsonpath_lib has correctness issues and looks abandoned.
// We should consider converting the rest of the codebase to use jsonpath_rust.

// Example usage:
// populate_usage_instrument!(
// value.apollo.router.config.authorization, // The metric name
// "$.authorization", // The path into the config
// opt.require_authentication, // The name of the attribute
// "$[?(@.require_authentication == true)]" // The path for the attribute relative to the metric
// );

/// This macro will query the config json for a primary metric and optionally metric attributes.
///
/// The reason we use jsonpath_rust is that jsonpath_lib has correctness issues and looks abandoned.
/// We should consider converting the rest of the codebase to use jsonpath_rust.
///
/// Example usage:
/// ```rust,ignore
/// populate_config_instrument!(
/// apollo.router.config.authorization, // The metric name
/// "$.authorization", // The path into the config
/// opt.require_authentication, // The name of the attribute
/// "$[?(@.require_authentication == true)]" // The path for the attribute relative to the metric
/// );
/// ```
macro_rules! populate_config_instrument {
($($metric:ident).+, $path:literal) => {
let instrument_name = stringify!($($metric).+).to_string();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::collections::HashMap;
use std::collections::HashSet;
use std::io::Cursor;
use std::num::NonZeroUsize;
use std::sync::atomic::AtomicU64;
use std::sync::Arc;
use std::time::SystemTime;
use std::time::SystemTimeError;
Expand All @@ -28,12 +29,15 @@ use opentelemetry::trace::TraceId;
use opentelemetry::Key;
use opentelemetry::KeyValue;
use opentelemetry::Value;
use opentelemetry_api::metrics::MeterProvider as _;
use opentelemetry_api::metrics::ObservableGauge;
use prost::Message;
use rand::Rng;
use serde::de::DeserializeOwned;
use thiserror::Error;
use url::Url;

use crate::metrics::meter_provider;
use crate::plugins::telemetry;
use crate::plugins::telemetry::apollo::ErrorConfiguration;
use crate::plugins::telemetry::apollo::ErrorsConfiguration;
Expand Down Expand Up @@ -245,6 +249,47 @@ impl LightSpanData {
}
}

/// An externally updateable gauge for "apollo_router_span_lru_size".
///
/// When observed, it reports the most recently stored value (give or take atomicity looseness).
///
/// This *could* be generalised to any kind of gauge, but we should ideally have gauges that can just
/// observe their accurate value whenever requested. The externally updateable approach is kind of
/// a hack that happens to work here because we only have one place where the value can change, and
/// otherwise we might have to use an inconvenient Mutex or RwLock around the entire LRU cache.
#[derive(Debug)]
struct SpanLruSizeInstrument {
value: Arc<AtomicU64>,
_gauge: ObservableGauge<u64>,
}
Comment on lines +260 to +264
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned in the doc comment, this is a hack that kind of mimicks what tracing::info!(value.) was doing (storing the value whenever it is updated, and reading the stored value when requested).

I'm not sure about this metric, should we keep it at all in 2.0?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should keep it. It's really helpful to debug and could potentially find the root cause of a leak or so.


impl SpanLruSizeInstrument {
fn new() -> Self {
let value = Arc::new(AtomicU64::new(0));

let meter = meter_provider().meter("apollo/router");
let gauge = meter
.u64_observable_gauge("apollo_router_span_lru_size")
.with_callback({
let value = Arc::clone(&value);
move |gauge| {
gauge.observe(value.load(std::sync::atomic::Ordering::Relaxed), &[]);
}
})
.init();

Self {
value,
_gauge: gauge,
}
}

fn update(&self, value: u64) {
self.value
.store(value, std::sync::atomic::Ordering::Relaxed);
}
}

/// A [`SpanExporter`] that writes to [`Reporter`].
///
/// [`SpanExporter`]: super::SpanExporter
Expand All @@ -253,6 +298,7 @@ impl LightSpanData {
#[derivative(Debug)]
pub(crate) struct Exporter {
spans_by_parent_id: LruCache<SpanId, LruCache<usize, LightSpanData>>,
span_lru_size_instrument: SpanLruSizeInstrument,
#[derivative(Debug = "ignore")]
report_exporter: Option<Arc<ApolloExporter>>,
#[derivative(Debug = "ignore")]
Expand Down Expand Up @@ -325,8 +371,12 @@ impl Exporter {
Sampler::AlwaysOff => 0f64,
},
};

let span_lru_size_instrument = SpanLruSizeInstrument::new();

Ok(Self {
spans_by_parent_id: LruCache::new(buffer_size),
span_lru_size_instrument,
report_exporter: if otlp_tracing_ratio < 1f64 {
Some(Arc::new(ApolloExporter::new(
endpoint,
Expand Down Expand Up @@ -1082,7 +1132,11 @@ impl SpanExporter for Exporter {
);
}
}
tracing::info!(value.apollo_router_span_lru_size = self.spans_by_parent_id.len() as u64,);

// Note this won't be correct anymore if there is any way outside of `.export()`
// to affect the size of the cache.
self.span_lru_size_instrument
.update(self.spans_by_parent_id.len() as u64);

#[allow(clippy::manual_map)] // https://github.com/rust-lang/rust-clippy/issues/8346
let report_exporter = match self.report_exporter.as_ref() {
Expand Down