-
Notifications
You must be signed in to change notification settings - Fork 331
Migrate value.* metrics to gauges
#6476
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| ### Migrate gauge metrics to OTel instruments ([PR #6476](https://github.com/apollographql/router/pull/6476)) | ||
|
|
||
| Updates gauge metrics using the legacy `tracing::info!(value.*)` syntax to OTel instruments. | ||
|
|
||
| By [@goto-bus-stop](https://github.com/goto-bus-stop) in https://github.com/apollographql/router/pull/6476 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ use std::collections::HashMap; | |
| use std::collections::HashSet; | ||
| use std::io::Cursor; | ||
| use std::num::NonZeroUsize; | ||
| use std::sync::atomic::AtomicU64; | ||
| use std::sync::Arc; | ||
| use std::time::SystemTime; | ||
| use std::time::SystemTimeError; | ||
|
|
@@ -28,12 +29,15 @@ use opentelemetry::trace::TraceId; | |
| use opentelemetry::Key; | ||
| use opentelemetry::KeyValue; | ||
| use opentelemetry::Value; | ||
| use opentelemetry_api::metrics::MeterProvider as _; | ||
| use opentelemetry_api::metrics::ObservableGauge; | ||
| use prost::Message; | ||
| use rand::Rng; | ||
| use serde::de::DeserializeOwned; | ||
| use thiserror::Error; | ||
| use url::Url; | ||
|
|
||
| use crate::metrics::meter_provider; | ||
| use crate::plugins::telemetry; | ||
| use crate::plugins::telemetry::apollo::ErrorConfiguration; | ||
| use crate::plugins::telemetry::apollo::ErrorsConfiguration; | ||
|
|
@@ -245,6 +249,47 @@ impl LightSpanData { | |
| } | ||
| } | ||
|
|
||
| /// An externally updateable gauge for "apollo_router_span_lru_size". | ||
| /// | ||
| /// When observed, it reports the most recently stored value (give or take atomicity looseness). | ||
| /// | ||
| /// This *could* be generalised to any kind of gauge, but we should ideally have gauges that can just | ||
| /// observe their accurate value whenever requested. The externally updateable approach is kind of | ||
| /// a hack that happens to work here because we only have one place where the value can change, and | ||
| /// otherwise we might have to use an inconvenient Mutex or RwLock around the entire LRU cache. | ||
| #[derive(Debug)] | ||
| struct SpanLruSizeInstrument { | ||
| value: Arc<AtomicU64>, | ||
| _gauge: ObservableGauge<u64>, | ||
| } | ||
|
Comment on lines
+260
to
+264
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As mentioned in the doc comment, this is a hack that kind of mimicks what I'm not sure about this metric, should we keep it at all in 2.0?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should keep it. It's really helpful to debug and could potentially find the root cause of a leak or so. |
||
|
|
||
| impl SpanLruSizeInstrument { | ||
| fn new() -> Self { | ||
| let value = Arc::new(AtomicU64::new(0)); | ||
|
|
||
| let meter = meter_provider().meter("apollo/router"); | ||
| let gauge = meter | ||
| .u64_observable_gauge("apollo_router_span_lru_size") | ||
| .with_callback({ | ||
| let value = Arc::clone(&value); | ||
| move |gauge| { | ||
| gauge.observe(value.load(std::sync::atomic::Ordering::Relaxed), &[]); | ||
| } | ||
| }) | ||
| .init(); | ||
|
|
||
| Self { | ||
| value, | ||
| _gauge: gauge, | ||
| } | ||
| } | ||
|
|
||
| fn update(&self, value: u64) { | ||
| self.value | ||
| .store(value, std::sync::atomic::Ordering::Relaxed); | ||
| } | ||
| } | ||
|
|
||
| /// A [`SpanExporter`] that writes to [`Reporter`]. | ||
| /// | ||
| /// [`SpanExporter`]: super::SpanExporter | ||
|
|
@@ -253,6 +298,7 @@ impl LightSpanData { | |
| #[derivative(Debug)] | ||
| pub(crate) struct Exporter { | ||
| spans_by_parent_id: LruCache<SpanId, LruCache<usize, LightSpanData>>, | ||
| span_lru_size_instrument: SpanLruSizeInstrument, | ||
| #[derivative(Debug = "ignore")] | ||
| report_exporter: Option<Arc<ApolloExporter>>, | ||
| #[derivative(Debug = "ignore")] | ||
|
|
@@ -325,8 +371,12 @@ impl Exporter { | |
| Sampler::AlwaysOff => 0f64, | ||
| }, | ||
| }; | ||
|
|
||
| let span_lru_size_instrument = SpanLruSizeInstrument::new(); | ||
|
|
||
| Ok(Self { | ||
| spans_by_parent_id: LruCache::new(buffer_size), | ||
| span_lru_size_instrument, | ||
| report_exporter: if otlp_tracing_ratio < 1f64 { | ||
| Some(Arc::new(ApolloExporter::new( | ||
| endpoint, | ||
|
|
@@ -1082,7 +1132,11 @@ impl SpanExporter for Exporter { | |
| ); | ||
| } | ||
| } | ||
| tracing::info!(value.apollo_router_span_lru_size = self.spans_by_parent_id.len() as u64,); | ||
|
|
||
| // Note this won't be correct anymore if there is any way outside of `.export()` | ||
| // to affect the size of the cache. | ||
| self.span_lru_size_instrument | ||
| .update(self.spans_by_parent_id.len() as u64); | ||
|
|
||
| #[allow(clippy::manual_map)] // https://github.com/rust-lang/rust-clippy/issues/8346 | ||
| let report_exporter = match self.report_exporter.as_ref() { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
session_count_activeinstrument is "stored" in anaxum::Routerinstance by moving it into a no-op layer. Another option would be to just have one global instrument for the entire lifetime of the program, but I'm not sure if that would work everywhere we need it to (eg. does it do the right thing across reloads, tests, etc?)