From 639c7d3426753c12182b56184f902f7f26d7743e Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 2 Aug 2024 12:48:41 +0100 Subject: [PATCH 01/22] Add cache storage metrics. These are: * currently unpopulated. * needs unit tests. * needs to be propagated when a new in memory cache is spawned they are correct. --- apollo-router/src/cache/mod.rs | 4 +- apollo-router/src/cache/storage.rs | 115 +++++++++++++++++++++++------ 2 files changed, 95 insertions(+), 24 deletions(-) diff --git a/apollo-router/src/cache/mod.rs b/apollo-router/src/cache/mod.rs index 80daa1d8a0..62c9462d5f 100644 --- a/apollo-router/src/cache/mod.rs +++ b/apollo-router/src/cache/mod.rs @@ -37,7 +37,7 @@ where pub(crate) async fn with_capacity( capacity: NonZeroUsize, redis: Option, - caller: &str, + caller: &'static str, ) -> Result { Ok(Self { wait_map: Arc::new(Mutex::new(HashMap::new())), @@ -47,7 +47,7 @@ where pub(crate) async fn from_configuration( config: &crate::configuration::Cache, - caller: &str, + caller: &'static str, ) -> Result { Self::with_capacity(config.in_memory.limit, config.redis.clone(), caller).await } diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index b72ad9d378..b4bbf37ee4 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -1,18 +1,23 @@ +use lru::LruCache; +use serde::de::DeserializeOwned; +use serde::Serialize; use std::fmt::Display; use std::fmt::{self}; use std::hash::Hash; use std::num::NonZeroUsize; +use std::sync::atomic::{AtomicI64, Ordering}; use std::sync::Arc; - -use lru::LruCache; -use serde::de::DeserializeOwned; -use serde::Serialize; use tokio::sync::Mutex; use tokio::time::Instant; use tower::BoxError; use super::redis::*; use crate::configuration::RedisCache; +use crate::metrics; +use crate::plugins::telemetry::config_new::instruments::METER_NAME; +use opentelemetry::metrics::MeterProvider; +use opentelemetry_api::metrics::{Meter, ObservableGauge}; +use opentelemetry_api::KeyValue; pub(crate) trait KeyType: Clone + fmt::Debug + fmt::Display + Hash + Eq + Send + Sync @@ -21,6 +26,10 @@ pub(crate) trait KeyType: pub(crate) trait ValueType: Clone + fmt::Debug + Send + Sync + Serialize + DeserializeOwned { + /// Returns an estimated size of the cache entry in bytes. + fn estimated_size(&self) -> Option { + Some(1) + } } // Blanket implementation which satisfies the compiler @@ -52,6 +61,10 @@ pub(crate) struct CacheStorage { caller: String, inner: Arc>>, redis: Option, + cache_size: Arc, + cache_estimated_storage: Arc, + _cache_size_gauge: ObservableGauge, + _cache_estimated_storage_gauge: ObservableGauge, } impl CacheStorage @@ -62,9 +75,19 @@ where pub(crate) async fn new( max_capacity: NonZeroUsize, config: Option, - caller: &str, + caller: &'static str, ) -> Result { + // Because calculating the cache size is expensive we do this as we go rather than iterating. This means storing the values for the gauges + let meter: opentelemetry::metrics::Meter = metrics::meter_provider().meter(METER_NAME); + let (cache_size, cache_size_gauge) = Self::create_cache_size_gauge(&meter, caller); + let (cache_estimated_storage, cache_estimated_storage_gauge) = + Self::create_cache_estimated_storage_size_gauge(&meter, caller); + Ok(Self { + _cache_size_gauge: cache_size_gauge, + _cache_estimated_storage_gauge: cache_estimated_storage_gauge, + cache_size, + cache_estimated_storage, caller: caller.to_string(), inner: Arc::new(Mutex::new(LruCache::new(max_capacity))), redis: if let Some(config) = config { @@ -89,6 +112,55 @@ where }) } + fn create_cache_size_gauge( + meter: &Meter, + caller: &'static str, + ) -> (Arc, ObservableGauge) { + let current_cache_size = Arc::new(AtomicI64::new(0)); + let current_cache_size_for_gauge = current_cache_size.clone(); + let cache_size_gauge = meter + // TODO move to dot naming convention + .i64_observable_gauge("apollo_router_cache_size") + .with_description("Cache size") + .with_callback(move |i| { + i.observe( + current_cache_size_for_gauge.load(Ordering::SeqCst), + &[ + KeyValue::new("kind", caller), + KeyValue::new("type", "memory"), + ], + ) + }) + .init(); + (current_cache_size, cache_size_gauge) + } + + fn create_cache_estimated_storage_size_gauge( + meter: &Meter, + caller: &'static str, + ) -> (Arc, ObservableGauge) { + let cache_estimated_storage = Arc::new(AtomicI64::new(0)); + let cache_estimated_storage_for_gauge = cache_estimated_storage.clone(); + let cache_estimated_storage_gauge = meter + .i64_observable_gauge("apollo.router.cache.estimated.storage.size") + .with_description("Estimated cache storage") + .with_callback(move |i| { + // If there's no storage then don't bother updating the gauge + let value = cache_estimated_storage_for_gauge.load(Ordering::SeqCst); + if value > 0 { + i.observe( + cache_estimated_storage_for_gauge.load(Ordering::SeqCst), + &[ + KeyValue::new("kind", caller), + KeyValue::new("type", "memory"), + ], + ) + } + }) + .init(); + (cache_estimated_storage, cache_estimated_storage_gauge) + } + /// `init_from_redis` is called with values newly deserialized from Redis cache /// if an error is returned, the value is ignored and considered a cache miss. pub(crate) async fn get( @@ -143,7 +215,7 @@ where }); match redis_value { Some(v) => { - self.inner.lock().await.put(key.clone(), v.0.clone()); + self.insert_in_memory(key.clone(), v.0.clone()).await; tracing::info!( monotonic_counter.apollo_router_cache_hit_count = 1u64, @@ -187,25 +259,24 @@ where .await; } - let mut in_memory = self.inner.lock().await; - in_memory.put(key, value); - let size = in_memory.len() as u64; - tracing::info!( - value.apollo_router_cache_size = size, - kind = %self.caller, - storage = &tracing::field::display(CacheStorageName::Memory), - ); + self.insert_in_memory(key, value).await; } - pub(crate) async fn insert_in_memory(&self, key: K, value: V) { + pub(crate) async fn insert_in_memory(&self, key: K, value: V) + where + V: ValueType, + { + // Update the cache size and estimated storage size + // This is cheaper than trying to estimate the cache storage size by iterating over the cache + self.cache_estimated_storage + .fetch_add(value.estimated_size().unwrap_or(0) as i64, Ordering::SeqCst); let mut in_memory = self.inner.lock().await; - in_memory.put(key, value); - let size = in_memory.len() as u64; - tracing::info!( - value.apollo_router_cache_size = size, - kind = %self.caller, - storage = &tracing::field::display(CacheStorageName::Memory), - ); + if let Some((_, v)) = in_memory.push(key.clone(), value.clone()) { + self.cache_estimated_storage + .fetch_sub(v.estimated_size().unwrap_or(0) as i64, Ordering::SeqCst); + } + self.cache_size + .store(in_memory.len() as i64, Ordering::SeqCst); } pub(crate) fn in_memory_cache(&self) -> InMemoryCache { From 289021d3a35c846173081bd46adb282e1d4fbfdc Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 2 Aug 2024 12:56:10 +0100 Subject: [PATCH 02/22] Set estimated storage size for ValueType to None --- apollo-router/src/cache/storage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index b4bbf37ee4..bff0881b40 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -28,7 +28,7 @@ pub(crate) trait ValueType: { /// Returns an estimated size of the cache entry in bytes. fn estimated_size(&self) -> Option { - Some(1) + None } } From 6f9779cf295558f5240f5403be9c20c6fa9af146 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 2 Aug 2024 13:19:31 +0100 Subject: [PATCH 03/22] Add most implementations of estimated_size --- apollo-router/src/cache/redis.rs | 6 ++++ apollo-router/src/cache/storage.rs | 29 +++++++++++++------ apollo-router/src/plugins/cache/entity.rs | 7 +++++ .../query_planner/caching_query_planner.rs | 8 ++++- apollo-router/src/services/query_planner.rs | 10 +++---- 5 files changed, 45 insertions(+), 15 deletions(-) diff --git a/apollo-router/src/cache/redis.rs b/apollo-router/src/cache/redis.rs index f0973551f4..92b6bf127c 100644 --- a/apollo-router/src/cache/redis.rs +++ b/apollo-router/src/cache/redis.rs @@ -591,6 +591,7 @@ impl RedisCacheStorage { mod test { use std::time::SystemTime; + use crate::cache::storage::ValueType; use url::Url; #[test] @@ -599,6 +600,11 @@ mod test { struct Stuff { time: SystemTime, } + impl ValueType for Stuff { + fn estimated_size(&self) -> Option { + None + } + } let invalid_json_payload = super::RedisValue(Stuff { // this systemtime is invalid, serialization will fail diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index bff0881b40..b792ef604e 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -13,8 +13,10 @@ use tower::BoxError; use super::redis::*; use crate::configuration::RedisCache; +use crate::error::QueryPlannerError; use crate::metrics; use crate::plugins::telemetry::config_new::instruments::METER_NAME; +use crate::services::QueryPlannerContent; use opentelemetry::metrics::MeterProvider; use opentelemetry_api::metrics::{Meter, ObservableGauge}; use opentelemetry_api::KeyValue; @@ -41,15 +43,6 @@ where // It has the functions it needs already } -// Blanket implementation which satisfies the compiler -impl ValueType for V -where - V: Clone + fmt::Debug + Send + Sync + Serialize + DeserializeOwned, -{ - // Nothing to implement, since V already supports the other traits. - // It has the functions it needs already -} - pub(crate) type InMemoryCache = Arc>>; // placeholder storage module @@ -302,3 +295,21 @@ impl Display for CacheStorageName { } } } + +impl ValueType for String { + fn estimated_size(&self) -> Option { + Some(self.len()) + } +} + +impl ValueType for crate::graphql::Response { + fn estimated_size(&self) -> Option { + None + } +} + +impl ValueType for usize { + fn estimated_size(&self) -> Option { + Some(std::mem::size_of::()) + } +} diff --git a/apollo-router/src/plugins/cache/entity.rs b/apollo-router/src/plugins/cache/entity.rs index 643b27b31b..4e36f1fc7c 100644 --- a/apollo-router/src/plugins/cache/entity.rs +++ b/apollo-router/src/plugins/cache/entity.rs @@ -32,6 +32,7 @@ use crate::batching::BatchQuery; use crate::cache::redis::RedisCacheStorage; use crate::cache::redis::RedisKey; use crate::cache::redis::RedisValue; +use crate::cache::storage::ValueType; use crate::configuration::subgraph::SubgraphConfiguration; use crate::configuration::RedisCache; use crate::error::FetchError; @@ -738,6 +739,12 @@ struct CacheEntry { data: Value, } +impl ValueType for CacheEntry { + fn estimated_size(&self) -> Option { + None + } +} + async fn cache_store_root_from_response( cache: RedisCacheStorage, subgraph_ttl: Option, diff --git a/apollo-router/src/query_planner/caching_query_planner.rs b/apollo-router/src/query_planner/caching_query_planner.rs index db4923f17c..52d85ac757 100644 --- a/apollo-router/src/query_planner/caching_query_planner.rs +++ b/apollo-router/src/query_planner/caching_query_planner.rs @@ -24,7 +24,7 @@ use tower_service::Service; use tracing::Instrument; use super::fetch::QueryHash; -use crate::cache::storage::InMemoryCache; +use crate::cache::storage::{InMemoryCache, ValueType}; use crate::cache::DeduplicatingCache; use crate::error::CacheResolverError; use crate::error::QueryPlannerError; @@ -684,6 +684,12 @@ pub(crate) struct WarmUpCachingQueryKey { pub(crate) introspection: bool, } +impl ValueType for Result> { + fn estimated_size(&self) -> Option { + todo!() + } +} + #[cfg(test)] mod tests { use mockall::mock; diff --git a/apollo-router/src/services/query_planner.rs b/apollo-router/src/services/query_planner.rs index 2fda97b60c..8a567bbc22 100644 --- a/apollo-router/src/services/query_planner.rs +++ b/apollo-router/src/services/query_planner.rs @@ -2,17 +2,17 @@ use std::sync::Arc; +use crate::cache::storage::ValueType; +use crate::error::QueryPlannerError; +use crate::graphql; +use crate::query_planner::QueryPlan; +use crate::Context; use async_trait::async_trait; use derivative::Derivative; use serde::Deserialize; use serde::Serialize; use static_assertions::assert_impl_all; -use crate::error::QueryPlannerError; -use crate::graphql; -use crate::query_planner::QueryPlan; -use crate::Context; - assert_impl_all!(Request: Send); /// [`Context`] for the request. #[derive(Derivative)] From 8ce69c3f9f9abe45a59bd9c55805288e98ed8126 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 2 Aug 2024 13:24:25 +0100 Subject: [PATCH 04/22] Lint --- apollo-router/src/cache/redis.rs | 3 ++- apollo-router/src/cache/storage.rs | 19 ++++++++++--------- .../query_planner/caching_query_planner.rs | 3 ++- apollo-router/src/services/query_planner.rs | 10 +++++----- 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/apollo-router/src/cache/redis.rs b/apollo-router/src/cache/redis.rs index 92b6bf127c..ae0697b3cf 100644 --- a/apollo-router/src/cache/redis.rs +++ b/apollo-router/src/cache/redis.rs @@ -591,9 +591,10 @@ impl RedisCacheStorage { mod test { use std::time::SystemTime; - use crate::cache::storage::ValueType; use url::Url; + use crate::cache::storage::ValueType; + #[test] fn ensure_invalid_payload_serialization_doesnt_fail() { #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index b792ef604e..81d756c51b 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -1,25 +1,26 @@ -use lru::LruCache; -use serde::de::DeserializeOwned; -use serde::Serialize; use std::fmt::Display; use std::fmt::{self}; use std::hash::Hash; use std::num::NonZeroUsize; -use std::sync::atomic::{AtomicI64, Ordering}; +use std::sync::atomic::AtomicI64; +use std::sync::atomic::Ordering; use std::sync::Arc; + +use lru::LruCache; +use opentelemetry::metrics::MeterProvider; +use opentelemetry_api::metrics::Meter; +use opentelemetry_api::metrics::ObservableGauge; +use opentelemetry_api::KeyValue; +use serde::de::DeserializeOwned; +use serde::Serialize; use tokio::sync::Mutex; use tokio::time::Instant; use tower::BoxError; use super::redis::*; use crate::configuration::RedisCache; -use crate::error::QueryPlannerError; use crate::metrics; use crate::plugins::telemetry::config_new::instruments::METER_NAME; -use crate::services::QueryPlannerContent; -use opentelemetry::metrics::MeterProvider; -use opentelemetry_api::metrics::{Meter, ObservableGauge}; -use opentelemetry_api::KeyValue; pub(crate) trait KeyType: Clone + fmt::Debug + fmt::Display + Hash + Eq + Send + Sync diff --git a/apollo-router/src/query_planner/caching_query_planner.rs b/apollo-router/src/query_planner/caching_query_planner.rs index 52d85ac757..ef76f3a8b9 100644 --- a/apollo-router/src/query_planner/caching_query_planner.rs +++ b/apollo-router/src/query_planner/caching_query_planner.rs @@ -24,7 +24,8 @@ use tower_service::Service; use tracing::Instrument; use super::fetch::QueryHash; -use crate::cache::storage::{InMemoryCache, ValueType}; +use crate::cache::storage::InMemoryCache; +use crate::cache::storage::ValueType; use crate::cache::DeduplicatingCache; use crate::error::CacheResolverError; use crate::error::QueryPlannerError; diff --git a/apollo-router/src/services/query_planner.rs b/apollo-router/src/services/query_planner.rs index 8a567bbc22..2fda97b60c 100644 --- a/apollo-router/src/services/query_planner.rs +++ b/apollo-router/src/services/query_planner.rs @@ -2,17 +2,17 @@ use std::sync::Arc; -use crate::cache::storage::ValueType; -use crate::error::QueryPlannerError; -use crate::graphql; -use crate::query_planner::QueryPlan; -use crate::Context; use async_trait::async_trait; use derivative::Derivative; use serde::Deserialize; use serde::Serialize; use static_assertions::assert_impl_all; +use crate::error::QueryPlannerError; +use crate::graphql; +use crate::query_planner::QueryPlan; +use crate::Context; + assert_impl_all!(Request: Send); /// [`Context`] for the request. #[derive(Derivative)] From 81d428a7adb12672cadfc7ae9e701893e42d6b70 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 2 Aug 2024 15:23:38 +0100 Subject: [PATCH 05/22] Add estimated cost logic --- apollo-router/src/cache/mod.rs | 2 + apollo-router/src/cache/size_estimation.rs | 409 ++++++++++++++++++ apollo-router/src/cache/storage.rs | 2 + .../query_planner/caching_query_planner.rs | 8 +- 4 files changed, 420 insertions(+), 1 deletion(-) create mode 100644 apollo-router/src/cache/size_estimation.rs diff --git a/apollo-router/src/cache/mod.rs b/apollo-router/src/cache/mod.rs index 62c9462d5f..6e1ef01cb8 100644 --- a/apollo-router/src/cache/mod.rs +++ b/apollo-router/src/cache/mod.rs @@ -14,7 +14,9 @@ use self::storage::ValueType; use crate::configuration::RedisCache; pub(crate) mod redis; +mod size_estimation; pub(crate) mod storage; +pub(crate) use size_estimation::estimate_size; type WaitMap = Arc>>>; pub(crate) const DEFAULT_CACHE_CAPACITY: NonZeroUsize = match NonZeroUsize::new(512) { diff --git a/apollo-router/src/cache/size_estimation.rs b/apollo-router/src/cache/size_estimation.rs new file mode 100644 index 0000000000..93db9ab95b --- /dev/null +++ b/apollo-router/src/cache/size_estimation.rs @@ -0,0 +1,409 @@ +use std::fmt::Debug; +use std::fmt::Display; +use std::fmt::Formatter; + +use serde::ser; +use serde::ser::SerializeMap; +use serde::ser::SerializeSeq; +use serde::ser::SerializeStruct; +use serde::ser::SerializeStructVariant; +use serde::ser::SerializeTuple; +use serde::ser::SerializeTupleStruct; +use serde::ser::SerializeTupleVariant; +use serde::Serialize; + +pub(crate) fn estimate_size(s: &T) -> usize { + let ser = s + .serialize(CountingSerializer::default()) + .expect("mut be able to serialize"); + ser.count +} + +pub(crate) struct Error; + +impl Debug for Error { + fn fmt(&self, _f: &mut Formatter<'_>) -> std::fmt::Result { + unreachable!() + } +} + +impl Display for Error { + fn fmt(&self, _f: &mut Formatter<'_>) -> std::fmt::Result { + unreachable!() + } +} + +impl std::error::Error for Error {} + +impl ser::Error for Error { + fn custom(_msg: T) -> Self { + unreachable!() + } +} + +/// This is a special serializer that doesn't store the serialized data, instead it counts the bytes +/// Yes, it's inaccurate, but we're looking for something that is relatively cheap to compute. +/// It doesn't take into account shared datastructures occurring multiple times and will give the +/// full estimated serialized cost. +#[derive(Default, Debug)] +struct CountingSerializer { + count: usize, +} + +impl ser::Serializer for CountingSerializer { + type Ok = Self; + type Error = Error; + type SerializeSeq = Self; + type SerializeTuple = Self; + type SerializeTupleStruct = Self; + type SerializeTupleVariant = Self; + type SerializeMap = Self; + type SerializeStruct = Self; + type SerializeStructVariant = Self; + + fn serialize_bool(mut self, _v: bool) -> Result { + self.count += std::mem::size_of::(); + Ok(self) + } + + fn serialize_i8(mut self, _v: i8) -> Result { + self.count += std::mem::size_of::(); + Ok(self) + } + + fn serialize_i16(mut self, _v: i16) -> Result { + self.count += std::mem::size_of::(); + Ok(self) + } + + fn serialize_i32(mut self, _v: i32) -> Result { + self.count += std::mem::size_of::(); + Ok(self) + } + + fn serialize_i64(mut self, _v: i64) -> Result { + self.count += std::mem::size_of::(); + Ok(self) + } + + fn serialize_u8(mut self, _v: u8) -> Result { + self.count += std::mem::size_of::(); + Ok(self) + } + + fn serialize_u16(mut self, _v: u16) -> Result { + self.count += std::mem::size_of::(); + Ok(self) + } + + fn serialize_u32(mut self, _v: u32) -> Result { + self.count += std::mem::size_of::(); + Ok(self) + } + + fn serialize_u64(mut self, _v: u64) -> Result { + self.count += std::mem::size_of::(); + Ok(self) + } + + fn serialize_f32(mut self, _v: f32) -> Result { + self.count += std::mem::size_of::(); + Ok(self) + } + + fn serialize_f64(mut self, _v: f64) -> Result { + self.count += std::mem::size_of::(); + Ok(self) + } + + fn serialize_char(mut self, _v: char) -> Result { + self.count += std::mem::size_of::(); + Ok(self) + } + + fn serialize_str(mut self, v: &str) -> Result { + //ptr + 8 bytes length + 8 bytes capacity + self.count += 24 + v.len(); + Ok(self) + } + + fn serialize_bytes(mut self, v: &[u8]) -> Result { + self.count += v.len(); + Ok(self) + } + + fn serialize_none(self) -> Result { + Ok(self) + } + + fn serialize_some(self, value: &T) -> Result + where + T: ?Sized + Serialize, + { + Ok(value.serialize(self).expect("failed to serialize")) + } + + fn serialize_unit(self) -> Result { + Ok(self) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Ok(self) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + ) -> Result { + Ok(self) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T, + ) -> Result + where + T: ?Sized + Serialize, + { + Ok(value.serialize(self).expect("failed to serialize")) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + value: &T, + ) -> Result + where + T: ?Sized + Serialize, + { + Ok(value.serialize(self).expect("failed to serialize")) + } + + fn serialize_seq(self, _len: Option) -> Result { + Ok(self) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Ok(self) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize, + ) -> Result { + Ok(self) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize, + ) -> Result { + Ok(self) + } + + fn serialize_map(self, _len: Option) -> Result { + Ok(self) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize, + ) -> Result { + Ok(self) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize, + ) -> Result { + Ok(self) + } +} +impl SerializeStructVariant for CountingSerializer { + type Ok = Self; + type Error = Error; + + fn serialize_field(&mut self, _key: &'static str, value: &T) -> Result<(), Self::Error> + where + T: ?Sized + Serialize, + { + let ser = value + .serialize(CountingSerializer::default()) + .expect("must be able to serialize"); + self.count += ser.count; + Ok(()) + } + + fn end(self) -> Result { + Ok(self) + } +} +impl SerializeSeq for CountingSerializer { + type Ok = Self; + type Error = Error; + + fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> + where + T: ?Sized + Serialize, + { + let ser = value + .serialize(CountingSerializer::default()) + .expect("must be able to serialize"); + self.count += ser.count; + Ok(()) + } + + fn end(self) -> Result { + Ok(self) + } +} +impl SerializeTuple for CountingSerializer { + type Ok = Self; + type Error = Error; + + fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> + where + T: ?Sized + Serialize, + { + let ser = value + .serialize(CountingSerializer::default()) + .expect("must be able to serialize"); + self.count += ser.count; + Ok(()) + } + + fn end(self) -> Result { + Ok(self) + } +} + +impl SerializeStruct for CountingSerializer { + type Ok = Self; + type Error = Error; + + fn serialize_field(&mut self, _key: &'static str, value: &T) -> Result<(), Self::Error> + where + T: ?Sized + Serialize, + { + let ser = value + .serialize(CountingSerializer::default()) + .expect("must be able to serialize"); + self.count += ser.count; + Ok(()) + } + + fn end(self) -> Result { + Ok(self) + } +} + +impl SerializeMap for CountingSerializer { + type Ok = Self; + type Error = Error; + + fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> + where + T: ?Sized + Serialize, + { + let ser = key + .serialize(CountingSerializer::default()) + .expect("must be able to serialize"); + self.count += ser.count; + Ok(()) + } + + fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> + where + T: ?Sized + Serialize, + { + let ser = value + .serialize(CountingSerializer::default()) + .expect("must be able to serialize"); + self.count += ser.count; + Ok(()) + } + + fn end(self) -> Result { + Ok(self) + } +} + +impl SerializeTupleVariant for CountingSerializer { + type Ok = Self; + type Error = Error; + + fn serialize_field(&mut self, value: &T) -> Result<(), Self::Error> + where + T: ?Sized + Serialize, + { + let ser = value + .serialize(CountingSerializer::default()) + .expect("must be able to serialize"); + self.count += ser.count; + Ok(()) + } + + fn end(self) -> Result { + Ok(self) + } +} + +impl SerializeTupleStruct for CountingSerializer { + type Ok = Self; + type Error = Error; + + fn serialize_field(&mut self, value: &T) -> Result<(), Self::Error> + where + T: ?Sized + Serialize, + { + let ser = value + .serialize(CountingSerializer::default()) + .expect("must be able to serialize"); + self.count += ser.count; + Ok(()) + } + + fn end(self) -> Result { + Ok(self) + } +} + +#[cfg(test)] +mod test { + use serde::Serialize; + + use crate::cache::estimate_size; + + #[test] + fn test_estimate_size() { + #[derive(Serialize)] + struct Test { + string: String, + u8: u8, + } + + let s = estimate_size(&Test { + string: "".to_string(), + u8: 0, + }); + assert_eq!(s, 25); + let s = estimate_size(&Test { + string: "test".to_string(), + u8: 0, + }); + assert_eq!(s, 29); + } +} diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index 81d756c51b..21706512a6 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -10,6 +10,7 @@ use lru::LruCache; use opentelemetry::metrics::MeterProvider; use opentelemetry_api::metrics::Meter; use opentelemetry_api::metrics::ObservableGauge; +use opentelemetry_api::metrics::Unit; use opentelemetry_api::KeyValue; use serde::de::DeserializeOwned; use serde::Serialize; @@ -138,6 +139,7 @@ where let cache_estimated_storage_gauge = meter .i64_observable_gauge("apollo.router.cache.estimated.storage.size") .with_description("Estimated cache storage") + .with_unit(Unit::new("bytes")) .with_callback(move |i| { // If there's no storage then don't bother updating the gauge let value = cache_estimated_storage_for_gauge.load(Ordering::SeqCst); diff --git a/apollo-router/src/query_planner/caching_query_planner.rs b/apollo-router/src/query_planner/caching_query_planner.rs index ef76f3a8b9..297cda962c 100644 --- a/apollo-router/src/query_planner/caching_query_planner.rs +++ b/apollo-router/src/query_planner/caching_query_planner.rs @@ -24,6 +24,7 @@ use tower_service::Service; use tracing::Instrument; use super::fetch::QueryHash; +use crate::cache::estimate_size; use crate::cache::storage::InMemoryCache; use crate::cache::storage::ValueType; use crate::cache::DeduplicatingCache; @@ -687,7 +688,12 @@ pub(crate) struct WarmUpCachingQueryKey { impl ValueType for Result> { fn estimated_size(&self) -> Option { - todo!() + match self { + Ok(QueryPlannerContent::Plan { plan }) => Some(estimate_size(plan)), + Ok(QueryPlannerContent::Response { response }) => Some(estimate_size(response)), + Ok(QueryPlannerContent::IntrospectionDisabled) => None, + Err(e) => Some(estimate_size(e)), + } } } From 4ef2bc914093003abcf3534648577e41689a94c1 Mon Sep 17 00:00:00 2001 From: bryn Date: Mon, 5 Aug 2024 11:16:41 +0100 Subject: [PATCH 06/22] Add metrics to docs and rename metric to apollo.router.cache.storage.estimated_size --- apollo-router/src/cache/storage.rs | 2 +- docs/source/configuration/in-memory-caching.mdx | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index 21706512a6..ce48d79939 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -137,7 +137,7 @@ where let cache_estimated_storage = Arc::new(AtomicI64::new(0)); let cache_estimated_storage_for_gauge = cache_estimated_storage.clone(); let cache_estimated_storage_gauge = meter - .i64_observable_gauge("apollo.router.cache.estimated.storage.size") + .i64_observable_gauge("apollo.router.cache.storage.estimated_size") .with_description("Estimated cache storage") .with_unit(Unit::new("bytes")) .with_callback(move |i| { diff --git a/docs/source/configuration/in-memory-caching.mdx b/docs/source/configuration/in-memory-caching.mdx index 4e483906a6..17aca096f9 100644 --- a/docs/source/configuration/in-memory-caching.mdx +++ b/docs/source/configuration/in-memory-caching.mdx @@ -72,15 +72,18 @@ supergraph: To get more information on the planning and warm-up process use the following metrics (where `` can be `redis` for distributed cache or `memory`): * counters: - * `apollo_router_cache_size{kind="query planner", storage="}`: current size of the cache (only for in-memory cache) - * `apollo_router_cache_hit_count{kind="query planner", storage="}` - * `apollo_router_cache_miss_count{kind="query planner", storage="}` + * `apollo_router_cache_hit_count{kind="query planner", storage=""}` + * `apollo_router_cache_miss_count{kind="query planner", storage=""}` * histograms: * `apollo.router.query_planning.plan.duration`: time spent planning queries * `apollo_router_schema_loading_time`: time spent loading a schema - * `apollo_router_cache_hit_time{kind="query planner", storage="}`: time to get a value from the cache - * `apollo_router_cache_miss_time{kind="query planner", storage="}` + * `apollo_router_cache_hit_time{kind="query planner", storage=""}`: time to get a value from the cache + * `apollo_router_cache_miss_time{kind="query planner", storage=""}` + +* gauges + * `apollo_router_cache_size{kind="query planner", storage="memory"}`: current size of the cache (only for in-memory cache) + * `apollo.router.cache.storage.estimated_size{kind="query planner", storage="memory"}`: estimated storage size of the cache (only for in-memory query planner cache) Typically, we would look at `apollo_router_cache_size` and the cache hit rate to define the right size of the in memory cache, then look at `apollo_router_schema_loading_time` and `apollo.router.query_planning.plan.duration` to decide how much time we want to spend warming up queries. From 08973434e126de4cc2f40b62b514173390522b5f Mon Sep 17 00:00:00 2001 From: bryn Date: Mon, 5 Aug 2024 11:48:27 +0100 Subject: [PATCH 07/22] Add tests --- apollo-router/src/cache/size_estimation.rs | 33 +++++++++++++++-- apollo-router/src/cache/storage.rs | 41 ++++++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/apollo-router/src/cache/size_estimation.rs b/apollo-router/src/cache/size_estimation.rs index 93db9ab95b..885e8d5c13 100644 --- a/apollo-router/src/cache/size_estimation.rs +++ b/apollo-router/src/cache/size_estimation.rs @@ -393,17 +393,46 @@ mod test { struct Test { string: String, u8: u8, + embedded: TestEmbedded, } + #[derive(Serialize)] + struct TestEmbedded { + string: String, + u8: u8, + } + + // Baseline let s = estimate_size(&Test { string: "".to_string(), u8: 0, + embedded: TestEmbedded { + string: "".to_string(), + u8: 0, + }, }); - assert_eq!(s, 25); + assert_eq!(s, 50); + + // Test modifying the root struct let s = estimate_size(&Test { string: "test".to_string(), u8: 0, + embedded: TestEmbedded { + string: "".to_string(), + u8: 0, + }, + }); + assert_eq!(s, 54); + + // Test modifying the embedded struct + let s = estimate_size(&Test { + string: "".to_string(), + u8: 0, + embedded: TestEmbedded { + string: "test".to_string(), + u8: 0, + }, }); - assert_eq!(s, 29); + assert_eq!(s, 54); } } diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index ce48d79939..8360215672 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -316,3 +316,44 @@ impl ValueType for usize { Some(std::mem::size_of::()) } } + +#[cfg(test)] +mod test { + use crate::cache::storage::{CacheStorage, ValueType}; + use crate::metrics::FutureMetricsExt; + use std::num::NonZeroUsize; + + #[tokio::test] + async fn test_metrics() { + #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] + struct Stuff {} + impl ValueType for Stuff { + fn estimated_size(&self) -> Option { + Some(1) + } + } + + async { + let cache: CacheStorage = + CacheStorage::new(NonZeroUsize::new(10).unwrap(), None, "test") + .await + .unwrap(); + + cache.insert("test".to_string(), Stuff {}).await; + assert_gauge!( + "apollo.router.cache.storage.estimated_size", + 1, + "kind" = "test", + "type" = "memory" + ); + assert_gauge!( + "apollo_router_cache_size", + 1, + "kind" = "test", + "type" = "memory" + ); + } + .with_metrics() + .await; + } +} From 6ebd9f7d9ea7ecdfa462652702610219515a14ed Mon Sep 17 00:00:00 2001 From: bryn Date: Mon, 5 Aug 2024 11:59:20 +0100 Subject: [PATCH 08/22] Add changelogs --- .changesets/feat_enhanced_observability.md | 5 +++++ .changesets/fix_missing_cache_gauge.md | 12 ++++++++++++ 2 files changed, 17 insertions(+) create mode 100644 .changesets/feat_enhanced_observability.md create mode 100644 .changesets/fix_missing_cache_gauge.md diff --git a/.changesets/feat_enhanced_observability.md b/.changesets/feat_enhanced_observability.md new file mode 100644 index 0000000000..1bc91aeb78 --- /dev/null +++ b/.changesets/feat_enhanced_observability.md @@ -0,0 +1,5 @@ +### `apollo_router_cache_size` gauge sometimes missing ([PR #5770](https://github.com/apollographql/router/pull/5770)) + +This PR fixes the issue where the `apollo_router_cache_size` metric disappeared if the in memory cache was not mutated. + +By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/5770 diff --git a/.changesets/fix_missing_cache_gauge.md b/.changesets/fix_missing_cache_gauge.md new file mode 100644 index 0000000000..07cf993a90 --- /dev/null +++ b/.changesets/fix_missing_cache_gauge.md @@ -0,0 +1,12 @@ +### Missing cache gauge ([PR #5770](https://github.com/apollographql/router/pull/5770)) + +Query planner cache entries may use an significant amount of memory in the Router. +To help users understand and monitor this the Router now exposes a new metric `apollo.router.cache.storage.estimated_size`. + +This metric has the following attributes: +- `kind`: `query planner`. +- `storage`: `memory`. + +As the size is only an estimation, users should check for correlation with pod memory usage to determine if cache needs to be updated. + +By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/5770 From e9bc8f514831003550d7f1413bb92a236aa7959d Mon Sep 17 00:00:00 2001 From: bryn Date: Mon, 5 Aug 2024 12:16:07 +0100 Subject: [PATCH 09/22] Add test for metric absence where no estimated size given --- apollo-router/src/cache/storage.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index 8360215672..5ca3629213 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -356,4 +356,34 @@ mod test { .with_metrics() .await; } + + #[tokio::test] + #[should_panic] + async fn test_metrics_not_emitted_where_no_estimated_size() { + #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] + struct Stuff {} + impl ValueType for Stuff { + fn estimated_size(&self) -> Option { + None + } + } + + async { + let cache: CacheStorage = + CacheStorage::new(NonZeroUsize::new(10).unwrap(), None, "test") + .await + .unwrap(); + + cache.insert("test".to_string(), Stuff {}).await; + // This metric won't exist + assert_gauge!( + "apollo_router_cache_size", + 0, + "kind" = "test", + "type" = "memory" + ); + } + .with_metrics() + .await; + } } From 0d318064486f77a40702900ba863c82b11390abc Mon Sep 17 00:00:00 2001 From: bryn Date: Mon, 5 Aug 2024 12:20:47 +0100 Subject: [PATCH 10/22] Update docs --- .../telemetry/instrumentation/standard-instruments.mdx | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/configuration/telemetry/instrumentation/standard-instruments.mdx b/docs/source/configuration/telemetry/instrumentation/standard-instruments.mdx index 8accb624a8..62e71df7ba 100644 --- a/docs/source/configuration/telemetry/instrumentation/standard-instruments.mdx +++ b/docs/source/configuration/telemetry/instrumentation/standard-instruments.mdx @@ -38,6 +38,7 @@ These instruments can be consumed by configuring a [metrics exporter](../exporte - `apollo_router_cache_miss_count` - Number of cache misses - `apollo_router_cache_hit_time` - Time to hit the cache in seconds - `apollo_router_cache_miss_time` - Time to miss the cache in seconds +- `apollo.router.cache.storage.estimated_size` - The estimated storage size of the cache in bytes (query planner in memory only). All cache metrics listed above have the following attributes: From 3faddbed51f5ce1b19023cf524858487cbd7018c Mon Sep 17 00:00:00 2001 From: bryn Date: Mon, 5 Aug 2024 13:23:00 +0100 Subject: [PATCH 11/22] lint --- apollo-router/src/cache/storage.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index 5ca3629213..cc0e865cc2 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -319,10 +319,12 @@ impl ValueType for usize { #[cfg(test)] mod test { - use crate::cache::storage::{CacheStorage, ValueType}; - use crate::metrics::FutureMetricsExt; use std::num::NonZeroUsize; + use crate::cache::storage::CacheStorage; + use crate::cache::storage::ValueType; + use crate::metrics::FutureMetricsExt; + #[tokio::test] async fn test_metrics() { #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] From 2dc9bca743689314a21168c31067149de4385205 Mon Sep 17 00:00:00 2001 From: bryn Date: Mon, 5 Aug 2024 13:30:22 +0100 Subject: [PATCH 12/22] Remove uneeded clones --- apollo-router/src/cache/storage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index cc0e865cc2..22b8cfb2e4 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -267,7 +267,7 @@ where self.cache_estimated_storage .fetch_add(value.estimated_size().unwrap_or(0) as i64, Ordering::SeqCst); let mut in_memory = self.inner.lock().await; - if let Some((_, v)) = in_memory.push(key.clone(), value.clone()) { + if let Some((_, v)) = in_memory.push(key, value) { self.cache_estimated_storage .fetch_sub(v.estimated_size().unwrap_or(0) as i64, Ordering::SeqCst); } From e37c87ebc97cb588dfc1de831760df1d0ce1d68f Mon Sep 17 00:00:00 2001 From: bryn Date: Mon, 5 Aug 2024 13:50:20 +0100 Subject: [PATCH 13/22] Reduce lock scope and add more tests --- apollo-router/src/cache/storage.rs | 118 ++++++++++++++++++++++++++--- 1 file changed, 108 insertions(+), 10 deletions(-) diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index 22b8cfb2e4..dc509e4799 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -264,15 +264,24 @@ where { // Update the cache size and estimated storage size // This is cheaper than trying to estimate the cache storage size by iterating over the cache + let new_value_size = value.estimated_size().unwrap_or(0) as i64; + + let (old_value, length) = { + let mut in_memory = self.inner.lock().await; + (in_memory.push(key, value), in_memory.len()) + }; + + let size_delta = match old_value { + Some((_, old_value)) => { + let old_value_size = old_value.estimated_size().unwrap_or(0) as i64; + new_value_size - old_value_size + } + None => new_value_size, + }; self.cache_estimated_storage - .fetch_add(value.estimated_size().unwrap_or(0) as i64, Ordering::SeqCst); - let mut in_memory = self.inner.lock().await; - if let Some((_, v)) = in_memory.push(key, value) { - self.cache_estimated_storage - .fetch_sub(v.estimated_size().unwrap_or(0) as i64, Ordering::SeqCst); - } - self.cache_size - .store(in_memory.len() as i64, Ordering::SeqCst); + .fetch_add(size_delta, Ordering::SeqCst); + + self.cache_size.store(length as i64, Ordering::SeqCst); } pub(crate) fn in_memory_cache(&self) -> InMemoryCache { @@ -319,11 +328,11 @@ impl ValueType for usize { #[cfg(test)] mod test { - use std::num::NonZeroUsize; - + use crate::cache::estimate_size; use crate::cache::storage::CacheStorage; use crate::cache::storage::ValueType; use crate::metrics::FutureMetricsExt; + use std::num::NonZeroUsize; #[tokio::test] async fn test_metrics() { @@ -388,4 +397,93 @@ mod test { .with_metrics() .await; } + + #[tokio::test] + async fn test_metrics_eviction() { + #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] + struct Stuff { + test: String, + } + impl ValueType for Stuff { + fn estimated_size(&self) -> Option { + Some(estimate_size(self)) + } + } + + async { + // note that the cache size is 1 + // so the second insert will always evict + let cache: CacheStorage = + CacheStorage::new(NonZeroUsize::new(1).unwrap(), None, "test") + .await + .unwrap(); + + cache + .insert( + "test".to_string(), + Stuff { + test: "test".to_string(), + }, + ) + .await; + assert_gauge!( + "apollo.router.cache.storage.estimated_size", + 28, + "kind" = "test", + "type" = "memory" + ); + assert_gauge!( + "apollo_router_cache_size", + 1, + "kind" = "test", + "type" = "memory" + ); + + // Insert something slightly larger + cache + .insert( + "test".to_string(), + Stuff { + test: "test_extended".to_string(), + }, + ) + .await; + assert_gauge!( + "apollo.router.cache.storage.estimated_size", + 37, + "kind" = "test", + "type" = "memory" + ); + assert_gauge!( + "apollo_router_cache_size", + 1, + "kind" = "test", + "type" = "memory" + ); + + // Even though this is a new cache entry, we should get back to where we initially were + cache + .insert( + "test2".to_string(), + Stuff { + test: "test".to_string(), + }, + ) + .await; + assert_gauge!( + "apollo.router.cache.storage.estimated_size", + 28, + "kind" = "test", + "type" = "memory" + ); + assert_gauge!( + "apollo_router_cache_size", + 1, + "kind" = "test", + "type" = "memory" + ); + } + .with_metrics() + .await; + } } From d929c980ee14d78f7dcf5e83a7824d33de03bed2 Mon Sep 17 00:00:00 2001 From: bryn Date: Mon, 5 Aug 2024 13:54:28 +0100 Subject: [PATCH 14/22] Lint --- apollo-router/src/cache/storage.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apollo-router/src/cache/storage.rs b/apollo-router/src/cache/storage.rs index dc509e4799..7cfa37a0ad 100644 --- a/apollo-router/src/cache/storage.rs +++ b/apollo-router/src/cache/storage.rs @@ -328,11 +328,12 @@ impl ValueType for usize { #[cfg(test)] mod test { + use std::num::NonZeroUsize; + use crate::cache::estimate_size; use crate::cache::storage::CacheStorage; use crate::cache::storage::ValueType; use crate::metrics::FutureMetricsExt; - use std::num::NonZeroUsize; #[tokio::test] async fn test_metrics() { From b82c59dc45b7c4e2b8ced758cbcab785e80d8bdc Mon Sep 17 00:00:00 2001 From: bryn Date: Tue, 6 Aug 2024 10:42:10 +0100 Subject: [PATCH 15/22] Cache the size of query plans --- .../file_uploads/rearrange_query_plan.rs | 1 + .../src/query_planner/bridge_query_planner.rs | 1 + .../query_planner/caching_query_planner.rs | 3 +- apollo-router/src/query_planner/plan.rs | 30 +++++++++++++++++++ apollo-router/src/query_planner/tests.rs | 9 +++++- .../src/services/supergraph/service.rs | 1 + 6 files changed, 43 insertions(+), 2 deletions(-) diff --git a/apollo-router/src/plugins/file_uploads/rearrange_query_plan.rs b/apollo-router/src/plugins/file_uploads/rearrange_query_plan.rs index c7bfdc1ec4..22bcf3fdb6 100644 --- a/apollo-router/src/plugins/file_uploads/rearrange_query_plan.rs +++ b/apollo-router/src/plugins/file_uploads/rearrange_query_plan.rs @@ -45,6 +45,7 @@ pub(super) fn rearrange_query_plan( formatted_query_plan: query_plan.formatted_query_plan.clone(), query: query_plan.query.clone(), query_metrics: query_plan.query_metrics, + estimated_size: Default::default(), }) } diff --git a/apollo-router/src/query_planner/bridge_query_planner.rs b/apollo-router/src/query_planner/bridge_query_planner.rs index cdd1d710ef..4ccd23c287 100644 --- a/apollo-router/src/query_planner/bridge_query_planner.rs +++ b/apollo-router/src/query_planner/bridge_query_planner.rs @@ -606,6 +606,7 @@ impl BridgeQueryPlanner { formatted_query_plan, query: Arc::new(selections), query_metrics, + estimated_size: Default::default(), }), }) } diff --git a/apollo-router/src/query_planner/caching_query_planner.rs b/apollo-router/src/query_planner/caching_query_planner.rs index 297cda962c..9bf0877bd0 100644 --- a/apollo-router/src/query_planner/caching_query_planner.rs +++ b/apollo-router/src/query_planner/caching_query_planner.rs @@ -689,7 +689,7 @@ pub(crate) struct WarmUpCachingQueryKey { impl ValueType for Result> { fn estimated_size(&self) -> Option { match self { - Ok(QueryPlannerContent::Plan { plan }) => Some(estimate_size(plan)), + Ok(QueryPlannerContent::Plan { plan }) => Some(plan.estimated_size()), Ok(QueryPlannerContent::Response { response }) => Some(estimate_size(response)), Ok(QueryPlannerContent::IntrospectionDisabled) => None, Err(e) => Some(estimate_size(e)), @@ -848,6 +848,7 @@ mod tests { .into(), query: Arc::new(Query::empty()), query_metrics: Default::default(), + estimated_size: Default::default(), }; let qp_content = QueryPlannerContent::Plan { plan: Arc::new(query_plan), diff --git a/apollo-router/src/query_planner/plan.rs b/apollo-router/src/query_planner/plan.rs index bf4471e23b..447adb7ba7 100644 --- a/apollo-router/src/query_planner/plan.rs +++ b/apollo-router/src/query_planner/plan.rs @@ -1,3 +1,5 @@ +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::Ordering; use std::sync::Arc; use apollo_compiler::validation::Valid; @@ -9,6 +11,7 @@ use serde::Serialize; pub(crate) use self::fetch::OperationKind; use super::fetch; use super::subscription::SubscriptionNode; +use crate::cache::estimate_size; use crate::configuration::Batching; use crate::error::CacheResolverError; use crate::error::ValidationErrors; @@ -42,6 +45,10 @@ pub struct QueryPlan { pub(crate) formatted_query_plan: Option>, pub(crate) query: Arc, pub(crate) query_metrics: OperationLimits, + + /// The estimated size in bytes of the query plan + #[serde(default)] + pub(crate) estimated_size: Arc, } /// This default impl is useful for test users @@ -64,6 +71,7 @@ impl QueryPlan { formatted_query_plan: Default::default(), query: Arc::new(Query::empty()), query_metrics: Default::default(), + estimated_size: Default::default(), } } } @@ -89,6 +97,14 @@ impl QueryPlan { self.root .query_hashes(batching_config, operation, variables, &self.query) } + + pub(crate) fn estimated_size(&self) -> usize { + if self.estimated_size.load(Ordering::SeqCst) == 0 { + self.estimated_size + .store(estimate_size(self), Ordering::SeqCst); + } + self.estimated_size.load(Ordering::SeqCst) + } } /// Query plans are composed of a set of nodes. @@ -607,3 +623,17 @@ pub(crate) struct DeferredNode { pub(crate) struct Depends { pub(crate) id: String, } + +#[cfg(test)] +mod test { + use crate::query_planner::QueryPlan; + + #[test] + fn test_estimated_size() { + let query_plan = QueryPlan::fake_builder().build(); + let size1 = query_plan.estimated_size(); + let size2 = query_plan.estimated_size(); + assert!(size1 > 0); + assert_eq!(size1, size2); + } +} diff --git a/apollo-router/src/query_planner/tests.rs b/apollo-router/src/query_planner/tests.rs index fd7fb6d8b6..cfd44d6d08 100644 --- a/apollo-router/src/query_planner/tests.rs +++ b/apollo-router/src/query_planner/tests.rs @@ -87,6 +87,7 @@ async fn mock_subgraph_service_withf_panics_should_be_reported_as_service_closed referenced_fields_by_type: Default::default(), } .into(), + estimated_size: Default::default(), }; let mut mock_products_service = plugin::test::MockSubgraphService::new(); @@ -142,6 +143,7 @@ async fn fetch_includes_operation_name() { .into(), query: Arc::new(Query::empty()), query_metrics: Default::default(), + estimated_size: Default::default(), }; let succeeded: Arc = Default::default(); @@ -202,6 +204,7 @@ async fn fetch_makes_post_requests() { .into(), query: Arc::new(Query::empty()), query_metrics: Default::default(), + estimated_size: Default::default(), }; let succeeded: Arc = Default::default(); @@ -329,7 +332,8 @@ async fn defer() { referenced_fields_by_type: Default::default(), }.into(), query: Arc::new(Query::empty()), - query_metrics: Default::default() + query_metrics: Default::default(), + estimated_size: Default::default(), }; let mut mock_x_service = plugin::test::MockSubgraphService::new(); @@ -460,6 +464,7 @@ async fn defer_if_condition() { ), formatted_query_plan: None, query_metrics: Default::default(), + estimated_size: Default::default(), }; let mocked_accounts = MockSubgraph::builder() @@ -642,6 +647,7 @@ async fn dependent_mutations() { .into(), query: Arc::new(Query::empty()), query_metrics: Default::default(), + estimated_size: Default::default(), }; let mut mock_a_service = plugin::test::MockSubgraphService::new(); @@ -1826,6 +1832,7 @@ fn broken_plan_does_not_panic() { .into(), query: Arc::new(Query::empty()), query_metrics: Default::default(), + estimated_size: Default::default(), }; let subgraph_schema = apollo_compiler::Schema::parse_and_validate(subgraph_schema, "").unwrap(); let mut subgraph_schemas = HashMap::new(); diff --git a/apollo-router/src/services/supergraph/service.rs b/apollo-router/src/services/supergraph/service.rs index a5ea8403d5..dec84074f8 100644 --- a/apollo-router/src/services/supergraph/service.rs +++ b/apollo-router/src/services/supergraph/service.rs @@ -454,6 +454,7 @@ async fn subscription_task( formatted_query_plan: query_plan.formatted_query_plan.clone(), query: query_plan.query.clone(), query_metrics: query_plan.query_metrics, + estimated_size: Default::default(), }) }), _ => { From 4f26dedd68c3b6402768c2606da7f0b0052b65d1 Mon Sep 17 00:00:00 2001 From: bryn Date: Tue, 6 Aug 2024 11:39:05 +0100 Subject: [PATCH 16/22] Changelogs --- .changesets/feat_enhanced_observability.md | 28 +++++++++++++++++++--- .changesets/fix_missing_cache_gauge.md | 11 ++------- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/.changesets/feat_enhanced_observability.md b/.changesets/feat_enhanced_observability.md index 1bc91aeb78..40fbe29823 100644 --- a/.changesets/feat_enhanced_observability.md +++ b/.changesets/feat_enhanced_observability.md @@ -1,5 +1,27 @@ -### `apollo_router_cache_size` gauge sometimes missing ([PR #5770](https://github.com/apollographql/router/pull/5770)) +### `apollo.router.cache.storage.estimated_size` gauge ([PR #5770](https://github.com/apollographql/router/pull/5770)) -This PR fixes the issue where the `apollo_router_cache_size` metric disappeared if the in memory cache was not mutated. +Query planner cache entries may use an significant amount of memory in the Router. +To help users understand and monitor this the Router now exposes a new metric `apollo.router.cache.storage.estimated_size`. -By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/5770 +This metric give an estimated size in bytes for the cache entry and has the following attributes: +- `kind`: `query planner`. +- `storage`: `memory`. + +As the size is only an estimation, users should check for correlation with pod memory usage to determine if cache needs to be updated. + +Usage scenario: +* Your pods are being terminated due to memory pressure. +* Add the following metrics to your monitoring system to track: + * `apollo.router.cache.storage.estimated_size`. + * `apollo_router_cache_size`. + * ratio of `apollo_router_cache_hits` - `apollo_router_cache_misses`. + +* Observe the `apollo.router.cache.storage.estimated_size` to see if it grows over time and correlates with pod memory usage. +* Observe the ratio of cache hits to misses to determine if the cache is being effective. + +Remediation: +* Adjust the cache size to lower if the cache reaches near 100% hit rate but the cache size is still growing. +* Increase the pod memory to higher if cache hit rate is low and cache size is still growing. +* Adjust the cache size to lower if the latency of query planning cache misses is acceptable and memory availability is limited. + +By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/5770 \ No newline at end of file diff --git a/.changesets/fix_missing_cache_gauge.md b/.changesets/fix_missing_cache_gauge.md index 07cf993a90..1bc91aeb78 100644 --- a/.changesets/fix_missing_cache_gauge.md +++ b/.changesets/fix_missing_cache_gauge.md @@ -1,12 +1,5 @@ -### Missing cache gauge ([PR #5770](https://github.com/apollographql/router/pull/5770)) +### `apollo_router_cache_size` gauge sometimes missing ([PR #5770](https://github.com/apollographql/router/pull/5770)) -Query planner cache entries may use an significant amount of memory in the Router. -To help users understand and monitor this the Router now exposes a new metric `apollo.router.cache.storage.estimated_size`. - -This metric has the following attributes: -- `kind`: `query planner`. -- `storage`: `memory`. - -As the size is only an estimation, users should check for correlation with pod memory usage to determine if cache needs to be updated. +This PR fixes the issue where the `apollo_router_cache_size` metric disappeared if the in memory cache was not mutated. By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/5770 From 0280873b7f911dc0789c16c3b390f25b700edb31 Mon Sep 17 00:00:00 2001 From: Edward Huang Date: Tue, 6 Aug 2024 20:51:19 -0700 Subject: [PATCH 17/22] add memory troubleshooting guide to kubernetes page --- docs/source/containerization/kubernetes.mdx | 23 +++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/docs/source/containerization/kubernetes.mdx b/docs/source/containerization/kubernetes.mdx index a2393c5225..591e30db0d 100644 --- a/docs/source/containerization/kubernetes.mdx +++ b/docs/source/containerization/kubernetes.mdx @@ -285,3 +285,26 @@ The gateway propagates subgraph errors to clients, but the router doesn't by def include_subgraph_errors: all: true ``` + +## Troubleshooting + +### Pods terminating due to memory pressure + +If your deployment of routers is terminating due to memory pressure, you can add router cache metrics to monitor and remediate your system: + +1. Add and track the following metrics to your monitoring system: + + * `apollo.router.cache.storage.estimated_size` + * `apollo_router_cache_size` + * ratio of `apollo_router_cache_hits` to `apollo_router_cache_misses` + +2. Observe and monitor the metrics: + + * Observe the `apollo.router.cache.storage.estimated_size` to see if it grows over time and correlates with pod memory usage. + * Observe the ratio of cache hits to misses to determine if the cache is being effective. + +3. Based on your observations, try some remediating adjustments: + + * Lower the cache size if the cache reaches near 100% hit-rate but the cache size is still growing. + * Increase the pod memory if the cache hit rate is low and the cache size is still growing. + * Lower the cache size if the latency of query planning cache misses is acceptable and memory availability is limited. From 01f64ce3ceec666461497a7a8a43e7fc4760633c Mon Sep 17 00:00:00 2001 From: Bryn Cooke Date: Wed, 7 Aug 2024 14:06:03 +0100 Subject: [PATCH 18/22] Update .changesets/fix_missing_cache_gauge.md Co-authored-by: Edward Huang --- .changesets/fix_missing_cache_gauge.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.changesets/fix_missing_cache_gauge.md b/.changesets/fix_missing_cache_gauge.md index 1bc91aeb78..1b71523210 100644 --- a/.changesets/fix_missing_cache_gauge.md +++ b/.changesets/fix_missing_cache_gauge.md @@ -1,5 +1,5 @@ -### `apollo_router_cache_size` gauge sometimes missing ([PR #5770](https://github.com/apollographql/router/pull/5770)) +### Fix missing `apollo_router_cache_size` metric ([PR #5770](https://github.com/apollographql/router/pull/5770)) -This PR fixes the issue where the `apollo_router_cache_size` metric disappeared if the in memory cache was not mutated. +Previously, if the in-memory cache wasn't mutated, the `apollo_router_cache_size` metric wouldn't be available. This has been fixed in this release. By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/5770 From 6fe6dbbb477d5f084decaf519b6878852cf51f70 Mon Sep 17 00:00:00 2001 From: Bryn Cooke Date: Wed, 7 Aug 2024 14:06:15 +0100 Subject: [PATCH 19/22] Update .changesets/feat_enhanced_observability.md Co-authored-by: Edward Huang --- .changesets/feat_enhanced_observability.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.changesets/feat_enhanced_observability.md b/.changesets/feat_enhanced_observability.md index 40fbe29823..0485b796e2 100644 --- a/.changesets/feat_enhanced_observability.md +++ b/.changesets/feat_enhanced_observability.md @@ -7,7 +7,7 @@ This metric give an estimated size in bytes for the cache entry and has the foll - `kind`: `query planner`. - `storage`: `memory`. -As the size is only an estimation, users should check for correlation with pod memory usage to determine if cache needs to be updated. +Before using the estimate to decide whether to update the cache, users should validate that the estimate correlates with their pod's memory usage. Usage scenario: * Your pods are being terminated due to memory pressure. From 1bdaf8ff93039d5576728b0ec16a93b000192912 Mon Sep 17 00:00:00 2001 From: Bryn Cooke Date: Wed, 7 Aug 2024 14:06:31 +0100 Subject: [PATCH 20/22] Update .changesets/feat_enhanced_observability.md Co-authored-by: Edward Huang --- .changesets/feat_enhanced_observability.md | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/.changesets/feat_enhanced_observability.md b/.changesets/feat_enhanced_observability.md index 0485b796e2..cae72b707d 100644 --- a/.changesets/feat_enhanced_observability.md +++ b/.changesets/feat_enhanced_observability.md @@ -10,18 +10,6 @@ This metric give an estimated size in bytes for the cache entry and has the foll Before using the estimate to decide whether to update the cache, users should validate that the estimate correlates with their pod's memory usage. Usage scenario: -* Your pods are being terminated due to memory pressure. -* Add the following metrics to your monitoring system to track: - * `apollo.router.cache.storage.estimated_size`. - * `apollo_router_cache_size`. - * ratio of `apollo_router_cache_hits` - `apollo_router_cache_misses`. - -* Observe the `apollo.router.cache.storage.estimated_size` to see if it grows over time and correlates with pod memory usage. -* Observe the ratio of cache hits to misses to determine if the cache is being effective. - -Remediation: -* Adjust the cache size to lower if the cache reaches near 100% hit rate but the cache size is still growing. -* Increase the pod memory to higher if cache hit rate is low and cache size is still growing. -* Adjust the cache size to lower if the latency of query planning cache misses is acceptable and memory availability is limited. +To learn how to troubleshoot with this metric, see the [Pods terminating due to memory pressure](https://www.apollographql.com/docs/router/containerization/kubernetes#pods-terminating-due-to-memory-pressure) guide in docs. By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/5770 \ No newline at end of file From 32954ef03422664a138d2d15ac7db2349c24f960 Mon Sep 17 00:00:00 2001 From: Bryn Cooke Date: Wed, 7 Aug 2024 14:06:49 +0100 Subject: [PATCH 21/22] Update .changesets/feat_enhanced_observability.md Co-authored-by: Edward Huang --- .changesets/feat_enhanced_observability.md | 1 - 1 file changed, 1 deletion(-) diff --git a/.changesets/feat_enhanced_observability.md b/.changesets/feat_enhanced_observability.md index cae72b707d..5e6123e2d2 100644 --- a/.changesets/feat_enhanced_observability.md +++ b/.changesets/feat_enhanced_observability.md @@ -9,7 +9,6 @@ This metric give an estimated size in bytes for the cache entry and has the foll Before using the estimate to decide whether to update the cache, users should validate that the estimate correlates with their pod's memory usage. -Usage scenario: To learn how to troubleshoot with this metric, see the [Pods terminating due to memory pressure](https://www.apollographql.com/docs/router/containerization/kubernetes#pods-terminating-due-to-memory-pressure) guide in docs. By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/5770 \ No newline at end of file From 059f77e16cda928f2102cdd6ace29333585bc6b5 Mon Sep 17 00:00:00 2001 From: Bryn Cooke Date: Wed, 7 Aug 2024 14:06:57 +0100 Subject: [PATCH 22/22] Update .changesets/feat_enhanced_observability.md Co-authored-by: Edward Huang --- .changesets/feat_enhanced_observability.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.changesets/feat_enhanced_observability.md b/.changesets/feat_enhanced_observability.md index 5e6123e2d2..703e0a6918 100644 --- a/.changesets/feat_enhanced_observability.md +++ b/.changesets/feat_enhanced_observability.md @@ -1,9 +1,8 @@ -### `apollo.router.cache.storage.estimated_size` gauge ([PR #5770](https://github.com/apollographql/router/pull/5770)) +### New `apollo.router.cache.storage.estimated_size` gauge ([PR #5770](https://github.com/apollographql/router/pull/5770)) -Query planner cache entries may use an significant amount of memory in the Router. -To help users understand and monitor this the Router now exposes a new metric `apollo.router.cache.storage.estimated_size`. +The router supports the new metric `apollo.router.cache.storage.estimated_size` that helps users understand and monitor the amount of memory that query planner cache entries consume. -This metric give an estimated size in bytes for the cache entry and has the following attributes: +The `apollo.router.cache.storage.estimated_size` metric gives an estimated size in bytes of a cache entry. It has the following attributes: - `kind`: `query planner`. - `storage`: `memory`.