diff --git a/.changesets/maint_renee_migrate_metrics_histograms.md b/.changesets/maint_renee_migrate_metrics_histograms.md deleted file mode 100644 index b78b99d872..0000000000 --- a/.changesets/maint_renee_migrate_metrics_histograms.md +++ /dev/null @@ -1,5 +0,0 @@ -### Migrate histogram metrics to `{f,u}64_histogram!` ([PR #6356](https://github.com/apollographql/router/pull/6356)) - -Updates histogram metrics using the legacy `tracing::info!(histogram.*)` syntax to the new metrics macros. - -By [@goto-bus-stop](https://github.com/goto-bus-stop) in https://github.com/apollographql/router/pull/6356 \ No newline at end of file diff --git a/.changesets/maint_renee_migrate_metrics_values.md b/.changesets/maint_renee_migrate_metrics_values.md index 2bb0ee2e23..19ab1e9815 100644 --- a/.changesets/maint_renee_migrate_metrics_values.md +++ b/.changesets/maint_renee_migrate_metrics_values.md @@ -1,5 +1,5 @@ -### Migrate gauge metrics to OTel instruments ([PR #6476](https://github.com/apollographql/router/pull/6476)) +### Migrate various metrics to OTel instruments ([PR #6476](https://github.com/apollographql/router/pull/6476), [PR #6356](https://github.com/apollographql/router/pull/6356), [PR #6539](https://github.com/apollographql/router/pull/6539)) -Updates gauge metrics using the legacy `tracing::info!(value.*)` syntax to OTel instruments. +Various metrics using our legacy mechanism based on the `tracing` crate are migrated to OTel instruments. -By [@goto-bus-stop](https://github.com/goto-bus-stop) in https://github.com/apollographql/router/pull/6476 \ No newline at end of file +By [@goto-bus-stop](https://github.com/goto-bus-stop) in https://github.com/apollographql/router/pull/6476, https://github.com/apollographql/router/pull/6356, https://github.com/apollographql/router/pull/6539 diff --git a/apollo-router/src/axum_factory/listeners.rs b/apollo-router/src/axum_factory/listeners.rs index 17881d47bb..d691d84c85 100644 --- a/apollo-router/src/axum_factory/listeners.rs +++ b/apollo-router/src/axum_factory/listeners.rs @@ -224,6 +224,19 @@ pub(super) fn serve_router_on_listen_addr( all_connections_stopped_sender: mpsc::Sender<()>, ) -> (impl Future, oneshot::Sender<()>) { let (shutdown_sender, shutdown_receiver) = oneshot::channel::<()>(); + + let meter = meter_provider().meter("apollo/router"); + let total_session_count_instrument = meter + .u64_observable_gauge("apollo_router_session_count_total") + .with_description("Number of currently connected clients") + .with_callback(move |gauge| { + gauge.observe( + TOTAL_SESSION_COUNT.load(Ordering::Relaxed), + &[KeyValue::new("listener", address.to_string())], + ); + }) + .init(); + // this server reproduces most of hyper::server::Server's behaviour // we select over the stop_listen_receiver channel and the listener's // accept future. If the channel received something or the sender @@ -232,18 +245,6 @@ pub(super) fn serve_router_on_listen_addr( let server = async move { tokio::pin!(shutdown_receiver); - let _total_session_count_instrument = meter_provider() - .meter("apollo/router") - .u64_observable_gauge("apollo_router_session_count_total") - .with_description("Number of currently connected clients") - .with_callback(move |gauge| { - gauge.observe( - TOTAL_SESSION_COUNT.load(Ordering::Relaxed), - &[KeyValue::new("listener", address.to_string())], - ); - }) - .init(); - let connection_shutdown = Arc::new(Notify::new()); loop { @@ -263,13 +264,20 @@ pub(super) fn serve_router_on_listen_addr( MAX_FILE_HANDLES_WARN.store(false, Ordering::SeqCst); } - // We only want to recognise sessions if we are the main graphql port. - let _guard = main_graphql_port.then(TotalSessionCountGuard::start); + // The session count instrument must be kept alive as long as any + // request is in flight. So its lifetime is not related to the server + // itself. The simplest way to do this is to hold onto a reference for + // the duration of every request. + let session_count_instrument = total_session_count_instrument.clone(); + // We only want to count sessions if we are the main graphql port. + let session_count_guard = main_graphql_port.then(TotalSessionCountGuard::start); let mut http_config = http_config.clone(); tokio::task::spawn(async move { // this sender must be moved into the session to track that it is still running let _connection_stop_signal = connection_stop_signal; + let _session_count_instrument = session_count_instrument; + let _session_count_guard = session_count_guard; match res { NetworkStream::Tcp(stream) => { diff --git a/apollo-router/src/axum_factory/tests.rs b/apollo-router/src/axum_factory/tests.rs index 7c9c585663..9fe6f489cb 100644 --- a/apollo-router/src/axum_factory/tests.rs +++ b/apollo-router/src/axum_factory/tests.rs @@ -61,6 +61,7 @@ use crate::graphql; use crate::http_server_factory::HttpServerFactory; use crate::http_server_factory::HttpServerHandle; use crate::json_ext::Path; +use crate::metrics::FutureMetricsExt as _; use crate::plugin::test::MockSubgraph; use crate::query_planner::QueryPlannerService; use crate::router_factory::create_plugins; @@ -2351,3 +2352,95 @@ async fn test_supergraph_timeout() { }) ); } + +/// There are two session count gauges: +/// - apollo_router_session_count_total, the number of open client connections +/// - apollo_router_session_count_active, the number of in-flight HTTP requests +/// +/// To test them, we use two hyper clients. Each client has its own connection pool, so by manually +/// sending requests and completing responses, and creating and dropping clients, we can control +/// the amount of open connections and the amount of in-flight requests. +/// +/// XXX(@goto-bus-stop): this only tests the `session_count_total` metric right now. The +/// `session_count_active` metric is reported from inside an axum router, so its lifetime is +/// actually a little shorter than the full request/response cycle, in a way that is not easy to +/// test from the outside. To test it we could use a custom inner service (passed to +/// `init_with_config`) where we can control the progress the inner service makes. +/// For now, I've tested the `session_count_active` metric manually and confirmed its value makes +/// sense... +#[tokio::test] +async fn it_reports_session_count_metric() { + let configuration = Configuration::fake_builder().build().unwrap(); + + async { + let (server, _client) = init_with_config( + router::service::empty().await, + Arc::new(configuration), + MultiMap::new(), + ) + .await + .unwrap(); + + let url = server + .graphql_listen_address() + .as_ref() + .unwrap() + .to_string(); + + let make_request = || { + http::Request::builder() + .uri(&url) + .body(hyper::Body::from(r#"{ "query": "{ me }" }"#)) + .unwrap() + }; + + let client = hyper::Client::new(); + // Create a second client that does not reuse the same connection pool. + let second_client = hyper::Client::new(); + + let first_response = client.request(make_request()).await.unwrap(); + + assert_gauge!( + "apollo_router_session_count_total", + 1, + "listener" = url.clone() + ); + + let second_response = second_client.request(make_request()).await.unwrap(); + + // Both requests are in-flight + assert_gauge!( + "apollo_router_session_count_total", + 2, + "listener" = url.clone() + ); + + _ = hyper::body::to_bytes(first_response.into_body()).await; + + // Connection is still open in the pool even though the request is complete. + assert_gauge!( + "apollo_router_session_count_total", + 2, + "listener" = url.clone() + ); + + _ = hyper::body::to_bytes(second_response.into_body()).await; + + drop(client); + drop(second_client); + + // XXX(@goto-bus-stop): Not ideal, but we would probably have to drop down to very + // low-level hyper primitives to control the shutdown of connections to the required + // extent. 100ms is a long time so I hope it's not flaky. + tokio::time::sleep(Duration::from_millis(100)).await; + + // All connections are closed + assert_gauge!( + "apollo_router_session_count_total", + 0, + "listener" = url.clone() + ); + } + .with_metrics() + .await; +}