Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .changesets/maint_renee_migrate_metrics_histograms.md

This file was deleted.

6 changes: 3 additions & 3 deletions .changesets/maint_renee_migrate_metrics_values.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
### Migrate gauge metrics to OTel instruments ([PR #6476](https://github.com/apollographql/router/pull/6476))
### Migrate various metrics to OTel instruments ([PR #6476](https://github.com/apollographql/router/pull/6476), [PR #6356](https://github.com/apollographql/router/pull/6356), [PR #6539](https://github.com/apollographql/router/pull/6539))

Updates gauge metrics using the legacy `tracing::info!(value.*)` syntax to OTel instruments.
Various metrics using our legacy mechanism based on the `tracing` crate are migrated to OTel instruments.

By [@goto-bus-stop](https://github.com/goto-bus-stop) in https://github.com/apollographql/router/pull/6476
By [@goto-bus-stop](https://github.com/goto-bus-stop) in https://github.com/apollographql/router/pull/6476, https://github.com/apollographql/router/pull/6356, https://github.com/apollographql/router/pull/6539
36 changes: 22 additions & 14 deletions apollo-router/src/axum_factory/listeners.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,19 @@ pub(super) fn serve_router_on_listen_addr(
all_connections_stopped_sender: mpsc::Sender<()>,
) -> (impl Future<Output = Listener>, oneshot::Sender<()>) {
let (shutdown_sender, shutdown_receiver) = oneshot::channel::<()>();

let meter = meter_provider().meter("apollo/router");
let total_session_count_instrument = meter
.u64_observable_gauge("apollo_router_session_count_total")
.with_description("Number of currently connected clients")
.with_callback(move |gauge| {
gauge.observe(
TOTAL_SESSION_COUNT.load(Ordering::Relaxed),
&[KeyValue::new("listener", address.to_string())],
);
})
.init();

// this server reproduces most of hyper::server::Server's behaviour
// we select over the stop_listen_receiver channel and the listener's
// accept future. If the channel received something or the sender
Expand All @@ -232,18 +245,6 @@ pub(super) fn serve_router_on_listen_addr(
let server = async move {
tokio::pin!(shutdown_receiver);

let _total_session_count_instrument = meter_provider()
.meter("apollo/router")
.u64_observable_gauge("apollo_router_session_count_total")
.with_description("Number of currently connected clients")
.with_callback(move |gauge| {
gauge.observe(
TOTAL_SESSION_COUNT.load(Ordering::Relaxed),
&[KeyValue::new("listener", address.to_string())],
);
})
.init();

let connection_shutdown = Arc::new(Notify::new());

loop {
Expand All @@ -263,13 +264,20 @@ pub(super) fn serve_router_on_listen_addr(
MAX_FILE_HANDLES_WARN.store(false, Ordering::SeqCst);
}

// We only want to recognise sessions if we are the main graphql port.
let _guard = main_graphql_port.then(TotalSessionCountGuard::start);
// The session count instrument must be kept alive as long as any
// request is in flight. So its lifetime is not related to the server
// itself. The simplest way to do this is to hold onto a reference for
// the duration of every request.
let session_count_instrument = total_session_count_instrument.clone();
// We only want to count sessions if we are the main graphql port.
let session_count_guard = main_graphql_port.then(TotalSessionCountGuard::start);

let mut http_config = http_config.clone();
tokio::task::spawn(async move {
// this sender must be moved into the session to track that it is still running
let _connection_stop_signal = connection_stop_signal;
let _session_count_instrument = session_count_instrument;
let _session_count_guard = session_count_guard;

match res {
NetworkStream::Tcp(stream) => {
Expand Down
93 changes: 93 additions & 0 deletions apollo-router/src/axum_factory/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ use crate::graphql;
use crate::http_server_factory::HttpServerFactory;
use crate::http_server_factory::HttpServerHandle;
use crate::json_ext::Path;
use crate::metrics::FutureMetricsExt as _;
use crate::plugin::test::MockSubgraph;
use crate::query_planner::QueryPlannerService;
use crate::router_factory::create_plugins;
Expand Down Expand Up @@ -2351,3 +2352,95 @@ async fn test_supergraph_timeout() {
})
);
}

/// There are two session count gauges:
/// - apollo_router_session_count_total, the number of open client connections
/// - apollo_router_session_count_active, the number of in-flight HTTP requests
///
/// To test them, we use two hyper clients. Each client has its own connection pool, so by manually
/// sending requests and completing responses, and creating and dropping clients, we can control
/// the amount of open connections and the amount of in-flight requests.
///
/// XXX(@goto-bus-stop): this only tests the `session_count_total` metric right now. The
/// `session_count_active` metric is reported from inside an axum router, so its lifetime is
/// actually a little shorter than the full request/response cycle, in a way that is not easy to
/// test from the outside. To test it we could use a custom inner service (passed to
/// `init_with_config`) where we can control the progress the inner service makes.
/// For now, I've tested the `session_count_active` metric manually and confirmed its value makes
/// sense...
#[tokio::test]
async fn it_reports_session_count_metric() {
let configuration = Configuration::fake_builder().build().unwrap();

async {
let (server, _client) = init_with_config(
router::service::empty().await,
Arc::new(configuration),
MultiMap::new(),
)
.await
.unwrap();

let url = server
.graphql_listen_address()
.as_ref()
.unwrap()
.to_string();

let make_request = || {
http::Request::builder()
.uri(&url)
.body(hyper::Body::from(r#"{ "query": "{ me }" }"#))
.unwrap()
};

let client = hyper::Client::new();
// Create a second client that does not reuse the same connection pool.
let second_client = hyper::Client::new();

let first_response = client.request(make_request()).await.unwrap();

assert_gauge!(
"apollo_router_session_count_total",
1,
"listener" = url.clone()
);

let second_response = second_client.request(make_request()).await.unwrap();

// Both requests are in-flight
assert_gauge!(
"apollo_router_session_count_total",
2,
"listener" = url.clone()
);

_ = hyper::body::to_bytes(first_response.into_body()).await;

// Connection is still open in the pool even though the request is complete.
assert_gauge!(
"apollo_router_session_count_total",
2,
"listener" = url.clone()
);

_ = hyper::body::to_bytes(second_response.into_body()).await;

drop(client);
drop(second_client);

// XXX(@goto-bus-stop): Not ideal, but we would probably have to drop down to very
// low-level hyper primitives to control the shutdown of connections to the required
// extent. 100ms is a long time so I hope it's not flaky.
tokio::time::sleep(Duration::from_millis(100)).await;

// All connections are closed
assert_gauge!(
"apollo_router_session_count_total",
0,
"listener" = url.clone()
);
}
.with_metrics()
.await;
}