From c141a07695a8b70f5e4db74ce931d01f29ca0958 Mon Sep 17 00:00:00 2001 From: Gary Pennington Date: Fri, 16 Aug 2024 09:46:25 +0100 Subject: [PATCH 1/4] Fix session counting and the reporting of file handle shortage Session counting incorrectly included connections to the health check or other non-graphql connections. This is now corrected so that only connections to the main graphql port are counted. Warnings about file handle shortages are now handled correctly as a global resource. --- .../axum_factory/axum_http_server_factory.rs | 2 + apollo-router/src/axum_factory/listeners.rs | 78 ++++++++++++------- 2 files changed, 52 insertions(+), 28 deletions(-) diff --git a/apollo-router/src/axum_factory/axum_http_server_factory.rs b/apollo-router/src/axum_factory/axum_http_server_factory.rs index f687440f4c..08df933dc6 100644 --- a/apollo-router/src/axum_factory/axum_http_server_factory.rs +++ b/apollo-router/src/axum_factory/axum_http_server_factory.rs @@ -303,6 +303,7 @@ impl HttpServerFactory for AxumHttpServerFactory { main_listener, actual_main_listen_address.clone(), all_routers.main.1, + true, all_connections_stopped_sender.clone(), ); @@ -341,6 +342,7 @@ impl HttpServerFactory for AxumHttpServerFactory { listener, listen_addr.clone(), router, + false, all_connections_stopped_sender.clone(), ); ( diff --git a/apollo-router/src/axum_factory/listeners.rs b/apollo-router/src/axum_factory/listeners.rs index 6be06acdbd..01796217d1 100644 --- a/apollo-router/src/axum_factory/listeners.rs +++ b/apollo-router/src/axum_factory/listeners.rs @@ -15,6 +15,7 @@ use futures::channel::oneshot; use futures::prelude::*; use hyper::server::conn::Http; use multimap::MultiMap; +use parking_lot::Mutex; #[cfg(unix)] use tokio::net::UnixListener; use tokio::sync::mpsc; @@ -31,7 +32,8 @@ use crate::router::ApolloRouterError; use crate::router_factory::Endpoint; use crate::ListenAddr; -pub(crate) static SESSION_COUNT: AtomicU64 = AtomicU64::new(0); +static MAX_OPEN_FILE_WARNING: Mutex> = Mutex::new(None); +static SESSION_COUNT: AtomicU64 = AtomicU64::new(0); #[derive(Clone, Debug)] pub(crate) struct ListenAddrAndRouter(pub(crate) ListenAddr, pub(crate) Router); @@ -197,10 +199,34 @@ pub(super) async fn get_extra_listeners( Ok(listeners_and_routers) } +async fn check_open_files() { + { + let mut max_open_file_warning = MAX_OPEN_FILE_WARNING.lock(); + match *max_open_file_warning { + None => { + tracing::error!( + "reached the max open file limit, cannot accept any new connection" + ); + *max_open_file_warning = Some(Instant::now()); + } + Some(last) => { + if Instant::now() - last > Duration::from_secs(60) { + tracing::error!( + "still at the max open file limit, cannot accept any new connection" + ); + *max_open_file_warning = Some(Instant::now()); + } + } + } + } + tokio::time::sleep(Duration::from_millis(1)).await; +} + pub(super) fn serve_router_on_listen_addr( mut listener: Listener, address: ListenAddr, router: axum::Router, + main_graphql_port: bool, all_connections_stopped_sender: mpsc::Sender<()>, ) -> (impl Future, oneshot::Sender<()>) { let (shutdown_sender, shutdown_receiver) = oneshot::channel::<()>(); @@ -213,7 +239,6 @@ pub(super) fn serve_router_on_listen_addr( tokio::pin!(shutdown_receiver); let connection_shutdown = Arc::new(Notify::new()); - let mut max_open_file_warning = None; let address = address.to_string(); @@ -229,16 +254,21 @@ pub(super) fn serve_router_on_listen_addr( match res { Ok(res) => { - if max_open_file_warning.is_some(){ - tracing::info!("can accept connections again"); - max_open_file_warning = None; + { + let mut max_open_file_warning = MAX_OPEN_FILE_WARNING.lock(); + if max_open_file_warning.is_some() { + tracing::info!("can accept connections again"); + *max_open_file_warning = None; + } + } + // We only want to recognise sessions if we are the main graphql port. + if main_graphql_port { + let session_count = SESSION_COUNT.fetch_add(1, Ordering::Acquire)+1; + tracing::info!( + value.apollo_router_session_count_total = session_count, + listener = &address + ); } - - let session_count = SESSION_COUNT.fetch_add(1, Ordering::Acquire)+1; - tracing::info!( - value.apollo_router_session_count_total = session_count, - listener = &address - ); let address = address.clone(); tokio::task::spawn(async move { @@ -356,12 +386,14 @@ pub(super) fn serve_router_on_listen_addr( } } - let session_count = SESSION_COUNT.fetch_sub(1, Ordering::Acquire)-1; - tracing::info!( - value.apollo_router_session_count_total = session_count, - listener = &address - ); - + // We only want to recognise sessions if we are the main graphql port. + if main_graphql_port { + let session_count = SESSION_COUNT.fetch_sub(1, Ordering::Acquire)-1; + tracing::info!( + value.apollo_router_session_count_total = session_count, + listener = &address + ); + } }); } @@ -419,17 +451,7 @@ pub(super) fn serve_router_on_listen_addr( _ => { match e.raw_os_error() { Some(libc::EMFILE) | Some(libc::ENFILE) => { - match max_open_file_warning { - None => { - tracing::error!("reached the max open file limit, cannot accept any new connection"); - max_open_file_warning = Some(Instant::now()); - } - Some(last) => if Instant::now() - last > Duration::from_secs(60) { - tracing::error!("still at the max open file limit, cannot accept any new connection"); - max_open_file_warning = Some(Instant::now()); - } - } - tokio::time::sleep(Duration::from_millis(1)).await; + check_open_files().await; } _ => {} } From 6d20709b5eadca8687a5101ec7d42f6183cd09e3 Mon Sep 17 00:00:00 2001 From: Gary Pennington Date: Fri, 16 Aug 2024 09:58:58 +0100 Subject: [PATCH 2/4] add a changeset --- .../fix_garypen_fix_sessions_and_handle_reporting.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 .changesets/fix_garypen_fix_sessions_and_handle_reporting.md diff --git a/.changesets/fix_garypen_fix_sessions_and_handle_reporting.md b/.changesets/fix_garypen_fix_sessions_and_handle_reporting.md new file mode 100644 index 0000000000..f9527320e6 --- /dev/null +++ b/.changesets/fix_garypen_fix_sessions_and_handle_reporting.md @@ -0,0 +1,7 @@ +### Fix session counting and the reporting of file handle shortage ([PR #5834](https://github.com/apollographql/router/pull/5834)) + +Session counting incorrectly included connections to the health check or other non-graphql connections. This is now corrected so that only connections to the main graphql port are counted. + +Warnings about file handle shortages are now handled correctly as a global resource. + +By [@garypen](https://github.com/garypen) in https://github.com/apollographql/router/pull/5834 \ No newline at end of file From 853b70c6db165c90aad52bf1bd2f185d632b988f Mon Sep 17 00:00:00 2001 From: Gary Pennington Date: Fri, 16 Aug 2024 12:19:21 +0100 Subject: [PATCH 3/4] Remove custom log rate limiting from listener The listener had its own custom rate limiting for log messages. The router comes with a standard mechanism for rate limiting log messages. Remove the custom rate limiting mechanism and advised in the changelog that standard rate limiting configuration should be used. --- apollo-router/src/axum_factory/listeners.rs | 42 +++++---------------- 1 file changed, 9 insertions(+), 33 deletions(-) diff --git a/apollo-router/src/axum_factory/listeners.rs b/apollo-router/src/axum_factory/listeners.rs index 01796217d1..dad439317c 100644 --- a/apollo-router/src/axum_factory/listeners.rs +++ b/apollo-router/src/axum_factory/listeners.rs @@ -7,7 +7,6 @@ use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; use std::time::Duration; -use std::time::Instant; use axum::response::*; use axum::Router; @@ -15,7 +14,6 @@ use futures::channel::oneshot; use futures::prelude::*; use hyper::server::conn::Http; use multimap::MultiMap; -use parking_lot::Mutex; #[cfg(unix)] use tokio::net::UnixListener; use tokio::sync::mpsc; @@ -32,8 +30,8 @@ use crate::router::ApolloRouterError; use crate::router_factory::Endpoint; use crate::ListenAddr; -static MAX_OPEN_FILE_WARNING: Mutex> = Mutex::new(None); static SESSION_COUNT: AtomicU64 = AtomicU64::new(0); +static MAX_FILE_HANDLES_WARN: AtomicBool = AtomicBool::new(false); #[derive(Clone, Debug)] pub(crate) struct ListenAddrAndRouter(pub(crate) ListenAddr, pub(crate) Router); @@ -199,29 +197,6 @@ pub(super) async fn get_extra_listeners( Ok(listeners_and_routers) } -async fn check_open_files() { - { - let mut max_open_file_warning = MAX_OPEN_FILE_WARNING.lock(); - match *max_open_file_warning { - None => { - tracing::error!( - "reached the max open file limit, cannot accept any new connection" - ); - *max_open_file_warning = Some(Instant::now()); - } - Some(last) => { - if Instant::now() - last > Duration::from_secs(60) { - tracing::error!( - "still at the max open file limit, cannot accept any new connection" - ); - *max_open_file_warning = Some(Instant::now()); - } - } - } - } - tokio::time::sleep(Duration::from_millis(1)).await; -} - pub(super) fn serve_router_on_listen_addr( mut listener: Listener, address: ListenAddr, @@ -254,12 +229,9 @@ pub(super) fn serve_router_on_listen_addr( match res { Ok(res) => { - { - let mut max_open_file_warning = MAX_OPEN_FILE_WARNING.lock(); - if max_open_file_warning.is_some() { - tracing::info!("can accept connections again"); - *max_open_file_warning = None; - } + if MAX_FILE_HANDLES_WARN.load(Ordering::SeqCst) { + tracing::info!("can accept connections again"); + MAX_FILE_HANDLES_WARN.store(false, Ordering::SeqCst); } // We only want to recognise sessions if we are the main graphql port. if main_graphql_port { @@ -451,7 +423,11 @@ pub(super) fn serve_router_on_listen_addr( _ => { match e.raw_os_error() { Some(libc::EMFILE) | Some(libc::ENFILE) => { - check_open_files().await; + tracing::error!( + "reached the max open file limit, cannot accept any new connection" + ); + MAX_FILE_HANDLES_WARN.store(true, Ordering::SeqCst); + tokio::time::sleep(Duration::from_millis(1)).await; } _ => {} } From d6114d81aef2f94e6b1bb728fa44286cbd7058b1 Mon Sep 17 00:00:00 2001 From: Gary Pennington Date: Fri, 16 Aug 2024 12:22:42 +0100 Subject: [PATCH 4/4] Update changeset message to reflect changes --- .changesets/fix_garypen_fix_sessions_and_handle_reporting.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.changesets/fix_garypen_fix_sessions_and_handle_reporting.md b/.changesets/fix_garypen_fix_sessions_and_handle_reporting.md index f9527320e6..cb77b5aa99 100644 --- a/.changesets/fix_garypen_fix_sessions_and_handle_reporting.md +++ b/.changesets/fix_garypen_fix_sessions_and_handle_reporting.md @@ -4,4 +4,6 @@ Session counting incorrectly included connections to the health check or other n Warnings about file handle shortages are now handled correctly as a global resource. -By [@garypen](https://github.com/garypen) in https://github.com/apollographql/router/pull/5834 \ No newline at end of file +The listening logic had its own custom rate limiting notifications. This has been removed and log notification is now controlled by the [standard router log rate limiting configuration](https://www.apollographql.com/docs/router/configuration/telemetry/exporters/logging/stdout/#rate_limit) + +By [@garypen](https://github.com/garypen) in https://github.com/apollographql/router/pull/5834