refactor: move metrics parsing options and rename HTTP queue metric

keivenchang · keivenchang · commit 1244cd35488e · 2025-09-10T01:44:51.000Z
- Move get_parsing_options from grpc service to http metrics module
- Rename HTTP_QUEUE to HTTP_QUEUED_REQUESTS for clarity
- Update comment to specify 'first response' timing

Signed-off-by: Keiven Chang &lt;keivenchang@users.noreply.github.com&gt;
diff --git a/lib/llm/src/grpc/service/kserve.rs b/lib/llm/src/grpc/service/kserve.rs
@@ -21,7 +21,8 @@ use tokio::task::JoinHandle;
 use tokio_stream::{Stream, StreamExt};
 use tokio_util::sync::CancellationToken;
 
-use crate::grpc::service::openai::{completion_response_stream, get_parsing_options};
+use crate::grpc::service::openai::completion_response_stream;
+use crate::http::service::metrics::get_parsing_options;
 use tonic::{Request, Response, Status, transport::Server};
 
 use crate::protocols::openai::completions::{
diff --git a/lib/runtime/src/metrics/prometheus_names.rs b/lib/runtime/src/metrics/prometheus_names.rs
@@ -45,9 +45,8 @@ pub mod frontend_service {
     /// Total number of LLM requests processed
     pub const REQUESTS_TOTAL: &str = "requests_total";
 
-    /// Number of requests waiting in HTTP queue before receiving a response.
-    /// This can measure the engine pool exhaustion if the engine is not able to process requests fast enough.
-    pub const HTTP_QUEUE: &str = "http_queue";
+    /// Number of requests waiting in HTTP queue before receiving the first response.
+    pub const HTTP_QUEUED_REQUESTS: &str = "http_queued_requests";
 
     /// Number of inflight requests going to the engine (vLLM, SGLang, ...)
     pub const INFLIGHT_REQUESTS: &str = "inflight_requests";