diff --git a/sgl-model-gateway/src/core/worker_registry.rs b/sgl-model-gateway/src/core/worker_registry.rs index be79597719f5..f78663427c59 100644 --- a/sgl-model-gateway/src/core/worker_registry.rs +++ b/sgl-model-gateway/src/core/worker_registry.rs @@ -8,6 +8,7 @@ use dashmap::DashMap; use uuid::Uuid; use crate::core::{ConnectionMode, RuntimeType, Worker, WorkerType}; +use crate::observability::metrics::RouterMetrics; /// Unique identifier for a worker #[derive(Debug, Clone, Hash, Eq, PartialEq)] @@ -138,8 +139,8 @@ impl WorkerRegistry { conn_workers.retain(|id| id != worker_id); } - // TODO we may even remove it from Prometheus exports worker.set_healthy(false); + RouterMetrics::remove_worker_metrics(worker.url()); Some(worker) } else { diff --git a/sgl-model-gateway/src/observability/metrics.rs b/sgl-model-gateway/src/observability/metrics.rs index 2fa4fcb6641b..c8d8ba11b1a2 100644 --- a/sgl-model-gateway/src/observability/metrics.rs +++ b/sgl-model-gateway/src/observability/metrics.rs @@ -536,6 +536,15 @@ impl RouterMetrics { .increment(1); } + // TODO delete the metrics (instead of setting them to zero) + pub fn remove_worker_metrics(worker_url: &str) { + gauge!("sgl_router_cb_state","worker" => worker_url.to_string()).set(0.0); + gauge!("sgl_router_worker_health","worker" => worker_url.to_string()).set(0.0); + gauge!("sgl_router_worker_load","worker" => worker_url.to_string()).set(0.0); + gauge!("sgl_router_running_requests","worker" => worker_url.to_string()).set(0.0); + gauge!("sgl_router_tree_size","worker" => worker_url.to_string()).set(0.0); + } + pub fn set_job_queue_depth(depth: usize) { gauge!("sgl_router_job_queue_depth").set(depth as f64); }