diff --git a/Cargo.lock b/Cargo.lock
index 83994db565..2aca376ba2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4642,25 +4642,6 @@ dependencies = [
"paste",
]
-[[package]]
-name = "metrics"
-version = "0.5.1"
-dependencies = [
- "axum 0.8.4",
- "clap 4.5.48",
- "dynamo-llm",
- "dynamo-runtime",
- "futures",
- "prometheus",
- "rand 0.9.2",
- "reqwest 0.12.23",
- "serde",
- "serde_json",
- "thiserror 2.0.16",
- "tokio",
- "tracing",
-]
-
[[package]]
name = "mime"
version = "0.3.17"
diff --git a/Cargo.toml b/Cargo.toml
index 3512e76bc1..48c07b1f96 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,6 @@
[workspace]
members = [
- "components/metrics",
"launch/dynamo-run",
"lib/llm",
"lib/runtime",
@@ -18,7 +17,6 @@ members = [
# - launch/dynamo-run
# - lib/engines/*
default-members = [
- "components/metrics",
"lib/llm",
"lib/runtime",
"lib/tokens",
diff --git a/components/metrics/Cargo.toml b/components/metrics/Cargo.toml
deleted file mode 100644
index 7167a3ee37..0000000000
--- a/components/metrics/Cargo.toml
+++ /dev/null
@@ -1,40 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-[package]
-name = "metrics"
-version.workspace = true
-edition.workspace = true
-authors.workspace = true
-license.workspace = true
-homepage.workspace = true
-repository.workspace = true
-
-[dependencies]
-dynamo-llm = { workspace = true }
-dynamo-runtime = { workspace = true }
-
-futures = { workspace = true }
-prometheus = { workspace = true }
-rand = { workspace = true }
-serde = { workspace = true }
-serde_json = { workspace = true }
-thiserror = { workspace = true }
-tokio = { workspace = true }
-tracing = { workspace = true }
-
-axum = { version = "0.8" }
-clap = { version = "4.5", features = ["derive", "env"] }
-reqwest = { version = "0.12.22", default-features = false, features = ["json", "rustls-tls"] }
diff --git a/components/metrics/README.md b/components/metrics/README.md
deleted file mode 100644
index 7261e387e3..0000000000
--- a/components/metrics/README.md
+++ /dev/null
@@ -1,191 +0,0 @@
-# Metrics
-
-⚠️ **DEPRECATION NOTICE** ⚠️
-
-**This `metrics` component is unmaintained and being deprecated.**
-
-The deprecated `metrics` component is being replaced by the **`MetricsRegistry`** built-in functionality that is now available directly in the `DistributedRuntime` framework. The `MetricsRegistry` provides:
-
-**For new projects and existing deployments, please migrate to using `MetricsRegistry` instead of this component.**
-
-This component may be migrated to the MetricsRegistry in the future.
-
-**📖 See the [Dynamo MetricsRegistry Guide](../../docs/guides/metrics.md) for detailed information on using the new metrics system.**
-
----
-
-The deprecated `metrics` component is a utility for collecting, aggregating, and publishing metrics from a Dynamo deployment, but it is unmaintained and being deprecated in favor of `MetricsRegistry`.
-
-**Note**: This is a demo implementation. The deprecated `metrics` component is no longer under active development.
-- In this demo the metrics names use the prefix "llm", but in production they will be prefixed with "dynamo" (e.g., the HTTP `/metrics` endpoint will serve metrics with "dynamo" prefixes)
-
-
-

-
-
-## Quickstart
-
-To start the deprecated `metrics` component, simply point it at the `namespace/component/endpoint`
-trio for the Dynamo workers that you're interested in monitoring metrics on.
-
-This will:
-1. Collect statistics from workers associated with that `namespace/component/endpoint`
-2. Postprocess and aggregate those statistics across the workers
-3. Publish them on a Prometheus-compatible metrics endpoint
-
-For example:
-```bash
-# Default namespace is "dynamo", but can be configured with --namespace
-# For more detailed output, try setting the env var: DYN_LOG=debug
-metrics --component MyComponent --endpoint my_endpoint
-
-# 2025-03-17T00:07:05.202558Z INFO metrics: Scraping endpoint dynamo/MyComponent/my_endpoint for stats
-# 2025-03-17T00:07:05.202955Z INFO metrics: Prometheus metrics server started at 0.0.0.0:9091/metrics
-# ...
-```
-
-With no matching endpoints running to collect stats from, you should see warnings in the logs:
-```bash
-2025-03-17T00:07:06.204756Z WARN metrics: No endpoints found matching dynamo/MyComponent/my_endpoint
-```
-
-After a worker with a matching endpoint gets started, the endpoint
-will get automatically discovered and the warnings will stop.
-
-## Workers
-
-The deprecated `metrics` component needs running workers to gather metrics from,
-so below are some examples of workers and how they can be monitored.
-
-### Mock Worker
-
-To try out how the deprecated `metrics` component works, there is a demo Rust-based
-[mock worker](src/bin/mock_worker.rs) that provides sample data through two mechanisms:
-1. Exposes a stats handler at `dynamo/MyComponent/my_endpoint` that responds to polling requests (from the deprecated `metrics` component) with randomly generated `ForwardPassMetrics` data
-2. Publishes mock `KVHitRateEvent` data every second to demonstrate event-based metrics
-
-Step 1: Launch a mock workers via the following command (if already built):
-```bash
-# or build/run from source: DYN_LOG=DEBUG cargo run --bin mock_worker
-mock_worker
-
-# 2025-03-16T23:49:28.101668Z INFO mock_worker: Starting Mock Worker on Endpoint: dynamo/MyComponent/my_endpoint
-```
-
-Step 2: Monitor the metrics of these mock workers, and prepare its own Prometheus endpoint at
-port 9091 (a default, when --port is not specified) on /metrics:
-```bash
-metrics --component MyComponent --endpoint my_endpoint
-```
-
-### Real Worker
-
-To run a more realistic deployment to gather metrics:
-
-```bash
-python -m dynamo.frontend &
-python -m dynamo.vllm --model-path
-```
-
-Then, to monitor the metrics of these VllmWorkers, run:
-```bash
-metrics --component backend --endpoint load_metrics
-```
-
-**NOTE**: `load_metrics` is currently a
-[hard-coded](https://github.com/ai-dynamo/dynamo/blob/d5220c7b1151372ba3d2a061c7d0a7ed72724789/lib/llm/src/kv_router/publisher.rs#L108)
-endpoint name used for python-based workers that register a `WorkerMetricsPublisher`.
-
-## Visualization
-
-To visualize the metrics being exposed on the Prometheus endpoint,
-see the Prometheus and Grafana configurations in
-[deploy/metrics](../../deploy/metrics):
-```bash
-docker compose -f deploy/docker-compose.yml --profile metrics up -d
-```
-
-## Metrics Collection Modes
-
-The deprecated `metrics` component supports two modes for exposing metrics in a Prometheus format:
-
-### Pull Mode (Default)
-
-When running in pull mode (the default), the deprecated `metrics` component will expose a
-Prometheus metrics endpoint on the specified host and port that a
-Prometheus server or curl client can pull from:
-
-```bash
-# Start metrics server on default host (0.0.0.0) and port (9091)
-metrics --component MyComponent --endpoint my_endpoint
-
-# Or specify a custom port
-metrics --component MyComponent --endpoint my_endpoint --port 9092
-```
-
-In pull mode:
-- The `--host` parameter must be a valid IPv4 or IPv6 address (e.g., "0.0.0.0", "127.0.0.1")
-- The `--port` parameter specifies which port the HTTP server will listen on
-
-You can then query the metrics using:
-```bash
-curl localhost:9091/metrics
-
-# # HELP llm_kv_blocks_active Active KV cache blocks
-# # TYPE llm_kv_blocks_active gauge
-# llm_kv_blocks_active{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033398"} 40
-# llm_kv_blocks_active{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033401"} 2
-# # HELP llm_kv_blocks_total Total KV cache blocks
-# # TYPE llm_kv_blocks_total gauge
-# llm_kv_blocks_total{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033398"} 100
-# llm_kv_blocks_total{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033401"} 100
-```
-
-### Push Mode
-
-For ephemeral or batch jobs, or when metrics need to be pushed through a firewall,
-you can use Push mode. In this mode, the deprecated `metrics` component will periodically push
-metrics to an externally hosted
-[Prometheus PushGateway](https://prometheus.io/docs/instrumenting/pushing/):
-
-Start a prometheus push gateway service via docker:
-```bash
-docker run --rm -d -p 9091:9091 --name pushgateway prom/pushgateway
-```
-
-Start the deprecated `metrics` component in `--push` mode, specifying the host and port of your PushGateway:
-```bash
-# Push metrics to a Prometheus PushGateway every --push-interval seconds
-metrics \
- --component MyComponent \
- --endpoint my_endpoint \
- --host 127.0.0.1 \
- --port 9091 \
- --push
-```
-
-When using Push mode:
-- The `--host` parameter must be a valid IPv4 or IPv6 address (e.g., "0.0.0.0", "127.0.0.1")
- that the Prometheus PushGateway is running on
-- The `--port` parameter specifies the port of the Prometheus PushGateway
-- The push interval can be configured with `--push-interval` (default: 2 seconds)
-- A default job name of "dynamo_metrics" is used for the Prometheus job label
-- Metrics persist in the PushGateway until explicitly deleted
-- Prometheus should be configured to scrape the PushGateway with `honor_labels: true`
-
-To view the metrics hosted on the PushGateway:
-```bash
-# View all metrics
-# curl http://:/metrics
-curl 127.0.0.1:9091/metrics
-```
-## Building/Running from Source
-
-For easy iteration while making edits to the deprecated `metrics` component, you can use `cargo run`
-to build and run with your local changes:
-
-```bash
-cargo run --bin metrics -- --component MyComponent --endpoint my_endpoint
-```
-
-
diff --git a/components/metrics/images/dynamo_metrics_grafana.png b/components/metrics/images/dynamo_metrics_grafana.png
deleted file mode 100644
index a63915f940..0000000000
Binary files a/components/metrics/images/dynamo_metrics_grafana.png and /dev/null differ
diff --git a/components/metrics/src/bin/mock_worker.rs b/components/metrics/src/bin/mock_worker.rs
deleted file mode 100644
index 19881c983b..0000000000
--- a/components/metrics/src/bin/mock_worker.rs
+++ /dev/null
@@ -1,161 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-use dynamo_llm::kv_router::{
- KV_HIT_RATE_SUBJECT,
- protocols::{ForwardPassMetrics, KvStats, WorkerStats},
- scheduler::KVHitRateEvent,
-};
-use dynamo_runtime::{
- DistributedRuntime, Result, Runtime, Worker,
- component::{Namespace, service::EndpointStats},
- logging,
- pipeline::{
- AsyncEngine, AsyncEngineContextProvider, Error, ManyOut, ResponseStream, SingleIn,
- async_trait, network::Ingress,
- },
- protocols::annotated::Annotated,
- stream,
- traits::events::EventPublisher,
-};
-use rand::Rng;
-use std::sync::Arc;
-use tokio::time::{Duration, interval};
-
-fn main() -> Result<()> {
- logging::init();
- let worker = Worker::from_settings()?;
- worker.execute(app)
-}
-
-async fn app(runtime: Runtime) -> Result<()> {
- let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;
- backend(distributed).await
-}
-
-struct MockRequestHandler {}
-
-impl MockRequestHandler {
- fn new() -> Arc {
- Arc::new(Self {})
- }
-}
-
-#[async_trait]
-impl AsyncEngine, ManyOut>, Error> for MockRequestHandler {
- async fn generate(&self, input: SingleIn) -> Result>> {
- let (data, ctx) = input.into_parts();
-
- let chars = data
- .chars()
- .map(|c| Annotated::from_data(c.to_string()))
- .collect::>();
-
- let stream = stream::iter(chars);
-
- Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
- }
-}
-
-// FIXME: These events are just for testing and may not currently be used.
-/// Spawns a background task that periodically publishes mock KV hit rate events
-async fn mock_event_publisher(namespace: Namespace) {
- // NOTE: These events are just for testing, and shouldn't be interpreted
- // in correlation with the stats handler's data:
- // 1. The worker ID associated with the events here won't match the
- // worker ID of the endpoint's service stats handler.
- // 2. These events aren't coming through the KV Router, so the metrics won't
- // be reflective of the KV Router's performance.
- // 3. The data in these events aren't in sync with the stats handler's
- // ForwardPassMetrics data, so they may not correlate well.
- let worker_id = rand::rng().random_range(1..=1000);
-
- let mut interval = interval(Duration::from_secs(1));
- loop {
- interval.tick().await;
-
- // Generate random KV hit rate event using a new thread_rng each time
- let isl_blocks = rand::rng().random_range(0..=100);
- let overlap_blocks = rand::rng().random_range(0..=isl_blocks);
-
- let event = KVHitRateEvent {
- worker_id,
- isl_blocks,
- overlap_blocks: overlap_blocks as u32,
- };
-
- if let Err(e) = namespace.publish(KV_HIT_RATE_SUBJECT, &event).await {
- tracing::warn!("Failed to publish KV hit rate event: {e}");
- } else {
- tracing::debug!(
- "Published KV hit rate event: worker_id={worker_id}, isl_blocks={isl_blocks}, overlap_blocks={overlap_blocks}, hit_rate={:.2}%",
- (overlap_blocks as f64 / isl_blocks as f64) * 100.0
- );
- }
- }
-}
-
-/// Generates mock forward pass metrics for stats handler
-fn mock_stats_handler(_stats: EndpointStats) -> serde_json::Value {
- let request_total_slots = 100;
- let request_active_slots = rand::rng().random_range(0..=request_total_slots);
- let kv_total_blocks = 100;
- let kv_active_blocks = rand::rng().random_range(0..=kv_total_blocks);
- let num_requests_waiting = rand::rng().random_range(0..=100);
- let gpu_cache_usage_perc = rand::rng().random_range(0.0..=1.0);
- let gpu_prefix_cache_hit_rate = rand::rng().random_range(0.0..=1.0);
-
- let worker_stats = WorkerStats {
- data_parallel_rank: None, // Default for backwards compatibility
- request_active_slots,
- request_total_slots,
- num_requests_waiting,
- };
-
- let kv_stats = KvStats {
- kv_active_blocks,
- kv_total_blocks,
- gpu_cache_usage_perc,
- gpu_prefix_cache_hit_rate,
- };
-
- let spec_decode_stats = None;
-
- let stats = ForwardPassMetrics {
- worker_stats,
- kv_stats,
- spec_decode_stats,
- };
- tracing::info!("Stats: {stats:?}");
- serde_json::to_value(stats).unwrap()
-}
-
-async fn backend(runtime: DistributedRuntime) -> Result<()> {
- let namespace = runtime.namespace("dynamo")?;
- // we must first create a service, then we can attach one more more endpoints
- let component = namespace
- .component("MyComponent")?
- .service_builder()
- .create()
- .await?;
- let endpoint = component.endpoint("my_endpoint");
- tracing::info!("Starting Mock Worker on Endpoint: {}", endpoint.path());
-
- // Spawn background task for publishing KV hit rate events
- let namespace_clone = namespace.clone();
- tokio::spawn(async move {
- mock_event_publisher(namespace_clone).await;
- });
-
- // Attach an ingress to the engine
- let ingress = Ingress::for_engine(MockRequestHandler::new())?;
-
- // Make the ingress discoverable via a component service
- endpoint
- .endpoint_builder()
- // Dummy stats handler to demonstrate how to attach a custom stats handler
- .stats_handler(mock_stats_handler)
- .handler(ingress)
- .start()
- .await
-}
diff --git a/components/metrics/src/lib.rs b/components/metrics/src/lib.rs
deleted file mode 100644
index c18b671dfb..0000000000
--- a/components/metrics/src/lib.rs
+++ /dev/null
@@ -1,595 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-//! Library functions for the metrics application.
-//!
-//! This library provides functionality to expose Prometheus metrics either through a local HTTP server
-//! or by pushing to a Prometheus PushGateway.
-//!
-//! # Examples
-//!
-//! ## Using the metrics pull mode
-//! ```no_run
-//! use metrics::{PrometheusMetricsCollector, MetricsMode};
-//!
-//! #[tokio::main]
-//! async fn main() -> Result<(), Box> {
-//! let mut collector = PrometheusMetricsCollector::new()?;
-//!
-//! // Start a metrics server with default values
-//! collector.start(MetricsMode::default())?;
-//!
-//! // Or explicitly specify values
-//! collector.start(MetricsMode::Pull {
-//! host: "127.0.0.1".to_string(),
-//! port: 9090,
-//! })?;
-//!
-//! // Or use the convenience constructor
-//! collector.start(MetricsMode::new_pull())?;
-//!
-//! // Your application code here
-//! tokio::signal::ctrl_c().await?;
-//!
-//! // Stop the metrics server gracefully
-//! collector.stop();
-//! Ok(())
-//! }
-//! ```
-//!
-//! ## Using the Push mode
-//! ```no_run
-//! use metrics::{PrometheusMetricsCollector, MetricsMode};
-//!
-//! #[tokio::main]
-//! async fn main() -> Result<(), Box> {
-//! let mut collector = PrometheusMetricsCollector::new()?;
-//!
-//! // Start pushing metrics to a Prometheus PushGateway with default values
-//! collector.start(MetricsMode::new_push())?;
-//!
-//! // Or explicitly specify values
-//! collector.start(MetricsMode::Push {
-//! host: "127.0.0.1".to_string(),
-//! port: 9091,
-//! job: "custom_job".to_string(),
-//! interval: 5, // Push every 5 seconds
-//! })?;
-//!
-//! // Your application code here
-//! tokio::signal::ctrl_c().await?;
-//!
-//! // Stop pushing metrics gracefully
-//! collector.stop();
-//! Ok(())
-//! }
-
-use axum::{Router, routing::get};
-use prometheus::{Encoder, TextEncoder, register_counter_vec, register_gauge_vec};
-use reqwest::Client;
-use serde::{Deserialize, Serialize};
-use std::net::SocketAddr;
-use std::time::Duration as StdDuration;
-
-use dynamo_llm::kv_router::protocols::{ForwardPassMetrics, LoadMetrics};
-use dynamo_llm::kv_router::scoring::Endpoint;
-use dynamo_llm::kv_router::scoring::ProcessedEndpoints;
-
-use dynamo_runtime::{
- Result, distributed::Component, error, service::EndpointInfo, utils::Duration,
-};
-
-/// Configuration for metrics collection mode
-#[derive(Debug, Clone)]
-pub enum MetricsMode {
- /// Host a Prometheus metrics server for pull-based collection
- Pull {
- /// Host to listen on (e.g. "0.0.0.0")
- host: String,
- /// Port to listen on (e.g. 9091)
- port: u16,
- },
- /// Push to a Prometheus PushGateway
- Push {
- /// PushGateway host (e.g. "http://localhost")
- host: String,
- /// PushGateway port (e.g. 9091)
- port: u16,
- /// Job name for the metrics
- job: String,
- /// Push interval in seconds
- interval: u64,
- },
-}
-
-impl Default for MetricsMode {
- fn default() -> Self {
- Self::new_pull()
- }
-}
-
-impl MetricsMode {
- /// Create a new Pull mode with default values
- pub fn new_pull() -> Self {
- Self::Pull {
- host: "0.0.0.0".to_string(),
- port: 9091,
- }
- }
-
- /// Create a new Push mode with default values
- pub fn new_push() -> Self {
- Self::Push {
- host: "127.0.0.1".to_string(),
- port: 9091,
- job: "dynamo_metrics".to_string(),
- interval: 2,
- }
- }
-}
-
-/// Configuration for LLM worker load capacity metrics
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct LLMWorkerLoadCapacityConfig {
- pub component_name: String,
- pub endpoint_name: String,
- pub model_name: Option,
-}
-
-/// Metrics collector for exposing metrics to prometheus/grafana
-pub struct PrometheusMetricsCollector {
- metrics: PrometheusMetrics,
- mode: Option,
- shutdown_tx: Option>,
-}
-
-impl PrometheusMetricsCollector {
- pub fn new() -> Result {
- Ok(Self {
- metrics: PrometheusMetrics::new()?,
- mode: None,
- shutdown_tx: None,
- })
- }
-
- /// Start metrics collection with the specified mode
- pub fn start(&mut self, mode: MetricsMode) -> Result<()> {
- // Store the mode
- self.mode = Some(mode.clone());
-
- match mode {
- MetricsMode::Pull { host, port } => self.start_pull_mode(host, port),
- MetricsMode::Push {
- host,
- port,
- job,
- interval,
- } => self.start_push_mode(host, port, job, interval),
- }
- }
-
- /// Stop metrics collection
- pub fn stop(&mut self) {
- if let Some(tx) = self.shutdown_tx.take() {
- let _ = tx.send(());
- }
- }
-
- /// Start a metrics server for pull-based collection on the specified port
- fn start_pull_mode(&mut self, host: String, port: u16) -> Result<()> {
- // Create an axum router with a metrics endpoint
- let app = Router::new().route(
- "/metrics",
- get(|| async {
- // Gather and encode metrics
- let encoder = TextEncoder::new();
- let mut buffer = Vec::new();
- encoder.encode(&prometheus::gather(), &mut buffer).unwrap();
- String::from_utf8(buffer).unwrap()
- }),
- );
-
- // Create a socket address to listen on
- let ip_addr = host.parse().map_err(|e| {
- error!("Failed to parse host '{}' as IP address: {}. Use a valid IPv4 or IPv6 address (e.g. '0.0.0.0' or '127.0.0.1')", host, e)
- })?;
- let addr = SocketAddr::new(ip_addr, port);
-
- // Create shutdown channel
- let (tx, rx) = tokio::sync::oneshot::channel();
- self.shutdown_tx = Some(tx);
-
- // Spawn the server in a background task
- tokio::spawn(async move {
- let listener = tokio::net::TcpListener::bind(addr)
- .await
- .unwrap_or_else(|_| panic!("could not bind to address: {addr}"));
- let server = axum::serve(listener, app);
-
- // Create a future that completes when shutdown signal is received
- let shutdown_future = async {
- rx.await.ok();
- };
-
- // Run the server with graceful shutdown
- tokio::select! {
- result = server => {
- if let Err(e) = result {
- tracing::error!("Metrics server error: {}", e);
- }
- },
- _ = shutdown_future => {
- tracing::info!("Metrics server shutting down gracefully");
- },
- }
- });
-
- tracing::info!("Prometheus metrics server started at {addr}/metrics");
- Ok(())
- }
-
- /// Start pushing metrics to a Prometheus PushGateway
- fn start_push_mode(
- &mut self,
- host: String,
- port: u16,
- job: String,
- interval: u64,
- ) -> Result<()> {
- // Create shutdown channel
- let (tx, mut rx) = tokio::sync::oneshot::channel();
- self.shutdown_tx = Some(tx);
-
- // Create HTTP client
- let client = Client::new();
- let url = format!("http://{host}:{port}/metrics/job/{job}");
- let url_clone = url.clone();
- let interval_duration = StdDuration::from_secs(interval);
-
- // Spawn background task to periodically push metrics
- tokio::spawn(async move {
- let mut interval = tokio::time::interval(interval_duration);
-
- loop {
- tokio::select! {
- _ = interval.tick() => {
- // Gather and encode metrics
- let encoder = TextEncoder::new();
- let mut buffer = Vec::new();
- if let Err(e) = encoder.encode(&prometheus::gather(), &mut buffer) {
- tracing::error!("Failed to encode metrics: {}", e);
- continue;
- }
-
- // Push metrics to the gateway
- match client.post(&url)
- .header("Content-Type", encoder.format_type())
- .body(buffer)
- .send()
- .await
- {
- Ok(response) => {
- if response.status().is_success() {
- tracing::debug!("Successfully pushed metrics to PushGateway");
- } else {
- tracing::error!(
- "Failed to push metrics to PushGateway. Status: {}, Error: {:?}",
- response.status(),
- response.text().await
- );
- }
- }
- Err(e) => {
- tracing::error!("Failed to push metrics to PushGateway: {}", e);
- }
- }
- }
- _ = &mut rx => {
- tracing::info!("Stopping metrics push task");
- break;
- }
- }
- }
- });
-
- tracing::info!(
- "Started pushing metrics to PushGateway at '{url_clone}' with job name '{job}'"
- );
- Ok(())
- }
-
- /// Update metrics with current values
- pub fn update(&mut self, config: &LLMWorkerLoadCapacityConfig, processed: &ProcessedEndpoints) {
- self.metrics.update(config, processed);
- }
-
- /// Update KV hit rate metrics
- pub fn update_kv_hit_rate(
- &mut self,
- config: &LLMWorkerLoadCapacityConfig,
- worker_id: i64,
- isl_blocks: usize,
- overlap_blocks: usize,
- ) {
- self.metrics
- .update_kv_hit_rate(config, worker_id, isl_blocks, overlap_blocks);
- }
-}
-
-/// Prometheus metrics collection
-pub struct PrometheusMetrics {
- kv_blocks_active: prometheus::GaugeVec,
- kv_blocks_total: prometheus::GaugeVec,
- requests_active: prometheus::GaugeVec,
- requests_total: prometheus::GaugeVec,
- load_avg: prometheus::GaugeVec,
- load_std: prometheus::GaugeVec,
- // KV hit rate metrics
- kv_hit_rate_percent: prometheus::GaugeVec,
- // FIXME: These are currently unused outside of mock_worker
- kv_hit_rate_isl_blocks: prometheus::CounterVec,
- kv_hit_rate_overlap_blocks: prometheus::CounterVec,
-}
-
-impl PrometheusMetrics {
- /// Initialize all metrics
- fn new() -> Result {
- Ok(Self {
- kv_blocks_active: register_gauge_vec!(
- "llm_kv_blocks_active",
- "Active KV cache blocks",
- &["component", "endpoint", "worker_id"]
- )?,
- kv_blocks_total: register_gauge_vec!(
- "llm_kv_blocks_total",
- "Total KV cache blocks",
- &["component", "endpoint", "worker_id"]
- )?,
- requests_active: register_gauge_vec!(
- "llm_requests_active_slots",
- "Active request slots",
- &["component", "endpoint", "worker_id"]
- )?,
- requests_total: register_gauge_vec!(
- "llm_requests_total_slots",
- "Total request slots",
- &["component", "endpoint", "worker_id"]
- )?,
- load_avg: register_gauge_vec!(
- "llm_load_avg",
- "Average load across workers",
- &["component", "endpoint"]
- )?,
- load_std: register_gauge_vec!(
- "llm_load_std",
- "Load standard deviation across workers",
- &["component", "endpoint"]
- )?,
- // KV hit rate (ForwardPassMetrics)
- kv_hit_rate_percent: register_gauge_vec!(
- "llm_kv_hit_rate_percent",
- "KV hit rate percentage per worker",
- &["component", "endpoint", "worker_id"]
- )?,
- // FIXME: Cleanup/remove event based metrics after finalizaing
- // metrics collection approach with vllm/trtllm workers.
- // Event-based KV hit rate metrics (not currently used outside mock worker)
- kv_hit_rate_isl_blocks: register_counter_vec!(
- "llm_kv_hit_rate_isl_blocks",
- "Cumulative count of ISL blocks in KV hit rate events",
- &["component", "endpoint", "worker_id"]
- )?,
- kv_hit_rate_overlap_blocks: register_counter_vec!(
- "llm_kv_hit_rate_overlap_blocks",
- "Cumulative count of overlapping blocks in KV hit rate events",
- &["component", "endpoint", "worker_id"]
- )?,
- })
- }
-
- /// Helper method to set a gauge with worker-specific labels (3 labels)
- fn set_worker_gauge(
- &self,
- gauge: &prometheus::GaugeVec,
- config: &LLMWorkerLoadCapacityConfig,
- worker_id: &String,
- value: f64,
- ) {
- gauge
- .with_label_values(&[&config.component_name, &config.endpoint_name, worker_id])
- .set(value);
- }
-
- /// Helper method to increment a counter with worker-specific labels (3 labels)
- fn increment_worker_counter(
- &self,
- counter: &prometheus::CounterVec,
- config: &LLMWorkerLoadCapacityConfig,
- worker_id: &String,
- value: f64,
- ) {
- counter
- .with_label_values(&[&config.component_name, &config.endpoint_name, worker_id])
- .inc_by(value);
- }
-
- /// Helper method to set a gauge with component/endpoint labels only (2 labels)
- fn set_endpoint_gauge(
- &self,
- gauge: &prometheus::GaugeVec,
- config: &LLMWorkerLoadCapacityConfig,
- value: f64,
- ) {
- gauge
- .with_label_values(&[&config.component_name, &config.endpoint_name])
- .set(value);
- }
-
- /// Update metrics with current values
- fn update(&self, config: &LLMWorkerLoadCapacityConfig, processed: &ProcessedEndpoints) {
- // Update per-worker metrics
- for (worker_id, endpoint) in processed.endpoints.iter() {
- let worker_id = worker_id.to_string();
- let load_metrics = endpoint.data.clone();
- let LoadMetrics::EngineLoadMetrics(metrics) = load_metrics else {
- panic!("Can only update with ForwardPassMetrics");
- };
-
- self.set_worker_gauge(
- &self.kv_blocks_active,
- config,
- &worker_id,
- metrics.kv_stats.kv_active_blocks as f64,
- );
- self.set_worker_gauge(
- &self.kv_blocks_total,
- config,
- &worker_id,
- metrics.kv_stats.kv_total_blocks as f64,
- );
- self.set_worker_gauge(
- &self.requests_active,
- config,
- &worker_id,
- metrics.worker_stats.request_active_slots as f64,
- );
- self.set_worker_gauge(
- &self.requests_total,
- config,
- &worker_id,
- metrics.worker_stats.request_total_slots as f64,
- );
- self.set_worker_gauge(
- &self.kv_hit_rate_percent,
- config,
- &worker_id,
- metrics.kv_stats.gpu_prefix_cache_hit_rate as f64,
- );
- }
-
- // Update aggregate metrics
- self.set_endpoint_gauge(&self.load_avg, config, processed.load_avg);
- self.set_endpoint_gauge(&self.load_std, config, processed.load_std);
- }
-
- /// Update KV hit rate metrics
- pub fn update_kv_hit_rate(
- &self,
- config: &LLMWorkerLoadCapacityConfig,
- worker_id: i64,
- isl_blocks: usize,
- overlap_blocks: usize,
- ) {
- let worker_id_str = worker_id.to_string();
-
- // Increment the ISL blocks and overlap blocks counters
- self.increment_worker_counter(
- &self.kv_hit_rate_isl_blocks,
- config,
- &worker_id_str,
- isl_blocks as f64,
- );
-
- self.increment_worker_counter(
- &self.kv_hit_rate_overlap_blocks,
- config,
- &worker_id_str,
- overlap_blocks as f64,
- );
-
- // TODO: The cumulative hit rate percentage can probably be computed by consumers
- // of Prometheus metrics like Grafana instead, but we'll compute it here for now
- // for convenient debugging/logging.
- // Calculate and set the cumulative hit rate percentage
- let cumulative_isl = self
- .kv_hit_rate_isl_blocks
- .with_label_values(&[
- &config.component_name,
- &config.endpoint_name,
- &worker_id_str,
- ])
- .get();
-
- let cumulative_overlap = self
- .kv_hit_rate_overlap_blocks
- .with_label_values(&[
- &config.component_name,
- &config.endpoint_name,
- &worker_id_str,
- ])
- .get();
-
- if cumulative_isl > 0.0 {
- let cumulative_hit_rate = (cumulative_overlap / cumulative_isl) * 100.0;
- tracing::debug!(
- "Estimated Cumulative KV hit rate: {cumulative_hit_rate:.2}% (Overlap: {cumulative_overlap} / ISL: {cumulative_isl})"
- );
- }
- }
-}
-
-/// Collect endpoints from a component
-pub async fn collect_endpoints(
- component: &Component,
- subject: &str,
- timeout: Duration,
-) -> Result> {
- // Collect stats from each backend
- let stream = component.scrape_stats(timeout).await?;
-
- // Filter the stats by the service subject
- let endpoints = stream
- .into_endpoints()
- .filter(|e| e.subject.starts_with(subject))
- .collect::>();
- tracing::debug!("Endpoints: {endpoints:?}");
- Ok(endpoints)
-}
-
-/// Extract metrics from endpoints
-pub fn extract_metrics(endpoints: &[EndpointInfo]) -> Vec {
- let endpoint_data = endpoints.iter().map(|e| e.data.clone()).collect::>();
-
- // Extract ForwardPassMetrics objects from endpoint services
- let metrics: Vec = endpoint_data
- .iter()
- .filter_map(|e| {
- let metrics_data = e.as_ref()?;
-
- match metrics_data.clone().decode::() {
- Ok(stats) => Some(stats),
- Err(err) => {
- tracing::error!(
- "Failed to decode ForwardPassMetrics data: {}. Raw data: {:?}",
- err,
- metrics_data
- );
- None
- }
- }
- })
- .collect();
- tracing::debug!("Metrics: {metrics:?}");
-
- metrics
-}
-
-/// Create ProcessedEndpoints from metrics and endpoints
-pub fn postprocess_metrics(
- metrics: &[ForwardPassMetrics],
- endpoints: &[EndpointInfo],
-) -> ProcessedEndpoints {
- let processed_endpoints: Vec = metrics
- .iter()
- .zip(endpoints.iter())
- .filter_map(|(m, e)| {
- e.id().ok().map(|id| Endpoint {
- name: format!("worker-{id}"),
- subject: e.subject.clone(),
- data: LoadMetrics::EngineLoadMetrics(m.clone()),
- })
- })
- .collect();
-
- ProcessedEndpoints::new(processed_endpoints)
-}
diff --git a/components/metrics/src/main.rs b/components/metrics/src/main.rs
deleted file mode 100644
index 873a08a721..0000000000
--- a/components/metrics/src/main.rs
+++ /dev/null
@@ -1,268 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-//! Metrics is a metrics aggregator designed to operate within a namespace and collect
-//! metrics from all workers.
-//!
-//! Metrics will collect for now:
-//!
-//! - LLM Worker Load:Capacity
-//! - These metrics will be scraped by the LLM NATS Service API's stats request
-//! - Request Slots: [Active, Total]
-//! - KV Cache Blocks: [Active, Total]
-//! - KV Hit Rate:
-//! - These metrics will be collected from KV hit rate events published by the KV router
-//! - ISL Blocks: Cumulative count of total blocks in all KV hit rate events
-//! - Overlap Blocks: Cumulative count of blocks that were already in the KV cache
-use clap::Parser;
-use dynamo_llm::kv_router::KV_HIT_RATE_SUBJECT;
-use dynamo_llm::kv_router::scheduler::KVHitRateEvent;
-use dynamo_runtime::{
- DistributedRuntime, ErrorContext, Result, Runtime, Worker, error, logging,
- traits::events::{EventPublisher, EventSubscriber},
- utils::{Duration, Instant},
-};
-use futures::stream::StreamExt;
-use std::sync::Arc;
-
-// Import from our library
-use metrics::{
- LLMWorkerLoadCapacityConfig, MetricsMode, PrometheusMetricsCollector, collect_endpoints,
- extract_metrics, postprocess_metrics,
-};
-
-/// CLI arguments for the metrics application
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
- /// Namespace to operate in and subscribe to events on
- #[arg(long, env = "DYN_NAMESPACE", default_value = "dynamo")]
- namespace: String,
-
- /// Component to scrape metrics from
- #[arg(long)]
- component: String,
-
- /// Endpoint to scrape metrics from
- #[arg(long)]
- endpoint: String,
-
- /// Model name for the target component (optional)
- #[arg(long)]
- model_name: Option,
-
- /// Polling interval in seconds for scraping dynamo endpoint stats (minimum 1 second)
- #[arg(long, default_value = "1")]
- poll_interval: u64,
-
- /// Host for serving or pushing prometheus metrics (default: 0.0.0.0)
- #[arg(
- long,
- default_value = "0.0.0.0",
- help_heading = "Prometheus Metrics Config"
- )]
- host: String,
-
- /// Port to run the Prometheus metrics server on (default: 9091)
- #[arg(
- long,
- default_value = "9091",
- help_heading = "Prometheus Metrics Config"
- )]
- port: u16,
-
- /// Push metrics to an external Prometheus Pushgateway instead of hosting them in-process
- #[arg(long, help_heading = "Prometheus Metrics Config")]
- push: bool,
-
- /// Push interval in seconds, when using push mode (minimum 1 second, default: 2)
- #[arg(long, default_value = "2", help_heading = "Prometheus Metrics Config")]
- push_interval: u64,
-}
-
-fn get_config(args: &Args) -> Result {
- if args.component.is_empty() {
- return Err(error!("Component name cannot be empty"));
- }
-
- if args.endpoint.is_empty() {
- return Err(error!("Endpoint name cannot be empty"));
- }
-
- if args.poll_interval < 1 {
- return Err(error!("Polling interval must be at least 1 second"));
- }
-
- if args.push && args.push_interval < 1 {
- return Err(error!("Push interval must be at least 1 second"));
- }
-
- Ok(LLMWorkerLoadCapacityConfig {
- component_name: args.component.clone(),
- endpoint_name: args.endpoint.clone(),
- model_name: args.model_name.clone(),
- })
-}
-
-async fn app(runtime: Runtime) -> Result<()> {
- let args = Args::parse();
- let config = get_config(&args)?;
- tracing::debug!("Config: {config:?}");
-
- let drt = DistributedRuntime::from_settings(runtime.clone()).await?;
-
- let namespace = drt.namespace(args.namespace)?;
- let component = namespace.component("count")?;
-
- // Create unique instance of Count
- let key = format!("{}/instance", component.etcd_root());
- tracing::debug!("Creating unique instance of Count at {key}");
- drt.etcd_client()
- .expect("Unreachable because of DistributedRuntime::from_settings above")
- .kv_create(&key, serde_json::to_vec_pretty(&config)?, None)
- .await
- .context("Unable to create unique instance of Count; possibly one already exists")?;
-
- let target_component = namespace.component(&config.component_name)?;
- let target_endpoint = target_component.endpoint(&config.endpoint_name);
-
- let service_path = target_endpoint.path();
- let service_subject = target_endpoint.subject();
- tracing::info!("Scraping endpoint {service_path} for stats");
-
- // Safety: DistributedRuntime::from_settings ensures this is Some
- let token = drt.primary_lease().unwrap().child_token();
- let event_name = format!("l2c.{}.{}", config.component_name, config.endpoint_name);
-
- // Initialize Prometheus metrics with the selected mode
- let metrics_collector = PrometheusMetricsCollector::new()?;
- let metrics_collector = Arc::new(tokio::sync::Mutex::new(metrics_collector));
-
- // Start metrics collection in the selected mode
- let metrics_mode = if args.push {
- MetricsMode::Push {
- host: args.host,
- port: args.port,
- job: "dynamo_push_metrics".to_string(),
- interval: args.push_interval,
- }
- } else {
- MetricsMode::Pull {
- host: args.host,
- port: args.port,
- }
- };
-
- metrics_collector.lock().await.start(metrics_mode)?;
-
- // TODO: Consider removing event subscription until metrics are more standardized
- // Subscribe to KV hit rate events
- let kv_hit_rate_subject = KV_HIT_RATE_SUBJECT;
- tracing::debug!("Subscribing to KV hit rate events on subject: {kv_hit_rate_subject}");
-
- // Clone fields for the event subscription task
- let config_clone = config.clone();
- let namespace_clone = namespace.clone();
- let metrics_collector_clone = metrics_collector.clone();
-
- // Note: Subscribing to KVHitRateEvent for illustration purposes. They're not used in production.
- // Spawn a task to handle KV hit rate events
- tokio::spawn(async move {
- match namespace_clone.subscribe(kv_hit_rate_subject).await {
- Ok(mut subscriber) => {
- tracing::debug!("Successfully subscribed to KV hit rate events");
-
- while let Some(msg) = subscriber.next().await {
- match serde_json::from_slice::(&msg.payload) {
- Ok(event) => {
- // TODO: Lower to debug
- let cache_hit_pct =
- (event.overlap_blocks as f64 / event.isl_blocks as f64) * 100.0;
- tracing::debug!(
- "Received KV hit rate event: worker_id={}, isl_blocks={}, overlap_blocks={}, cache_hit_pct={:.2}%",
- event.worker_id,
- event.isl_blocks,
- event.overlap_blocks,
- cache_hit_pct
- );
-
- // Update metrics with the event data
- let mut metrics = metrics_collector_clone.lock().await;
- metrics.update_kv_hit_rate(
- &config_clone,
- event.worker_id,
- event.isl_blocks,
- event.overlap_blocks as usize,
- );
- }
- Err(e) => {
- tracing::warn!("Failed to deserialize KV hit rate event: {e}");
- }
- }
- }
-
- tracing::warn!("KV hit rate event subscription stream ended");
- }
- Err(e) => {
- tracing::error!("Failed to subscribe to KV hit rate events: {:?}", e);
- }
- }
- });
-
- loop {
- let next = Instant::now() + Duration::from_secs(args.poll_interval);
-
- // Collect and process metrics
- let scrape_timeout = Duration::from_secs(1);
- let endpoints =
- collect_endpoints(&target_component, &service_subject, scrape_timeout).await?;
- if endpoints.is_empty() {
- tracing::warn!("No endpoints found matching {service_path}");
- continue;
- }
-
- let metrics = extract_metrics(&endpoints);
- let processed = postprocess_metrics(&metrics, &endpoints);
- if processed.endpoints.is_empty() {
- tracing::warn!("No metrics found matching {service_path}");
- } else {
- tracing::info!("Aggregated metrics: {processed:?}");
- }
-
- // Update Prometheus metrics
- metrics_collector.lock().await.update(&config, &processed);
-
- // TODO: Enable KV Routers to subscribe to metrics events published here
- // for a single view of the aggregated metrics, as opposed to the current
- // approach where each KV Router computes and published its own metrics.
- // Publish metrics event
- namespace.publish(&event_name, &processed).await?;
-
- // Wait until cancelled or the next tick
- match tokio::time::timeout_at(next, token.cancelled()).await {
- Ok(_) => break,
- Err(_) => continue,
- }
- }
-
- Ok(())
-}
-
-fn main() -> Result<()> {
- logging::init();
- let worker = Worker::from_settings()?;
- worker.execute(app)
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
- use std::env;
-
- #[test]
- fn test_namespace_from_env() {
- unsafe { env::set_var("DYN_NAMESPACE", "test-namespace") };
- let args = Args::parse_from(["count", "--component", "comp", "--endpoint", "end"]);
- assert_eq!(args.namespace, "test-namespace");
- }
-}
diff --git a/deploy/metrics/README.md b/deploy/metrics/README.md
index 474d9ea4f6..be6d823ce2 100644
--- a/deploy/metrics/README.md
+++ b/deploy/metrics/README.md
@@ -174,7 +174,7 @@ The following configuration files should be present in this directory:
- [grafana_dashboards/grafana-dashboard-providers.yml](./grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration
- [grafana_dashboards/grafana-dynamo-dashboard.json](./grafana_dashboards/grafana-dynamo-dashboard.json): A general Dynamo Dashboard for both SW and HW metrics.
- [grafana_dashboards/grafana-dcgm-metrics.json](./grafana_dashboards/grafana-dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics
-- [grafana_dashboards/grafana-llm-metrics.json](./grafana_dashboards/grafana-llm-metrics.json): This file, which is being phased out, contains the Grafana dashboard configuration for LLM-specific metrics. It requires an additional `metrics` component to operate concurrently. A new version is under development.
+- [grafana_dashboards/grafana-kvbm-dashboard.json](./grafana_dashboards/grafana-kvbm-dashboard.json): Contains Grafana dashboard configuration for KVBM metrics
### Metric Name Constants
@@ -237,8 +237,6 @@ This centralized approach ensures all Dynamo components use consistent, valid Pr
- DCGM Exporter: `http://localhost:9401/metrics`
- - Start the [components/metrics](../../components/metrics/README.md) application to begin monitoring for metric events from dynamo workers and aggregating them on a Prometheus metrics endpoint: `http://localhost:9091/metrics`.
- - Uncomment the appropriate lines in prometheus.yml to poll port 9091.
- Start worker(s) that publishes KV Cache metrics: [lib/runtime/examples/service_metrics/README.md](../../lib/runtime/examples/service_metrics/README.md) can populate dummy KV Cache metrics.
### Configuration
@@ -275,7 +273,7 @@ Grafana is pre-configured with:
docker compose logs grafana
```
-3. For issues with the legacy metrics component (being phased out), see [components/metrics/README.md](../../components/metrics/README.md) for details on the exposed metrics and troubleshooting steps.
+3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection.
## Developer Guide
@@ -477,21 +475,6 @@ let requests_total = namespace.create_counter(
)?;
```
-## Running the deprecated `components/metrics` program
-
-⚠️ **DEPRECATION NOTICE** ⚠️
-
-When you run the example [components/metrics](../../components/metrics/README.md) program, it exposes a Prometheus /metrics endpoint with the following metrics (defined in [components/metrics/src/lib.rs](../../components/metrics/src/lib.rs)):
-
-**⚠️ The following `llm_kv_*` metrics are deprecated:**
-
-- `llm_requests_active_slots`: Active request slots per worker
-- `llm_requests_total_slots`: Total available request slots per worker
-- `llm_kv_blocks_active`: Active KV blocks per worker
-- `llm_kv_blocks_total`: Total KV blocks available per worker
-- `llm_kv_hit_rate_percent`: KV Cache hit percent per worker
-- `llm_load_avg`: Average load across workers
-- `llm_load_std`: Load standard deviation across workers
## Troubleshooting
@@ -506,4 +489,4 @@ When you run the example [components/metrics](../../components/metrics/README.md
docker compose logs grafana
```
-3. For issues with the legacy metrics component (being phased out), see [components/metrics/README.md](../../components/metrics/README.md) for details on the exposed metrics and troubleshooting steps.
+3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection.
diff --git a/deploy/metrics/grafana_dashboards/grafana-llm-metrics.json b/deploy/metrics/grafana_dashboards/grafana-llm-metrics.json
deleted file mode 100644
index 775d305471..0000000000
--- a/deploy/metrics/grafana_dashboards/grafana-llm-metrics.json
+++ /dev/null
@@ -1,900 +0,0 @@
-{
- "annotations": {
- "list": [
- {
- "builtIn": 1,
- "datasource": {
- "type": "grafana",
- "uid": "-- Grafana --"
- },
- "enable": true,
- "hide": true,
- "iconColor": "rgba(0, 211, 255, 1)",
- "name": "Annotations & Alerts",
- "type": "dashboard"
- }
- ]
- },
- "copyright": [
- "SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.",
- "SPDX-License-Identifier: Apache-2.0",
- "Licensed under the Apache License, Version 2.0 (the \"License\");",
- "you may not use this file except in compliance with the License.",
- "You may obtain a copy of the License at",
- "http://www.apache.org/licenses/LICENSE-2.0",
- "Unless required by applicable law or agreed to in writing, software",
- "distributed under the License is distributed on an \"AS IS\" BASIS,",
- "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.",
- "See the License for the specific language governing permissions and",
- "limitations under the License.",
- "",
- "DEPRECATION NOTICE:",
- "This dashboard uses deprecated llm_kv_* metrics (llm_kv_blocks_active, llm_kv_blocks_total, llm_kv_hit_rate_percent)",
- "that are part of the deprecated metrics aggregation service. These metrics will be removed in a future release.",
- "Please migrate to the new MetricsRegistry system which provides dynamo_* metrics instead.",
- "See docs/guides/metrics.md for migration guidance."
- ],
- "editable": true,
- "fiscalYearStartMonth": 0,
- "graphTooltip": 0,
- "id": 1,
- "links": [],
- "liveNow": false,
- "panels": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 20,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "lineInterpolation": "smooth",
- "lineWidth": 2,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "never",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- },
- "unit": "percent"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 0
- },
- "id": 1,
- "options": {
- "legend": {
- "calcs": [
- "mean",
- "max"
- ],
- "displayMode": "table",
- "placement": "right",
- "showLegend": true
- },
- "tooltip": {
- "mode": "multi",
- "sort": "none"
- }
- },
- "title": "KV Cache Utilization by Worker",
- "type": "timeseries",
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "editorMode": "code",
- "expr": "100 * llm_kv_blocks_active{component=\"$component\", endpoint=\"$endpoint\"} / llm_kv_blocks_total{component=\"$component\", endpoint=\"$endpoint\"}",
- "legendFormat": "Worker {{worker_id}}",
- "range": true,
- "refId": "A"
- }
- ]
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 20,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "lineInterpolation": "smooth",
- "lineWidth": 2,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "never",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- },
- "unit": "percent"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 0
- },
- "id": 2,
- "options": {
- "legend": {
- "calcs": [
- "mean",
- "max"
- ],
- "displayMode": "table",
- "placement": "right",
- "showLegend": true
- },
- "tooltip": {
- "mode": "multi",
- "sort": "none"
- }
- },
- "title": "Request Slot Utilization by Worker",
- "type": "timeseries",
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "editorMode": "code",
- "expr": "100 * llm_requests_active_slots{component=\"$component\", endpoint=\"$endpoint\"} / llm_requests_total_slots{component=\"$component\", endpoint=\"$endpoint\"}",
- "legendFormat": "Worker {{worker_id}}",
- "range": true,
- "refId": "A"
- }
- ]
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "thresholds"
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- },
- {
- "color": "yellow",
- "value": 50
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- },
- "unit": "percent"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 4,
- "x": 0,
- "y": 8
- },
- "id": 3,
- "options": {
- "orientation": "auto",
- "reduceOptions": {
- "calcs": [
- "lastNotNull"
- ],
- "fields": "",
- "values": false
- },
- "showThresholdLabels": false,
- "showThresholdMarkers": true
- },
- "pluginVersion": "10.0.0",
- "title": "Average KV Cache Utilization",
- "type": "gauge",
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "editorMode": "code",
- "expr": "100 * avg(llm_kv_blocks_active{component=\"$component\", endpoint=\"$endpoint\"}) / avg(llm_kv_blocks_total{component=\"$component\", endpoint=\"$endpoint\"})",
- "legendFormat": "__auto",
- "range": true,
- "refId": "A"
- }
- ]
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "thresholds"
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- },
- {
- "color": "yellow",
- "value": 50
- },
- {
- "color": "red",
- "value": 80
- }
- ]
- },
- "unit": "percent"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 4,
- "x": 4,
- "y": 8
- },
- "id": 4,
- "options": {
- "orientation": "auto",
- "reduceOptions": {
- "calcs": [
- "lastNotNull"
- ],
- "fields": "",
- "values": false
- },
- "showThresholdLabels": false,
- "showThresholdMarkers": true
- },
- "pluginVersion": "10.0.0",
- "title": "Average Request Slot Utilization",
- "type": "gauge",
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "editorMode": "code",
- "expr": "100 * avg(llm_requests_active_slots{component=\"$component\", endpoint=\"$endpoint\"}) / avg(llm_requests_total_slots{component=\"$component\", endpoint=\"$endpoint\"})",
- "legendFormat": "__auto",
- "range": true,
- "refId": "A"
- }
- ]
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "thresholds"
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- }
- ]
- },
- "unit": "percent"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 4,
- "x": 8,
- "y": 8
- },
- "id": 7,
- "options": {
- "orientation": "auto",
- "reduceOptions": {
- "calcs": [
- "lastNotNull"
- ],
- "fields": "",
- "values": false
- },
- "showThresholdLabels": false,
- "showThresholdMarkers": true
- },
- "pluginVersion": "10.0.0",
- "title": "Average KV Cache Hit Rate",
- "type": "gauge",
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "editorMode": "code",
- "expr": "100 * avg(llm_kv_hit_rate_percent{component=\"$component\", endpoint=\"$endpoint\"})",
- "legendFormat": "__auto",
- "range": true,
- "refId": "A"
- }
- ]
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 20,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "lineInterpolation": "smooth",
- "lineWidth": 2,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "never",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- }
- ]
- },
- "unit": "none"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 8
- },
- "id": 5,
- "options": {
- "legend": {
- "calcs": [
- "mean",
- "max"
- ],
- "displayMode": "table",
- "placement": "right",
- "showLegend": true
- },
- "tooltip": {
- "mode": "multi",
- "sort": "none"
- }
- },
- "title": "Load Average & Standard Deviation",
- "type": "timeseries",
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "editorMode": "code",
- "expr": "llm_load_avg{component=\"$component\", endpoint=\"$endpoint\"}",
- "legendFormat": "Average",
- "range": true,
- "refId": "A"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "editorMode": "code",
- "expr": "llm_load_std{component=\"$component\", endpoint=\"$endpoint\"}",
- "hide": false,
- "legendFormat": "StdDev",
- "range": true,
- "refId": "B"
- }
- ]
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 20,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "lineInterpolation": "smooth",
- "lineWidth": 2,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "never",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- }
- ]
- },
- "unit": "percent"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 16
- },
- "id": 8,
- "options": {
- "legend": {
- "calcs": [
- "mean",
- "max"
- ],
- "displayMode": "table",
- "placement": "right",
- "showLegend": true
- },
- "tooltip": {
- "mode": "multi",
- "sort": "none"
- }
- },
- "title": "KV Cache Hit Rate by Worker",
- "type": "timeseries",
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "editorMode": "code",
- "expr": "100 * llm_kv_hit_rate_percent{component=\"$component\", endpoint=\"$endpoint\"}",
- "legendFormat": "Worker {{worker_id}}",
- "range": true,
- "refId": "A"
- }
- ]
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 20,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "lineInterpolation": "smooth",
- "lineWidth": 2,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "never",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- }
- ]
- },
- "unit": "percent"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 16
- },
- "id": 9,
- "options": {
- "legend": {
- "calcs": [
- "mean",
- "max"
- ],
- "displayMode": "table",
- "placement": "right",
- "showLegend": true
- },
- "tooltip": {
- "mode": "multi",
- "sort": "none"
- }
- },
- "title": "Average KV Cache Hit Rate",
- "type": "timeseries",
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "editorMode": "code",
- "expr": "avg(100 * llm_kv_hit_rate_percent{component=\"$component\", endpoint=\"$endpoint\"})",
- "legendFormat": "Average Hit Rate",
- "range": true,
- "refId": "A"
- }
- ]
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "palette-classic"
- },
- "custom": {
- "axisCenteredZero": false,
- "axisColorMode": "text",
- "axisLabel": "",
- "axisPlacement": "auto",
- "barAlignment": 0,
- "drawStyle": "line",
- "fillOpacity": 20,
- "gradientMode": "none",
- "hideFrom": {
- "legend": false,
- "tooltip": false,
- "viz": false
- },
- "lineInterpolation": "smooth",
- "lineWidth": 2,
- "pointSize": 5,
- "scaleDistribution": {
- "type": "linear"
- },
- "showPoints": "never",
- "spanNulls": false,
- "stacking": {
- "group": "A",
- "mode": "none"
- },
- "thresholdsStyle": {
- "mode": "off"
- }
- },
- "mappings": [],
- "thresholds": {
- "mode": "absolute",
- "steps": [
- {
- "color": "green",
- "value": null
- }
- ]
- },
- "unit": "none"
- },
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 24,
- "x": 0,
- "y": 24
- },
- "id": 6,
- "options": {
- "legend": {
- "calcs": [
- "mean",
- "max"
- ],
- "displayMode": "table",
- "placement": "right",
- "showLegend": true
- },
- "tooltip": {
- "mode": "multi",
- "sort": "none"
- }
- },
- "title": "Available Resources",
- "type": "timeseries",
- "targets": [
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "editorMode": "code",
- "expr": "sum(llm_kv_blocks_total{component=\"$component\", endpoint=\"$endpoint\"} - llm_kv_blocks_active{component=\"$component\", endpoint=\"$endpoint\"})",
- "legendFormat": "Available KV Blocks",
- "range": true,
- "refId": "A"
- },
- {
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "editorMode": "code",
- "expr": "sum(llm_requests_total_slots{component=\"$component\", endpoint=\"$endpoint\"} - llm_requests_active_slots{component=\"$component\", endpoint=\"$endpoint\"})",
- "hide": false,
- "legendFormat": "Available Request Slots",
- "range": true,
- "refId": "B"
- }
- ]
- }
- ],
- "refresh": "2s",
- "schemaVersion": 38,
- "style": "dark",
- "tags": [
- "llm",
- "metrics"
- ],
- "templating": {
- "list": [
- {
- "current": {
- "selected": false,
- "text": "component",
- "value": "vllm"
- },
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "definition": "label_values(llm_kv_blocks_active, component)",
- "hide": 0,
- "includeAll": false,
- "label": "Component",
- "multi": false,
- "name": "component",
- "options": [],
- "query": {
- "query": "label_values(llm_kv_blocks_active, component)",
- "refId": "StandardVariableQuery"
- },
- "refresh": 1,
- "regex": "",
- "skipUrlSync": false,
- "sort": 0,
- "type": "query"
- },
- {
- "current": {
- "selected": false,
- "text": "endpoint",
- "value": "load_metrics"
- },
- "datasource": {
- "type": "prometheus",
- "uid": "prometheus"
- },
- "definition": "label_values(llm_kv_blocks_active{component=\"$component\"}, endpoint)",
- "hide": 0,
- "includeAll": false,
- "label": "Endpoint",
- "multi": false,
- "name": "endpoint",
- "options": [],
- "query": {
- "query": "label_values(llm_kv_blocks_active{component=\"$component\"}, endpoint)",
- "refId": "StandardVariableQuery"
- },
- "refresh": 1,
- "regex": "",
- "skipUrlSync": false,
- "sort": 0,
- "type": "query"
- }
- ]
- },
- "time": {
- "from": "now-5m",
- "to": "now"
- },
- "timepicker": {},
- "timezone": "",
- "title": "LLM Worker Metrics",
- "uid": "llm-worker-metrics",
- "version": 1,
- "weekStart": ""
-}
\ No newline at end of file
diff --git a/deploy/metrics/prometheus.yml b/deploy/metrics/prometheus.yml
index 52b2c9a44b..74600e6b37 100644
--- a/deploy/metrics/prometheus.yml
+++ b/deploy/metrics/prometheus.yml
@@ -33,13 +33,13 @@ scrape_configs:
static_configs:
- targets: ['dcgm-exporter:9401'] # on the "monitoring" network
- # This is a demo service that needs to be launched manually. See components/metrics/README.md
- # Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 8080/tcp
- # You can also force the port, if the default is different: python -m dynamo.frontend --http-port 8080
+ # This is a demo service that needs to be launched manually
+ # Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 8000/tcp
+ # You can also force the port, if the default is different: python -m dynamo.frontend --http-port 8000
- job_name: 'dynamo-frontend'
scrape_interval: 10s
static_configs:
- - targets: ['host.docker.internal:8080'] # on the "monitoring" network
+ - targets: ['host.docker.internal:8000'] # on the "monitoring" network
# Launch via: DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 dynamo. ...
# If you want to update the scrape_interval, you may want to also update component.rs's MAX_DELAY
@@ -48,15 +48,6 @@ scrape_configs:
static_configs:
- targets: ['host.docker.internal:8081']
- # DEPRECATED: This metrics aggregation service is being deprecated in favor of MetricsRegistry
- # The new system uses the 'dynamo-backend' job above instead of this separate service
- # This is another demo aggregator that needs to be launched manually. See components/metrics/README.md
- # Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 9091/tcp
- - job_name: 'metrics-aggregation-service'
- scrape_interval: 2s
- static_configs:
- # - targets: ['localhost:9091'] # metrics aggregation service on host
- - targets: ['host.docker.internal:9091'] # metrics aggregation service on host
# KVBM leader related metrics
- job_name: 'kvbm-leader-metrics'