diff --git a/Cargo.lock b/Cargo.lock index 83994db565..2aca376ba2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4642,25 +4642,6 @@ dependencies = [ "paste", ] -[[package]] -name = "metrics" -version = "0.5.1" -dependencies = [ - "axum 0.8.4", - "clap 4.5.48", - "dynamo-llm", - "dynamo-runtime", - "futures", - "prometheus", - "rand 0.9.2", - "reqwest 0.12.23", - "serde", - "serde_json", - "thiserror 2.0.16", - "tokio", - "tracing", -] - [[package]] name = "mime" version = "0.3.17" diff --git a/Cargo.toml b/Cargo.toml index 3512e76bc1..48c07b1f96 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,6 @@ [workspace] members = [ - "components/metrics", "launch/dynamo-run", "lib/llm", "lib/runtime", @@ -18,7 +17,6 @@ members = [ # - launch/dynamo-run # - lib/engines/* default-members = [ - "components/metrics", "lib/llm", "lib/runtime", "lib/tokens", diff --git a/components/metrics/Cargo.toml b/components/metrics/Cargo.toml deleted file mode 100644 index 7167a3ee37..0000000000 --- a/components/metrics/Cargo.toml +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -[package] -name = "metrics" -version.workspace = true -edition.workspace = true -authors.workspace = true -license.workspace = true -homepage.workspace = true -repository.workspace = true - -[dependencies] -dynamo-llm = { workspace = true } -dynamo-runtime = { workspace = true } - -futures = { workspace = true } -prometheus = { workspace = true } -rand = { workspace = true } -serde = { workspace = true } -serde_json = { workspace = true } -thiserror = { workspace = true } -tokio = { workspace = true } -tracing = { workspace = true } - -axum = { version = "0.8" } -clap = { version = "4.5", features = ["derive", "env"] } -reqwest = { version = "0.12.22", default-features = false, features = ["json", "rustls-tls"] } diff --git a/components/metrics/README.md b/components/metrics/README.md deleted file mode 100644 index 7261e387e3..0000000000 --- a/components/metrics/README.md +++ /dev/null @@ -1,191 +0,0 @@ -# Metrics - -⚠️ **DEPRECATION NOTICE** ⚠️ - -**This `metrics` component is unmaintained and being deprecated.** - -The deprecated `metrics` component is being replaced by the **`MetricsRegistry`** built-in functionality that is now available directly in the `DistributedRuntime` framework. The `MetricsRegistry` provides: - -**For new projects and existing deployments, please migrate to using `MetricsRegistry` instead of this component.** - -This component may be migrated to the MetricsRegistry in the future. - -**📖 See the [Dynamo MetricsRegistry Guide](../../docs/guides/metrics.md) for detailed information on using the new metrics system.** - ---- - -The deprecated `metrics` component is a utility for collecting, aggregating, and publishing metrics from a Dynamo deployment, but it is unmaintained and being deprecated in favor of `MetricsRegistry`. - -**Note**: This is a demo implementation. The deprecated `metrics` component is no longer under active development. -- In this demo the metrics names use the prefix "llm", but in production they will be prefixed with "dynamo" (e.g., the HTTP `/metrics` endpoint will serve metrics with "dynamo" prefixes) - -
- Dynamo Metrics Dashboard -
- -## Quickstart - -To start the deprecated `metrics` component, simply point it at the `namespace/component/endpoint` -trio for the Dynamo workers that you're interested in monitoring metrics on. - -This will: -1. Collect statistics from workers associated with that `namespace/component/endpoint` -2. Postprocess and aggregate those statistics across the workers -3. Publish them on a Prometheus-compatible metrics endpoint - -For example: -```bash -# Default namespace is "dynamo", but can be configured with --namespace -# For more detailed output, try setting the env var: DYN_LOG=debug -metrics --component MyComponent --endpoint my_endpoint - -# 2025-03-17T00:07:05.202558Z INFO metrics: Scraping endpoint dynamo/MyComponent/my_endpoint for stats -# 2025-03-17T00:07:05.202955Z INFO metrics: Prometheus metrics server started at 0.0.0.0:9091/metrics -# ... -``` - -With no matching endpoints running to collect stats from, you should see warnings in the logs: -```bash -2025-03-17T00:07:06.204756Z WARN metrics: No endpoints found matching dynamo/MyComponent/my_endpoint -``` - -After a worker with a matching endpoint gets started, the endpoint -will get automatically discovered and the warnings will stop. - -## Workers - -The deprecated `metrics` component needs running workers to gather metrics from, -so below are some examples of workers and how they can be monitored. - -### Mock Worker - -To try out how the deprecated `metrics` component works, there is a demo Rust-based -[mock worker](src/bin/mock_worker.rs) that provides sample data through two mechanisms: -1. Exposes a stats handler at `dynamo/MyComponent/my_endpoint` that responds to polling requests (from the deprecated `metrics` component) with randomly generated `ForwardPassMetrics` data -2. Publishes mock `KVHitRateEvent` data every second to demonstrate event-based metrics - -Step 1: Launch a mock workers via the following command (if already built): -```bash -# or build/run from source: DYN_LOG=DEBUG cargo run --bin mock_worker -mock_worker - -# 2025-03-16T23:49:28.101668Z INFO mock_worker: Starting Mock Worker on Endpoint: dynamo/MyComponent/my_endpoint -``` - -Step 2: Monitor the metrics of these mock workers, and prepare its own Prometheus endpoint at -port 9091 (a default, when --port is not specified) on /metrics: -```bash -metrics --component MyComponent --endpoint my_endpoint -``` - -### Real Worker - -To run a more realistic deployment to gather metrics: - -```bash -python -m dynamo.frontend & -python -m dynamo.vllm --model-path -``` - -Then, to monitor the metrics of these VllmWorkers, run: -```bash -metrics --component backend --endpoint load_metrics -``` - -**NOTE**: `load_metrics` is currently a -[hard-coded](https://github.com/ai-dynamo/dynamo/blob/d5220c7b1151372ba3d2a061c7d0a7ed72724789/lib/llm/src/kv_router/publisher.rs#L108) -endpoint name used for python-based workers that register a `WorkerMetricsPublisher`. - -## Visualization - -To visualize the metrics being exposed on the Prometheus endpoint, -see the Prometheus and Grafana configurations in -[deploy/metrics](../../deploy/metrics): -```bash -docker compose -f deploy/docker-compose.yml --profile metrics up -d -``` - -## Metrics Collection Modes - -The deprecated `metrics` component supports two modes for exposing metrics in a Prometheus format: - -### Pull Mode (Default) - -When running in pull mode (the default), the deprecated `metrics` component will expose a -Prometheus metrics endpoint on the specified host and port that a -Prometheus server or curl client can pull from: - -```bash -# Start metrics server on default host (0.0.0.0) and port (9091) -metrics --component MyComponent --endpoint my_endpoint - -# Or specify a custom port -metrics --component MyComponent --endpoint my_endpoint --port 9092 -``` - -In pull mode: -- The `--host` parameter must be a valid IPv4 or IPv6 address (e.g., "0.0.0.0", "127.0.0.1") -- The `--port` parameter specifies which port the HTTP server will listen on - -You can then query the metrics using: -```bash -curl localhost:9091/metrics - -# # HELP llm_kv_blocks_active Active KV cache blocks -# # TYPE llm_kv_blocks_active gauge -# llm_kv_blocks_active{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033398"} 40 -# llm_kv_blocks_active{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033401"} 2 -# # HELP llm_kv_blocks_total Total KV cache blocks -# # TYPE llm_kv_blocks_total gauge -# llm_kv_blocks_total{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033398"} 100 -# llm_kv_blocks_total{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033401"} 100 -``` - -### Push Mode - -For ephemeral or batch jobs, or when metrics need to be pushed through a firewall, -you can use Push mode. In this mode, the deprecated `metrics` component will periodically push -metrics to an externally hosted -[Prometheus PushGateway](https://prometheus.io/docs/instrumenting/pushing/): - -Start a prometheus push gateway service via docker: -```bash -docker run --rm -d -p 9091:9091 --name pushgateway prom/pushgateway -``` - -Start the deprecated `metrics` component in `--push` mode, specifying the host and port of your PushGateway: -```bash -# Push metrics to a Prometheus PushGateway every --push-interval seconds -metrics \ - --component MyComponent \ - --endpoint my_endpoint \ - --host 127.0.0.1 \ - --port 9091 \ - --push -``` - -When using Push mode: -- The `--host` parameter must be a valid IPv4 or IPv6 address (e.g., "0.0.0.0", "127.0.0.1") - that the Prometheus PushGateway is running on -- The `--port` parameter specifies the port of the Prometheus PushGateway -- The push interval can be configured with `--push-interval` (default: 2 seconds) -- A default job name of "dynamo_metrics" is used for the Prometheus job label -- Metrics persist in the PushGateway until explicitly deleted -- Prometheus should be configured to scrape the PushGateway with `honor_labels: true` - -To view the metrics hosted on the PushGateway: -```bash -# View all metrics -# curl http://:/metrics -curl 127.0.0.1:9091/metrics -``` -## Building/Running from Source - -For easy iteration while making edits to the deprecated `metrics` component, you can use `cargo run` -to build and run with your local changes: - -```bash -cargo run --bin metrics -- --component MyComponent --endpoint my_endpoint -``` - - diff --git a/components/metrics/images/dynamo_metrics_grafana.png b/components/metrics/images/dynamo_metrics_grafana.png deleted file mode 100644 index a63915f940..0000000000 Binary files a/components/metrics/images/dynamo_metrics_grafana.png and /dev/null differ diff --git a/components/metrics/src/bin/mock_worker.rs b/components/metrics/src/bin/mock_worker.rs deleted file mode 100644 index 19881c983b..0000000000 --- a/components/metrics/src/bin/mock_worker.rs +++ /dev/null @@ -1,161 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -use dynamo_llm::kv_router::{ - KV_HIT_RATE_SUBJECT, - protocols::{ForwardPassMetrics, KvStats, WorkerStats}, - scheduler::KVHitRateEvent, -}; -use dynamo_runtime::{ - DistributedRuntime, Result, Runtime, Worker, - component::{Namespace, service::EndpointStats}, - logging, - pipeline::{ - AsyncEngine, AsyncEngineContextProvider, Error, ManyOut, ResponseStream, SingleIn, - async_trait, network::Ingress, - }, - protocols::annotated::Annotated, - stream, - traits::events::EventPublisher, -}; -use rand::Rng; -use std::sync::Arc; -use tokio::time::{Duration, interval}; - -fn main() -> Result<()> { - logging::init(); - let worker = Worker::from_settings()?; - worker.execute(app) -} - -async fn app(runtime: Runtime) -> Result<()> { - let distributed = DistributedRuntime::from_settings(runtime.clone()).await?; - backend(distributed).await -} - -struct MockRequestHandler {} - -impl MockRequestHandler { - fn new() -> Arc { - Arc::new(Self {}) - } -} - -#[async_trait] -impl AsyncEngine, ManyOut>, Error> for MockRequestHandler { - async fn generate(&self, input: SingleIn) -> Result>> { - let (data, ctx) = input.into_parts(); - - let chars = data - .chars() - .map(|c| Annotated::from_data(c.to_string())) - .collect::>(); - - let stream = stream::iter(chars); - - Ok(ResponseStream::new(Box::pin(stream), ctx.context())) - } -} - -// FIXME: These events are just for testing and may not currently be used. -/// Spawns a background task that periodically publishes mock KV hit rate events -async fn mock_event_publisher(namespace: Namespace) { - // NOTE: These events are just for testing, and shouldn't be interpreted - // in correlation with the stats handler's data: - // 1. The worker ID associated with the events here won't match the - // worker ID of the endpoint's service stats handler. - // 2. These events aren't coming through the KV Router, so the metrics won't - // be reflective of the KV Router's performance. - // 3. The data in these events aren't in sync with the stats handler's - // ForwardPassMetrics data, so they may not correlate well. - let worker_id = rand::rng().random_range(1..=1000); - - let mut interval = interval(Duration::from_secs(1)); - loop { - interval.tick().await; - - // Generate random KV hit rate event using a new thread_rng each time - let isl_blocks = rand::rng().random_range(0..=100); - let overlap_blocks = rand::rng().random_range(0..=isl_blocks); - - let event = KVHitRateEvent { - worker_id, - isl_blocks, - overlap_blocks: overlap_blocks as u32, - }; - - if let Err(e) = namespace.publish(KV_HIT_RATE_SUBJECT, &event).await { - tracing::warn!("Failed to publish KV hit rate event: {e}"); - } else { - tracing::debug!( - "Published KV hit rate event: worker_id={worker_id}, isl_blocks={isl_blocks}, overlap_blocks={overlap_blocks}, hit_rate={:.2}%", - (overlap_blocks as f64 / isl_blocks as f64) * 100.0 - ); - } - } -} - -/// Generates mock forward pass metrics for stats handler -fn mock_stats_handler(_stats: EndpointStats) -> serde_json::Value { - let request_total_slots = 100; - let request_active_slots = rand::rng().random_range(0..=request_total_slots); - let kv_total_blocks = 100; - let kv_active_blocks = rand::rng().random_range(0..=kv_total_blocks); - let num_requests_waiting = rand::rng().random_range(0..=100); - let gpu_cache_usage_perc = rand::rng().random_range(0.0..=1.0); - let gpu_prefix_cache_hit_rate = rand::rng().random_range(0.0..=1.0); - - let worker_stats = WorkerStats { - data_parallel_rank: None, // Default for backwards compatibility - request_active_slots, - request_total_slots, - num_requests_waiting, - }; - - let kv_stats = KvStats { - kv_active_blocks, - kv_total_blocks, - gpu_cache_usage_perc, - gpu_prefix_cache_hit_rate, - }; - - let spec_decode_stats = None; - - let stats = ForwardPassMetrics { - worker_stats, - kv_stats, - spec_decode_stats, - }; - tracing::info!("Stats: {stats:?}"); - serde_json::to_value(stats).unwrap() -} - -async fn backend(runtime: DistributedRuntime) -> Result<()> { - let namespace = runtime.namespace("dynamo")?; - // we must first create a service, then we can attach one more more endpoints - let component = namespace - .component("MyComponent")? - .service_builder() - .create() - .await?; - let endpoint = component.endpoint("my_endpoint"); - tracing::info!("Starting Mock Worker on Endpoint: {}", endpoint.path()); - - // Spawn background task for publishing KV hit rate events - let namespace_clone = namespace.clone(); - tokio::spawn(async move { - mock_event_publisher(namespace_clone).await; - }); - - // Attach an ingress to the engine - let ingress = Ingress::for_engine(MockRequestHandler::new())?; - - // Make the ingress discoverable via a component service - endpoint - .endpoint_builder() - // Dummy stats handler to demonstrate how to attach a custom stats handler - .stats_handler(mock_stats_handler) - .handler(ingress) - .start() - .await -} diff --git a/components/metrics/src/lib.rs b/components/metrics/src/lib.rs deleted file mode 100644 index c18b671dfb..0000000000 --- a/components/metrics/src/lib.rs +++ /dev/null @@ -1,595 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Library functions for the metrics application. -//! -//! This library provides functionality to expose Prometheus metrics either through a local HTTP server -//! or by pushing to a Prometheus PushGateway. -//! -//! # Examples -//! -//! ## Using the metrics pull mode -//! ```no_run -//! use metrics::{PrometheusMetricsCollector, MetricsMode}; -//! -//! #[tokio::main] -//! async fn main() -> Result<(), Box> { -//! let mut collector = PrometheusMetricsCollector::new()?; -//! -//! // Start a metrics server with default values -//! collector.start(MetricsMode::default())?; -//! -//! // Or explicitly specify values -//! collector.start(MetricsMode::Pull { -//! host: "127.0.0.1".to_string(), -//! port: 9090, -//! })?; -//! -//! // Or use the convenience constructor -//! collector.start(MetricsMode::new_pull())?; -//! -//! // Your application code here -//! tokio::signal::ctrl_c().await?; -//! -//! // Stop the metrics server gracefully -//! collector.stop(); -//! Ok(()) -//! } -//! ``` -//! -//! ## Using the Push mode -//! ```no_run -//! use metrics::{PrometheusMetricsCollector, MetricsMode}; -//! -//! #[tokio::main] -//! async fn main() -> Result<(), Box> { -//! let mut collector = PrometheusMetricsCollector::new()?; -//! -//! // Start pushing metrics to a Prometheus PushGateway with default values -//! collector.start(MetricsMode::new_push())?; -//! -//! // Or explicitly specify values -//! collector.start(MetricsMode::Push { -//! host: "127.0.0.1".to_string(), -//! port: 9091, -//! job: "custom_job".to_string(), -//! interval: 5, // Push every 5 seconds -//! })?; -//! -//! // Your application code here -//! tokio::signal::ctrl_c().await?; -//! -//! // Stop pushing metrics gracefully -//! collector.stop(); -//! Ok(()) -//! } - -use axum::{Router, routing::get}; -use prometheus::{Encoder, TextEncoder, register_counter_vec, register_gauge_vec}; -use reqwest::Client; -use serde::{Deserialize, Serialize}; -use std::net::SocketAddr; -use std::time::Duration as StdDuration; - -use dynamo_llm::kv_router::protocols::{ForwardPassMetrics, LoadMetrics}; -use dynamo_llm::kv_router::scoring::Endpoint; -use dynamo_llm::kv_router::scoring::ProcessedEndpoints; - -use dynamo_runtime::{ - Result, distributed::Component, error, service::EndpointInfo, utils::Duration, -}; - -/// Configuration for metrics collection mode -#[derive(Debug, Clone)] -pub enum MetricsMode { - /// Host a Prometheus metrics server for pull-based collection - Pull { - /// Host to listen on (e.g. "0.0.0.0") - host: String, - /// Port to listen on (e.g. 9091) - port: u16, - }, - /// Push to a Prometheus PushGateway - Push { - /// PushGateway host (e.g. "http://localhost") - host: String, - /// PushGateway port (e.g. 9091) - port: u16, - /// Job name for the metrics - job: String, - /// Push interval in seconds - interval: u64, - }, -} - -impl Default for MetricsMode { - fn default() -> Self { - Self::new_pull() - } -} - -impl MetricsMode { - /// Create a new Pull mode with default values - pub fn new_pull() -> Self { - Self::Pull { - host: "0.0.0.0".to_string(), - port: 9091, - } - } - - /// Create a new Push mode with default values - pub fn new_push() -> Self { - Self::Push { - host: "127.0.0.1".to_string(), - port: 9091, - job: "dynamo_metrics".to_string(), - interval: 2, - } - } -} - -/// Configuration for LLM worker load capacity metrics -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LLMWorkerLoadCapacityConfig { - pub component_name: String, - pub endpoint_name: String, - pub model_name: Option, -} - -/// Metrics collector for exposing metrics to prometheus/grafana -pub struct PrometheusMetricsCollector { - metrics: PrometheusMetrics, - mode: Option, - shutdown_tx: Option>, -} - -impl PrometheusMetricsCollector { - pub fn new() -> Result { - Ok(Self { - metrics: PrometheusMetrics::new()?, - mode: None, - shutdown_tx: None, - }) - } - - /// Start metrics collection with the specified mode - pub fn start(&mut self, mode: MetricsMode) -> Result<()> { - // Store the mode - self.mode = Some(mode.clone()); - - match mode { - MetricsMode::Pull { host, port } => self.start_pull_mode(host, port), - MetricsMode::Push { - host, - port, - job, - interval, - } => self.start_push_mode(host, port, job, interval), - } - } - - /// Stop metrics collection - pub fn stop(&mut self) { - if let Some(tx) = self.shutdown_tx.take() { - let _ = tx.send(()); - } - } - - /// Start a metrics server for pull-based collection on the specified port - fn start_pull_mode(&mut self, host: String, port: u16) -> Result<()> { - // Create an axum router with a metrics endpoint - let app = Router::new().route( - "/metrics", - get(|| async { - // Gather and encode metrics - let encoder = TextEncoder::new(); - let mut buffer = Vec::new(); - encoder.encode(&prometheus::gather(), &mut buffer).unwrap(); - String::from_utf8(buffer).unwrap() - }), - ); - - // Create a socket address to listen on - let ip_addr = host.parse().map_err(|e| { - error!("Failed to parse host '{}' as IP address: {}. Use a valid IPv4 or IPv6 address (e.g. '0.0.0.0' or '127.0.0.1')", host, e) - })?; - let addr = SocketAddr::new(ip_addr, port); - - // Create shutdown channel - let (tx, rx) = tokio::sync::oneshot::channel(); - self.shutdown_tx = Some(tx); - - // Spawn the server in a background task - tokio::spawn(async move { - let listener = tokio::net::TcpListener::bind(addr) - .await - .unwrap_or_else(|_| panic!("could not bind to address: {addr}")); - let server = axum::serve(listener, app); - - // Create a future that completes when shutdown signal is received - let shutdown_future = async { - rx.await.ok(); - }; - - // Run the server with graceful shutdown - tokio::select! { - result = server => { - if let Err(e) = result { - tracing::error!("Metrics server error: {}", e); - } - }, - _ = shutdown_future => { - tracing::info!("Metrics server shutting down gracefully"); - }, - } - }); - - tracing::info!("Prometheus metrics server started at {addr}/metrics"); - Ok(()) - } - - /// Start pushing metrics to a Prometheus PushGateway - fn start_push_mode( - &mut self, - host: String, - port: u16, - job: String, - interval: u64, - ) -> Result<()> { - // Create shutdown channel - let (tx, mut rx) = tokio::sync::oneshot::channel(); - self.shutdown_tx = Some(tx); - - // Create HTTP client - let client = Client::new(); - let url = format!("http://{host}:{port}/metrics/job/{job}"); - let url_clone = url.clone(); - let interval_duration = StdDuration::from_secs(interval); - - // Spawn background task to periodically push metrics - tokio::spawn(async move { - let mut interval = tokio::time::interval(interval_duration); - - loop { - tokio::select! { - _ = interval.tick() => { - // Gather and encode metrics - let encoder = TextEncoder::new(); - let mut buffer = Vec::new(); - if let Err(e) = encoder.encode(&prometheus::gather(), &mut buffer) { - tracing::error!("Failed to encode metrics: {}", e); - continue; - } - - // Push metrics to the gateway - match client.post(&url) - .header("Content-Type", encoder.format_type()) - .body(buffer) - .send() - .await - { - Ok(response) => { - if response.status().is_success() { - tracing::debug!("Successfully pushed metrics to PushGateway"); - } else { - tracing::error!( - "Failed to push metrics to PushGateway. Status: {}, Error: {:?}", - response.status(), - response.text().await - ); - } - } - Err(e) => { - tracing::error!("Failed to push metrics to PushGateway: {}", e); - } - } - } - _ = &mut rx => { - tracing::info!("Stopping metrics push task"); - break; - } - } - } - }); - - tracing::info!( - "Started pushing metrics to PushGateway at '{url_clone}' with job name '{job}'" - ); - Ok(()) - } - - /// Update metrics with current values - pub fn update(&mut self, config: &LLMWorkerLoadCapacityConfig, processed: &ProcessedEndpoints) { - self.metrics.update(config, processed); - } - - /// Update KV hit rate metrics - pub fn update_kv_hit_rate( - &mut self, - config: &LLMWorkerLoadCapacityConfig, - worker_id: i64, - isl_blocks: usize, - overlap_blocks: usize, - ) { - self.metrics - .update_kv_hit_rate(config, worker_id, isl_blocks, overlap_blocks); - } -} - -/// Prometheus metrics collection -pub struct PrometheusMetrics { - kv_blocks_active: prometheus::GaugeVec, - kv_blocks_total: prometheus::GaugeVec, - requests_active: prometheus::GaugeVec, - requests_total: prometheus::GaugeVec, - load_avg: prometheus::GaugeVec, - load_std: prometheus::GaugeVec, - // KV hit rate metrics - kv_hit_rate_percent: prometheus::GaugeVec, - // FIXME: These are currently unused outside of mock_worker - kv_hit_rate_isl_blocks: prometheus::CounterVec, - kv_hit_rate_overlap_blocks: prometheus::CounterVec, -} - -impl PrometheusMetrics { - /// Initialize all metrics - fn new() -> Result { - Ok(Self { - kv_blocks_active: register_gauge_vec!( - "llm_kv_blocks_active", - "Active KV cache blocks", - &["component", "endpoint", "worker_id"] - )?, - kv_blocks_total: register_gauge_vec!( - "llm_kv_blocks_total", - "Total KV cache blocks", - &["component", "endpoint", "worker_id"] - )?, - requests_active: register_gauge_vec!( - "llm_requests_active_slots", - "Active request slots", - &["component", "endpoint", "worker_id"] - )?, - requests_total: register_gauge_vec!( - "llm_requests_total_slots", - "Total request slots", - &["component", "endpoint", "worker_id"] - )?, - load_avg: register_gauge_vec!( - "llm_load_avg", - "Average load across workers", - &["component", "endpoint"] - )?, - load_std: register_gauge_vec!( - "llm_load_std", - "Load standard deviation across workers", - &["component", "endpoint"] - )?, - // KV hit rate (ForwardPassMetrics) - kv_hit_rate_percent: register_gauge_vec!( - "llm_kv_hit_rate_percent", - "KV hit rate percentage per worker", - &["component", "endpoint", "worker_id"] - )?, - // FIXME: Cleanup/remove event based metrics after finalizaing - // metrics collection approach with vllm/trtllm workers. - // Event-based KV hit rate metrics (not currently used outside mock worker) - kv_hit_rate_isl_blocks: register_counter_vec!( - "llm_kv_hit_rate_isl_blocks", - "Cumulative count of ISL blocks in KV hit rate events", - &["component", "endpoint", "worker_id"] - )?, - kv_hit_rate_overlap_blocks: register_counter_vec!( - "llm_kv_hit_rate_overlap_blocks", - "Cumulative count of overlapping blocks in KV hit rate events", - &["component", "endpoint", "worker_id"] - )?, - }) - } - - /// Helper method to set a gauge with worker-specific labels (3 labels) - fn set_worker_gauge( - &self, - gauge: &prometheus::GaugeVec, - config: &LLMWorkerLoadCapacityConfig, - worker_id: &String, - value: f64, - ) { - gauge - .with_label_values(&[&config.component_name, &config.endpoint_name, worker_id]) - .set(value); - } - - /// Helper method to increment a counter with worker-specific labels (3 labels) - fn increment_worker_counter( - &self, - counter: &prometheus::CounterVec, - config: &LLMWorkerLoadCapacityConfig, - worker_id: &String, - value: f64, - ) { - counter - .with_label_values(&[&config.component_name, &config.endpoint_name, worker_id]) - .inc_by(value); - } - - /// Helper method to set a gauge with component/endpoint labels only (2 labels) - fn set_endpoint_gauge( - &self, - gauge: &prometheus::GaugeVec, - config: &LLMWorkerLoadCapacityConfig, - value: f64, - ) { - gauge - .with_label_values(&[&config.component_name, &config.endpoint_name]) - .set(value); - } - - /// Update metrics with current values - fn update(&self, config: &LLMWorkerLoadCapacityConfig, processed: &ProcessedEndpoints) { - // Update per-worker metrics - for (worker_id, endpoint) in processed.endpoints.iter() { - let worker_id = worker_id.to_string(); - let load_metrics = endpoint.data.clone(); - let LoadMetrics::EngineLoadMetrics(metrics) = load_metrics else { - panic!("Can only update with ForwardPassMetrics"); - }; - - self.set_worker_gauge( - &self.kv_blocks_active, - config, - &worker_id, - metrics.kv_stats.kv_active_blocks as f64, - ); - self.set_worker_gauge( - &self.kv_blocks_total, - config, - &worker_id, - metrics.kv_stats.kv_total_blocks as f64, - ); - self.set_worker_gauge( - &self.requests_active, - config, - &worker_id, - metrics.worker_stats.request_active_slots as f64, - ); - self.set_worker_gauge( - &self.requests_total, - config, - &worker_id, - metrics.worker_stats.request_total_slots as f64, - ); - self.set_worker_gauge( - &self.kv_hit_rate_percent, - config, - &worker_id, - metrics.kv_stats.gpu_prefix_cache_hit_rate as f64, - ); - } - - // Update aggregate metrics - self.set_endpoint_gauge(&self.load_avg, config, processed.load_avg); - self.set_endpoint_gauge(&self.load_std, config, processed.load_std); - } - - /// Update KV hit rate metrics - pub fn update_kv_hit_rate( - &self, - config: &LLMWorkerLoadCapacityConfig, - worker_id: i64, - isl_blocks: usize, - overlap_blocks: usize, - ) { - let worker_id_str = worker_id.to_string(); - - // Increment the ISL blocks and overlap blocks counters - self.increment_worker_counter( - &self.kv_hit_rate_isl_blocks, - config, - &worker_id_str, - isl_blocks as f64, - ); - - self.increment_worker_counter( - &self.kv_hit_rate_overlap_blocks, - config, - &worker_id_str, - overlap_blocks as f64, - ); - - // TODO: The cumulative hit rate percentage can probably be computed by consumers - // of Prometheus metrics like Grafana instead, but we'll compute it here for now - // for convenient debugging/logging. - // Calculate and set the cumulative hit rate percentage - let cumulative_isl = self - .kv_hit_rate_isl_blocks - .with_label_values(&[ - &config.component_name, - &config.endpoint_name, - &worker_id_str, - ]) - .get(); - - let cumulative_overlap = self - .kv_hit_rate_overlap_blocks - .with_label_values(&[ - &config.component_name, - &config.endpoint_name, - &worker_id_str, - ]) - .get(); - - if cumulative_isl > 0.0 { - let cumulative_hit_rate = (cumulative_overlap / cumulative_isl) * 100.0; - tracing::debug!( - "Estimated Cumulative KV hit rate: {cumulative_hit_rate:.2}% (Overlap: {cumulative_overlap} / ISL: {cumulative_isl})" - ); - } - } -} - -/// Collect endpoints from a component -pub async fn collect_endpoints( - component: &Component, - subject: &str, - timeout: Duration, -) -> Result> { - // Collect stats from each backend - let stream = component.scrape_stats(timeout).await?; - - // Filter the stats by the service subject - let endpoints = stream - .into_endpoints() - .filter(|e| e.subject.starts_with(subject)) - .collect::>(); - tracing::debug!("Endpoints: {endpoints:?}"); - Ok(endpoints) -} - -/// Extract metrics from endpoints -pub fn extract_metrics(endpoints: &[EndpointInfo]) -> Vec { - let endpoint_data = endpoints.iter().map(|e| e.data.clone()).collect::>(); - - // Extract ForwardPassMetrics objects from endpoint services - let metrics: Vec = endpoint_data - .iter() - .filter_map(|e| { - let metrics_data = e.as_ref()?; - - match metrics_data.clone().decode::() { - Ok(stats) => Some(stats), - Err(err) => { - tracing::error!( - "Failed to decode ForwardPassMetrics data: {}. Raw data: {:?}", - err, - metrics_data - ); - None - } - } - }) - .collect(); - tracing::debug!("Metrics: {metrics:?}"); - - metrics -} - -/// Create ProcessedEndpoints from metrics and endpoints -pub fn postprocess_metrics( - metrics: &[ForwardPassMetrics], - endpoints: &[EndpointInfo], -) -> ProcessedEndpoints { - let processed_endpoints: Vec = metrics - .iter() - .zip(endpoints.iter()) - .filter_map(|(m, e)| { - e.id().ok().map(|id| Endpoint { - name: format!("worker-{id}"), - subject: e.subject.clone(), - data: LoadMetrics::EngineLoadMetrics(m.clone()), - }) - }) - .collect(); - - ProcessedEndpoints::new(processed_endpoints) -} diff --git a/components/metrics/src/main.rs b/components/metrics/src/main.rs deleted file mode 100644 index 873a08a721..0000000000 --- a/components/metrics/src/main.rs +++ /dev/null @@ -1,268 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Metrics is a metrics aggregator designed to operate within a namespace and collect -//! metrics from all workers. -//! -//! Metrics will collect for now: -//! -//! - LLM Worker Load:Capacity -//! - These metrics will be scraped by the LLM NATS Service API's stats request -//! - Request Slots: [Active, Total] -//! - KV Cache Blocks: [Active, Total] -//! - KV Hit Rate: -//! - These metrics will be collected from KV hit rate events published by the KV router -//! - ISL Blocks: Cumulative count of total blocks in all KV hit rate events -//! - Overlap Blocks: Cumulative count of blocks that were already in the KV cache -use clap::Parser; -use dynamo_llm::kv_router::KV_HIT_RATE_SUBJECT; -use dynamo_llm::kv_router::scheduler::KVHitRateEvent; -use dynamo_runtime::{ - DistributedRuntime, ErrorContext, Result, Runtime, Worker, error, logging, - traits::events::{EventPublisher, EventSubscriber}, - utils::{Duration, Instant}, -}; -use futures::stream::StreamExt; -use std::sync::Arc; - -// Import from our library -use metrics::{ - LLMWorkerLoadCapacityConfig, MetricsMode, PrometheusMetricsCollector, collect_endpoints, - extract_metrics, postprocess_metrics, -}; - -/// CLI arguments for the metrics application -#[derive(Parser, Debug)] -#[command(author, version, about, long_about = None)] -struct Args { - /// Namespace to operate in and subscribe to events on - #[arg(long, env = "DYN_NAMESPACE", default_value = "dynamo")] - namespace: String, - - /// Component to scrape metrics from - #[arg(long)] - component: String, - - /// Endpoint to scrape metrics from - #[arg(long)] - endpoint: String, - - /// Model name for the target component (optional) - #[arg(long)] - model_name: Option, - - /// Polling interval in seconds for scraping dynamo endpoint stats (minimum 1 second) - #[arg(long, default_value = "1")] - poll_interval: u64, - - /// Host for serving or pushing prometheus metrics (default: 0.0.0.0) - #[arg( - long, - default_value = "0.0.0.0", - help_heading = "Prometheus Metrics Config" - )] - host: String, - - /// Port to run the Prometheus metrics server on (default: 9091) - #[arg( - long, - default_value = "9091", - help_heading = "Prometheus Metrics Config" - )] - port: u16, - - /// Push metrics to an external Prometheus Pushgateway instead of hosting them in-process - #[arg(long, help_heading = "Prometheus Metrics Config")] - push: bool, - - /// Push interval in seconds, when using push mode (minimum 1 second, default: 2) - #[arg(long, default_value = "2", help_heading = "Prometheus Metrics Config")] - push_interval: u64, -} - -fn get_config(args: &Args) -> Result { - if args.component.is_empty() { - return Err(error!("Component name cannot be empty")); - } - - if args.endpoint.is_empty() { - return Err(error!("Endpoint name cannot be empty")); - } - - if args.poll_interval < 1 { - return Err(error!("Polling interval must be at least 1 second")); - } - - if args.push && args.push_interval < 1 { - return Err(error!("Push interval must be at least 1 second")); - } - - Ok(LLMWorkerLoadCapacityConfig { - component_name: args.component.clone(), - endpoint_name: args.endpoint.clone(), - model_name: args.model_name.clone(), - }) -} - -async fn app(runtime: Runtime) -> Result<()> { - let args = Args::parse(); - let config = get_config(&args)?; - tracing::debug!("Config: {config:?}"); - - let drt = DistributedRuntime::from_settings(runtime.clone()).await?; - - let namespace = drt.namespace(args.namespace)?; - let component = namespace.component("count")?; - - // Create unique instance of Count - let key = format!("{}/instance", component.etcd_root()); - tracing::debug!("Creating unique instance of Count at {key}"); - drt.etcd_client() - .expect("Unreachable because of DistributedRuntime::from_settings above") - .kv_create(&key, serde_json::to_vec_pretty(&config)?, None) - .await - .context("Unable to create unique instance of Count; possibly one already exists")?; - - let target_component = namespace.component(&config.component_name)?; - let target_endpoint = target_component.endpoint(&config.endpoint_name); - - let service_path = target_endpoint.path(); - let service_subject = target_endpoint.subject(); - tracing::info!("Scraping endpoint {service_path} for stats"); - - // Safety: DistributedRuntime::from_settings ensures this is Some - let token = drt.primary_lease().unwrap().child_token(); - let event_name = format!("l2c.{}.{}", config.component_name, config.endpoint_name); - - // Initialize Prometheus metrics with the selected mode - let metrics_collector = PrometheusMetricsCollector::new()?; - let metrics_collector = Arc::new(tokio::sync::Mutex::new(metrics_collector)); - - // Start metrics collection in the selected mode - let metrics_mode = if args.push { - MetricsMode::Push { - host: args.host, - port: args.port, - job: "dynamo_push_metrics".to_string(), - interval: args.push_interval, - } - } else { - MetricsMode::Pull { - host: args.host, - port: args.port, - } - }; - - metrics_collector.lock().await.start(metrics_mode)?; - - // TODO: Consider removing event subscription until metrics are more standardized - // Subscribe to KV hit rate events - let kv_hit_rate_subject = KV_HIT_RATE_SUBJECT; - tracing::debug!("Subscribing to KV hit rate events on subject: {kv_hit_rate_subject}"); - - // Clone fields for the event subscription task - let config_clone = config.clone(); - let namespace_clone = namespace.clone(); - let metrics_collector_clone = metrics_collector.clone(); - - // Note: Subscribing to KVHitRateEvent for illustration purposes. They're not used in production. - // Spawn a task to handle KV hit rate events - tokio::spawn(async move { - match namespace_clone.subscribe(kv_hit_rate_subject).await { - Ok(mut subscriber) => { - tracing::debug!("Successfully subscribed to KV hit rate events"); - - while let Some(msg) = subscriber.next().await { - match serde_json::from_slice::(&msg.payload) { - Ok(event) => { - // TODO: Lower to debug - let cache_hit_pct = - (event.overlap_blocks as f64 / event.isl_blocks as f64) * 100.0; - tracing::debug!( - "Received KV hit rate event: worker_id={}, isl_blocks={}, overlap_blocks={}, cache_hit_pct={:.2}%", - event.worker_id, - event.isl_blocks, - event.overlap_blocks, - cache_hit_pct - ); - - // Update metrics with the event data - let mut metrics = metrics_collector_clone.lock().await; - metrics.update_kv_hit_rate( - &config_clone, - event.worker_id, - event.isl_blocks, - event.overlap_blocks as usize, - ); - } - Err(e) => { - tracing::warn!("Failed to deserialize KV hit rate event: {e}"); - } - } - } - - tracing::warn!("KV hit rate event subscription stream ended"); - } - Err(e) => { - tracing::error!("Failed to subscribe to KV hit rate events: {:?}", e); - } - } - }); - - loop { - let next = Instant::now() + Duration::from_secs(args.poll_interval); - - // Collect and process metrics - let scrape_timeout = Duration::from_secs(1); - let endpoints = - collect_endpoints(&target_component, &service_subject, scrape_timeout).await?; - if endpoints.is_empty() { - tracing::warn!("No endpoints found matching {service_path}"); - continue; - } - - let metrics = extract_metrics(&endpoints); - let processed = postprocess_metrics(&metrics, &endpoints); - if processed.endpoints.is_empty() { - tracing::warn!("No metrics found matching {service_path}"); - } else { - tracing::info!("Aggregated metrics: {processed:?}"); - } - - // Update Prometheus metrics - metrics_collector.lock().await.update(&config, &processed); - - // TODO: Enable KV Routers to subscribe to metrics events published here - // for a single view of the aggregated metrics, as opposed to the current - // approach where each KV Router computes and published its own metrics. - // Publish metrics event - namespace.publish(&event_name, &processed).await?; - - // Wait until cancelled or the next tick - match tokio::time::timeout_at(next, token.cancelled()).await { - Ok(_) => break, - Err(_) => continue, - } - } - - Ok(()) -} - -fn main() -> Result<()> { - logging::init(); - let worker = Worker::from_settings()?; - worker.execute(app) -} - -#[cfg(test)] -mod tests { - use super::*; - use std::env; - - #[test] - fn test_namespace_from_env() { - unsafe { env::set_var("DYN_NAMESPACE", "test-namespace") }; - let args = Args::parse_from(["count", "--component", "comp", "--endpoint", "end"]); - assert_eq!(args.namespace, "test-namespace"); - } -} diff --git a/deploy/metrics/README.md b/deploy/metrics/README.md index 474d9ea4f6..be6d823ce2 100644 --- a/deploy/metrics/README.md +++ b/deploy/metrics/README.md @@ -174,7 +174,7 @@ The following configuration files should be present in this directory: - [grafana_dashboards/grafana-dashboard-providers.yml](./grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration - [grafana_dashboards/grafana-dynamo-dashboard.json](./grafana_dashboards/grafana-dynamo-dashboard.json): A general Dynamo Dashboard for both SW and HW metrics. - [grafana_dashboards/grafana-dcgm-metrics.json](./grafana_dashboards/grafana-dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics -- [grafana_dashboards/grafana-llm-metrics.json](./grafana_dashboards/grafana-llm-metrics.json): This file, which is being phased out, contains the Grafana dashboard configuration for LLM-specific metrics. It requires an additional `metrics` component to operate concurrently. A new version is under development. +- [grafana_dashboards/grafana-kvbm-dashboard.json](./grafana_dashboards/grafana-kvbm-dashboard.json): Contains Grafana dashboard configuration for KVBM metrics ### Metric Name Constants @@ -237,8 +237,6 @@ This centralized approach ensures all Dynamo components use consistent, valid Pr - DCGM Exporter: `http://localhost:9401/metrics` - - Start the [components/metrics](../../components/metrics/README.md) application to begin monitoring for metric events from dynamo workers and aggregating them on a Prometheus metrics endpoint: `http://localhost:9091/metrics`. - - Uncomment the appropriate lines in prometheus.yml to poll port 9091. - Start worker(s) that publishes KV Cache metrics: [lib/runtime/examples/service_metrics/README.md](../../lib/runtime/examples/service_metrics/README.md) can populate dummy KV Cache metrics. ### Configuration @@ -275,7 +273,7 @@ Grafana is pre-configured with: docker compose logs grafana ``` -3. For issues with the legacy metrics component (being phased out), see [components/metrics/README.md](../../components/metrics/README.md) for details on the exposed metrics and troubleshooting steps. +3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection. ## Developer Guide @@ -477,21 +475,6 @@ let requests_total = namespace.create_counter( )?; ``` -## Running the deprecated `components/metrics` program - -⚠️ **DEPRECATION NOTICE** ⚠️ - -When you run the example [components/metrics](../../components/metrics/README.md) program, it exposes a Prometheus /metrics endpoint with the following metrics (defined in [components/metrics/src/lib.rs](../../components/metrics/src/lib.rs)): - -**⚠️ The following `llm_kv_*` metrics are deprecated:** - -- `llm_requests_active_slots`: Active request slots per worker -- `llm_requests_total_slots`: Total available request slots per worker -- `llm_kv_blocks_active`: Active KV blocks per worker -- `llm_kv_blocks_total`: Total KV blocks available per worker -- `llm_kv_hit_rate_percent`: KV Cache hit percent per worker -- `llm_load_avg`: Average load across workers -- `llm_load_std`: Load standard deviation across workers ## Troubleshooting @@ -506,4 +489,4 @@ When you run the example [components/metrics](../../components/metrics/README.md docker compose logs grafana ``` -3. For issues with the legacy metrics component (being phased out), see [components/metrics/README.md](../../components/metrics/README.md) for details on the exposed metrics and troubleshooting steps. +3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection. diff --git a/deploy/metrics/grafana_dashboards/grafana-llm-metrics.json b/deploy/metrics/grafana_dashboards/grafana-llm-metrics.json deleted file mode 100644 index 775d305471..0000000000 --- a/deploy/metrics/grafana_dashboards/grafana-llm-metrics.json +++ /dev/null @@ -1,900 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "copyright": [ - "SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.", - "SPDX-License-Identifier: Apache-2.0", - "Licensed under the Apache License, Version 2.0 (the \"License\");", - "you may not use this file except in compliance with the License.", - "You may obtain a copy of the License at", - "http://www.apache.org/licenses/LICENSE-2.0", - "Unless required by applicable law or agreed to in writing, software", - "distributed under the License is distributed on an \"AS IS\" BASIS,", - "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.", - "See the License for the specific language governing permissions and", - "limitations under the License.", - "", - "DEPRECATION NOTICE:", - "This dashboard uses deprecated llm_kv_* metrics (llm_kv_blocks_active, llm_kv_blocks_total, llm_kv_hit_rate_percent)", - "that are part of the deprecated metrics aggregation service. These metrics will be removed in a future release.", - "Please migrate to the new MetricsRegistry system which provides dynamo_* metrics instead.", - "See docs/guides/metrics.md for migration guidance." - ], - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": 1, - "links": [], - "liveNow": false, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 1, - "options": { - "legend": { - "calcs": [ - "mean", - "max" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "title": "KV Cache Utilization by Worker", - "type": "timeseries", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "100 * llm_kv_blocks_active{component=\"$component\", endpoint=\"$endpoint\"} / llm_kv_blocks_total{component=\"$component\", endpoint=\"$endpoint\"}", - "legendFormat": "Worker {{worker_id}}", - "range": true, - "refId": "A" - } - ] - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 2, - "options": { - "legend": { - "calcs": [ - "mean", - "max" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "title": "Request Slot Utilization by Worker", - "type": "timeseries", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "100 * llm_requests_active_slots{component=\"$component\", endpoint=\"$endpoint\"} / llm_requests_total_slots{component=\"$component\", endpoint=\"$endpoint\"}", - "legendFormat": "Worker {{worker_id}}", - "range": true, - "refId": "A" - } - ] - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 4, - "x": 0, - "y": 8 - }, - "id": 3, - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "10.0.0", - "title": "Average KV Cache Utilization", - "type": "gauge", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "100 * avg(llm_kv_blocks_active{component=\"$component\", endpoint=\"$endpoint\"}) / avg(llm_kv_blocks_total{component=\"$component\", endpoint=\"$endpoint\"})", - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ] - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 4, - "x": 4, - "y": 8 - }, - "id": 4, - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "10.0.0", - "title": "Average Request Slot Utilization", - "type": "gauge", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "100 * avg(llm_requests_active_slots{component=\"$component\", endpoint=\"$endpoint\"}) / avg(llm_requests_total_slots{component=\"$component\", endpoint=\"$endpoint\"})", - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ] - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 4, - "x": 8, - "y": 8 - }, - "id": 7, - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "10.0.0", - "title": "Average KV Cache Hit Rate", - "type": "gauge", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "100 * avg(llm_kv_hit_rate_percent{component=\"$component\", endpoint=\"$endpoint\"})", - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ] - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 8 - }, - "id": 5, - "options": { - "legend": { - "calcs": [ - "mean", - "max" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "title": "Load Average & Standard Deviation", - "type": "timeseries", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "llm_load_avg{component=\"$component\", endpoint=\"$endpoint\"}", - "legendFormat": "Average", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "llm_load_std{component=\"$component\", endpoint=\"$endpoint\"}", - "hide": false, - "legendFormat": "StdDev", - "range": true, - "refId": "B" - } - ] - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 16 - }, - "id": 8, - "options": { - "legend": { - "calcs": [ - "mean", - "max" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "title": "KV Cache Hit Rate by Worker", - "type": "timeseries", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "100 * llm_kv_hit_rate_percent{component=\"$component\", endpoint=\"$endpoint\"}", - "legendFormat": "Worker {{worker_id}}", - "range": true, - "refId": "A" - } - ] - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 16 - }, - "id": 9, - "options": { - "legend": { - "calcs": [ - "mean", - "max" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "title": "Average KV Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "avg(100 * llm_kv_hit_rate_percent{component=\"$component\", endpoint=\"$endpoint\"})", - "legendFormat": "Average Hit Rate", - "range": true, - "refId": "A" - } - ] - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 24 - }, - "id": 6, - "options": { - "legend": { - "calcs": [ - "mean", - "max" - ], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "title": "Available Resources", - "type": "timeseries", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(llm_kv_blocks_total{component=\"$component\", endpoint=\"$endpoint\"} - llm_kv_blocks_active{component=\"$component\", endpoint=\"$endpoint\"})", - "legendFormat": "Available KV Blocks", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "editorMode": "code", - "expr": "sum(llm_requests_total_slots{component=\"$component\", endpoint=\"$endpoint\"} - llm_requests_active_slots{component=\"$component\", endpoint=\"$endpoint\"})", - "hide": false, - "legendFormat": "Available Request Slots", - "range": true, - "refId": "B" - } - ] - } - ], - "refresh": "2s", - "schemaVersion": 38, - "style": "dark", - "tags": [ - "llm", - "metrics" - ], - "templating": { - "list": [ - { - "current": { - "selected": false, - "text": "component", - "value": "vllm" - }, - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "definition": "label_values(llm_kv_blocks_active, component)", - "hide": 0, - "includeAll": false, - "label": "Component", - "multi": false, - "name": "component", - "options": [], - "query": { - "query": "label_values(llm_kv_blocks_active, component)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - }, - { - "current": { - "selected": false, - "text": "endpoint", - "value": "load_metrics" - }, - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, - "definition": "label_values(llm_kv_blocks_active{component=\"$component\"}, endpoint)", - "hide": 0, - "includeAll": false, - "label": "Endpoint", - "multi": false, - "name": "endpoint", - "options": [], - "query": { - "query": "label_values(llm_kv_blocks_active{component=\"$component\"}, endpoint)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - } - ] - }, - "time": { - "from": "now-5m", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "LLM Worker Metrics", - "uid": "llm-worker-metrics", - "version": 1, - "weekStart": "" -} \ No newline at end of file diff --git a/deploy/metrics/prometheus.yml b/deploy/metrics/prometheus.yml index 52b2c9a44b..74600e6b37 100644 --- a/deploy/metrics/prometheus.yml +++ b/deploy/metrics/prometheus.yml @@ -33,13 +33,13 @@ scrape_configs: static_configs: - targets: ['dcgm-exporter:9401'] # on the "monitoring" network - # This is a demo service that needs to be launched manually. See components/metrics/README.md - # Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 8080/tcp - # You can also force the port, if the default is different: python -m dynamo.frontend --http-port 8080 + # This is a demo service that needs to be launched manually + # Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 8000/tcp + # You can also force the port, if the default is different: python -m dynamo.frontend --http-port 8000 - job_name: 'dynamo-frontend' scrape_interval: 10s static_configs: - - targets: ['host.docker.internal:8080'] # on the "monitoring" network + - targets: ['host.docker.internal:8000'] # on the "monitoring" network # Launch via: DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 dynamo. ... # If you want to update the scrape_interval, you may want to also update component.rs's MAX_DELAY @@ -48,15 +48,6 @@ scrape_configs: static_configs: - targets: ['host.docker.internal:8081'] - # DEPRECATED: This metrics aggregation service is being deprecated in favor of MetricsRegistry - # The new system uses the 'dynamo-backend' job above instead of this separate service - # This is another demo aggregator that needs to be launched manually. See components/metrics/README.md - # Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 9091/tcp - - job_name: 'metrics-aggregation-service' - scrape_interval: 2s - static_configs: - # - targets: ['localhost:9091'] # metrics aggregation service on host - - targets: ['host.docker.internal:9091'] # metrics aggregation service on host # KVBM leader related metrics - job_name: 'kvbm-leader-metrics'