From 7cfba9d9a63f2bf6a736306ebb0c8f1c798ff081 Mon Sep 17 00:00:00 2001 From: CYJiang Date: Tue, 2 Dec 2025 20:26:29 +0800 Subject: [PATCH] fix: update incorrect metrics in cache Signed-off-by: CYJiang --- pkg/cache/cache_metrics.go | 20 ++-- pkg/metrics/engine_fetcher.go | 64 +++++++++---- pkg/metrics/engine_fetcher_test.go | 22 ++--- pkg/metrics/metrics.go | 146 +++++++++++++++++------------ pkg/metrics/types.go | 25 ++++- 5 files changed, 174 insertions(+), 103 deletions(-) diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go index 307905615..d1093eddd 100644 --- a/pkg/cache/cache_metrics.go +++ b/pkg/cache/cache_metrics.go @@ -353,17 +353,21 @@ func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily, klog.V(4).Infof(err.Error()) engineType = defaultEngineLabelValue } - rawMetricName, ok := metric.EngineMetricsNameMapping[engineType] - if !ok { - klog.V(4).Infof("Cannot find engine type %v mapping for metrics %v", engineType, labelMetricName) + candidates, ok := metric.EngineMetricsNameMapping[engineType] + if !ok || len(candidates) == 0 { + klog.V(4).Infof("No metric name candidates found for engine type %v and metric %v", engineType, labelMetricName) return nil, false } - metricFamily, exists := allMetrics[rawMetricName] - if !exists { - klog.V(4).Infof("Cannot find raw metrics %v, engine type %v", rawMetricName, engineType) - return nil, false + + for _, rawMetricName := range candidates { + if metricFamily, exists := allMetrics[rawMetricName]; exists { + return metricFamily, true + } } - return metricFamily, true + + klog.V(4).Infof("None of the candidate raw metrics %v found for engine %v and metric %v", candidates, engineType, labelMetricName) + return nil, false + } // Update `PodMetrics` and `PodModelMetrics` according to the metric scope diff --git a/pkg/metrics/engine_fetcher.go b/pkg/metrics/engine_fetcher.go index b0f180a3d..3f9bd549a 100644 --- a/pkg/metrics/engine_fetcher.go +++ b/pkg/metrics/engine_fetcher.go @@ -100,14 +100,14 @@ func (ef *EngineMetricsFetcher) FetchTypedMetric(ctx context.Context, endpoint, return nil, fmt.Errorf("metric %s is not a raw pod metric, use FetchAllTypedMetrics for complex queries", metricName) } - // Get raw metric name for this engine - rawMetricName, exists := metricDef.EngineMetricsNameMapping[engineType] - if !exists { + // Get raw metric name candidates for this engine + candidates, exists := metricDef.EngineMetricsNameMapping[engineType] + if !exists || len(candidates) == 0 { return nil, fmt.Errorf("metric %s not supported for engine type %s", metricName, engineType) } url := fmt.Sprintf("http://%s/metrics", endpoint) - + var lastErr error // Fetch with retry logic for attempt := 0; attempt <= ef.config.MaxRetries; attempt++ { if attempt > 0 { @@ -130,21 +130,31 @@ func (ef *EngineMetricsFetcher) FetchTypedMetric(ctx context.Context, endpoint, continue } - // Parse the specific metric we need - metricValue, err := ef.parseMetricFromFamily(allMetrics, rawMetricName, metricDef) - if err != nil { - klog.V(4).InfoS("Failed to parse metric from engine endpoint", - "attempt", attempt+1, "identifier", identifier, "metric", metricName, "error", err) - continue + // Try each candidate until one exists and can be parsed + for _, rawMetricName := range candidates { + if _, ok := allMetrics[rawMetricName]; !ok { + continue // skip if not present + } + + metricValue, err := ef.parseMetricFromFamily(allMetrics, rawMetricName, metricDef) + if err != nil { + lastErr = err + klog.V(5).InfoS("Failed to parse candidate metric", "candidate", rawMetricName, "error", err) + continue + } + + klog.V(4).InfoS("Successfully fetched typed metric from engine endpoint", + "identifier", identifier, "metric", metricName, "rawMetric", rawMetricName, "value", metricValue, "attempt", attempt+1) + return metricValue, nil } - klog.V(4).InfoS("Successfully fetched typed metric from engine endpoint", - "identifier", identifier, "metric", metricName, "value", metricValue, "attempt", attempt+1) - return metricValue, nil + klog.V(4).InfoS("Failed to find valid metric among candidates", + "candidates", candidates, "identifier", identifier, "metric", metricName) + // Continue to next retry if any } - return nil, fmt.Errorf("failed to fetch typed metric %s from engine endpoint %s after %d attempts", - metricName, identifier, ef.config.MaxRetries+1) + return nil, fmt.Errorf("failed to fetch typed metric %s from engine endpoint %s after %d attempts: %w", + metricName, identifier, ef.config.MaxRetries+1, lastErr) } // FetchAllTypedMetrics fetches all available typed metrics from an engine endpoint @@ -215,10 +225,26 @@ func (ef *EngineMetricsFetcher) FetchAllTypedMetrics(ctx context.Context, endpoi continue } - // Get raw metric name for this engine - rawMetricName, exists := metricDef.EngineMetricsNameMapping[result.EngineType] - if !exists { - klog.V(5).InfoS("Metric not supported for engine type", "metric", metricName, "engine", result.EngineType) + // Get raw metric name candidates for this engine + candidates, exists := metricDef.EngineMetricsNameMapping[result.EngineType] + if !exists || len(candidates) == 0 { + klog.V(5).InfoS("No raw metric names defined for metric and engine type", + "metric", metricName, "engine", result.EngineType) + continue + } + + // Find the first candidate that exists in allMetrics + var rawMetricName string + for _, name := range candidates { + if _, ok := allMetrics[name]; ok { + rawMetricName = name + break + } + } + + if rawMetricName == "" { + klog.V(5).InfoS("None of the candidate raw metrics found in endpoint response", + "metric", metricName, "engine", result.EngineType, "candidates", candidates) continue } diff --git a/pkg/metrics/engine_fetcher_test.go b/pkg/metrics/engine_fetcher_test.go index 14f7ed253..3ead76aee 100644 --- a/pkg/metrics/engine_fetcher_test.go +++ b/pkg/metrics/engine_fetcher_test.go @@ -77,9 +77,9 @@ func setupMockMetrics() { Metrics["running_requests"] = Metric{ MetricSource: PodRawMetrics, MetricType: MetricType{Raw: Gauge}, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm_num_requests_running", - "sglang": "sglang_running_requests", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm_num_requests_running"}, + "sglang": {"sglang_running_requests"}, }, Description: "Number of running requests", MetricScope: PodModelMetricScope, @@ -88,9 +88,9 @@ func setupMockMetrics() { Metrics["waiting_requests"] = Metric{ MetricSource: PodRawMetrics, MetricType: MetricType{Raw: Gauge}, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm_num_requests_waiting", - "sglang": "sglang_waiting_requests", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm_num_requests_waiting"}, + "sglang": {"sglang_waiting_requests"}, }, Description: "Number of waiting requests", MetricScope: PodModelMetricScope, @@ -99,9 +99,9 @@ func setupMockMetrics() { Metrics["cache_usage"] = Metric{ MetricSource: PodRawMetrics, MetricType: MetricType{Raw: Gauge}, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm_gpu_cache_usage_perc", - "sglang": "sglang_cache_usage", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm_gpu_cache_usage_perc"}, + "sglang": {"sglang_cache_usage"}, }, Description: "Cache usage percentage", MetricScope: PodMetricScope, @@ -110,8 +110,8 @@ func setupMockMetrics() { Metrics["time_to_first_token"] = Metric{ MetricSource: PodRawMetrics, MetricType: MetricType{Raw: Histogram}, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm_time_to_first_token_seconds", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm_time_to_first_token_seconds"}, }, Description: "Time to first token histogram", MetricScope: PodModelMetricScope, diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index f672ebf3a..eb6625337 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -60,63 +60,63 @@ var ( // Metrics defines all available metrics, including raw and query-based metrics. Metrics = map[string]Metric{ // Counter metrics - NumRequestsRunning: { + NumRequestsSwapped: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:num_requests_running", - "sglang": "sglang:num_running_reqs", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:num_requests_swapped"}, }, - Description: "Number of running requests", + Description: "Number of swapped requests", }, - NumRequestsWaiting: { + PromptTokenTotal: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:num_requests_waiting", - "sglang": "sglang:num_waiting_reqs", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:prompt_tokens_total"}, }, - Description: "Number of waiting requests", + Description: "Total prompt tokens", }, - NumRequestsSwapped: { + GenerationTokenTotal: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Counter, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:num_requests_swapped", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:generation_tokens_total"}, }, - Description: "Number of swapped requests", + Description: "Total generation tokens", }, // Gauge metrics - PromptTokenTotal: { + NumRequestsRunning: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:prompt_tokens_total", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:num_requests_running"}, + "sglang": {"sglang:num_running_reqs"}, }, - Description: "Total prompt tokens", + Description: "Number of running requests", }, - GenerationTokenTotal: { + NumRequestsWaiting: { MetricScope: PodModelMetricScope, MetricSource: PodRawMetrics, MetricType: MetricType{ Raw: Gauge, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:generation_tokens_total", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:num_requests_waiting"}, + "sglang": {"sglang:num_waiting_reqs"}, }, - Description: "Total generation tokens", + Description: "Number of waiting requests", }, AvgPromptThroughputToksPerS: { MetricScope: PodModelMetricScope, @@ -124,8 +124,13 @@ var ( MetricType: MetricType{ Raw: Gauge, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:avg_prompt_throughput_toks_per_s", + EngineMetricsNameMapping: map[string][]string{ + // vLLM deprecated "vllm:avg_prompt_throughput_toks_per_s" in v0.7.0 + // It was removed entirely in PR #12383(https://github.com/vllm-project/vllm/pull/12383). + // This metric is no longer emitted by vLLM >= v0.7.0. + // TODO: Remove the deprecated vLLM metric names from EngineMetricsNameMapping + // once we confirm no deployments rely on vLLM < v0.7.0. + "vllm": {"vllm:avg_prompt_throughput_toks_per_s"}, }, Description: "Average prompt throughput in tokens per second", }, @@ -135,9 +140,14 @@ var ( MetricType: MetricType{ Raw: Gauge, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:avg_generation_throughput_toks_per_s", - "sglang": "sglang:gen_throughput", + EngineMetricsNameMapping: map[string][]string{ + // vLLM deprecated "vllm:avg_generation_throughput_toks_per_s" in v0.7.0. + // It was removed entirely in PR #12383(https://github.com/vllm-project/vllm/pull/12383). + // This metric is no longer emitted by vLLM >= v0.7.0. + // TODO: Remove the deprecated vLLM metric names from EngineMetricsNameMapping + // once we confirm no deployments rely on vLLM < v0.7.0. + "vllm": {"vllm:avg_generation_throughput_toks_per_s"}, + "sglang": {"sglang:gen_throughput"}, }, Description: "Average generation throughput in tokens per second", }, @@ -148,8 +158,8 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:iteration_tokens_total", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:iteration_tokens_total"}, }, Description: "Total iteration tokens", }, @@ -159,9 +169,9 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:time_to_first_token_seconds", - "sglang": "sglang:time_to_first_token_seconds", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:time_to_first_token_seconds"}, + "sglang": {"sglang:time_to_first_token_seconds"}, }, Description: "Time to first token in seconds", }, @@ -171,9 +181,18 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:time_per_output_token_seconds", - "sglang": "sglang:inter_token_latency_seconds", + EngineMetricsNameMapping: map[string][]string{ + // vLLM exposes two metrics for inter-token latency: + // - "vllm:inter_token_latency_seconds" is the current, recommended metric (since v0.11). + // - "vllm:time_per_output_token_seconds" is deprecated as of v0.11 and hidden by default. + // It can be temporarily enabled via --show-hidden-metrics-for-version=0.11, + // but will be removed in v0.13.0. + // + // We list both to maintain backward compatibility during transition. + // TODO: Remove "vllm:time_per_output_token_seconds" from this list once vLLM >= v0.13.0 + // is widely adopted and the deprecated metric is no longer in use. + "vllm": {"vllm:inter_token_latency_seconds", "vllm:time_per_output_token_seconds"}, + "sglang": {"sglang:inter_token_latency_seconds"}, }, Description: "Time per output token in seconds", }, @@ -183,9 +202,9 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:e2e_request_latency_seconds", - "sglang": "sglang:e2e_request_latency_seconds", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:e2e_request_latency_seconds"}, + "sglang": {"sglang:e2e_request_latency_seconds"}, }, Description: "End-to-end request latency in seconds", }, @@ -195,8 +214,9 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:request_queue_time_seconds", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:request_queue_time_seconds"}, + "sglang": {"sglang:queue_time_seconds"}, }, Description: "Request queue time in seconds", }, @@ -206,8 +226,8 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:request_inference_time_seconds", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:request_inference_time_seconds"}, }, Description: "Request inference time in seconds", }, @@ -217,8 +237,8 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:request_decode_time_seconds", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:request_decode_time_seconds"}, }, Description: "Request decode time in seconds", }, @@ -228,8 +248,8 @@ var ( MetricType: MetricType{ Raw: Histogram, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:request_prefill_time_seconds", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:request_prefill_time_seconds"}, }, Description: "Request prefill time in seconds", }, @@ -303,10 +323,16 @@ var ( MetricType: MetricType{ Raw: Counter, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:gpu_cache_usage_perc", - "sglang": "sglang:token_usage", // Based on https://github.com/sgl-project/sglang/issues/5979 - "xllm": "kv_cache_utilization", + EngineMetricsNameMapping: map[string][]string{ + // In vLLM PR#18354 (https://github.com/vllm-project/vllm/pull/18354) (merged since v0.10.0), + // metrics with the "gpu_" prefix for non-GPU-specific data were deprecated: + // - Deprecated: vllm:gpu_cache_usage_perc + // - Official replacement: vllm:kv_cache_usage_perc (same value, clearer semantics) + // + // We keep both for backward compatibility with deployments < v0.10.0. + "vllm": {"vllm:gpu_cache_usage_perc", "vllm:kv_cache_usage_perc"}, + "sglang": {"sglang:token_usage"}, // Based on https://github.com/sgl-project/sglang/issues/5979 + "xllm": {"kv_cache_utilization"}, }, Description: "GPU cache usage percentage", }, @@ -316,8 +342,8 @@ var ( MetricType: MetricType{ Raw: Gauge, }, - EngineMetricsNameMapping: map[string]string{ - "xllm": "engine_utilization", + EngineMetricsNameMapping: map[string][]string{ + "xllm": {"engine_utilization"}, }, Description: "GPU busy time ratio", }, @@ -327,8 +353,8 @@ var ( MetricType: MetricType{ Raw: Counter, }, - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:cpu_cache_usage_perc", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:cpu_cache_usage_perc"}, }, Description: "CPU cache usage percentage", }, @@ -375,8 +401,8 @@ var ( Query: QueryLabel, }, LabelKey: "max_lora", - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:lora_requests_info", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:lora_requests_info"}, }, Description: "Max count of Lora Adapters", }, @@ -387,8 +413,8 @@ var ( Query: QueryLabel, }, LabelKey: "running_lora_adapters", - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:lora_requests_info", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:lora_requests_info"}, }, Description: "Count of running Lora Adapters", }, @@ -399,8 +425,8 @@ var ( Query: QueryLabel, }, LabelKey: "waiting_lora_adapters", - EngineMetricsNameMapping: map[string]string{ - "vllm": "vllm:lora_requests_info", + EngineMetricsNameMapping: map[string][]string{ + "vllm": {"vllm:lora_requests_info"}, }, Description: "Count of waiting Lora Adapters", }, diff --git a/pkg/metrics/types.go b/pkg/metrics/types.go index 8bf0ac830..9602434dc 100644 --- a/pkg/metrics/types.go +++ b/pkg/metrics/types.go @@ -77,11 +77,26 @@ const ( // Metric defines a unique metric with metadata. type Metric struct { - MetricSource MetricSource - MetricType MetricType - PromQL string // Optional: Only applicable for PromQL-based metrics - LabelKey string // Optional: Only applicable for QueryLabel-based metrics - EngineMetricsNameMapping map[string]string // Optional: Mapping from engine type to raw metric name. + MetricSource MetricSource + MetricType MetricType + PromQL string // Optional: Only applicable for PromQL-based metrics + LabelKey string // Optional: Only applicable for QueryLabel-based metrics + // EngineMetricsNameMapping defines a fallback list of raw metric names for each engine type. + // This is necessary because different versions or forks of inference engines (e.g., vLLM) + // may expose the same logical metric under different Prometheus metric names. + // + // The slice order matters: the system will try names in the given order and use the first + // one that exists in the scraped metrics. This allows graceful compatibility across engine + // versions without requiring runtime version detection. + // + // Example: + // "vllm": {"vllm:inter_token_latency_seconds", "vllm:time_per_output_token_seconds"} + // → Newer vLLM versions use "vllm:inter_token_latency_seconds", older ones use "vllm:time_per_output_token_seconds". + // → By listing the new name first, deployments using newer versions will pick it up, + // while older deployments fall back to the second name if the first is absent. + // + // If no candidate name exists in the actual metrics endpoint, the metric will be skipped. + EngineMetricsNameMapping map[string][]string // Optional: Mapping from engine type to raw metric name. Description string MetricScope MetricScope }