From 7cfba9d9a63f2bf6a736306ebb0c8f1c798ff081 Mon Sep 17 00:00:00 2001
From: CYJiang <googs1025@gmail.com>
Date: Tue, 2 Dec 2025 20:26:29 +0800
Subject: [PATCH] fix: update incorrect metrics in cache

Signed-off-by: CYJiang <googs1025@gmail.com>
---
 pkg/cache/cache_metrics.go         |  20 ++--
 pkg/metrics/engine_fetcher.go      |  64 +++++++++----
 pkg/metrics/engine_fetcher_test.go |  22 ++---
 pkg/metrics/metrics.go             | 146 +++++++++++++++++------------
 pkg/metrics/types.go               |  25 ++++-
 5 files changed, 174 insertions(+), 103 deletions(-)

diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go
index 307905615..d1093eddd 100644
--- a/pkg/cache/cache_metrics.go
+++ b/pkg/cache/cache_metrics.go
@@ -353,17 +353,21 @@ func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily,
 		klog.V(4).Infof(err.Error())
 		engineType = defaultEngineLabelValue
 	}
-	rawMetricName, ok := metric.EngineMetricsNameMapping[engineType]
-	if !ok {
-		klog.V(4).Infof("Cannot find engine type %v mapping for metrics %v", engineType, labelMetricName)
+	candidates, ok := metric.EngineMetricsNameMapping[engineType]
+	if !ok || len(candidates) == 0 {
+		klog.V(4).Infof("No metric name candidates found for engine type %v and metric %v", engineType, labelMetricName)
 		return nil, false
 	}
-	metricFamily, exists := allMetrics[rawMetricName]
-	if !exists {
-		klog.V(4).Infof("Cannot find raw metrics %v, engine type %v", rawMetricName, engineType)
-		return nil, false
+
+	for _, rawMetricName := range candidates {
+		if metricFamily, exists := allMetrics[rawMetricName]; exists {
+			return metricFamily, true
+		}
 	}
-	return metricFamily, true
+
+	klog.V(4).Infof("None of the candidate raw metrics %v found for engine %v and metric %v", candidates, engineType, labelMetricName)
+	return nil, false
+
 }
 
 // Update `PodMetrics` and `PodModelMetrics` according to the metric scope
diff --git a/pkg/metrics/engine_fetcher.go b/pkg/metrics/engine_fetcher.go
index b0f180a3d..3f9bd549a 100644
--- a/pkg/metrics/engine_fetcher.go
+++ b/pkg/metrics/engine_fetcher.go
@@ -100,14 +100,14 @@ func (ef *EngineMetricsFetcher) FetchTypedMetric(ctx context.Context, endpoint,
 		return nil, fmt.Errorf("metric %s is not a raw pod metric, use FetchAllTypedMetrics for complex queries", metricName)
 	}
 
-	// Get raw metric name for this engine
-	rawMetricName, exists := metricDef.EngineMetricsNameMapping[engineType]
-	if !exists {
+	// Get raw metric name candidates for this engine
+	candidates, exists := metricDef.EngineMetricsNameMapping[engineType]
+	if !exists || len(candidates) == 0 {
 		return nil, fmt.Errorf("metric %s not supported for engine type %s", metricName, engineType)
 	}
 
 	url := fmt.Sprintf("http://%s/metrics", endpoint)
-
+	var lastErr error
 	// Fetch with retry logic
 	for attempt := 0; attempt <= ef.config.MaxRetries; attempt++ {
 		if attempt > 0 {
@@ -130,21 +130,31 @@ func (ef *EngineMetricsFetcher) FetchTypedMetric(ctx context.Context, endpoint,
 			continue
 		}
 
-		// Parse the specific metric we need
-		metricValue, err := ef.parseMetricFromFamily(allMetrics, rawMetricName, metricDef)
-		if err != nil {
-			klog.V(4).InfoS("Failed to parse metric from engine endpoint",
-				"attempt", attempt+1, "identifier", identifier, "metric", metricName, "error", err)
-			continue
+		// Try each candidate until one exists and can be parsed
+		for _, rawMetricName := range candidates {
+			if _, ok := allMetrics[rawMetricName]; !ok {
+				continue // skip if not present
+			}
+
+			metricValue, err := ef.parseMetricFromFamily(allMetrics, rawMetricName, metricDef)
+			if err != nil {
+				lastErr = err
+				klog.V(5).InfoS("Failed to parse candidate metric", "candidate", rawMetricName, "error", err)
+				continue
+			}
+
+			klog.V(4).InfoS("Successfully fetched typed metric from engine endpoint",
+				"identifier", identifier, "metric", metricName, "rawMetric", rawMetricName, "value", metricValue, "attempt", attempt+1)
+			return metricValue, nil
 		}
 
-		klog.V(4).InfoS("Successfully fetched typed metric from engine endpoint",
-			"identifier", identifier, "metric", metricName, "value", metricValue, "attempt", attempt+1)
-		return metricValue, nil
+		klog.V(4).InfoS("Failed to find valid metric among candidates",
+			"candidates", candidates, "identifier", identifier, "metric", metricName)
+		// Continue to next retry if any
 	}
 
-	return nil, fmt.Errorf("failed to fetch typed metric %s from engine endpoint %s after %d attempts",
-		metricName, identifier, ef.config.MaxRetries+1)
+	return nil, fmt.Errorf("failed to fetch typed metric %s from engine endpoint %s after %d attempts: %w",
+		metricName, identifier, ef.config.MaxRetries+1, lastErr)
 }
 
 // FetchAllTypedMetrics fetches all available typed metrics from an engine endpoint
@@ -215,10 +225,26 @@ func (ef *EngineMetricsFetcher) FetchAllTypedMetrics(ctx context.Context, endpoi
 			continue
 		}
 
-		// Get raw metric name for this engine
-		rawMetricName, exists := metricDef.EngineMetricsNameMapping[result.EngineType]
-		if !exists {
-			klog.V(5).InfoS("Metric not supported for engine type", "metric", metricName, "engine", result.EngineType)
+		// Get raw metric name candidates for this engine
+		candidates, exists := metricDef.EngineMetricsNameMapping[result.EngineType]
+		if !exists || len(candidates) == 0 {
+			klog.V(5).InfoS("No raw metric names defined for metric and engine type",
+				"metric", metricName, "engine", result.EngineType)
+			continue
+		}
+
+		// Find the first candidate that exists in allMetrics
+		var rawMetricName string
+		for _, name := range candidates {
+			if _, ok := allMetrics[name]; ok {
+				rawMetricName = name
+				break
+			}
+		}
+
+		if rawMetricName == "" {
+			klog.V(5).InfoS("None of the candidate raw metrics found in endpoint response",
+				"metric", metricName, "engine", result.EngineType, "candidates", candidates)
 			continue
 		}
 
diff --git a/pkg/metrics/engine_fetcher_test.go b/pkg/metrics/engine_fetcher_test.go
index 14f7ed253..3ead76aee 100644
--- a/pkg/metrics/engine_fetcher_test.go
+++ b/pkg/metrics/engine_fetcher_test.go
@@ -77,9 +77,9 @@ func setupMockMetrics() {
 	Metrics["running_requests"] = Metric{
 		MetricSource: PodRawMetrics,
 		MetricType:   MetricType{Raw: Gauge},
-		EngineMetricsNameMapping: map[string]string{
-			"vllm":   "vllm_num_requests_running",
-			"sglang": "sglang_running_requests",
+		EngineMetricsNameMapping: map[string][]string{
+			"vllm":   {"vllm_num_requests_running"},
+			"sglang": {"sglang_running_requests"},
 		},
 		Description: "Number of running requests",
 		MetricScope: PodModelMetricScope,
@@ -88,9 +88,9 @@ func setupMockMetrics() {
 	Metrics["waiting_requests"] = Metric{
 		MetricSource: PodRawMetrics,
 		MetricType:   MetricType{Raw: Gauge},
-		EngineMetricsNameMapping: map[string]string{
-			"vllm":   "vllm_num_requests_waiting",
-			"sglang": "sglang_waiting_requests",
+		EngineMetricsNameMapping: map[string][]string{
+			"vllm":   {"vllm_num_requests_waiting"},
+			"sglang": {"sglang_waiting_requests"},
 		},
 		Description: "Number of waiting requests",
 		MetricScope: PodModelMetricScope,
@@ -99,9 +99,9 @@ func setupMockMetrics() {
 	Metrics["cache_usage"] = Metric{
 		MetricSource: PodRawMetrics,
 		MetricType:   MetricType{Raw: Gauge},
-		EngineMetricsNameMapping: map[string]string{
-			"vllm":   "vllm_gpu_cache_usage_perc",
-			"sglang": "sglang_cache_usage",
+		EngineMetricsNameMapping: map[string][]string{
+			"vllm":   {"vllm_gpu_cache_usage_perc"},
+			"sglang": {"sglang_cache_usage"},
 		},
 		Description: "Cache usage percentage",
 		MetricScope: PodMetricScope,
@@ -110,8 +110,8 @@ func setupMockMetrics() {
 	Metrics["time_to_first_token"] = Metric{
 		MetricSource: PodRawMetrics,
 		MetricType:   MetricType{Raw: Histogram},
-		EngineMetricsNameMapping: map[string]string{
-			"vllm": "vllm_time_to_first_token_seconds",
+		EngineMetricsNameMapping: map[string][]string{
+			"vllm": {"vllm_time_to_first_token_seconds"},
 		},
 		Description: "Time to first token histogram",
 		MetricScope: PodModelMetricScope,
diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
index f672ebf3a..eb6625337 100644
--- a/pkg/metrics/metrics.go
+++ b/pkg/metrics/metrics.go
@@ -60,63 +60,63 @@ var (
 	// Metrics defines all available metrics, including raw and query-based metrics.
 	Metrics = map[string]Metric{
 		// Counter metrics
-		NumRequestsRunning: {
+		NumRequestsSwapped: {
 			MetricScope:  PodModelMetricScope,
 			MetricSource: PodRawMetrics,
 			MetricType: MetricType{
 				Raw: Counter,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm":   "vllm:num_requests_running",
-				"sglang": "sglang:num_running_reqs",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm": {"vllm:num_requests_swapped"},
 			},
-			Description: "Number of running requests",
+			Description: "Number of swapped requests",
 		},
-		NumRequestsWaiting: {
+		PromptTokenTotal: {
 			MetricScope:  PodModelMetricScope,
 			MetricSource: PodRawMetrics,
 			MetricType: MetricType{
 				Raw: Counter,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm":   "vllm:num_requests_waiting",
-				"sglang": "sglang:num_waiting_reqs",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm": {"vllm:prompt_tokens_total"},
 			},
-			Description: "Number of waiting requests",
+			Description: "Total prompt tokens",
 		},
-		NumRequestsSwapped: {
+		GenerationTokenTotal: {
 			MetricScope:  PodModelMetricScope,
 			MetricSource: PodRawMetrics,
 			MetricType: MetricType{
 				Raw: Counter,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm": "vllm:num_requests_swapped",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm": {"vllm:generation_tokens_total"},
 			},
-			Description: "Number of swapped requests",
+			Description: "Total generation tokens",
 		},
 		// Gauge metrics
-		PromptTokenTotal: {
+		NumRequestsRunning: {
 			MetricScope:  PodModelMetricScope,
 			MetricSource: PodRawMetrics,
 			MetricType: MetricType{
 				Raw: Gauge,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm": "vllm:prompt_tokens_total",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm":   {"vllm:num_requests_running"},
+				"sglang": {"sglang:num_running_reqs"},
 			},
-			Description: "Total prompt tokens",
+			Description: "Number of running requests",
 		},
-		GenerationTokenTotal: {
+		NumRequestsWaiting: {
 			MetricScope:  PodModelMetricScope,
 			MetricSource: PodRawMetrics,
 			MetricType: MetricType{
 				Raw: Gauge,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm": "vllm:generation_tokens_total",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm":   {"vllm:num_requests_waiting"},
+				"sglang": {"sglang:num_waiting_reqs"},
 			},
-			Description: "Total generation tokens",
+			Description: "Number of waiting requests",
 		},
 		AvgPromptThroughputToksPerS: {
 			MetricScope:  PodModelMetricScope,
@@ -124,8 +124,13 @@ var (
 			MetricType: MetricType{
 				Raw: Gauge,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm": "vllm:avg_prompt_throughput_toks_per_s",
+			EngineMetricsNameMapping: map[string][]string{
+				// vLLM deprecated "vllm:avg_prompt_throughput_toks_per_s" in v0.7.0
+				// It was removed entirely in PR #12383(https://github.com/vllm-project/vllm/pull/12383).
+				// This metric is no longer emitted by vLLM >= v0.7.0.
+				// TODO: Remove the deprecated vLLM metric names from EngineMetricsNameMapping
+				// once we confirm no deployments rely on vLLM < v0.7.0.
+				"vllm": {"vllm:avg_prompt_throughput_toks_per_s"},
 			},
 			Description: "Average prompt throughput in tokens per second",
 		},
@@ -135,9 +140,14 @@ var (
 			MetricType: MetricType{
 				Raw: Gauge,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm":   "vllm:avg_generation_throughput_toks_per_s",
-				"sglang": "sglang:gen_throughput",
+			EngineMetricsNameMapping: map[string][]string{
+				// vLLM deprecated "vllm:avg_generation_throughput_toks_per_s" in v0.7.0.
+				// It was removed entirely in PR #12383(https://github.com/vllm-project/vllm/pull/12383).
+				// This metric is no longer emitted by vLLM >= v0.7.0.
+				// TODO: Remove the deprecated vLLM metric names from EngineMetricsNameMapping
+				// once we confirm no deployments rely on vLLM < v0.7.0.
+				"vllm":   {"vllm:avg_generation_throughput_toks_per_s"},
+				"sglang": {"sglang:gen_throughput"},
 			},
 			Description: "Average generation throughput in tokens per second",
 		},
@@ -148,8 +158,8 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm": "vllm:iteration_tokens_total",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm": {"vllm:iteration_tokens_total"},
 			},
 			Description: "Total iteration tokens",
 		},
@@ -159,9 +169,9 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm":   "vllm:time_to_first_token_seconds",
-				"sglang": "sglang:time_to_first_token_seconds",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm":   {"vllm:time_to_first_token_seconds"},
+				"sglang": {"sglang:time_to_first_token_seconds"},
 			},
 			Description: "Time to first token in seconds",
 		},
@@ -171,9 +181,18 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm":   "vllm:time_per_output_token_seconds",
-				"sglang": "sglang:inter_token_latency_seconds",
+			EngineMetricsNameMapping: map[string][]string{
+				// vLLM exposes two metrics for inter-token latency:
+				// - "vllm:inter_token_latency_seconds" is the current, recommended metric (since v0.11).
+				// - "vllm:time_per_output_token_seconds" is deprecated as of v0.11 and hidden by default.
+				//   It can be temporarily enabled via --show-hidden-metrics-for-version=0.11,
+				//   but will be removed in v0.13.0.
+				//
+				// We list both to maintain backward compatibility during transition.
+				// TODO: Remove "vllm:time_per_output_token_seconds" from this list once vLLM >= v0.13.0
+				// is widely adopted and the deprecated metric is no longer in use.
+				"vllm":   {"vllm:inter_token_latency_seconds", "vllm:time_per_output_token_seconds"},
+				"sglang": {"sglang:inter_token_latency_seconds"},
 			},
 			Description: "Time per output token in seconds",
 		},
@@ -183,9 +202,9 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm":   "vllm:e2e_request_latency_seconds",
-				"sglang": "sglang:e2e_request_latency_seconds",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm":   {"vllm:e2e_request_latency_seconds"},
+				"sglang": {"sglang:e2e_request_latency_seconds"},
 			},
 			Description: "End-to-end request latency in seconds",
 		},
@@ -195,8 +214,9 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm": "vllm:request_queue_time_seconds",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm":   {"vllm:request_queue_time_seconds"},
+				"sglang": {"sglang:queue_time_seconds"},
 			},
 			Description: "Request queue time in seconds",
 		},
@@ -206,8 +226,8 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm": "vllm:request_inference_time_seconds",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm": {"vllm:request_inference_time_seconds"},
 			},
 			Description: "Request inference time in seconds",
 		},
@@ -217,8 +237,8 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm": "vllm:request_decode_time_seconds",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm": {"vllm:request_decode_time_seconds"},
 			},
 			Description: "Request decode time in seconds",
 		},
@@ -228,8 +248,8 @@ var (
 			MetricType: MetricType{
 				Raw: Histogram,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm": "vllm:request_prefill_time_seconds",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm": {"vllm:request_prefill_time_seconds"},
 			},
 			Description: "Request prefill time in seconds",
 		},
@@ -303,10 +323,16 @@ var (
 			MetricType: MetricType{
 				Raw: Counter,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm":   "vllm:gpu_cache_usage_perc",
-				"sglang": "sglang:token_usage", // Based on https://github.com/sgl-project/sglang/issues/5979
-				"xllm":   "kv_cache_utilization",
+			EngineMetricsNameMapping: map[string][]string{
+				// In vLLM PR#18354 (https://github.com/vllm-project/vllm/pull/18354) (merged since v0.10.0),
+				// metrics with the "gpu_" prefix for non-GPU-specific data were deprecated:
+				//   - Deprecated: vllm:gpu_cache_usage_perc
+				//   - Official replacement: vllm:kv_cache_usage_perc (same value, clearer semantics)
+				//
+				// We keep both for backward compatibility with deployments < v0.10.0.
+				"vllm":   {"vllm:gpu_cache_usage_perc", "vllm:kv_cache_usage_perc"},
+				"sglang": {"sglang:token_usage"}, // Based on https://github.com/sgl-project/sglang/issues/5979
+				"xllm":   {"kv_cache_utilization"},
 			},
 			Description: "GPU cache usage percentage",
 		},
@@ -316,8 +342,8 @@ var (
 			MetricType: MetricType{
 				Raw: Gauge,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"xllm": "engine_utilization",
+			EngineMetricsNameMapping: map[string][]string{
+				"xllm": {"engine_utilization"},
 			},
 			Description: "GPU busy time ratio",
 		},
@@ -327,8 +353,8 @@ var (
 			MetricType: MetricType{
 				Raw: Counter,
 			},
-			EngineMetricsNameMapping: map[string]string{
-				"vllm": "vllm:cpu_cache_usage_perc",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm": {"vllm:cpu_cache_usage_perc"},
 			},
 			Description: "CPU cache usage percentage",
 		},
@@ -375,8 +401,8 @@ var (
 				Query: QueryLabel,
 			},
 			LabelKey: "max_lora",
-			EngineMetricsNameMapping: map[string]string{
-				"vllm": "vllm:lora_requests_info",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm": {"vllm:lora_requests_info"},
 			},
 			Description: "Max count of Lora Adapters",
 		},
@@ -387,8 +413,8 @@ var (
 				Query: QueryLabel,
 			},
 			LabelKey: "running_lora_adapters",
-			EngineMetricsNameMapping: map[string]string{
-				"vllm": "vllm:lora_requests_info",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm": {"vllm:lora_requests_info"},
 			},
 			Description: "Count of running Lora Adapters",
 		},
@@ -399,8 +425,8 @@ var (
 				Query: QueryLabel,
 			},
 			LabelKey: "waiting_lora_adapters",
-			EngineMetricsNameMapping: map[string]string{
-				"vllm": "vllm:lora_requests_info",
+			EngineMetricsNameMapping: map[string][]string{
+				"vllm": {"vllm:lora_requests_info"},
 			},
 			Description: "Count of waiting Lora Adapters",
 		},
diff --git a/pkg/metrics/types.go b/pkg/metrics/types.go
index 8bf0ac830..9602434dc 100644
--- a/pkg/metrics/types.go
+++ b/pkg/metrics/types.go
@@ -77,11 +77,26 @@ const (
 
 // Metric defines a unique metric with metadata.
 type Metric struct {
-	MetricSource             MetricSource
-	MetricType               MetricType
-	PromQL                   string            // Optional: Only applicable for PromQL-based metrics
-	LabelKey                 string            // Optional: Only applicable for QueryLabel-based metrics
-	EngineMetricsNameMapping map[string]string // Optional: Mapping from engine type to raw metric name.
+	MetricSource MetricSource
+	MetricType   MetricType
+	PromQL       string // Optional: Only applicable for PromQL-based metrics
+	LabelKey     string // Optional: Only applicable for QueryLabel-based metrics
+	// EngineMetricsNameMapping defines a fallback list of raw metric names for each engine type.
+	// This is necessary because different versions or forks of inference engines (e.g., vLLM)
+	// may expose the same logical metric under different Prometheus metric names.
+	//
+	// The slice order matters: the system will try names in the given order and use the first
+	// one that exists in the scraped metrics. This allows graceful compatibility across engine
+	// versions without requiring runtime version detection.
+	//
+	// Example:
+	//   "vllm": {"vllm:inter_token_latency_seconds", "vllm:time_per_output_token_seconds"}
+	//   → Newer vLLM versions use "vllm:inter_token_latency_seconds", older ones use "vllm:time_per_output_token_seconds".
+	//   → By listing the new name first, deployments using newer versions will pick it up,
+	//     while older deployments fall back to the second name if the first is absent.
+	//
+	// If no candidate name exists in the actual metrics endpoint, the metric will be skipped.
+	EngineMetricsNameMapping map[string][]string // Optional: Mapping from engine type to raw metric name.
 	Description              string
 	MetricScope              MetricScope
 }