Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions pkg/cache/cache_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -353,17 +353,21 @@ func (c *Store) fetchMetrics(pod *Pod, allMetrics map[string]*dto.MetricFamily,
klog.V(4).Infof(err.Error())
engineType = defaultEngineLabelValue
}
rawMetricName, ok := metric.EngineMetricsNameMapping[engineType]
if !ok {
klog.V(4).Infof("Cannot find engine type %v mapping for metrics %v", engineType, labelMetricName)
candidates, ok := metric.EngineMetricsNameMapping[engineType]
if !ok || len(candidates) == 0 {
klog.V(4).Infof("No metric name candidates found for engine type %v and metric %v", engineType, labelMetricName)
return nil, false
}
metricFamily, exists := allMetrics[rawMetricName]
if !exists {
klog.V(4).Infof("Cannot find raw metrics %v, engine type %v", rawMetricName, engineType)
return nil, false

for _, rawMetricName := range candidates {
if metricFamily, exists := allMetrics[rawMetricName]; exists {
return metricFamily, true
}
}
return metricFamily, true

klog.V(4).Infof("None of the candidate raw metrics %v found for engine %v and metric %v", candidates, engineType, labelMetricName)
return nil, false

}

// Update `PodMetrics` and `PodModelMetrics` according to the metric scope
Expand Down
64 changes: 45 additions & 19 deletions pkg/metrics/engine_fetcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,14 @@ func (ef *EngineMetricsFetcher) FetchTypedMetric(ctx context.Context, endpoint,
return nil, fmt.Errorf("metric %s is not a raw pod metric, use FetchAllTypedMetrics for complex queries", metricName)
}

// Get raw metric name for this engine
rawMetricName, exists := metricDef.EngineMetricsNameMapping[engineType]
if !exists {
// Get raw metric name candidates for this engine
candidates, exists := metricDef.EngineMetricsNameMapping[engineType]
if !exists || len(candidates) == 0 {
return nil, fmt.Errorf("metric %s not supported for engine type %s", metricName, engineType)
}

url := fmt.Sprintf("http://%s/metrics", endpoint)

var lastErr error
// Fetch with retry logic
for attempt := 0; attempt <= ef.config.MaxRetries; attempt++ {
if attempt > 0 {
Expand All @@ -130,21 +130,31 @@ func (ef *EngineMetricsFetcher) FetchTypedMetric(ctx context.Context, endpoint,
continue
}

// Parse the specific metric we need
metricValue, err := ef.parseMetricFromFamily(allMetrics, rawMetricName, metricDef)
if err != nil {
klog.V(4).InfoS("Failed to parse metric from engine endpoint",
"attempt", attempt+1, "identifier", identifier, "metric", metricName, "error", err)
continue
// Try each candidate until one exists and can be parsed
for _, rawMetricName := range candidates {
if _, ok := allMetrics[rawMetricName]; !ok {
continue // skip if not present
}

metricValue, err := ef.parseMetricFromFamily(allMetrics, rawMetricName, metricDef)
if err != nil {
lastErr = err
klog.V(5).InfoS("Failed to parse candidate metric", "candidate", rawMetricName, "error", err)
continue
}

klog.V(4).InfoS("Successfully fetched typed metric from engine endpoint",
"identifier", identifier, "metric", metricName, "rawMetric", rawMetricName, "value", metricValue, "attempt", attempt+1)
return metricValue, nil
}

klog.V(4).InfoS("Successfully fetched typed metric from engine endpoint",
"identifier", identifier, "metric", metricName, "value", metricValue, "attempt", attempt+1)
return metricValue, nil
klog.V(4).InfoS("Failed to find valid metric among candidates",
"candidates", candidates, "identifier", identifier, "metric", metricName)
// Continue to next retry if any
}

return nil, fmt.Errorf("failed to fetch typed metric %s from engine endpoint %s after %d attempts",
metricName, identifier, ef.config.MaxRetries+1)
return nil, fmt.Errorf("failed to fetch typed metric %s from engine endpoint %s after %d attempts: %w",
metricName, identifier, ef.config.MaxRetries+1, lastErr)
Comment on lines +156 to +157
Copy link

Copilot AI Dec 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential nil error wrapping in error message

If none of the candidate metrics are found in the endpoint response (all candidates fail the existence check at line 135), lastErr will remain nil. Using %w with a nil error will result in a confusing error message like "... after 3 attempts: %!w()".

Suggested fix:

if lastErr != nil {
    return nil, fmt.Errorf("failed to fetch typed metric %s from engine endpoint %s after %d attempts: %w",
        metricName, identifier, ef.config.MaxRetries+1, lastErr)
}
return nil, fmt.Errorf("failed to fetch typed metric %s from engine endpoint %s after %d attempts: no matching candidates found",
    metricName, identifier, ef.config.MaxRetries+1)
Suggested change
return nil, fmt.Errorf("failed to fetch typed metric %s from engine endpoint %s after %d attempts: %w",
metricName, identifier, ef.config.MaxRetries+1, lastErr)
if lastErr != nil {
return nil, fmt.Errorf("failed to fetch typed metric %s from engine endpoint %s after %d attempts: %w",
metricName, identifier, ef.config.MaxRetries+1, lastErr)
}
return nil, fmt.Errorf("failed to fetch typed metric %s from engine endpoint %s after %d attempts: no matching candidates found",
metricName, identifier, ef.config.MaxRetries+1)

Copilot uses AI. Check for mistakes.
}

// FetchAllTypedMetrics fetches all available typed metrics from an engine endpoint
Expand Down Expand Up @@ -215,10 +225,26 @@ func (ef *EngineMetricsFetcher) FetchAllTypedMetrics(ctx context.Context, endpoi
continue
}

// Get raw metric name for this engine
rawMetricName, exists := metricDef.EngineMetricsNameMapping[result.EngineType]
if !exists {
klog.V(5).InfoS("Metric not supported for engine type", "metric", metricName, "engine", result.EngineType)
// Get raw metric name candidates for this engine
candidates, exists := metricDef.EngineMetricsNameMapping[result.EngineType]
if !exists || len(candidates) == 0 {
klog.V(5).InfoS("No raw metric names defined for metric and engine type",
"metric", metricName, "engine", result.EngineType)
continue
}

// Find the first candidate that exists in allMetrics
var rawMetricName string
for _, name := range candidates {
if _, ok := allMetrics[name]; ok {
rawMetricName = name
break
}
}

if rawMetricName == "" {
klog.V(5).InfoS("None of the candidate raw metrics found in endpoint response",
"metric", metricName, "engine", result.EngineType, "candidates", candidates)
continue
}

Expand Down
22 changes: 11 additions & 11 deletions pkg/metrics/engine_fetcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ func setupMockMetrics() {
Metrics["running_requests"] = Metric{
MetricSource: PodRawMetrics,
MetricType: MetricType{Raw: Gauge},
EngineMetricsNameMapping: map[string]string{
"vllm": "vllm_num_requests_running",
"sglang": "sglang_running_requests",
EngineMetricsNameMapping: map[string][]string{
"vllm": {"vllm_num_requests_running"},
"sglang": {"sglang_running_requests"},
},
Description: "Number of running requests",
MetricScope: PodModelMetricScope,
Expand All @@ -88,9 +88,9 @@ func setupMockMetrics() {
Metrics["waiting_requests"] = Metric{
MetricSource: PodRawMetrics,
MetricType: MetricType{Raw: Gauge},
EngineMetricsNameMapping: map[string]string{
"vllm": "vllm_num_requests_waiting",
"sglang": "sglang_waiting_requests",
EngineMetricsNameMapping: map[string][]string{
"vllm": {"vllm_num_requests_waiting"},
"sglang": {"sglang_waiting_requests"},
},
Description: "Number of waiting requests",
MetricScope: PodModelMetricScope,
Expand All @@ -99,9 +99,9 @@ func setupMockMetrics() {
Metrics["cache_usage"] = Metric{
MetricSource: PodRawMetrics,
MetricType: MetricType{Raw: Gauge},
EngineMetricsNameMapping: map[string]string{
"vllm": "vllm_gpu_cache_usage_perc",
"sglang": "sglang_cache_usage",
EngineMetricsNameMapping: map[string][]string{
"vllm": {"vllm_gpu_cache_usage_perc"},
"sglang": {"sglang_cache_usage"},
},
Description: "Cache usage percentage",
MetricScope: PodMetricScope,
Expand All @@ -110,8 +110,8 @@ func setupMockMetrics() {
Metrics["time_to_first_token"] = Metric{
MetricSource: PodRawMetrics,
MetricType: MetricType{Raw: Histogram},
EngineMetricsNameMapping: map[string]string{
"vllm": "vllm_time_to_first_token_seconds",
EngineMetricsNameMapping: map[string][]string{
"vllm": {"vllm_time_to_first_token_seconds"},
},
Description: "Time to first token histogram",
MetricScope: PodModelMetricScope,
Expand Down
Loading