From 27d0204a8e4a0d1797bac329ab8a5b0af452285a Mon Sep 17 00:00:00 2001 From: supermario_leo Date: Tue, 7 Apr 2026 15:16:06 +0800 Subject: [PATCH] proxy: preserve wall-clock duration in metrics Keep request duration from being underreported when upstream timings only cover a subset of the full request lifecycle. - compare wall-clock and upstream timing durations - keep token and throughput values from timings - add regression coverage for underreported timings fixes #602 --- proxy/metrics_monitor.go | 9 +++++++-- proxy/metrics_monitor_test.go | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/proxy/metrics_monitor.go b/proxy/metrics_monitor.go index 347099a94..661bb9eb8 100644 --- a/proxy/metrics_monitor.go +++ b/proxy/metrics_monitor.go @@ -365,6 +365,8 @@ func processStreamingResponse(modelID string, start time.Time, body []byte) (Tok } func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result) (TokenMetrics, error) { + wallDurationMs := int(time.Since(start).Milliseconds()) + // default values cachedTokens := -1 // unknown or missing data outputTokens := 0 @@ -373,7 +375,7 @@ func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result) // timings data tokensPerSecond := -1.0 promptPerSecond := -1.0 - durationMs := int(time.Since(start).Milliseconds()) + durationMs := wallDurationMs if usage.Exists() { if pt := usage.Get("prompt_tokens"); pt.Exists() { @@ -402,7 +404,10 @@ func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result) outputTokens = int(timings.Get("predicted_n").Int()) promptPerSecond = timings.Get("prompt_per_second").Float() tokensPerSecond = timings.Get("predicted_per_second").Float() - durationMs = int(timings.Get("prompt_ms").Float() + timings.Get("predicted_ms").Float()) + timingsDurationMs := int(timings.Get("prompt_ms").Float() + timings.Get("predicted_ms").Float()) + if timingsDurationMs > durationMs { + durationMs = timingsDurationMs + } if cachedValue := timings.Get("cache_n"); cachedValue.Exists() { cachedTokens = int(cachedValue.Int()) diff --git a/proxy/metrics_monitor_test.go b/proxy/metrics_monitor_test.go index 3f3c9ec6e..b8c66a177 100644 --- a/proxy/metrics_monitor_test.go +++ b/proxy/metrics_monitor_test.go @@ -14,6 +14,7 @@ import ( "github.com/gin-gonic/gin" "github.com/mostlygeek/llama-swap/event" "github.com/stretchr/testify/assert" + "github.com/tidwall/gjson" ) func TestMetricsMonitor_AddMetrics(t *testing.T) { @@ -570,6 +571,27 @@ func TestMetricsMonitor_Concurrent(t *testing.T) { } func TestMetricsMonitor_ParseMetrics(t *testing.T) { + t.Run("keeps wall clock duration when timings underreport request time", func(t *testing.T) { + start := time.Now().Add(-5 * time.Second) + usage := gjson.Parse(`{"prompt_tokens": 5, "completion_tokens": 1}`) + timings := gjson.Parse(`{ + "prompt_n": 5, + "predicted_n": 1, + "prompt_per_second": 10.0, + "predicted_per_second": 2.0, + "prompt_ms": 5.0, + "predicted_ms": 15.0 + }`) + + metrics, err := parseMetrics("test-model", start, usage, timings) + assert.NoError(t, err) + assert.Equal(t, 5, metrics.InputTokens) + assert.Equal(t, 1, metrics.OutputTokens) + assert.Equal(t, 10.0, metrics.PromptPerSecond) + assert.Equal(t, 2.0, metrics.TokensPerSecond) + assert.GreaterOrEqual(t, metrics.DurationMs, 5000) + }) + t.Run("prefers timings over usage data", func(t *testing.T) { mm := newMetricsMonitor(testLogger, 10, 0)