-
Notifications
You must be signed in to change notification settings - Fork 322
proxy: fix metrics for non-llama.cpp backends (vLLM) and correct wall-clock timing #701
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ import ( | |
| "time" | ||
|
|
||
| "github.com/gin-gonic/gin" | ||
| "github.com/mostlygeek/llama-swap/proxy/config" | ||
| "github.com/klauspost/compress/zstd" | ||
| "github.com/mostlygeek/llama-swap/event" | ||
| "github.com/tidwall/gjson" | ||
|
|
@@ -95,6 +96,7 @@ func (e TokenMetricsEvent) Type() uint32 { | |
|
|
||
| // metricsMonitor parses llama-server output for token statistics | ||
| type metricsMonitor struct { | ||
| config config.Config | ||
| mu sync.RWMutex | ||
| metrics []TokenMetrics | ||
| maxMetrics int | ||
|
|
@@ -111,8 +113,9 @@ type metricsMonitor struct { | |
|
|
||
| // newMetricsMonitor creates a new metricsMonitor. captureBufferMB is the | ||
| // capture buffer size in megabytes; 0 disables captures. | ||
| func newMetricsMonitor(logger *LogMonitor, maxMetrics int, captureBufferMB int) *metricsMonitor { | ||
| func newMetricsMonitor(cfg config.Config, logger *LogMonitor, maxMetrics int, captureBufferMB int) *metricsMonitor { | ||
| return &metricsMonitor{ | ||
| config: cfg, | ||
| logger: logger, | ||
| maxMetrics: maxMetrics, | ||
| enableCaptures: captureBufferMB > 0, | ||
|
|
@@ -130,6 +133,10 @@ func (mp *metricsMonitor) addMetrics(metric TokenMetrics) int { | |
| defer mp.mu.Unlock() | ||
|
|
||
| metric.ID = mp.nextID | ||
| // Resolve modelID to display name (first alias or modelID itself) | ||
| if modelConfig, exists := mp.config.Models[metric.Model]; exists && len(modelConfig.Aliases) > 0 { | ||
| metric.Model = modelConfig.Aliases[0] | ||
| } | ||
| mp.nextID++ | ||
| mp.metrics = append(mp.metrics, metric) | ||
| if len(mp.metrics) > mp.maxMetrics { | ||
|
|
@@ -271,6 +278,9 @@ func (mp *metricsMonitor) wrapHandler( | |
| request.Header.Set("Accept-Encoding", filterAcceptEncoding(ae)) | ||
| } | ||
|
|
||
| // Capture wall clock time before proxying the request | ||
| requestStart := time.Now() | ||
|
|
||
| if err := next(modelID, recorder, request); err != nil { | ||
| return err | ||
| } | ||
|
|
@@ -287,7 +297,7 @@ func (mp *metricsMonitor) wrapHandler( | |
| tm := TokenMetrics{ | ||
| Timestamp: time.Now(), | ||
| Model: modelID, | ||
| DurationMs: int(time.Since(recorder.StartTime()).Milliseconds()), | ||
| DurationMs: int(time.Since(requestStart).Milliseconds()), | ||
| } | ||
|
|
||
| body := recorder.body.Bytes() | ||
|
|
@@ -308,7 +318,7 @@ func (mp *metricsMonitor) wrapHandler( | |
| } | ||
| } | ||
| if strings.Contains(recorder.Header().Get("Content-Type"), "text/event-stream") { | ||
| if parsed, err := processStreamingResponse(modelID, recorder.StartTime(), body); err != nil { | ||
| if parsed, err := processStreamingResponse(modelID, requestStart, body); err != nil { | ||
| mp.logger.Warnf("error processing streaming response: %v, path=%s, recording minimal metrics", err, request.URL.Path) | ||
| } else { | ||
| tm = parsed | ||
|
|
@@ -328,7 +338,7 @@ func (mp *metricsMonitor) wrapHandler( | |
| } | ||
|
|
||
| if usage.Exists() || timings.Exists() { | ||
| if parsedMetrics, err := parseMetrics(modelID, recorder.StartTime(), usage, timings); err != nil { | ||
| if parsedMetrics, err := parseMetrics(modelID, requestStart, usage, timings); err != nil { | ||
| mp.logger.Warnf("error parsing metrics: %v, path=%s, recording minimal metrics", err, request.URL.Path) | ||
| } else { | ||
| tm = parsedMetrics | ||
|
|
@@ -481,6 +491,17 @@ func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result) | |
| } | ||
| } | ||
|
|
||
| // Fallback: estimate speeds from wall clock when timings unavailable (e.g., vLLM) | ||
| if !timings.Exists() && wallDurationMs > 0 { | ||
| durationSec := float64(wallDurationMs) / 1000.0 | ||
| if inputTokens > 0 { | ||
| promptPerSecond = float64(inputTokens) / durationSec | ||
| } | ||
| if outputTokens > 0 { | ||
| tokensPerSecond = float64(outputTokens) / durationSec | ||
| } | ||
| } | ||
|
Comment on lines
+494
to
+503
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fallback speeds conflate prompt eval with token generation. When
Given the backend doesn't expose per-phase timings, this is an acceptable approximation — but worth making the caveat explicit so downstream dashboards don't treat these as directly comparable to llama.cpp's 🤖 Prompt for AI Agents |
||
|
|
||
| return TokenMetrics{ | ||
| Timestamp: time.Now(), | ||
| Model: modelID, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fix gofmt formatting failures.
CI's
gofmt -l .step failed on this file. Two likely culprits in the changed lines:configinserted betweengin-gonic/ginandklauspost/compress/zstd) in a way that appears to disagree with the existing formatting of this file.configfield added tometricsMonitorat line 99 and theconfig:key at line 118 don't match the tab alignment used for the surrounding fields (mu,metrics,logger, etc.), whichgofmtwill re-flow.Please run
gofmt -w proxy/metrics_monitor.go(and double-check withgofmt -l .) before pushing.As per coding guidelines: "Run
gofmt -l .before committing to verify formatting. Fix any reported files withgofmt -w <file>."🔧 Verification
🤖 Prompt for AI Agents