Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions misc/simple-responder/simple-responder.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,14 @@ func main() {
"prompt_tokens": 25,
"total_tokens": 35,
},
// add timings to simulate llama.cpp
"timings": gin.H{
"prompt_n": 25,
"prompt_ms": 13,
"predicted_n": 10,
"predicted_ms": 17,
"predicted_per_second": 10,
},
}
c.SSEvent("message", finalData)
c.Writer.Flush()
Expand All @@ -102,6 +110,13 @@ func main() {
"prompt_tokens": 25,
"total_tokens": 35,
},
"timings": gin.H{
"prompt_n": 25,
"prompt_ms": 13,
"predicted_n": 10,
"predicted_ms": 17,
"predicted_per_second": 10,
},
})
}
})
Expand Down
75 changes: 45 additions & 30 deletions proxy/metrics_middleware.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,51 +67,66 @@ func (rec *MetricsRecorder) processBody(body []byte) {
}
}

func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) {
if !jsonData.Get("usage").Exists() {
return
func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
usage := jsonData.Get("usage")
if !usage.Exists() {
return false
}

// default values
outputTokens := int(jsonData.Get("usage.completion_tokens").Int())
inputTokens := int(jsonData.Get("usage.prompt_tokens").Int())
tokensPerSecond := -1.0
durationMs := int(time.Since(rec.startTime).Milliseconds())

if outputTokens > 0 {
duration := time.Since(rec.startTime)
tokensPerSecond := float64(inputTokens+outputTokens) / duration.Seconds()

metrics := TokenMetrics{
Timestamp: time.Now(),
Model: rec.realModelName,
InputTokens: inputTokens,
OutputTokens: outputTokens,
TokensPerSecond: tokensPerSecond,
DurationMs: int(duration.Milliseconds()),
}
rec.metricsMonitor.addMetrics(metrics)
// use llama-server's timing data for tok/sec and duration as it is more accurate
if timings := jsonData.Get("timings"); timings.Exists() {
tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float()
durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float())
}

rec.metricsMonitor.addMetrics(TokenMetrics{
Timestamp: time.Now(),
Model: rec.realModelName,
InputTokens: inputTokens,
OutputTokens: outputTokens,
TokensPerSecond: tokensPerSecond,
DurationMs: durationMs,
})

return true
}

func (rec *MetricsRecorder) processStreamingResponse(body []byte) {
// Iterate **backwards** through the lines looking for the data payload with
// usage data
lines := bytes.Split(body, []byte("\n"))
for _, line := range lines {
line = bytes.TrimSpace(line)

for i := len(lines) - 1; i >= 0; i-- {
line := bytes.TrimSpace(lines[i])
if len(line) == 0 {
continue
}

// Check for SSE data prefix
if data, found := bytes.CutPrefix(line, []byte("data:")); found {
data = bytes.TrimSpace(data)
if len(data) == 0 {
continue
}
if bytes.Equal(data, []byte("[DONE]")) {
break
}
// SSE payload always follows "data:"
prefix := []byte("data:")
if !bytes.HasPrefix(line, prefix) {
continue
}
data := bytes.TrimSpace(line[len(prefix):])

if len(data) == 0 {
continue
}

if bytes.Equal(data, []byte("[DONE]")) {
// [DONE] line itself contains nothing of interest.
continue
}

// Parse JSON to look for usage data
if gjson.ValidBytes(data) {
rec.parseAndRecordMetrics(gjson.ParseBytes(data))
if gjson.ValidBytes(data) {
if rec.parseAndRecordMetrics(gjson.ParseBytes(data)) {
return // short circuit if a metric was recorded
}
}
}
Expand Down
8 changes: 6 additions & 2 deletions proxy/proxymanager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,9 @@ func TestProxyManager_MiddlewareWritesMetrics_NonStreaming(t *testing.T) {

// Check that metrics were recorded
metrics := proxy.metricsMonitor.GetMetrics()
assert.NotEmpty(t, metrics, "metrics should be recorded for non-streaming request")
if !assert.NotEmpty(t, metrics, "metrics should be recorded for non-streaming request") {
return
}

// Verify the last metric has the correct model
lastMetric := metrics[len(metrics)-1]
Expand Down Expand Up @@ -741,7 +743,9 @@ func TestProxyManager_MiddlewareWritesMetrics_Streaming(t *testing.T) {

// Check that metrics were recorded
metrics := proxy.metricsMonitor.GetMetrics()
assert.NotEmpty(t, metrics, "metrics should be recorded for streaming request")
if !assert.NotEmpty(t, metrics, "metrics should be recorded for streaming request") {
return
}

// Verify the last metric has the correct model
lastMetric := metrics[len(metrics)-1]
Expand Down
26 changes: 13 additions & 13 deletions ui/src/pages/Activity.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
import { useState, useEffect } from "react";
import { useAPI } from "../contexts/APIProvider";

const formatTimestamp = (timestamp: string): string => {
return new Date(timestamp).toLocaleString();
};

const formatSpeed = (speed: number): string => {
return speed < 0 ? "unknown" : speed.toFixed(2) + " t/s";
};

const formatDuration = (ms: number): string => {
return (ms / 1000).toFixed(2) + "s";
};

const ActivityPage = () => {
const { metrics } = useAPI();
const [error, setError] = useState<string | null>(null);
Expand All @@ -11,18 +23,6 @@ const ActivityPage = () => {
}
}, [metrics]);

const formatTimestamp = (timestamp: string) => {
return new Date(timestamp).toLocaleString();
};

const formatSpeed = (speed: number) => {
return speed.toFixed(2) + " t/s";
};

const formatDuration = (ms: number) => {
return (ms / 1000).toFixed(2) + "s";
};

if (error) {
return (
<div className="p-6">
Expand Down Expand Up @@ -51,7 +51,7 @@ const ActivityPage = () => {
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th>
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Input Tokens</th>
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Output Tokens</th>
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Processing Speed</th>
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generation Speed</th>
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th>
</tr>
</thead>
Expand Down