Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 44 additions & 6 deletions studio/frontend/src/components/assistant-ui/message-timing.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,24 @@ export const MessageTiming: FC<{

if (timing?.totalStreamTime === undefined) return null;

const serverTimings = (
const custom = (
message.metadata as Record<string, unknown> | undefined
)?.custom as { serverTimings?: Record<string, number> } | undefined;
const st = serverTimings?.serverTimings;
)?.custom as
| {
serverTimings?: Record<string, number>;
contextUsage?: {
cachedTokens?: number;
cacheWriteTokens?: number;
};
}
| undefined;
const st = custom?.serverTimings;
// `??` (not `||`) so an explicit cache_n=0 isn't replaced by a stale
// contextUsage.cachedTokens from a prior turn.
const cacheHits =
st?.cache_n ?? custom?.contextUsage?.cachedTokens ?? 0;
// Anthropic-only cache-write count.
const cacheWrites = custom?.contextUsage?.cacheWriteTokens ?? 0;

// Guard unphysical tok/s: llama.cpp emits predicted_ms=0 on no-op
// turns, blowing the rate up to Infinity. Require >=1 token AND a
Expand Down Expand Up @@ -122,11 +136,19 @@ export const MessageTiming: FC<{
</span>
</div>
)}
{(st?.cache_n ?? 0) > 0 && (
{cacheHits > 0 && (
<div className="flex items-center justify-between gap-4">
<span className="text-muted-foreground">Cache hits</span>
<span className="font-mono tabular-nums">
{formatNumber(st!.cache_n)}
{formatNumber(cacheHits)}
</span>
</div>
)}
{cacheWrites > 0 && (
<div className="flex items-center justify-between gap-4">
<span className="text-muted-foreground">Cache writes</span>
<span className="font-mono tabular-nums">
{formatNumber(cacheWrites)}
</span>
</div>
)}
Expand All @@ -146,7 +168,7 @@ export const MessageTiming: FC<{
</>
) : (
<>
{/* Client-side metrics (safetensors fallback) */}
{/* Client-side metrics (safetensors + external provider fallback) */}
{timing.firstTokenTime !== undefined && (
<div className="flex items-center justify-between gap-4">
<span className="text-muted-foreground">First token</span>
Expand All @@ -155,6 +177,22 @@ export const MessageTiming: FC<{
</span>
</div>
)}
{cacheHits > 0 && (
<div className="flex items-center justify-between gap-4">
<span className="text-muted-foreground">Cache hits</span>
<span className="font-mono tabular-nums">
{formatNumber(cacheHits)}
</span>
</div>
)}
{cacheWrites > 0 && (
<div className="flex items-center justify-between gap-4">
<span className="text-muted-foreground">Cache writes</span>
<span className="font-mono tabular-nums">
{formatNumber(cacheWrites)}
</span>
</div>
)}
<div className="flex items-center justify-between gap-4">
<span className="text-muted-foreground">Total</span>
<span className="font-mono tabular-nums">
Expand Down
29 changes: 25 additions & 4 deletions studio/frontend/src/features/chat/api/chat-adapter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,13 @@ interface ServerUsage {
prompt_tokens: number;
completion_tokens: number;
total_tokens: number;
// External prompt-cache fields (see _build_usage_chunk in
// external_provider.py). cache_creation is Anthropic-only.
prompt_tokens_details?: {
cached_tokens?: number;
};
cache_creation_input_tokens?: number;
cache_read_input_tokens?: number;
}

/** Server-side timing data from llama-server's timings object. */
Expand Down Expand Up @@ -1881,18 +1888,31 @@ export function createOpenAIStreamAdapter(): ChatModelAdapter {
const finalTokPerSec = meta?.timings?.predicted_per_second;
const serverPromptEvalTime = meta?.timings?.prompt_ms;

// Update context usage in store if we got valid server data
// Prefer llama-server timings; fall back to provider usage envelope.
const cachedTokens =
meta?.timings?.cache_n ??
meta?.usage?.prompt_tokens_details?.cached_tokens ??
meta?.usage?.cache_read_input_tokens ??
0;
// Anthropic-only (billed at the write premium).
const cacheWriteTokens = meta?.usage?.cache_creation_input_tokens ?? 0;

// Gate on the captured checkpoint still being active so a late
// completion from provider A doesn't populate the bar after the
// user switched to provider B mid-stream.
if (
meta?.usage &&
typeof meta.usage.prompt_tokens === "number" &&
typeof meta.usage.completion_tokens === "number" &&
typeof meta.usage.total_tokens === "number"
typeof meta.usage.total_tokens === "number" &&
useChatRuntimeStore.getState().params.checkpoint === params.checkpoint
) {
useChatRuntimeStore.getState().setContextUsage({
promptTokens: meta.usage.prompt_tokens,
completionTokens: meta.usage.completion_tokens,
totalTokens: meta.usage.total_tokens,
cachedTokens: meta.timings?.cache_n ?? 0,
cachedTokens,
cacheWriteTokens,
});
}

Expand Down Expand Up @@ -1922,7 +1942,8 @@ export function createOpenAIStreamAdapter(): ChatModelAdapter {
promptTokens: meta.usage.prompt_tokens,
completionTokens: meta.usage.completion_tokens,
totalTokens: meta.usage.total_tokens,
cachedTokens: meta.timings?.cache_n ?? 0,
cachedTokens,
cacheWriteTokens,
modelId: params.checkpoint,
}
: undefined,
Expand Down
36 changes: 33 additions & 3 deletions studio/frontend/src/features/chat/chat-page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -1037,6 +1037,10 @@ export function ChatPage(): ReactElement {
ggufMaxContextLength: null,
ggufNativeContextLength: null,
activeNativePathToken: null,
// Clear previous-model counters; the relaxed external-provider
// render gate would otherwise show stale stats until the next
// completion overwrites them.
contextUsage: null,
supportsReasoning: reasoningCaps.supportsReasoning,
reasoningAlwaysOn: reasoningCaps.reasoningAlwaysOn,
reasoningStyle: reasoningCaps.reasoningStyle,
Expand Down Expand Up @@ -1161,7 +1165,9 @@ export function ChatPage(): ReactElement {
if (!saved) return;
viewBeforeCompareRef.current = null;
navigate({ to: "/chat", search: saved });
// Restore context usage from the active thread's last assistant message.
// Restore usage from the last assistant message, but only if it
// matches the currently active checkpoint. Without this guard the
// relaxed render gate would show stale stats from another model.
const threadId =
saved.thread ?? useChatRuntimeStore.getState().activeThreadId;
if (threadId) {
Expand All @@ -1175,7 +1181,29 @@ export function ChatPage(): ReactElement {
const usage = metadata?.contextUsage as ReturnType<
typeof useChatRuntimeStore.getState
>["contextUsage"];
if (usage) useChatRuntimeStore.getState().setContextUsage(usage);
if (!usage) return;
const store = useChatRuntimeStore.getState();
const activeCheckpoint = store.params.checkpoint;
const usageModelId =
(usage as { modelId?: unknown }).modelId;
// Scope by modelId when present; reject if no active checkpoint
// (model-scoped usage cannot be attributed to "nothing").
if (typeof usageModelId === "string" && usageModelId) {
if (!activeCheckpoint || usageModelId !== activeCheckpoint) {
return;
}
}
// For local turns, also require the restored count to fit in
// the active window. Skip when unknown (external provider).
const limit = store.ggufContextLength;
if (
typeof limit === "number" &&
limit > 0 &&
(usage.totalTokens ?? 0) > limit
) {
return;
}
store.setContextUsage(usage);
})
.catch((error) => {
if (!isExpectedBackgroundChatStorageError(error)) {
Expand Down Expand Up @@ -1491,11 +1519,13 @@ export function ChatPage(): ReactElement {
) : null}
</div>
<div className="ml-auto flex items-center gap-2">
{view.mode === "single" && ggufContextLength && contextUsage ? (
{view.mode === "single" && contextUsage ? (
<ContextUsageBar
used={contextUsage.totalTokens}
// null on external providers; the bar handles that.
total={ggufContextLength}
cached={contextUsage.cachedTokens}
cacheWrites={contextUsage.cacheWriteTokens}
promptTokens={contextUsage.promptTokens}
completionTokens={contextUsage.completionTokens}
className="h-[34px]"
Expand Down
89 changes: 66 additions & 23 deletions studio/frontend/src/features/chat/components/context-usage-bar.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -28,37 +28,66 @@ function getSeverityColor(percent: number): {

export const ContextUsageBar: FC<{
used: number;
total: number;
// null on external providers (no known window); bar then hides the ratio.
total?: number | null;
cached?: number;
// Anthropic-only (billed at the write premium).
cacheWrites?: number;
promptTokens?: number;
completionTokens?: number;
className?: string;
}> = ({ used, total, cached, promptTokens, completionTokens, className }) => {
if (total <= 0) return null;
}> = ({
used,
total,
cached,
cacheWrites,
promptTokens,
completionTokens,
className,
}) => {
const hasKnownLimit = typeof total === "number" && total > 0;
const hasUsageDetails =
promptTokens !== undefined ||
completionTokens !== undefined ||
(cached !== undefined && cached > 0) ||
(cacheWrites !== undefined && cacheWrites > 0);

const percent = Math.min((used / total) * 100, 100);
const severity = getSeverityColor(percent);
// Nothing to show: no limit and no per-turn counters.
if (!hasKnownLimit && used <= 0 && !hasUsageDetails) return null;

const percent = hasKnownLimit
? Math.min((used / (total as number)) * 100, 100)
: null;
const severity = getSeverityColor(percent ?? 0);

return (
<Tooltip>
<TooltipTrigger asChild>
<button
type="button"
aria-label={`Context usage: ${formatTokenCount(used)} of ${formatTokenCount(total)} tokens`}
aria-label={
hasKnownLimit
? `Context usage: ${formatTokenCount(used)} of ${formatTokenCount(total as number)} tokens`
: `Token usage: ${formatTokenCount(used)} tokens`
}
className={cn(
"flex items-center gap-2 rounded-[10px] px-2.5 py-1 font-mono text-chat-icon-fg text-[13px] tabular-nums transition-colors hover:bg-chat-icon-bg-hover hover:text-chat-icon-fg-hover",
className,
)}
>
<span>
{formatTokenCount(used)} / {formatTokenCount(total)}
{hasKnownLimit
? `${formatTokenCount(used)} / ${formatTokenCount(total as number)}`
: `${formatTokenCount(used)} tokens`}
</span>
<div className="h-1.5 w-16 rounded-full bg-black/10 dark:bg-white/15 overflow-hidden">
<div
className={cn("h-full rounded-full transition-all", severity.bar)}
style={{ width: `${percent}%` }}
/>
</div>
{hasKnownLimit && percent !== null ? (
<div className="h-1.5 w-16 rounded-full bg-black/10 dark:bg-white/15 overflow-hidden">
<div
className={cn("h-full rounded-full transition-all", severity.bar)}
style={{ width: `${percent}%` }}
/>
</div>
) : null}
</button>
</TooltipTrigger>
<TooltipContent
Expand All @@ -68,12 +97,14 @@ export const ContextUsageBar: FC<{
className="[&_span>svg]:hidden!"
>
<div className="grid min-w-44 gap-1.5 text-xs">
<div className="flex items-center justify-between gap-4">
<span className="text-muted-foreground">Context usage</span>
<span className={cn("font-mono tabular-nums font-medium", severity.text)}>
{percent.toFixed(1)}%
</span>
</div>
{hasKnownLimit && percent !== null ? (
<div className="flex items-center justify-between gap-4">
<span className="text-muted-foreground">Context usage</span>
<span className={cn("font-mono tabular-nums font-medium", severity.text)}>
{percent.toFixed(1)}%
</span>
</div>
) : null}
{promptTokens !== undefined && (
<div className="flex items-center justify-between gap-4">
<span className="text-muted-foreground">Prompt tokens</span>
Expand All @@ -98,20 +129,32 @@ export const ContextUsageBar: FC<{
</span>
</div>
)}
{cacheWrites !== undefined && cacheWrites > 0 && (
<div className="flex items-center justify-between gap-4">
<span className="text-muted-foreground">Cache writes</span>
<span className="font-mono tabular-nums">
{formatTokenCountFull(cacheWrites)}
</span>
</div>
)}
<div className="my-0.5 border-t border-border/40" />
<div className="flex items-center justify-between gap-4">
<span className="text-muted-foreground">Total</span>
<span className="text-muted-foreground">
{hasKnownLimit ? "Total" : "Total tokens"}
</span>
<span className="font-mono tabular-nums">
{formatTokenCountFull(used)} / {formatTokenCountFull(total)}
{hasKnownLimit
? `${formatTokenCountFull(used)} / ${formatTokenCountFull(total as number)}`
: formatTokenCountFull(used)}
</span>
</div>
{percent > 85 && (
{hasKnownLimit && percent !== null && percent > 85 ? (
<div className="mt-1 max-w-64 text-[11px] leading-snug text-muted-foreground/90">
Close to the context limit. Generation will stop at 100%.
Increase <span className="font-medium">Context Length</span> in
the chat Settings panel to keep going.
</div>
)}
) : null}
</div>
</TooltipContent>
</Tooltip>
Expand Down
21 changes: 14 additions & 7 deletions studio/frontend/src/features/chat/runtime-provider.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -826,17 +826,24 @@ function useStudioRuntimeAdapters(): StudioRuntimeAdapters {
completionTokens: number;
totalTokens: number;
cachedTokens: number;
cacheWriteTokens?: number;
modelId?: string;
}
| undefined;
const store = useChatRuntimeStore.getState();
if (
savedUsage &&
store.ggufContextLength &&
savedUsage.totalTokens <= store.ggufContextLength &&
(!savedUsage.modelId ||
savedUsage.modelId === store.params.checkpoint)
) {
// Window check applies only when a local GGUF window is known;
// external providers have ggufContextLength === null.
const withinLocalLimit =
!store.ggufContextLength ||
(savedUsage?.totalTokens ?? 0) <= store.ggufContextLength;
// Legacy unscoped usage (no modelId) is only trusted when a
// known local window bounds the totals, so we can't misattribute
// an old local turn to a newly-selected external provider.
const modelMatches = savedUsage?.modelId
? savedUsage.modelId === store.params.checkpoint
: typeof store.ggufContextLength === "number" &&
store.ggufContextLength > 0;
if (savedUsage && withinLocalLimit && modelMatches) {
store.setContextUsage(savedUsage);
}

Expand Down
Loading
Loading