diff --git a/studio/frontend/src/components/assistant-ui/message-timing.tsx b/studio/frontend/src/components/assistant-ui/message-timing.tsx index 4fb68d4b90..31f742bc5e 100644 --- a/studio/frontend/src/components/assistant-ui/message-timing.tsx +++ b/studio/frontend/src/components/assistant-ui/message-timing.tsx @@ -33,10 +33,24 @@ export const MessageTiming: FC<{ if (timing?.totalStreamTime === undefined) return null; - const serverTimings = ( + const custom = ( message.metadata as Record | undefined - )?.custom as { serverTimings?: Record } | undefined; - const st = serverTimings?.serverTimings; + )?.custom as + | { + serverTimings?: Record; + contextUsage?: { + cachedTokens?: number; + cacheWriteTokens?: number; + }; + } + | undefined; + const st = custom?.serverTimings; + // `??` (not `||`) so an explicit cache_n=0 isn't replaced by a stale + // contextUsage.cachedTokens from a prior turn. + const cacheHits = + st?.cache_n ?? custom?.contextUsage?.cachedTokens ?? 0; + // Anthropic-only cache-write count. + const cacheWrites = custom?.contextUsage?.cacheWriteTokens ?? 0; // Guard unphysical tok/s: llama.cpp emits predicted_ms=0 on no-op // turns, blowing the rate up to Infinity. Require >=1 token AND a @@ -122,11 +136,19 @@ export const MessageTiming: FC<{ )} - {(st?.cache_n ?? 0) > 0 && ( + {cacheHits > 0 && (
Cache hits - {formatNumber(st!.cache_n)} + {formatNumber(cacheHits)} + +
+ )} + {cacheWrites > 0 && ( +
+ Cache writes + + {formatNumber(cacheWrites)}
)} @@ -146,7 +168,7 @@ export const MessageTiming: FC<{ ) : ( <> - {/* Client-side metrics (safetensors fallback) */} + {/* Client-side metrics (safetensors + external provider fallback) */} {timing.firstTokenTime !== undefined && (
First token @@ -155,6 +177,22 @@ export const MessageTiming: FC<{
)} + {cacheHits > 0 && ( +
+ Cache hits + + {formatNumber(cacheHits)} + +
+ )} + {cacheWrites > 0 && ( +
+ Cache writes + + {formatNumber(cacheWrites)} + +
+ )}
Total diff --git a/studio/frontend/src/features/chat/api/chat-adapter.ts b/studio/frontend/src/features/chat/api/chat-adapter.ts index 0c557f1b01..9842e380e0 100644 --- a/studio/frontend/src/features/chat/api/chat-adapter.ts +++ b/studio/frontend/src/features/chat/api/chat-adapter.ts @@ -70,6 +70,13 @@ interface ServerUsage { prompt_tokens: number; completion_tokens: number; total_tokens: number; + // External prompt-cache fields (see _build_usage_chunk in + // external_provider.py). cache_creation is Anthropic-only. + prompt_tokens_details?: { + cached_tokens?: number; + }; + cache_creation_input_tokens?: number; + cache_read_input_tokens?: number; } /** Server-side timing data from llama-server's timings object. */ @@ -1881,18 +1888,31 @@ export function createOpenAIStreamAdapter(): ChatModelAdapter { const finalTokPerSec = meta?.timings?.predicted_per_second; const serverPromptEvalTime = meta?.timings?.prompt_ms; - // Update context usage in store if we got valid server data + // Prefer llama-server timings; fall back to provider usage envelope. + const cachedTokens = + meta?.timings?.cache_n ?? + meta?.usage?.prompt_tokens_details?.cached_tokens ?? + meta?.usage?.cache_read_input_tokens ?? + 0; + // Anthropic-only (billed at the write premium). + const cacheWriteTokens = meta?.usage?.cache_creation_input_tokens ?? 0; + + // Gate on the captured checkpoint still being active so a late + // completion from provider A doesn't populate the bar after the + // user switched to provider B mid-stream. if ( meta?.usage && typeof meta.usage.prompt_tokens === "number" && typeof meta.usage.completion_tokens === "number" && - typeof meta.usage.total_tokens === "number" + typeof meta.usage.total_tokens === "number" && + useChatRuntimeStore.getState().params.checkpoint === params.checkpoint ) { useChatRuntimeStore.getState().setContextUsage({ promptTokens: meta.usage.prompt_tokens, completionTokens: meta.usage.completion_tokens, totalTokens: meta.usage.total_tokens, - cachedTokens: meta.timings?.cache_n ?? 0, + cachedTokens, + cacheWriteTokens, }); } @@ -1922,7 +1942,8 @@ export function createOpenAIStreamAdapter(): ChatModelAdapter { promptTokens: meta.usage.prompt_tokens, completionTokens: meta.usage.completion_tokens, totalTokens: meta.usage.total_tokens, - cachedTokens: meta.timings?.cache_n ?? 0, + cachedTokens, + cacheWriteTokens, modelId: params.checkpoint, } : undefined, diff --git a/studio/frontend/src/features/chat/chat-page.tsx b/studio/frontend/src/features/chat/chat-page.tsx index ce02b0da18..85ed0f7eef 100644 --- a/studio/frontend/src/features/chat/chat-page.tsx +++ b/studio/frontend/src/features/chat/chat-page.tsx @@ -1037,6 +1037,10 @@ export function ChatPage(): ReactElement { ggufMaxContextLength: null, ggufNativeContextLength: null, activeNativePathToken: null, + // Clear previous-model counters; the relaxed external-provider + // render gate would otherwise show stale stats until the next + // completion overwrites them. + contextUsage: null, supportsReasoning: reasoningCaps.supportsReasoning, reasoningAlwaysOn: reasoningCaps.reasoningAlwaysOn, reasoningStyle: reasoningCaps.reasoningStyle, @@ -1161,7 +1165,9 @@ export function ChatPage(): ReactElement { if (!saved) return; viewBeforeCompareRef.current = null; navigate({ to: "/chat", search: saved }); - // Restore context usage from the active thread's last assistant message. + // Restore usage from the last assistant message, but only if it + // matches the currently active checkpoint. Without this guard the + // relaxed render gate would show stale stats from another model. const threadId = saved.thread ?? useChatRuntimeStore.getState().activeThreadId; if (threadId) { @@ -1175,7 +1181,29 @@ export function ChatPage(): ReactElement { const usage = metadata?.contextUsage as ReturnType< typeof useChatRuntimeStore.getState >["contextUsage"]; - if (usage) useChatRuntimeStore.getState().setContextUsage(usage); + if (!usage) return; + const store = useChatRuntimeStore.getState(); + const activeCheckpoint = store.params.checkpoint; + const usageModelId = + (usage as { modelId?: unknown }).modelId; + // Scope by modelId when present; reject if no active checkpoint + // (model-scoped usage cannot be attributed to "nothing"). + if (typeof usageModelId === "string" && usageModelId) { + if (!activeCheckpoint || usageModelId !== activeCheckpoint) { + return; + } + } + // For local turns, also require the restored count to fit in + // the active window. Skip when unknown (external provider). + const limit = store.ggufContextLength; + if ( + typeof limit === "number" && + limit > 0 && + (usage.totalTokens ?? 0) > limit + ) { + return; + } + store.setContextUsage(usage); }) .catch((error) => { if (!isExpectedBackgroundChatStorageError(error)) { @@ -1491,11 +1519,13 @@ export function ChatPage(): ReactElement { ) : null}
- {view.mode === "single" && ggufContextLength && contextUsage ? ( + {view.mode === "single" && contextUsage ? ( = ({ used, total, cached, promptTokens, completionTokens, className }) => { - if (total <= 0) return null; +}> = ({ + used, + total, + cached, + cacheWrites, + promptTokens, + completionTokens, + className, +}) => { + const hasKnownLimit = typeof total === "number" && total > 0; + const hasUsageDetails = + promptTokens !== undefined || + completionTokens !== undefined || + (cached !== undefined && cached > 0) || + (cacheWrites !== undefined && cacheWrites > 0); - const percent = Math.min((used / total) * 100, 100); - const severity = getSeverityColor(percent); + // Nothing to show: no limit and no per-turn counters. + if (!hasKnownLimit && used <= 0 && !hasUsageDetails) return null; + + const percent = hasKnownLimit + ? Math.min((used / (total as number)) * 100, 100) + : null; + const severity = getSeverityColor(percent ?? 0); return (
-
- Context usage - - {percent.toFixed(1)}% - -
+ {hasKnownLimit && percent !== null ? ( +
+ Context usage + + {percent.toFixed(1)}% + +
+ ) : null} {promptTokens !== undefined && (
Prompt tokens @@ -98,20 +129,32 @@ export const ContextUsageBar: FC<{
)} + {cacheWrites !== undefined && cacheWrites > 0 && ( +
+ Cache writes + + {formatTokenCountFull(cacheWrites)} + +
+ )}
- Total + + {hasKnownLimit ? "Total" : "Total tokens"} + - {formatTokenCountFull(used)} / {formatTokenCountFull(total)} + {hasKnownLimit + ? `${formatTokenCountFull(used)} / ${formatTokenCountFull(total as number)}` + : formatTokenCountFull(used)}
- {percent > 85 && ( + {hasKnownLimit && percent !== null && percent > 85 ? (
Close to the context limit. Generation will stop at 100%. Increase Context Length in the chat Settings panel to keep going.
- )} + ) : null}
diff --git a/studio/frontend/src/features/chat/runtime-provider.tsx b/studio/frontend/src/features/chat/runtime-provider.tsx index d01383b309..21be5f6e3e 100644 --- a/studio/frontend/src/features/chat/runtime-provider.tsx +++ b/studio/frontend/src/features/chat/runtime-provider.tsx @@ -826,17 +826,24 @@ function useStudioRuntimeAdapters(): StudioRuntimeAdapters { completionTokens: number; totalTokens: number; cachedTokens: number; + cacheWriteTokens?: number; modelId?: string; } | undefined; const store = useChatRuntimeStore.getState(); - if ( - savedUsage && - store.ggufContextLength && - savedUsage.totalTokens <= store.ggufContextLength && - (!savedUsage.modelId || - savedUsage.modelId === store.params.checkpoint) - ) { + // Window check applies only when a local GGUF window is known; + // external providers have ggufContextLength === null. + const withinLocalLimit = + !store.ggufContextLength || + (savedUsage?.totalTokens ?? 0) <= store.ggufContextLength; + // Legacy unscoped usage (no modelId) is only trusted when a + // known local window bounds the totals, so we can't misattribute + // an old local turn to a newly-selected external provider. + const modelMatches = savedUsage?.modelId + ? savedUsage.modelId === store.params.checkpoint + : typeof store.ggufContextLength === "number" && + store.ggufContextLength > 0; + if (savedUsage && withinLocalLimit && modelMatches) { store.setContextUsage(savedUsage); } diff --git a/studio/frontend/src/features/chat/stores/chat-runtime-store.ts b/studio/frontend/src/features/chat/stores/chat-runtime-store.ts index a00b53a44c..6b60ed51ea 100644 --- a/studio/frontend/src/features/chat/stores/chat-runtime-store.ts +++ b/studio/frontend/src/features/chat/stores/chat-runtime-store.ts @@ -291,6 +291,8 @@ type ChatRuntimeStore = { completionTokens: number; totalTokens: number; cachedTokens: number; + // Anthropic-only; optional so pre-cache-stats persisted entries load. + cacheWriteTokens?: number; } | null; modelLoading: boolean; activeNativePathToken: string | null; @@ -640,7 +642,14 @@ export const useChatRuntimeStore = create((set, get) => ({ if (state.settingsHydrated && hasKeys(changedParams)) { saveSettingsPatch({ inferenceParams: changedParams }); } - return { params }; + // Mirror setCheckpoint: the local model load path can mutate + // params.checkpoint via setParams() before setCheckpoint runs, + // leaving stale per-turn counters under the new checkpoint. + const checkpointChanged = state.params.checkpoint !== params.checkpoint; + return { + params, + ...(checkpointChanged ? { contextUsage: null } : {}), + }; }), setCustomPresets: (customPresets) => set(() => { @@ -704,12 +713,17 @@ export const useChatRuntimeStore = create((set, get) => ({ // mount, and a stale persisted local id would race against the // freshly-loaded model. See LAST_EXTERNAL_CHECKPOINT_KEY notes. saveLastExternalCheckpoint(isExternalModelId(modelId) ? modelId : null); + // Clear stale per-turn usage when the model changes; the relaxed + // external-provider render gate would otherwise show old counters + // until the next completion overwrites them. + const checkpointChanged = state.params.checkpoint !== modelId; return { params: { ...state.params, checkpoint: modelId, }, activeGgufVariant: ggufVariant ?? null, + ...(checkpointChanged ? { contextUsage: null } : {}), }; }), setActiveThreadId: (activeThreadId) =>