unslothai · danielhanchen · May 26, 2026 · May 23, 2026 · May 23, 2026 · May 23, 2026
diff --git a/studio/frontend/src/components/assistant-ui/message-timing.tsx b/studio/frontend/src/components/assistant-ui/message-timing.tsx
@@ -33,10 +33,24 @@ export const MessageTiming: FC<{
 
   if (timing?.totalStreamTime === undefined) return null;
 
-  const serverTimings = (
+  const custom = (
     message.metadata as Record<string, unknown> | undefined
-  )?.custom as { serverTimings?: Record<string, number> } | undefined;
-  const st = serverTimings?.serverTimings;
+  )?.custom as
+    | {
+        serverTimings?: Record<string, number>;
+        contextUsage?: {
+          cachedTokens?: number;
+          cacheWriteTokens?: number;
+        };
+      }
+    | undefined;
+  const st = custom?.serverTimings;
+  // `??` (not `||`) so an explicit cache_n=0 isn't replaced by a stale
+  // contextUsage.cachedTokens from a prior turn.
+  const cacheHits =
+    st?.cache_n ?? custom?.contextUsage?.cachedTokens ?? 0;
+  // Anthropic-only cache-write count.
+  const cacheWrites = custom?.contextUsage?.cacheWriteTokens ?? 0;
 
   // Guard unphysical tok/s: llama.cpp emits predicted_ms=0 on no-op
   // turns, blowing the rate up to Infinity. Require >=1 token AND a
@@ -122,11 +136,19 @@ export const MessageTiming: FC<{
                   </span>
                 </div>
               )}
-              {(st?.cache_n ?? 0) > 0 && (
+              {cacheHits > 0 && (
                 <div className="flex items-center justify-between gap-4">
                   <span className="text-muted-foreground">Cache hits</span>
                   <span className="font-mono tabular-nums">
-                    {formatNumber(st!.cache_n)}
+                    {formatNumber(cacheHits)}
+                  </span>
+                </div>
+              )}
+              {cacheWrites > 0 && (
+                <div className="flex items-center justify-between gap-4">
+                  <span className="text-muted-foreground">Cache writes</span>
+                  <span className="font-mono tabular-nums">
+                    {formatNumber(cacheWrites)}
                   </span>
                 </div>
               )}
@@ -146,7 +168,7 @@ export const MessageTiming: FC<{
             </>
           ) : (
             <>
-              {/* Client-side metrics (safetensors fallback) */}
+              {/* Client-side metrics (safetensors + external provider fallback) */}
               {timing.firstTokenTime !== undefined && (
                 <div className="flex items-center justify-between gap-4">
                   <span className="text-muted-foreground">First token</span>
@@ -155,6 +177,22 @@ export const MessageTiming: FC<{
                   </span>
                 </div>
               )}
+              {cacheHits > 0 && (
+                <div className="flex items-center justify-between gap-4">
+                  <span className="text-muted-foreground">Cache hits</span>
+                  <span className="font-mono tabular-nums">
+                    {formatNumber(cacheHits)}
+                  </span>
+                </div>
+              )}
+              {cacheWrites > 0 && (
+                <div className="flex items-center justify-between gap-4">
+                  <span className="text-muted-foreground">Cache writes</span>
+                  <span className="font-mono tabular-nums">
+                    {formatNumber(cacheWrites)}
+                  </span>
+                </div>
+              )}
               <div className="flex items-center justify-between gap-4">
                 <span className="text-muted-foreground">Total</span>
                 <span className="font-mono tabular-nums">

diff --git a/studio/frontend/src/features/chat/api/chat-adapter.ts b/studio/frontend/src/features/chat/api/chat-adapter.ts
@@ -70,6 +70,13 @@ interface ServerUsage {
   prompt_tokens: number;
   completion_tokens: number;
   total_tokens: number;
+  // External prompt-cache fields (see _build_usage_chunk in
+  // external_provider.py). cache_creation is Anthropic-only.
+  prompt_tokens_details?: {
+    cached_tokens?: number;
+  };
+  cache_creation_input_tokens?: number;
+  cache_read_input_tokens?: number;
 }
 
 /** Server-side timing data from llama-server's timings object. */
@@ -1881,18 +1888,31 @@ export function createOpenAIStreamAdapter(): ChatModelAdapter {
         const finalTokPerSec = meta?.timings?.predicted_per_second;
         const serverPromptEvalTime = meta?.timings?.prompt_ms;
 
-        // Update context usage in store if we got valid server data
+        // Prefer llama-server timings; fall back to provider usage envelope.
+        const cachedTokens =
+          meta?.timings?.cache_n ??
+          meta?.usage?.prompt_tokens_details?.cached_tokens ??
+          meta?.usage?.cache_read_input_tokens ??
+          0;
+        // Anthropic-only (billed at the write premium).
+        const cacheWriteTokens = meta?.usage?.cache_creation_input_tokens ?? 0;
+
+        // Gate on the captured checkpoint still being active so a late
+        // completion from provider A doesn't populate the bar after the
+        // user switched to provider B mid-stream.
         if (
           meta?.usage &&
           typeof meta.usage.prompt_tokens === "number" &&
           typeof meta.usage.completion_tokens === "number" &&
-          typeof meta.usage.total_tokens === "number"
+          typeof meta.usage.total_tokens === "number" &&
+          useChatRuntimeStore.getState().params.checkpoint === params.checkpoint
         ) {
           useChatRuntimeStore.getState().setContextUsage({
             promptTokens: meta.usage.prompt_tokens,
             completionTokens: meta.usage.completion_tokens,
             totalTokens: meta.usage.total_tokens,
-            cachedTokens: meta.timings?.cache_n ?? 0,
+            cachedTokens,
+            cacheWriteTokens,
           });
         }
 
@@ -1922,7 +1942,8 @@ export function createOpenAIStreamAdapter(): ChatModelAdapter {
                     promptTokens: meta.usage.prompt_tokens,
                     completionTokens: meta.usage.completion_tokens,
                     totalTokens: meta.usage.total_tokens,
-                    cachedTokens: meta.timings?.cache_n ?? 0,
+                    cachedTokens,
+                    cacheWriteTokens,
                     modelId: params.checkpoint,
                   }
                 : undefined,

diff --git a/studio/frontend/src/features/chat/chat-page.tsx b/studio/frontend/src/features/chat/chat-page.tsx
@@ -1037,6 +1037,10 @@ export function ChatPage(): ReactElement {
           ggufMaxContextLength: null,
           ggufNativeContextLength: null,
           activeNativePathToken: null,
+          // Clear previous-model counters; the relaxed external-provider
+          // render gate would otherwise show stale stats until the next
+          // completion overwrites them.
+          contextUsage: null,
           supportsReasoning: reasoningCaps.supportsReasoning,
           reasoningAlwaysOn: reasoningCaps.reasoningAlwaysOn,
           reasoningStyle: reasoningCaps.reasoningStyle,
@@ -1161,7 +1165,9 @@ export function ChatPage(): ReactElement {
     if (!saved) return;
     viewBeforeCompareRef.current = null;
     navigate({ to: "/chat", search: saved });
-    // Restore context usage from the active thread's last assistant message.
+    // Restore usage from the last assistant message, but only if it
+    // matches the currently active checkpoint. Without this guard the
+    // relaxed render gate would show stale stats from another model.
     const threadId =
       saved.thread ?? useChatRuntimeStore.getState().activeThreadId;
     if (threadId) {
@@ -1175,7 +1181,29 @@ export function ChatPage(): ReactElement {
           const usage = metadata?.contextUsage as ReturnType<
             typeof useChatRuntimeStore.getState
           >["contextUsage"];
-          if (usage) useChatRuntimeStore.getState().setContextUsage(usage);
+          if (!usage) return;
+          const store = useChatRuntimeStore.getState();
+          const activeCheckpoint = store.params.checkpoint;
+          const usageModelId =
+            (usage as { modelId?: unknown }).modelId;
+          // Scope by modelId when present; reject if no active checkpoint
+          // (model-scoped usage cannot be attributed to "nothing").
+          if (typeof usageModelId === "string" && usageModelId) {
+            if (!activeCheckpoint || usageModelId !== activeCheckpoint) {
+              return;
+            }
+          }
+          // For local turns, also require the restored count to fit in
+          // the active window. Skip when unknown (external provider).
+          const limit = store.ggufContextLength;
+          if (
+            typeof limit === "number" &&
+            limit > 0 &&
+            (usage.totalTokens ?? 0) > limit
+          ) {
+            return;
+          }
+          store.setContextUsage(usage);
         })
         .catch((error) => {
           if (!isExpectedBackgroundChatStorageError(error)) {
@@ -1491,11 +1519,13 @@ export function ChatPage(): ReactElement {
             ) : null}
           </div>
           <div className="ml-auto flex items-center gap-2">
-            {view.mode === "single" && ggufContextLength && contextUsage ? (
+            {view.mode === "single" && contextUsage ? (
               <ContextUsageBar
                 used={contextUsage.totalTokens}
+                // null on external providers; the bar handles that.
                 total={ggufContextLength}
                 cached={contextUsage.cachedTokens}
+                cacheWrites={contextUsage.cacheWriteTokens}
                 promptTokens={contextUsage.promptTokens}
                 completionTokens={contextUsage.completionTokens}
                 className="h-[34px]"

diff --git a/studio/frontend/src/features/chat/components/context-usage-bar.tsx b/studio/frontend/src/features/chat/components/context-usage-bar.tsx
@@ -28,37 +28,66 @@ function getSeverityColor(percent: number): {
 
 export const ContextUsageBar: FC<{
   used: number;
-  total: number;
+  // null on external providers (no known window); bar then hides the ratio.
+  total?: number | null;
   cached?: number;
+  // Anthropic-only (billed at the write premium).
+  cacheWrites?: number;
   promptTokens?: number;
   completionTokens?: number;
   className?: string;
-}> = ({ used, total, cached, promptTokens, completionTokens, className }) => {
-  if (total <= 0) return null;
+}> = ({
+  used,
+  total,
+  cached,
+  cacheWrites,
+  promptTokens,
+  completionTokens,
+  className,
+}) => {
+  const hasKnownLimit = typeof total === "number" && total > 0;
+  const hasUsageDetails =
+    promptTokens !== undefined ||
+    completionTokens !== undefined ||
+    (cached !== undefined && cached > 0) ||
+    (cacheWrites !== undefined && cacheWrites > 0);
 
-  const percent = Math.min((used / total) * 100, 100);
-  const severity = getSeverityColor(percent);
+  // Nothing to show: no limit and no per-turn counters.
+  if (!hasKnownLimit && used <= 0 && !hasUsageDetails) return null;
+
+  const percent = hasKnownLimit
+    ? Math.min((used / (total as number)) * 100, 100)
+    : null;
+  const severity = getSeverityColor(percent ?? 0);
 
   return (
     <Tooltip>
       <TooltipTrigger asChild>
         <button
           type="button"
-          aria-label={`Context usage: ${formatTokenCount(used)} of ${formatTokenCount(total)} tokens`}
+          aria-label={
+            hasKnownLimit
+              ? `Context usage: ${formatTokenCount(used)} of ${formatTokenCount(total as number)} tokens`
+              : `Token usage: ${formatTokenCount(used)} tokens`
+          }
           className={cn(
             "flex items-center gap-2 rounded-[10px] px-2.5 py-1 font-mono text-chat-icon-fg text-[13px] tabular-nums transition-colors hover:bg-chat-icon-bg-hover hover:text-chat-icon-fg-hover",
             className,
           )}
         >
           <span>
-            {formatTokenCount(used)} / {formatTokenCount(total)}
+            {hasKnownLimit
+              ? `${formatTokenCount(used)} / ${formatTokenCount(total as number)}`
+              : `${formatTokenCount(used)} tokens`}
           </span>
-          <div className="h-1.5 w-16 rounded-full bg-black/10 dark:bg-white/15 overflow-hidden">
-            <div
-              className={cn("h-full rounded-full transition-all", severity.bar)}
-              style={{ width: `${percent}%` }}
-            />
-          </div>
+          {hasKnownLimit && percent !== null ? (
+            <div className="h-1.5 w-16 rounded-full bg-black/10 dark:bg-white/15 overflow-hidden">
+              <div
+                className={cn("h-full rounded-full transition-all", severity.bar)}
+                style={{ width: `${percent}%` }}
+              />
+            </div>
+          ) : null}
         </button>
       </TooltipTrigger>
       <TooltipContent
@@ -68,12 +97,14 @@ export const ContextUsageBar: FC<{
         className="[&_span>svg]:hidden!"
       >
         <div className="grid min-w-44 gap-1.5 text-xs">
-          <div className="flex items-center justify-between gap-4">
-            <span className="text-muted-foreground">Context usage</span>
-            <span className={cn("font-mono tabular-nums font-medium", severity.text)}>
-              {percent.toFixed(1)}%
-            </span>
-          </div>
+          {hasKnownLimit && percent !== null ? (
+            <div className="flex items-center justify-between gap-4">
+              <span className="text-muted-foreground">Context usage</span>
+              <span className={cn("font-mono tabular-nums font-medium", severity.text)}>
+                {percent.toFixed(1)}%
+              </span>
+            </div>
+          ) : null}
           {promptTokens !== undefined && (
             <div className="flex items-center justify-between gap-4">
               <span className="text-muted-foreground">Prompt tokens</span>
@@ -98,20 +129,32 @@ export const ContextUsageBar: FC<{
               </span>
             </div>
           )}
+          {cacheWrites !== undefined && cacheWrites > 0 && (
+            <div className="flex items-center justify-between gap-4">
+              <span className="text-muted-foreground">Cache writes</span>
+              <span className="font-mono tabular-nums">
+                {formatTokenCountFull(cacheWrites)}
+              </span>
+            </div>
+          )}
           <div className="my-0.5 border-t border-border/40" />
           <div className="flex items-center justify-between gap-4">
-            <span className="text-muted-foreground">Total</span>
+            <span className="text-muted-foreground">
+              {hasKnownLimit ? "Total" : "Total tokens"}
+            </span>
             <span className="font-mono tabular-nums">
-              {formatTokenCountFull(used)} / {formatTokenCountFull(total)}
+              {hasKnownLimit
+                ? `${formatTokenCountFull(used)} / ${formatTokenCountFull(total as number)}`
+                : formatTokenCountFull(used)}
             </span>
           </div>
-          {percent > 85 && (
+          {hasKnownLimit && percent !== null && percent > 85 ? (
             <div className="mt-1 max-w-64 text-[11px] leading-snug text-muted-foreground/90">
               Close to the context limit. Generation will stop at 100%.
               Increase <span className="font-medium">Context Length</span> in
               the chat Settings panel to keep going.
             </div>
-          )}
+          ) : null}
         </div>
       </TooltipContent>
     </Tooltip>

diff --git a/studio/frontend/src/features/chat/runtime-provider.tsx b/studio/frontend/src/features/chat/runtime-provider.tsx
@@ -826,17 +826,24 @@ function useStudioRuntimeAdapters(): StudioRuntimeAdapters {
               completionTokens: number;
               totalTokens: number;
               cachedTokens: number;
+              cacheWriteTokens?: number;
               modelId?: string;
             }
           | undefined;
         const store = useChatRuntimeStore.getState();
-        if (
-          savedUsage &&
-          store.ggufContextLength &&
-          savedUsage.totalTokens <= store.ggufContextLength &&
-          (!savedUsage.modelId ||
-            savedUsage.modelId === store.params.checkpoint)
-        ) {
+        // Window check applies only when a local GGUF window is known;
+        // external providers have ggufContextLength === null.
+        const withinLocalLimit =
+          !store.ggufContextLength ||
+          (savedUsage?.totalTokens ?? 0) <= store.ggufContextLength;
+        // Legacy unscoped usage (no modelId) is only trusted when a
+        // known local window bounds the totals, so we can't misattribute
+        // an old local turn to a newly-selected external provider.
+        const modelMatches = savedUsage?.modelId
+          ? savedUsage.modelId === store.params.checkpoint
+          : typeof store.ggufContextLength === "number" &&
+            store.ggufContextLength > 0;
+        if (savedUsage && withinLocalLimit && modelMatches) {
           store.setContextUsage(savedUsage);
         }