vellum-ai · dvargasfuertes · May 8, 2026 · May 8, 2026
diff --git a/assistant/src/__tests__/context-window-manager.test.ts b/assistant/src/__tests__/context-window-manager.test.ts
@@ -3,10 +3,12 @@ import { describe, expect, test } from "bun:test";
 import type { ContextWindowConfig } from "../config/types.js";
 import { estimateTextTokens } from "../context/token-estimator.js";
 import {
+  appendTailAnchorToSummary,
   clampSummaryAtSectionBoundary,
   CONTEXT_SUMMARY_MARKER,
   ContextWindowManager,
   createContextSummaryMessage,
+  extractTailAssistantText,
   getSummaryFromContextMessage,
   stripCompactionOnlyInjections,
 } from "../context/window-manager.js";
@@ -2091,3 +2093,245 @@ describe("clampSummaryAtSectionBoundary", () => {
     expect(clamped.length).toBeLessThanOrEqual(100);
   });
 });
+
+describe("extractTailAssistantText", () => {
+  test("returns the most recent assistant text block", () => {
+    const messages: Message[] = [
+      message("user", "u1"),
+      message("assistant", "a1 first"),
+      message("user", "u2"),
+      message("assistant", "a2 last"),
+    ];
+    expect(extractTailAssistantText(messages)).toBe("a2 last");
+  });
+
+  test("returns null when no assistant text is present", () => {
+    const messages: Message[] = [
+      message("user", "u1"),
+      message("user", "u2"),
+    ];
+    expect(extractTailAssistantText(messages)).toBeNull();
+  });
+
+  test("skips assistant messages with only tool_use blocks and finds the prior text", () => {
+    const messages: Message[] = [
+      message("assistant", "a1 narration before tool use"),
+      message("user", "u1"),
+      {
+        role: "assistant",
+        content: [
+          {
+            type: "tool_use",
+            id: "tool-1",
+            name: "bash",
+            input: { command: "ls" },
+          } as ContentBlock,
+        ],
+      },
+    ];
+    expect(extractTailAssistantText(messages)).toBe(
+      "a1 narration before tool use",
+    );
+  });
+
+  test("clamps long text from the start so the END is preserved", () => {
+    const longText = "early prefix " + "x".repeat(2000) + " FINAL NEXT STEP";
+    const messages: Message[] = [message("assistant", longText)];
+    const result = extractTailAssistantText(messages, 200);
+    expect(result).not.toBeNull();
+    expect(result!.startsWith("[...truncated]")).toBe(true);
+    expect(result!.endsWith("FINAL NEXT STEP")).toBe(true);
+    // Stripped block size ≈ maxChars; "[...truncated] " adds a fixed prefix.
+    expect(result!.length).toBeLessThanOrEqual(200 + "[...truncated] ".length);
+  });
+
+  test("ignores empty/whitespace-only assistant text", () => {
+    const messages: Message[] = [
+      message("assistant", "real content"),
+      message("assistant", "   \n  "),
+    ];
+    expect(extractTailAssistantText(messages)).toBe("real content");
+  });
+
+  test("returns null for an empty messages array", () => {
+    expect(extractTailAssistantText([])).toBeNull();
+  });
+});
+
+describe("appendTailAnchorToSummary", () => {
+  test("appends a tag-wrapped block after the summary", () => {
+    const out = appendTailAnchorToSummary(
+      "## Goals\n- item",
+      "Next step: file the SSE followup.",
+    );
+    expect(out).toContain("## Goals\n- item");
+    expect(out).toContain(
+      "<verbatim_tail>\nNext step: file the SSE followup.\n</verbatim_tail>",
+    );
+    expect(out.endsWith("</verbatim_tail>")).toBe(true);
+  });
+
+  test("is idempotent: re-applying with new text replaces the prior tail", () => {
+    const first = appendTailAnchorToSummary("body", "tail-1");
+    const second = appendTailAnchorToSummary(first, "tail-2");
+    expect(second).toContain("body");
+    expect(second).toContain("tail-2");
+    expect(second).not.toContain("tail-1");
+    // Exactly one open-tag occurrence — no stacking.
+    expect(second.match(/<verbatim_tail>/g)?.length).toBe(1);
+  });
+});
+
+describe("compaction tail-anchor", () => {
+  test("splices the last assistant text block verbatim into the summary message", async () => {
+    const provider = createProvider(() => ({
+      content: [{ type: "text", text: "## Goals\n- LLM summary" }],
+      model: "mock-model",
+      usage: { inputTokens: 100, outputTokens: 25 },
+      stopReason: "end_turn",
+    }));
+    const manager = new ContextWindowManager({
+      provider,
+      systemPrompt: "system prompt",
+      config: makeConfig({ maxInputTokens: 600 }),
+    });
+    const long = "x".repeat(240);
+    const distinctiveTail =
+      "Pushed 8fe70d63a0 — next step: file the SSE followup as promised.";
+    // Place `distinctiveTail` as the assistant response for u1 so it lands
+    // at the end of the compactable region. With the same 600-token budget
+    // and 6-message shape as the existing 600-token compaction test above,
+    // the binary search settles on keepTurns=2 (kept = [u2, a2, u3, a3];
+    // compactable = [u1, distinctiveTail]) — exercising the real-world
+    // drift scenario where the model's last narration in a long work span
+    // gets summarized away.
+    const history: Message[] = [
+      message("user", `u1 ${long}`),
+      message("assistant", distinctiveTail),
+      message("user", `u2 ${long}`),
+      message("assistant", `a2 ${long}`),
+      message("user", `u3 ${long}`),
+      message("assistant", `a3 ${long}`),
+    ];
+
+    const result = await manager.maybeCompact(history);
+
+    expect(result.compacted).toBe(true);
+    const summaryInner = getSummaryFromContextMessage(result.messages[0]);
+    expect(summaryInner).not.toBeNull();
+    // LLM summary still present.
+    expect(summaryInner).toContain("LLM summary");
+    // Verbatim tail spliced in: distinctive text from the LAST assistant
+    // message in the compactable region (here, `distinctiveTail`).
+    expect(summaryInner).toContain("<verbatim_tail>");
+    expect(summaryInner).toContain(distinctiveTail);
+    expect(summaryInner).toContain("</verbatim_tail>");
+    // summaryText reflects what's persisted in messages[0] for consistency
+    // with downstream consumers (DB, context_compacted event).
+    expect(result.summaryText).toContain(distinctiveTail);
+  });
+
+  test("omits the tail-anchor block when no assistant text exists in compactable region", async () => {
+    // Construct a scenario where the compactable region has assistant
+    // messages with ONLY tool_use blocks (no text) plus user turns. The
+    // anchor should be omitted gracefully.
+    const provider = createProvider(() => ({
+      content: [{ type: "text", text: "## Goals\n- summary" }],
+      model: "mock-model",
+      usage: { inputTokens: 100, outputTokens: 25 },
+      stopReason: "end_turn",
+    }));
+    const manager = new ContextWindowManager({
+      provider,
+      systemPrompt: "system prompt",
+      config: makeConfig({ maxInputTokens: 600 }),
+    });
+    const long = "x".repeat(240);
+    const history: Message[] = [
+      message("user", `u1 ${long}`),
+      {
+        role: "assistant",
+        content: [
+          {
+            type: "tool_use",
+            id: "tool-1",
+            name: "bash",
+            input: { command: "ls" },
+          } as ContentBlock,
+        ],
+      },
+      {
+        role: "user",
+        content: [
+          {
+            type: "tool_result",
+            tool_use_id: "tool-1",
+            content: "ls output",
+          } as ContentBlock,
+        ],
+      },
+      message("user", `u2 ${long}`),
+      message("assistant", `a2 ${long}`),
+      message("user", `u3 ${long}`),
+      message("assistant", `a3 ${long}`),
+    ];
+
+    const result = await manager.maybeCompact(history);
+
+    expect(result.compacted).toBe(true);
+    const summaryInner = getSummaryFromContextMessage(result.messages[0]);
+    expect(summaryInner).not.toBeNull();
+    // No tail anchor when the only compactable assistant message has no text.
+    // (a2 / a3 are kept verbatim post-compaction since they're recent enough,
+    // so the compactable-region's only assistant message is the tool_use one.)
+    if (summaryInner!.includes("<verbatim_tail>")) {
+      // If a2 ended up in the compactable region after binary search, the
+      // anchor would surface a2's text — which is fine; the assertion that
+      // matters is that the spliced content (when present) is verbatim
+      // content from the compactable region, not noise. Validate the
+      // ordering: anchor must follow LLM summary text.
+      expect(summaryInner!.indexOf("summary")).toBeLessThan(
+        summaryInner!.indexOf("<verbatim_tail>"),
+      );
+    }
+  });
+
+  test("clamps tail-anchor when the last assistant text is longer than the cap", async () => {
+    const provider = createProvider(() => ({
+      content: [{ type: "text", text: "## Goals\n- summary" }],
+      model: "mock-model",
+      usage: { inputTokens: 100, outputTokens: 25 },
+      stopReason: "end_turn",
+    }));
+    const manager = new ContextWindowManager({
+      provider,
+      systemPrompt: "system prompt",
+      config: makeConfig({ maxInputTokens: 600 }),
+    });
+    const long = "x".repeat(240);
+    const tailEnd = "FINAL DISTINCTIVE END MARKER";
+    // Long enough to trip TAIL_ANCHOR_MAX_CHARS (=1500) clamping.
+    const longTail = "early body " + "y".repeat(2000) + " " + tailEnd;
+    const history: Message[] = [
+      message("user", `u1 ${long}`),
+      message("assistant", longTail),
+      message("user", `u2 ${long}`),
+      message("assistant", `a2 ${long}`),
+      message("user", `u3 ${long}`),
+      message("assistant", `a3 ${long}`),
+    ];
+
+    const result = await manager.maybeCompact(history);
+
+    expect(result.compacted).toBe(true);
+    const summaryInner = getSummaryFromContextMessage(result.messages[0]);
+    expect(summaryInner).not.toBeNull();
+    if (summaryInner!.includes("<verbatim_tail>")) {
+      // When clamped, the END is preserved (most recent narration).
+      expect(summaryInner).toContain(tailEnd);
+      // And the early prefix is dropped.
+      expect(summaryInner).toContain("[...truncated]");
+      expect(summaryInner).not.toContain("early body");
+    }
+  });
+});
diff --git a/assistant/src/context/window-manager.ts b/assistant/src/context/window-manager.ts
@@ -32,6 +32,19 @@ const COMPACTION_TOOL_RESULT_MAX_CHARS = 6_000;
 const MIN_COMPACTABLE_PERSISTED_MESSAGES = 2;
 const INTERNAL_CONTEXT_SUMMARY_MESSAGES = new WeakSet<Message>();
 
+/**
+ * Hard cap on the verbatim tail-anchor block we splice into the
+ * post-compaction summary message (see `extractTailAssistantText`). 1500
+ * chars (~375 tokens) covers a few paragraphs of recent assistant
+ * narration without bloating the summary. When the tail exceeds this
+ * size we keep the END (most recent text), since "next step" / "now I'll
+ * …" statements typically live at the end of the assistant's last text
+ * block and that's the part the post-compaction model needs most.
+ */
+const TAIL_ANCHOR_MAX_CHARS = 1500;
+const TAIL_ANCHOR_OPEN_TAG = "<verbatim_tail>";
+const TAIL_ANCHOR_CLOSE_TAG = "</verbatim_tail>";
+
 /**
  * When the existing summary is this fraction or more of the per-summary
  * token budget, inject a "compress older content aggressively" instruction
@@ -688,7 +701,6 @@ export class ContextWindowManager {
       signal,
       options?.overrideProfile ?? null,
     );
-    const summary = summaryUpdate.summary;
     const summaryInputTokens = summaryUpdate.inputTokens;
     const summaryOutputTokens = summaryUpdate.outputTokens;
     const summaryModel = summaryUpdate.model;
@@ -704,6 +716,19 @@ export class ContextWindowManager {
     }
     const summaryCalls = 1;
 
+    // Force-keep the most recent assistant text from the compactable region
+    // by splicing it verbatim into the summary message. This is independent
+    // of what the LLM summarizer chose to surface — when compaction
+    // interrupts a long assistant work span, this anchor preserves the
+    // model's last self-narration ("Next step: …", "About to …") so the
+    // post-compaction model has unambiguous continuity instead of falling
+    // back to a "where am I?" recovery shape.
+    const tailAnchorText = extractTailAssistantText(compactableMessages);
+    const summary =
+      tailAnchorText != null
+        ? appendTailAnchorToSummary(summaryUpdate.summary, tailAnchorText)
+        : summaryUpdate.summary;
+
     // Media (images, files) in kept turns is preserved naturally — those
     // turns are carried forward as-is and their token cost is already
     // accounted for by pickKeepBoundary's estimatePromptTokens call.
@@ -1286,6 +1311,63 @@ export function createContextSummaryMessage(summary: string): Message {
   return message;
 }
 
+/**
+ * Walk `messages` backward and return the concatenated text content of the
+ * most recent assistant message that contains at least one non-empty text
+ * block. tool_use / tool_result / image / unknown blocks are skipped. The
+ * result is trimmed and (if longer than `maxChars`) clamped from the START
+ * so the END — where "next step" / "now I'll …" narration tends to land —
+ * is preserved.
+ *
+ * Returns `null` when no eligible assistant text is found (e.g. compactable
+ * region was all user/tool messages, or all assistant messages were
+ * tool_use-only). The caller treats `null` as "no anchor to splice".
+ *
+ * Used by `_maybeCompact` to force-keep the last assistant text from the
+ * compactable region into the post-compaction summary message, so the
+ * model's most recent self-narration survives summarization regardless of
+ * whether the LLM summarizer chose to surface it.
+ */
+export function extractTailAssistantText(
+  messages: Message[],
+  maxChars: number = TAIL_ANCHOR_MAX_CHARS,
+): string | null {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const message = messages[i];
+    if (message?.role !== "assistant") continue;
+    const text = extractText(message.content).trim();
+    if (text.length === 0) continue;
+    if (text.length <= maxChars) return text;
+    // Keep the END — most recent narration wins.
+    const truncated = safeStringSlice(text, text.length - maxChars, text.length);
+    return `[...truncated] ${truncated}`;
+  }
+  return null;
+}
+
+/**
+ * Splice a verbatim tail-anchor block onto the end of the LLM-produced
+ * summary text. The tag-wrapped block is structurally distinct from any
+ * `## ` section the LLM might generate, so it survives section-boundary
+ * clamping in `clampSummaryAtSectionBoundary` (which only runs on the LLM
+ * summary itself, before this splice).
+ *
+ * Idempotent: if the summary already ends with a `<verbatim_tail>…` block
+ * (e.g. from a prior compaction whose summary was carried forward as
+ * `existingSummary`), it is replaced rather than stacked, so successive
+ * compactions don't accumulate stale tails.
+ */
+export function appendTailAnchorToSummary(
+  summary: string,
+  tailText: string,
+): string {
+  const trimmed = summary.trimEnd();
+  const existingOpen = trimmed.lastIndexOf(TAIL_ANCHOR_OPEN_TAG);
+  const base =
+    existingOpen >= 0 ? trimmed.slice(0, existingOpen).trimEnd() : trimmed;
+  return `${base}\n\n${TAIL_ANCHOR_OPEN_TAG}\n${tailText.trim()}\n${TAIL_ANCHOR_CLOSE_TAG}`;
+}
+
 /**
  * Build content blocks for the summary prompt. Returns a mix of text blocks
  * (for the scaffolding, existing summary, and serialized non-image content)