diff --git a/assistant/src/memory/v3/__tests__/gate.test.ts b/assistant/src/memory/v3/__tests__/gate.test.ts new file mode 100644 index 00000000000..bbdd63b947c --- /dev/null +++ b/assistant/src/memory/v3/__tests__/gate.test.ts @@ -0,0 +1,344 @@ +/** + * Tests for `assistant/src/memory/v3/gate.ts`. + * + * Coverage matrix: + * - ready + selection → selection maps from candidates, in model order, and + * includes sticky slugs even when the model omits them. + * - more + questions → `decision.questions` surfaced; selection still returned. + * - more with no/blank questions → decision is `{ decision: "more" }` (no + * empty `questions` array). + * - provider === null (no provider configured) → fail-safe: ready, all + * candidates selected, sticky present. + * - provider throws → fail-safe (ready, all candidates). + * - missing tool_use block → fail-safe (ready, all candidates). + * - tool input failing schema → fail-safe (ready, all candidates). + * - model selecting a slug outside the candidate set → dropped. + * - request shape: forced tool_choice on `decide_selection`, candidate set in + * the user message, abort signal forwarded. + * + * The provider is injected via `runGate({ provider })` — no real LLM, no + * network, no `mock.module`. `~/.vellum/` is never touched. + */ + +import { describe, expect, test } from "bun:test"; + +import type { + Message, + Provider, + ProviderResponse, + SendMessageOptions, + ToolDefinition, +} from "../../../providers/types.js"; +import type { RetrievalInput } from "../../v2/harness/retriever.js"; +import { runGate } from "../gate.js"; + +// --------------------------------------------------------------------------- +// Helpers. +// --------------------------------------------------------------------------- + +interface ProviderCall { + messages: Message[]; + tools: ToolDefinition[] | undefined; + systemPrompt: string | undefined; + options: SendMessageOptions | undefined; +} + +/** + * A stub provider that records its calls and returns a fixed response. + * Honors an already-aborted signal by throwing an AbortError so signal + * forwarding can be asserted. + */ +function makeProvider( + response: ProviderResponse, + calls: ProviderCall[], +): Provider { + return { + name: "stub", + sendMessage: async (messages, tools, systemPrompt, options) => { + calls.push({ messages, tools, systemPrompt, options }); + if (options?.signal?.aborted) { + const err = new Error("aborted"); + err.name = "AbortError"; + throw err; + } + return response; + }, + }; +} + +/** A provider whose sendMessage always throws. */ +function makeThrowingProvider(): Provider { + return { + name: "throwing-stub", + sendMessage: async () => { + throw new Error("boom"); + }, + }; +} + +function gateToolResponse(input: Record): ProviderResponse { + return { + model: "stub-model", + stopReason: "tool_use", + usage: { inputTokens: 0, outputTokens: 0 }, + content: [ + { type: "tool_use", id: "tu-1", name: "decide_selection", input }, + ], + }; +} + +/** A response with no tool_use block (e.g. the model emitted only text). */ +function textOnlyResponse(): ProviderResponse { + return { + model: "stub-model", + stopReason: "end_turn", + usage: { inputTokens: 0, outputTokens: 0 }, + content: [{ type: "text", text: "no tool here" }], + }; +} + +/** Minimal `RetrievalInput` — the gate only reads `nowText` and `signal`. */ +function makeInput(overrides?: Partial): RetrievalInput { + return { + workspaceDir: "/tmp/does-not-matter", + recentTurnPairs: [], + nowText: "2026-05-25 10:00 PT", + priorEverInjected: [], + config: {} as unknown as RetrievalInput["config"], + ...overrides, + }; +} + +// --------------------------------------------------------------------------- +// Tests. +// --------------------------------------------------------------------------- + +describe("runGate — ready decision", () => { + test("maps model selection to slugs in order and includes sticky", async () => { + const calls: ProviderCall[] = []; + const provider = makeProvider( + // Model selects b, a (its own order). Sticky `c` is omitted by the + // model but must survive in the final selection. + gateToolResponse({ decision: "ready", selected_slugs: ["b", "a"] }), + calls, + ); + + const result = await runGate({ + input: makeInput(), + candidates: new Set(["a", "b", "c"]), + sticky: new Set(["c"]), + passNumber: 1, + provider, + }); + + expect(result.decision).toEqual({ decision: "ready" }); + // Model order preserved (b, a), then omitted sticky appended (c). + expect(result.selectedSlugs).toEqual(["b", "a", "c"]); + expect(calls).toHaveLength(1); + }); + + test("forces tool_choice on decide_selection and surfaces candidates", async () => { + const calls: ProviderCall[] = []; + const provider = makeProvider( + gateToolResponse({ decision: "ready", selected_slugs: ["a"] }), + calls, + ); + + await runGate({ + input: makeInput({ nowText: "NOW-MARKER" }), + candidates: new Set(["a", "b"]), + sticky: new Set(), + passNumber: 3, + provider, + }); + + const call = calls[0]; + expect(call.options?.config?.tool_choice).toEqual({ + type: "tool", + name: "decide_selection", + }); + expect(call.options?.config?.callSite).toBe("memoryV3Gate"); + expect(call.tools?.[0].name).toBe("decide_selection"); + const userText = call.messages[0].content + .map((b) => (b.type === "text" ? b.text : "")) + .join("\n"); + expect(userText).toContain("NOW-MARKER"); + expect(userText).toContain("a"); + expect(userText).toContain("b"); + }); + + test("drops a model-selected slug outside the candidate set", async () => { + const calls: ProviderCall[] = []; + const provider = makeProvider( + gateToolResponse({ decision: "ready", selected_slugs: ["a", "ghost"] }), + calls, + ); + + const result = await runGate({ + input: makeInput(), + candidates: new Set(["a", "b"]), + sticky: new Set(), + passNumber: 1, + provider, + }); + + expect(result.selectedSlugs).toEqual(["a"]); + }); + + test("forwards an abort signal to the provider call", async () => { + const calls: ProviderCall[] = []; + const controller = new AbortController(); + controller.abort(); + const provider = makeProvider( + gateToolResponse({ decision: "ready", selected_slugs: ["a"] }), + calls, + ); + + // Aborted signal makes the stub throw → gate fails open (ready, all). + const result = await runGate({ + input: makeInput({ signal: controller.signal }), + candidates: new Set(["a", "b"]), + sticky: new Set(), + passNumber: 1, + provider, + }); + + expect(calls[0].options?.signal).toBe(controller.signal); + expect(result.decision).toEqual({ decision: "ready" }); + expect(result.selectedSlugs).toEqual(["a", "b"]); + }); +}); + +describe("runGate — more decision", () => { + test("surfaces generated follow-up questions", async () => { + const calls: ProviderCall[] = []; + const provider = makeProvider( + gateToolResponse({ + decision: "more", + selected_slugs: ["a"], + questions: ["What is the user's deadline?", "Who else is involved?"], + }), + calls, + ); + + const result = await runGate({ + input: makeInput(), + candidates: new Set(["a", "b"]), + sticky: new Set(), + passNumber: 1, + provider, + }); + + expect(result.decision).toEqual({ + decision: "more", + questions: ["What is the user's deadline?", "Who else is involved?"], + }); + // Selection is still returned alongside the "more" verdict. + expect(result.selectedSlugs).toEqual(["a"]); + }); + + test("omits questions array when the model gave none (or only blanks)", async () => { + const calls: ProviderCall[] = []; + const provider = makeProvider( + gateToolResponse({ + decision: "more", + selected_slugs: ["a"], + questions: [" ", ""], + }), + calls, + ); + + const result = await runGate({ + input: makeInput(), + candidates: new Set(["a"]), + sticky: new Set(), + passNumber: 1, + provider, + }); + + expect(result.decision).toEqual({ decision: "more" }); + }); + + test("preserves sticky even on a more decision", async () => { + const calls: ProviderCall[] = []; + const provider = makeProvider( + gateToolResponse({ + decision: "more", + selected_slugs: ["a"], + questions: ["follow-up?"], + }), + calls, + ); + + const result = await runGate({ + input: makeInput(), + candidates: new Set(["a", "sticky-page"]), + sticky: new Set(["sticky-page"]), + passNumber: 1, + provider, + }); + + expect(result.selectedSlugs).toContain("sticky-page"); + }); +}); + +describe("runGate — fail-safe", () => { + test("provider === null selects all candidates with sticky and ready", async () => { + const result = await runGate({ + input: makeInput(), + candidates: new Set(["a", "b", "c"]), + sticky: new Set(["c"]), + passNumber: 1, + provider: null, + }); + + expect(result.decision).toEqual({ decision: "ready" }); + expect([...result.selectedSlugs].sort()).toEqual(["a", "b", "c"]); + expect(result.selectedSlugs).toContain("c"); + }); + + test("provider throw falls back to ready + all candidates", async () => { + const result = await runGate({ + input: makeInput(), + candidates: new Set(["a", "b"]), + sticky: new Set(), + passNumber: 1, + provider: makeThrowingProvider(), + }); + + expect(result.decision).toEqual({ decision: "ready" }); + expect([...result.selectedSlugs].sort()).toEqual(["a", "b"]); + }); + + test("missing tool_use block falls back to ready + all candidates", async () => { + const calls: ProviderCall[] = []; + const result = await runGate({ + input: makeInput(), + candidates: new Set(["a", "b"]), + sticky: new Set(), + passNumber: 1, + provider: makeProvider(textOnlyResponse(), calls), + }); + + expect(result.decision).toEqual({ decision: "ready" }); + expect([...result.selectedSlugs].sort()).toEqual(["a", "b"]); + }); + + test("schema-mismatched tool input falls back to ready + all candidates", async () => { + const calls: ProviderCall[] = []; + const result = await runGate({ + input: makeInput(), + candidates: new Set(["a", "b"]), + sticky: new Set(), + passNumber: 1, + // `decision` is required; missing it fails the Zod schema. + provider: makeProvider( + gateToolResponse({ selected_slugs: ["a"] }), + calls, + ), + }); + + expect(result.decision).toEqual({ decision: "ready" }); + expect([...result.selectedSlugs].sort()).toEqual(["a", "b"]); + }); +}); diff --git a/assistant/src/memory/v3/gate.ts b/assistant/src/memory/v3/gate.ts new file mode 100644 index 00000000000..4abae9452ef --- /dev/null +++ b/assistant/src/memory/v3/gate.ts @@ -0,0 +1,275 @@ +/** + * Memory v3 — selection gate. + * + * The gate is the final step of one retrieval pass. After the scouts, the tree + * walk, the edge expansion, and the sticky carry-over have each contributed + * candidate page slugs, the gate makes one capable LLM call over the *unioned* + * candidate set and decides: + * + * - **ready** — finalize the selection and inject for the next reply, or + * - **more** — the candidates don't yet cover the turn; emit follow-up + * questions that seed the next pass. These questions are the gate's own + * *generated* queries (a refined sub-question), NOT a replay of the + * original user message — the loop feeds them back to the scouts/tree on + * the next iteration. + * + * The gate also returns the final ordered `selectedSlugs` (the order the model + * returned, with sticky slugs guaranteed present). Sticky pages are never + * dropped: they were injected on a prior turn and removing them mid-conversation + * would silently amnesia the assistant, so we union them back in even when the + * model omits them. + * + * Scope — brief generation is deferred. The full v3 design pairs the selection + * with a ~1000-token voice brief, but that brief is only consumed when v3 is + * actually injected (a later cutover). In shadow mode the harness injects v2 + * and only compares selections, so this module produces the selection + + * `GateDecision` only — matching what the harness trace already models. The + * brief-generation seam is marked below; do not build voice synthesis here. + * + * Fail-safe. If no provider is configured or the provider call errors/returns + * an unusable response, the gate fails *open*: it returns + * `decision: { decision: "ready" }` and selects every candidate. A retrieval + * loop that can't reach the model should still inject what it found rather than + * inject nothing. + * + * This module is currently unwired — a later PR composes it into the loop. + */ + +import { z } from "zod"; + +import { + extractToolUse, + getConfiguredProvider, +} from "../../providers/provider-send-message.js"; +import type { + Message, + Provider, + ToolDefinition, +} from "../../providers/types.js"; +import { getLogger } from "../../util/logger.js"; +import type { RetrievalInput } from "../v2/harness/retriever.js"; +import type { GateDecision } from "../v2/harness/trace.js"; + +const log = getLogger("memory-v3-gate"); + +/** Tool name forced via `tool_choice`. Shared constant so tests can match it. */ +const GATE_TOOL_NAME = "decide_selection"; + +/** + * Arguments to one gate invocation. + * + * `candidates` is the accumulated candidate set for this pass — the union of + * scouts-kept, tree pages, edge-pulled, and sticky slugs. `sticky` is the + * subset that was injected on a prior turn and must survive: it is always a + * subset of `candidates` in practice, but the gate unions it back into both + * the prompt and the final selection defensively. + */ +export interface RunGateArgs { + input: RetrievalInput; + candidates: Set; + sticky: Set; + passNumber: number; + /** + * Provider override seam for tests. Production leaves this unset and the + * gate resolves `getConfiguredProvider("memoryV3Gate")`. `null` is distinct + * from `undefined`: passing `null` simulates "no provider configured" and + * exercises the fail-safe path without resolving the real registry. + */ + provider?: Provider | null; +} + +export interface RunGateResult { + decision: GateDecision; + /** Final page slugs in the model's returned order; sticky guaranteed present. */ + selectedSlugs: string[]; +} + +/** + * Build the forced tool definition. `selected_slugs` is the ordered final + * selection; `decision` is the ready/more verdict; `questions` carries the + * generated follow-up queries on "more" (ignored on "ready"). Mirrors the + * forced-tool pattern of v2's `select_pages_to_inject`. + */ +function buildGateTool(candidateSlugs: readonly string[]): ToolDefinition { + return { + name: GATE_TOOL_NAME, + description: + "Decide whether the accumulated candidate pages are sufficient to answer " + + "the next turn. Return decision='ready' with the final ordered selection " + + "when the candidates cover the turn; return decision='more' with one or " + + "more generated follow-up questions (NOT the original message) to seed " + + "another retrieval pass when coverage is incomplete.", + input_schema: { + type: "object", + properties: { + decision: { type: "string", enum: ["ready", "more"] }, + selected_slugs: { + type: "array", + items: { type: "string", enum: [...candidateSlugs] }, + description: + "Final ordered page slugs to inject. Choose only from the candidate set.", + }, + questions: { + type: "array", + items: { type: "string" }, + description: + "When decision='more', the generated follow-up questions seeding the next pass.", + }, + }, + required: ["decision"], + }, + }; +} + +const GateToolResultSchema = z.object({ + decision: z.enum(["ready", "more"]), + selected_slugs: z.array(z.string()).optional(), + questions: z.array(z.string()).optional(), +}); + +/** + * Order a slug selection: keep the model's returned order, restricted to the + * candidate set, then append any sticky slugs the model omitted (sticky is + * never dropped). De-duplicates while preserving first-seen order. + */ +function orderSelection( + modelSlugs: readonly string[], + candidates: Set, + sticky: Set, +): string[] { + const seen = new Set(); + const out: string[] = []; + for (const slug of modelSlugs) { + if (!candidates.has(slug)) continue; // model can only pick from candidates + if (seen.has(slug)) continue; + seen.add(slug); + out.push(slug); + } + for (const slug of sticky) { + if (seen.has(slug)) continue; + seen.add(slug); + out.push(slug); + } + return out; +} + +/** + * Fail-safe result: inject every candidate and declare the pass ready. Used + * when the provider is unavailable or the call cannot produce a usable + * decision. Ordering puts sticky last via `orderSelection` with an empty + * model selection, so candidates come first then any sticky not already in + * the set. + */ +function failSafe(candidates: Set, sticky: Set): RunGateResult { + return { + decision: { decision: "ready" }, + selectedSlugs: orderSelection([...candidates], candidates, sticky), + }; +} + +/** + * Run the gate for one pass. + * + * Makes one forced-tool LLM call over the candidate set and maps the result to + * a `GateDecision` plus the final ordered selection. Sticky slugs are always + * present in the selection. Any failure (no provider, provider throw, missing + * tool_use, schema mismatch) falls back to selecting all candidates with a + * "ready" decision. + */ +export async function runGate(args: RunGateArgs): Promise { + const { input, candidates, sticky, passNumber } = args; + + const candidateSlugs = [...candidates]; + + // Resolve the provider. A `provider` key in args (including explicit `null`) + // takes precedence so tests inject a stub; production omits it and resolves + // the configured `memoryV3Gate` call site. + const provider = + args.provider !== undefined + ? args.provider + : await getConfiguredProvider("memoryV3Gate"); + + if (!provider) { + log.warn("memoryV3Gate provider unavailable; gate failing open (ready)"); + return failSafe(candidates, sticky); + } + + const systemPrompt = + "You are the final selection gate for a memory-retrieval loop. You are " + + "given the candidate concept pages gathered so far for the current turn. " + + "Decide whether they are sufficient to answer the next reply."; + + const stickySlugs = [...sticky]; + const userMsg: Message = { + role: "user", + content: [ + { + type: "text", + text: `\n${input.nowText}\n`, + }, + { + type: "text", + text: + `${passNumber}\n\n` + + `\n${stickySlugs.join("\n")}\n\n\n` + + `\n${candidateSlugs.join("\n")}\n`, + }, + ], + }; + + const gateTool = buildGateTool(candidateSlugs); + + let response; + try { + response = await provider.sendMessage([userMsg], [gateTool], systemPrompt, { + config: { + callSite: "memoryV3Gate" as const, + tool_choice: { type: "tool" as const, name: GATE_TOOL_NAME }, + }, + ...(input.signal ? { signal: input.signal } : {}), + }); + } catch (err) { + log.warn({ err }, "Gate provider call threw; failing open (ready)"); + return failSafe(candidates, sticky); + } + + const toolBlock = extractToolUse(response); + if (!toolBlock || toolBlock.name !== GATE_TOOL_NAME) { + log.warn( + { stopReason: response.stopReason }, + "Gate model returned no decide_selection tool_use; failing open (ready)", + ); + return failSafe(candidates, sticky); + } + + const parsed = GateToolResultSchema.safeParse(toolBlock.input); + if (!parsed.success) { + log.warn( + { error: parsed.error.message }, + "Gate tool input did not match schema; failing open (ready)", + ); + return failSafe(candidates, sticky); + } + + const selectedSlugs = orderSelection( + parsed.data.selected_slugs ?? [], + candidates, + sticky, + ); + + if (parsed.data.decision === "more") { + const questions = (parsed.data.questions ?? []).filter( + (q) => q.trim().length > 0, + ); + const decision: GateDecision = + questions.length > 0 + ? { decision: "more", questions } + : { decision: "more" }; + return { decision, selectedSlugs }; + } + + // brief generation lands at cutover (P5) — shadow mode injects v2, so this + // gate produces only the selection + decision. Do NOT synthesize a voice + // brief here. + return { decision: { decision: "ready" }, selectedSlugs }; +}