From 54977a2b87aaa60a65784ed9a68cdb85659e5772 Mon Sep 17 00:00:00 2001 From: Vellum Assistant Date: Mon, 25 May 2026 01:53:55 -0500 Subject: [PATCH] feat(memory-v3): fast filter judging dense hits (sticky bypass) --- .../src/memory/v3/__tests__/filter.test.ts | 338 ++++++++++++++++++ assistant/src/memory/v3/filter.ts | 258 +++++++++++++ 2 files changed, 596 insertions(+) create mode 100644 assistant/src/memory/v3/__tests__/filter.test.ts create mode 100644 assistant/src/memory/v3/filter.ts diff --git a/assistant/src/memory/v3/__tests__/filter.test.ts b/assistant/src/memory/v3/__tests__/filter.test.ts new file mode 100644 index 00000000000..25a78c76df3 --- /dev/null +++ b/assistant/src/memory/v3/__tests__/filter.test.ts @@ -0,0 +1,338 @@ +/** + * Tests for `assistant/src/memory/v3/filter.ts`. + * + * Coverage matrix: + * - keep-subset → kept = bypass ∪ judged-kept; dropped = judged minus kept; + * bypass slugs are never judged. + * - model keeping a slug outside the judged set → dropped. + * - empty dense → no LLM call, kept = bypass-relevant only. + * - dense entirely covered by bypass → no LLM call (nothing to judge). + * - provider === null (no provider configured) → fail-open: keep all dense, + * failureReason = "no_provider". + * - provider throws → fail-open (keep all, failureReason = "api_error"). + * - missing tool_use block → fail-open (failureReason = "tool_use_missing"). + * - tool input failing schema → fail-open (failureReason = "schema_mismatch"). + * - request shape: forced tool_choice on `filter_dense_hits`, judged set in + * the user message, abort signal forwarded. + * + * The provider is injected via `filterDenseHits({ provider })` — no real LLM, + * no network, no `mock.module`. `~/.vellum/` is never touched. + */ + +import { describe, expect, test } from "bun:test"; + +import type { + Message, + Provider, + ProviderResponse, + SendMessageOptions, + ToolDefinition, +} from "../../../providers/types.js"; +import type { RetrievalInput } from "../../v2/harness/retriever.js"; +import type { ScoutResult } from "../../v2/harness/trace.js"; +import { filterDenseHits } from "../filter.js"; + +// --------------------------------------------------------------------------- +// Helpers. +// --------------------------------------------------------------------------- + +interface ProviderCall { + messages: Message[]; + tools: ToolDefinition[] | undefined; + systemPrompt: string | undefined; + options: SendMessageOptions | undefined; +} + +/** + * A stub provider that records its calls and returns a fixed response. + * Honors an already-aborted signal by throwing an AbortError so signal + * forwarding can be asserted. + */ +function makeProvider( + response: ProviderResponse, + calls: ProviderCall[], +): Provider { + return { + name: "stub", + sendMessage: async (messages, tools, systemPrompt, options) => { + calls.push({ messages, tools, systemPrompt, options }); + if (options?.signal?.aborted) { + const err = new Error("aborted"); + err.name = "AbortError"; + throw err; + } + return response; + }, + }; +} + +/** A provider whose sendMessage always throws. */ +function makeThrowingProvider(): Provider { + return { + name: "throwing-stub", + sendMessage: async () => { + throw new Error("boom"); + }, + }; +} + +/** A provider that must never be called (asserts no LLM round-trip happens). */ +function makeNeverCalledProvider(): Provider { + return { + name: "never-called-stub", + sendMessage: async () => { + throw new Error("provider should not be called"); + }, + }; +} + +function filterToolResponse(input: Record): ProviderResponse { + return { + model: "stub-model", + stopReason: "tool_use", + usage: { inputTokens: 0, outputTokens: 0 }, + content: [ + { type: "tool_use", id: "tu-1", name: "filter_dense_hits", input }, + ], + }; +} + +/** A response with no tool_use block (e.g. the model emitted only text). */ +function textOnlyResponse(): ProviderResponse { + return { + model: "stub-model", + stopReason: "end_turn", + usage: { inputTokens: 0, outputTokens: 0 }, + content: [{ type: "text", text: "no tool here" }], + }; +} + +/** Minimal `RetrievalInput` — the filter only reads `nowText` and `signal`. */ +function makeInput(overrides?: Partial): RetrievalInput { + return { + workspaceDir: "/tmp/does-not-matter", + recentTurnPairs: [], + nowText: "2026-05-25 10:00 PT", + priorEverInjected: [], + config: {} as unknown as RetrievalInput["config"], + ...overrides, + }; +} + +function denseResult(slugs: string[]): ScoutResult { + return { lane: "dense", slugs }; +} + +// --------------------------------------------------------------------------- +// Tests. +// --------------------------------------------------------------------------- + +describe("filterDenseHits — judged keep/drop", () => { + test("kept = bypass ∪ judged-kept; bypass slugs are never judged", async () => { + const calls: ProviderCall[] = []; + // Dense surfaces a, b, c, plus bypass slug `x`. Model keeps a, c; drops b. + const provider = makeProvider( + filterToolResponse({ keep_slugs: ["c", "a"] }), + calls, + ); + + const result = await filterDenseHits({ + input: makeInput(), + dense: denseResult(["a", "b", "c", "x"]), + sticky: new Set(["x"]), + bypass: new Set(["x"]), + provider, + }); + + // bypass first (x), then judged-kept in model order (c, a). + expect(result.kept).toEqual(["x", "c", "a"]); + // Only the non-bypass slugs are judged; b was dropped. + expect(result.trace.judged).toEqual(["a", "b", "c"]); + expect(result.trace.dropped).toEqual(["b"]); + expect(result.failureReason).toBeUndefined(); + expect(calls).toHaveLength(1); + // The bypass slug `x` was never shown to the model. + const userText = calls[0].messages[0].content + .map((b) => (b.type === "text" ? b.text : "")) + .join("\n"); + expect(userText).not.toContain("x"); + }); + + test("forces tool_choice on filter_dense_hits and surfaces judged candidates", async () => { + const calls: ProviderCall[] = []; + const provider = makeProvider( + filterToolResponse({ keep_slugs: ["a"] }), + calls, + ); + + await filterDenseHits({ + input: makeInput({ nowText: "NOW-MARKER" }), + dense: denseResult(["a", "b"]), + sticky: new Set(), + bypass: new Set(), + provider, + }); + + const call = calls[0]; + expect(call.options?.config?.tool_choice).toEqual({ + type: "tool", + name: "filter_dense_hits", + }); + expect(call.options?.config?.callSite).toBe("memoryV3Filter"); + expect(call.tools?.[0].name).toBe("filter_dense_hits"); + const userText = call.messages[0].content + .map((b) => (b.type === "text" ? b.text : "")) + .join("\n"); + expect(userText).toContain("NOW-MARKER"); + expect(userText).toContain("a"); + expect(userText).toContain("b"); + }); + + test("drops a model-kept slug outside the judged set", async () => { + const calls: ProviderCall[] = []; + const provider = makeProvider( + filterToolResponse({ keep_slugs: ["a", "ghost"] }), + calls, + ); + + const result = await filterDenseHits({ + input: makeInput(), + dense: denseResult(["a", "b"]), + sticky: new Set(), + bypass: new Set(), + provider, + }); + + expect(result.kept).toEqual(["a"]); + expect(result.trace.dropped).toEqual(["b"]); + }); + + test("forwards an abort signal to the provider call", async () => { + const calls: ProviderCall[] = []; + const controller = new AbortController(); + controller.abort(); + const provider = makeProvider( + filterToolResponse({ keep_slugs: ["a"] }), + calls, + ); + + // Aborted signal makes the stub throw → filter fails open (keep all). + const result = await filterDenseHits({ + input: makeInput({ signal: controller.signal }), + dense: denseResult(["a", "b"]), + sticky: new Set(), + bypass: new Set(), + provider, + }); + + expect(calls[0].options?.signal).toBe(controller.signal); + expect([...result.kept].sort()).toEqual(["a", "b"]); + expect(result.failureReason).toBe("api_error"); + }); +}); + +describe("filterDenseHits — no LLM call", () => { + test("empty dense → no call, kept = bypass-relevant only", async () => { + const provider = makeNeverCalledProvider(); + + const result = await filterDenseHits({ + input: makeInput(), + dense: denseResult([]), + sticky: new Set(["x"]), + bypass: new Set(["x"]), + provider, + }); + + expect(result.kept).toEqual(["x"]); + expect(result.trace).toEqual({ judged: [], dropped: [] }); + expect(result.failureReason).toBeUndefined(); + }); + + test("dense fully covered by bypass → no call (nothing to judge)", async () => { + const provider = makeNeverCalledProvider(); + + const result = await filterDenseHits({ + input: makeInput(), + dense: denseResult(["x", "y"]), + sticky: new Set(["x", "y"]), + bypass: new Set(["x", "y"]), + provider, + }); + + expect([...result.kept].sort()).toEqual(["x", "y"]); + expect(result.trace).toEqual({ judged: [], dropped: [] }); + }); +}); + +describe("filterDenseHits — fail-open", () => { + test("provider === null keeps all dense with failureReason no_provider", async () => { + const result = await filterDenseHits({ + input: makeInput(), + dense: denseResult(["a", "b", "c"]), + sticky: new Set(), + bypass: new Set(), + provider: null, + }); + + expect([...result.kept].sort()).toEqual(["a", "b", "c"]); + expect(result.trace.judged).toEqual(["a", "b", "c"]); + expect(result.trace.dropped).toEqual([]); + expect(result.failureReason).toBe("no_provider"); + }); + + test("fail-open still unions bypass slugs into kept", async () => { + const result = await filterDenseHits({ + input: makeInput(), + dense: denseResult(["a", "b", "x"]), + sticky: new Set(["x"]), + bypass: new Set(["x"]), + provider: null, + }); + + // bypass `x` first, then the judged-but-kept-by-fail-open slugs a, b. + expect(result.kept).toEqual(["x", "a", "b"]); + expect(result.trace.judged).toEqual(["a", "b"]); + }); + + test("provider throw keeps all dense (failureReason api_error)", async () => { + const result = await filterDenseHits({ + input: makeInput(), + dense: denseResult(["a", "b"]), + sticky: new Set(), + bypass: new Set(), + provider: makeThrowingProvider(), + }); + + expect([...result.kept].sort()).toEqual(["a", "b"]); + expect(result.failureReason).toBe("api_error"); + }); + + test("missing tool_use block keeps all dense (failureReason tool_use_missing)", async () => { + const calls: ProviderCall[] = []; + const result = await filterDenseHits({ + input: makeInput(), + dense: denseResult(["a", "b"]), + sticky: new Set(), + bypass: new Set(), + provider: makeProvider(textOnlyResponse(), calls), + }); + + expect([...result.kept].sort()).toEqual(["a", "b"]); + expect(result.failureReason).toBe("tool_use_missing"); + }); + + test("schema-mismatched tool input keeps all dense (failureReason schema_mismatch)", async () => { + const calls: ProviderCall[] = []; + const result = await filterDenseHits({ + input: makeInput(), + dense: denseResult(["a", "b"]), + sticky: new Set(), + bypass: new Set(), + // `keep_slugs` is required; missing it fails the Zod schema. + provider: makeProvider(filterToolResponse({ wrong_key: ["a"] }), calls), + }); + + expect([...result.kept].sort()).toEqual(["a", "b"]); + expect(result.failureReason).toBe("schema_mismatch"); + }); +}); diff --git a/assistant/src/memory/v3/filter.ts b/assistant/src/memory/v3/filter.ts new file mode 100644 index 00000000000..79892178809 --- /dev/null +++ b/assistant/src/memory/v3/filter.ts @@ -0,0 +1,258 @@ +/** + * Memory v3 — fast dense-hit filter. + * + * The dense scout lane surfaces embedding-similarity candidates that span + * subtrees: some are meaningful cross-domain associations worth carrying into + * the gate, others are spurious near-neighbors that only crowd the slate. This + * module makes **one cheap LLM call** to keep the meaningful associations and + * drop the noise, *before* the more expensive selection gate runs. + * + * What it judges. Only the bounded dense candidate set (the scout lane is + * already capped at ~50–200 by quota/MMR — the filter never sees the whole + * corpus). Hot pages and near-exact sparse hits arrive via the scouts' + * `sticky` / `bypass` sets and are **never judged**: a literal keyword hit or a + * page the user has been touching is a strong enough signal that we shouldn't + * make it earn its place through a fallible cheap judgment. They are unioned + * straight into `kept`. + * + * Fail-open. If no provider is configured or the call errors / returns an + * unusable response, the filter keeps *all* dense candidates and surfaces a + * `failureReason` so the loop can record that the filter was bypassed. Dropping + * candidates on a model outage would silently starve retrieval; keeping them is + * the safe degradation (the downstream gate still narrows the slate). + * + * No LLM call when there is nothing to judge. An empty dense set short-circuits + * to `kept` = the bypass-relevant slugs (no judged additions), with no provider + * round-trip. + * + * This module is currently unwired — a later PR composes it into the loop. + */ + +import { z } from "zod"; + +import { + extractToolUse, + getConfiguredProvider, +} from "../../providers/provider-send-message.js"; +import type { + Message, + Provider, + ToolDefinition, +} from "../../providers/types.js"; +import { getLogger } from "../../util/logger.js"; +import type { RetrievalInput } from "../v2/harness/retriever.js"; +import type { ScoutResult } from "../v2/harness/trace.js"; + +const log = getLogger("memory-v3-filter"); + +/** Tool name forced via `tool_choice`. Shared constant so tests can match it. */ +const FILTER_TOOL_NAME = "filter_dense_hits"; + +/** + * Arguments to one filter invocation. + * + * `dense` is the bounded dense scout result; only its slugs that are *not* + * already in `bypass` are judged. `sticky` is the broader keep-in-the-running + * set (hot + near-exact sparse); `bypass` is the subset strong enough to skip + * judgment entirely. Bypass slugs that also appear in the dense lane are kept + * unconditionally and never sent to the model. + */ +export interface FilterDenseHitsArgs { + input: RetrievalInput; + dense: ScoutResult; + sticky: Set; + bypass: Set; + /** + * Provider override seam for tests. Production leaves this unset and the + * filter resolves `getConfiguredProvider("memoryV3Filter")`. `null` is + * distinct from `undefined`: passing `null` simulates "no provider + * configured" and exercises the fail-open path without resolving the real + * registry. + */ + provider?: Provider | null; +} + +export interface FilterDenseHitsResult { + /** Final kept slugs: bypass ∪ judged-kept. */ + kept: string[]; + /** Inspection trace: which dense slugs were judged and which were dropped. */ + trace: { judged: string[]; dropped: string[] }; + /** + * Non-null when the filter could not judge (no provider, provider throw, + * missing tool_use, schema mismatch) and therefore failed open by keeping all + * dense candidates. The loop can surface this to flag a bypassed filter. + */ + failureReason?: string; +} + +/** + * Build the forced tool definition. `keep_slugs` is the model's subset of the + * judged candidate set to retain; everything judged-but-not-kept is dropped. + * Mirrors the forced-tool pattern of v2's `select_pages_to_inject`. + */ +function buildFilterTool(judgedSlugs: readonly string[]): ToolDefinition { + return { + name: FILTER_TOOL_NAME, + description: + "From the candidate concept pages surfaced by embedding similarity for " + + "the current turn, keep the ones that are meaningful associations worth " + + "surfacing and drop the spurious near-neighbors. Return keep_slugs as the " + + "subset to retain — choose only from the candidate set. Lean toward " + + "keeping a plausible cross-domain association over dropping it.", + input_schema: { + type: "object", + properties: { + keep_slugs: { + type: "array", + items: { type: "string", enum: [...judgedSlugs] }, + description: + "The subset of candidate page slugs to keep. Choose only from the candidate set.", + }, + }, + required: ["keep_slugs"], + }, + }; +} + +const FilterToolResultSchema = z.object({ + keep_slugs: z.array(z.string()), +}); + +/** + * Compose the final result. `kept` = bypass slugs ∪ judged-kept (de-duplicated, + * bypass first then judged-kept in the model's returned order). `trace` records + * exactly which dense slugs were judged and which the model dropped. + */ +function buildResult( + bypass: Set, + judged: readonly string[], + judgedKept: readonly string[], + failureReason?: string, +): FilterDenseHitsResult { + const keptSet = new Set(bypass); + const kept: string[] = [...bypass]; + for (const slug of judgedKept) { + if (keptSet.has(slug)) continue; + keptSet.add(slug); + kept.push(slug); + } + const keptJudged = new Set(judgedKept); + const dropped = judged.filter((slug) => !keptJudged.has(slug)); + return { + kept, + trace: { judged: [...judged], dropped }, + ...(failureReason !== undefined ? { failureReason } : {}), + }; +} + +/** + * Run the fast dense-hit filter for one pass. + * + * Makes at most one forced-tool LLM call over the *judged* set (dense slugs not + * already in `bypass`). Bypass slugs are kept unconditionally. On an empty + * judged set no call is made. Any failure (no provider, provider throw, missing + * tool_use, schema mismatch) fails open: every dense candidate is kept and a + * `failureReason` is returned. + */ +export async function filterDenseHits( + args: FilterDenseHitsArgs, +): Promise { + const { input, dense, bypass } = args; + + // Dense slugs that bypass judgment (near-exact sparse / hot) are kept as-is; + // only the remainder is judged. + const judged = dense.slugs.filter((slug) => !bypass.has(slug)); + + // Nothing to judge → no LLM call. Kept is just the bypass-relevant slugs. + if (judged.length === 0) { + return buildResult(bypass, judged, judged); + } + + // Resolve the provider. A `provider` key in args (including explicit `null`) + // takes precedence so tests inject a stub; production omits it and resolves + // the configured `memoryV3Filter` call site. + const provider = + args.provider !== undefined + ? args.provider + : await getConfiguredProvider("memoryV3Filter"); + + if (!provider) { + log.warn( + "memoryV3Filter provider unavailable; failing open (keeping all dense)", + ); + return buildResult(bypass, judged, judged, "no_provider"); + } + + const systemPrompt = + "You are a fast relevance filter for a memory-retrieval loop. You are given " + + "candidate concept pages surfaced by embedding similarity for the current " + + "turn. Keep the pages that are meaningful associations and drop the " + + "spurious near-neighbors. When in doubt, keep."; + + const userMsg: Message = { + role: "user", + content: [ + { + type: "text", + text: `\n${input.nowText}\n`, + }, + { + type: "text", + text: `\n${judged.join("\n")}\n`, + }, + ], + }; + + const filterTool = buildFilterTool(judged); + + let response; + try { + response = await provider.sendMessage( + [userMsg], + [filterTool], + systemPrompt, + { + config: { + callSite: "memoryV3Filter" as const, + tool_choice: { type: "tool" as const, name: FILTER_TOOL_NAME }, + }, + ...(input.signal ? { signal: input.signal } : {}), + }, + ); + } catch (err) { + log.warn({ err }, "Filter provider call threw; failing open (keep all)"); + return buildResult(bypass, judged, judged, "api_error"); + } + + const toolBlock = extractToolUse(response); + if (!toolBlock || toolBlock.name !== FILTER_TOOL_NAME) { + log.warn( + { stopReason: response.stopReason }, + "Filter model returned no filter_dense_hits tool_use; failing open (keep all)", + ); + return buildResult(bypass, judged, judged, "tool_use_missing"); + } + + const parsed = FilterToolResultSchema.safeParse(toolBlock.input); + if (!parsed.success) { + log.warn( + { error: parsed.error.message }, + "Filter tool input did not match schema; failing open (keep all)", + ); + return buildResult(bypass, judged, judged, "schema_mismatch"); + } + + // Restrict the model's keep set to the judged candidates (it can only keep + // what it was shown) and preserve its returned order. + const judgedSet = new Set(judged); + const seen = new Set(); + const judgedKept: string[] = []; + for (const slug of parsed.data.keep_slugs) { + if (!judgedSet.has(slug)) continue; + if (seen.has(slug)) continue; + seen.add(slug); + judgedKept.push(slug); + } + + return buildResult(bypass, judged, judgedKept); +}