vellum-ai · dvargasfuertes · May 28, 2026 · May 28, 2026
diff --git a/evals/benchmarks/longmemeval-v2/README.md b/evals/benchmarks/longmemeval-v2/README.md
@@ -48,15 +48,48 @@ interface BenchmarkItem {
   questionId: string; // V2 questions.jsonl `id` (stable question id)
   ability: string; // V2 questions.jsonl `question_type` (one of the five abilities)
   question: string;
-  answer: string; // reference answer, used by the GPT-4o judge
+  answer: string; // reference answer, used by the dispatched evaluator
   trajectoryIds: string[]; // ordered haystack from haystacks/lme_v2_<tier>.json
 }
 ```
 
 The V2 schema also ships `domain`, `environment`, `image`, and
 `eval_function` fields (see `SCHEMA.md` in the published dataset). The
-loader's zod schema preserves these via `.passthrough()`; the runner /
-judge will consume them in subsequent PRs without the loader having to
-grow first.
+loader's zod schema preserves these via `.passthrough()`; the judge and
+the runner consume them.
 
-This PR ships the loader and its fixture tests only. The two-conversation runner (`run-ingest-ask`), GPT-4o paper-faithful judge, and Phase 1 wiring land in subsequent PRs against the contract established here.
+## Judge
+
+`src/judge/index.ts` exports `evalFromSpec(spec, inputs, overrides?)` — a
+TypeScript port of V2's `evaluation/qa_eval_metrics.py`. Each V2 question
+carries an `eval_function` spec string of the form `"name|key=value|..."`
+that dispatches to one of six implementations:
+
+**Deterministic (no LLM):**
+
+- `norm_phrase_set_match` — phrase-set membership (unordered)
+- `norm_phrase_set_match_ordered` — phrase-set membership (ordered)
+- `mc_choice_match` — single multiple-choice letter
+- `mc_choice_set_match` — multi-select multiple-choice letters
+
+**LLM judges** (default `gpt-5.2` with `reasoning_effort=medium`, per V2's
+`run_eval.py` defaults):
+
+- `llm_abstention_checker` — flawed-premise (abstention) questions
+- `llm_gotchas_checker` — insight-style gotcha questions
+
+Both LLM judges issue an OpenAI-shape chat completion with a strict
+system prompt + rubric, expect `{"label": 0|1, "reason": "..."}` JSON
+output, and tolerate Markdown code fences + regex-fallback parsing.
+Transport is a direct `fetch` to the chat completions endpoint — tests
+swap `globalThis.fetch`, no production wrapper.
+
+`evalFromSpec` returns `{ label: boolean, reason: string, function: string }`.
+`reason` is empty for deterministic evaluators; the function name is
+echoed for audit/logging.
+
+## Next
+
+The two-conversation runner (`run-ingest-ask`, shipped in PR #32356) and
+this judge unblock Phase 1 wiring (5-item smoke against
+`vellum-simple-memory`), which lands in a follow-up PR.
diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/questions.jsonl b/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/questions.jsonl
@@ -1,3 +1,3 @@
-{"id": "q_001", "domain": "web", "environment": "shopping_admin", "question_type": "static-state-recall", "question": "What is the URL of the project settings page?", "image": null, "answer": "/settings/project", "eval_function": "exact_match"}
-{"id": "q_002", "domain": "enterprise", "environment": "servicenow", "question_type": "dynamic-state-tracking", "question": "After the bulk import completed, what was the new total record count?", "image": null, "answer": "12,481", "eval_function": "exact_match"}
-{"id": "q_003", "domain": "web", "environment": "gitlab", "question_type": "workflow-knowledge", "question": "What sequence of clicks creates a new dashboard?", "image": null, "answer": "Dashboards > New > template > Save", "eval_function": "exact_match", "extra_field_for_passthrough": true}
+{"id": "q_001", "domain": "web", "environment": "shopping_admin", "question_type": "static-state-recall", "question": "What is the URL of the project settings page?", "image": null, "answer": "/settings/project", "eval_function": "norm_phrase_set_match"}
+{"id": "q_002", "domain": "enterprise", "environment": "servicenow", "question_type": "dynamic-state-tracking", "question": "After the bulk import completed, what was the new total record count?", "image": null, "answer": "12,481", "eval_function": "norm_phrase_set_match|separators="}
+{"id": "q_003", "domain": "web", "environment": "gitlab", "question_type": "workflow-knowledge", "question": "What sequence of clicks creates a new dashboard?", "image": null, "answer": "Dashboards > New > template > Save", "eval_function": "norm_phrase_set_match_ordered|separators=>", "extra_field_for_passthrough": true}
diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/judge/deterministic.test.ts b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/deterministic.test.ts
@@ -0,0 +1,153 @@
+import { describe, expect, test } from "bun:test";
+
+import {
+  extractMultiSelectLetters,
+  mcChoiceMatch,
+  mcChoiceSetMatch,
+  normPhraseSetMatch,
+  normPhraseSetMatchOrdered,
+} from "../../judge/deterministic";
+
+describe("normPhraseSetMatch", () => {
+  test("matches single phrase contained in prediction", () => {
+    expect(
+      normPhraseSetMatch("The URL is /settings/project.", "/settings/project"),
+    ).toBe(true);
+  });
+
+  test("requires every comma-separated phrase to appear", () => {
+    expect(normPhraseSetMatch("apple and banana", "apple, banana")).toBe(true);
+    expect(normPhraseSetMatch("apple only", "apple, banana")).toBe(false);
+  });
+
+  test("order does not matter for the set variant", () => {
+    expect(normPhraseSetMatch("banana apple", "apple, banana")).toBe(true);
+  });
+
+  test("respects custom separator", () => {
+    expect(
+      normPhraseSetMatch("apple and banana", "apple|banana", {
+        separators: ["|"],
+      }),
+    ).toBe(true);
+  });
+
+  test("requireNonEmpty=true rejects empty prediction", () => {
+    expect(normPhraseSetMatch("", "apple")).toBe(false);
+  });
+
+  test("requireNonEmpty=false allows empty answer to match anything", () => {
+    expect(normPhraseSetMatch("anything", "", { requireNonEmpty: false })).toBe(
+      true,
+    );
+  });
+
+  test("normalization makes hyphenated phrase match prediction with spaces", () => {
+    expect(normPhraseSetMatch("project settings", "project-settings")).toBe(
+      true,
+    );
+  });
+
+  test("respects empty separators (single-phrase answer with comma in it)", () => {
+    expect(
+      normPhraseSetMatch("the count was 12,481 records", "12,481", {
+        separators: [],
+      }),
+    ).toBe(true);
+  });
+});
+
+describe("normPhraseSetMatchOrdered", () => {
+  test("matches when answer phrases appear in order", () => {
+    expect(
+      normPhraseSetMatchOrdered(
+        "click Dashboards, then New, then template, then Save",
+        "Dashboards > New > template > Save",
+        { separators: [">"] },
+      ),
+    ).toBe(true);
+  });
+
+  test("fails when order is wrong", () => {
+    expect(
+      normPhraseSetMatchOrdered(
+        "click Save, then template, then New, then Dashboards",
+        "Dashboards > New > template > Save",
+        { separators: [">"] },
+      ),
+    ).toBe(false);
+  });
+
+  test("requireNonEmpty=true rejects empty prediction", () => {
+    expect(normPhraseSetMatchOrdered("", "a > b", { separators: [">"] })).toBe(
+      false,
+    );
+  });
+});
+
+describe("mcChoiceMatch", () => {
+  test("matches single letter regardless of case", () => {
+    expect(mcChoiceMatch("a", "A")).toBe(true);
+  });
+
+  test("extracts from \\boxed{...}", () => {
+    expect(mcChoiceMatch("After thinking, \\boxed{c}", "C")).toBe(true);
+  });
+
+  test('strips "choice" and "option" words', () => {
+    expect(mcChoiceMatch("Choice B", "B")).toBe(true);
+    expect(mcChoiceMatch("Option d.", "D")).toBe(true);
+  });
+
+  test("strips trailing periods by default", () => {
+    expect(mcChoiceMatch("B.", "B")).toBe(true);
+  });
+
+  test("respects custom stripChars", () => {
+    expect(mcChoiceMatch("B)", "B", { stripChars: ".)" })).toBe(true);
+  });
+
+  test("returns false when letters differ", () => {
+    expect(mcChoiceMatch("A", "B")).toBe(false);
+  });
+
+  test("null/undefined prediction or answer → false", () => {
+    expect(mcChoiceMatch(null, "A")).toBe(false);
+    expect(mcChoiceMatch("A", undefined)).toBe(false);
+  });
+});
+
+describe("mcChoiceSetMatch", () => {
+  test("set-equality on multi-letter answers regardless of order", () => {
+    expect(mcChoiceSetMatch("A, B, D", "D B A")).toBe(true);
+  });
+
+  test('filters filler words like "and"/"option"', () => {
+    expect(mcChoiceSetMatch("A and B and C", "A, B, C")).toBe(true);
+  });
+
+  test("returns false on different sets", () => {
+    expect(mcChoiceSetMatch("A, B", "A, B, C")).toBe(false);
+  });
+
+  test("requireNonEmpty=true rejects empty prediction", () => {
+    expect(mcChoiceSetMatch("", "A, B")).toBe(false);
+  });
+});
+
+describe("extractMultiSelectLetters", () => {
+  test("explodes a CSV of letters into per-letter array", () => {
+    expect(extractMultiSelectLetters("A, B, C")).toEqual(["A", "B", "C"]);
+  });
+
+  test("drops filler words", () => {
+    expect(extractMultiSelectLetters("Final answer: A and B")).toEqual([
+      "A",
+      "B",
+    ]);
+  });
+
+  test("explodes multi-letter chunks like 'BD' into ['B','D']", () => {
+    expect(extractMultiSelectLetters("BD")).toEqual(["B", "D"]);
+  });
+});
diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/judge/index.test.ts b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/index.test.ts
@@ -0,0 +1,132 @@
+import { afterEach, describe, expect, test } from "bun:test";
+
+import { evalFromSpec } from "../../judge";
+
+const originalFetch = globalThis.fetch;
+
+function mockOpenAI(content: string) {
+  const captured: { body?: Record<string, unknown> } = {};
+  globalThis.fetch = (async (
+    _url: string | URL | Request,
+    init?: RequestInit,
+  ) => {
+    if (init?.body !== undefined) {
+      captured.body = JSON.parse(String(init.body)) as Record<string, unknown>;
+    }
+    return new Response(
+      JSON.stringify({ choices: [{ message: { content } }] }),
+      { status: 200, headers: { "content-type": "application/json" } },
+    );
+  }) as typeof fetch;
+  return captured;
+}
+
+describe("evalFromSpec dispatcher", () => {
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+  });
+
+  test("dispatches norm_phrase_set_match and reports the function name", async () => {
+    const result = await evalFromSpec("norm_phrase_set_match", {
+      prediction: "the project settings page is /settings/project",
+      answer: "/settings/project",
+    });
+    expect(result).toEqual({
+      label: true,
+      reason: "",
+      function: "norm_phrase_set_match",
+    });
+  });
+
+  test("dispatches norm_phrase_set_match_ordered with separator kwargs from spec", async () => {
+    const result = await evalFromSpec(
+      "norm_phrase_set_match_ordered|separators=>",
+      {
+        prediction: "click Dashboards then New then template then Save",
+        answer: "Dashboards > New > template > Save",
+      },
+    );
+    expect(result.label).toBe(true);
+    expect(result.function).toBe("norm_phrase_set_match_ordered");
+  });
+
+  test("dispatches mc_choice_match", async () => {
+    const result = await evalFromSpec("mc_choice_match", {
+      prediction: "Choice B.",
+      answer: "B",
+    });
+    expect(result.label).toBe(true);
+    expect(result.function).toBe("mc_choice_match");
+  });
+
+  test("dispatches mc_choice_set_match", async () => {
+    const result = await evalFromSpec("mc_choice_set_match", {
+      prediction: "Final answer: A and C",
+      answer: "C, A",
+    });
+    expect(result.label).toBe(true);
+    expect(result.function).toBe("mc_choice_set_match");
+  });
+
+  test("dispatches llm_abstention_checker through the OpenAI transport", async () => {
+    mockOpenAI('{"label": 1, "reason": "premise rejected"}');
+
+    const result = await evalFromSpec(
+      "llm_abstention_checker",
+      {
+        prediction: "The premise here is wrong because X.",
+        answer: "Reject the premise.",
+        questionItem: { question: "Why does Z fail?" },
+      },
+      { evaluatorModel: "gpt-5.2", evaluatorApiKey: "unit-test" },
+    );
+
+    expect(result).toEqual({
+      label: true,
+      reason: "premise rejected",
+      function: "llm_abstention_checker",
+    });
+  });
+
+  test("dispatches llm_gotchas_checker through the OpenAI transport", async () => {
+    mockOpenAI('{"label": 1, "reason": "captures insight"}');
+
+    const result = await evalFromSpec(
+      "llm_gotchas_checker",
+      {
+        prediction: "You need to refresh the cache first.",
+        answer: "Cache must be invalidated before reload.",
+      },
+      { evaluatorModel: "gpt-5.2", evaluatorApiKey: "unit-test" },
+    );
+
+    expect(result.label).toBe(true);
+    expect(result.function).toBe("llm_gotchas_checker");
+  });
+
+  test("caller overrides win over spec kwargs", async () => {
+    // Spec sets requireNonEmpty=true; override forces false so an empty
+    // answer still matches.
+    const result = await evalFromSpec(
+      "norm_phrase_set_match|require_non_empty=true",
+      { prediction: "non-empty", answer: "" },
+      { requireNonEmpty: false },
+    );
+    expect(result.label).toBe(true);
+  });
+
+  test("throws on unknown function name", async () => {
+    await expect(
+      evalFromSpec("definitely_not_real", { prediction: "x", answer: "y" }),
+    ).rejects.toThrow(/Unknown eval function/);
+  });
+
+  test("propagates parse errors on malformed spec strings", async () => {
+    await expect(
+      evalFromSpec("norm_phrase_set_match|noequals", {
+        prediction: "x",
+        answer: "y",
+      }),
+    ).rejects.toThrow(/Invalid eval function option/);
+  });
+});