diff --git a/evals/benchmarks/longmemeval-v2/README.md b/evals/benchmarks/longmemeval-v2/README.md
index 9cf68fbf450..a8a8e6d00b7 100644
--- a/evals/benchmarks/longmemeval-v2/README.md
+++ b/evals/benchmarks/longmemeval-v2/README.md
@@ -48,15 +48,48 @@ interface BenchmarkItem {
   questionId: string; // V2 questions.jsonl `id` (stable question id)
   ability: string; // V2 questions.jsonl `question_type` (one of the five abilities)
   question: string;
-  answer: string; // reference answer, used by the GPT-4o judge
+  answer: string; // reference answer, used by the dispatched evaluator
   trajectoryIds: string[]; // ordered haystack from haystacks/lme_v2_<tier>.json
 }
 ```
 
 The V2 schema also ships `domain`, `environment`, `image`, and
 `eval_function` fields (see `SCHEMA.md` in the published dataset). The
-loader's zod schema preserves these via `.passthrough()`; the runner /
-judge will consume them in subsequent PRs without the loader having to
-grow first.
+loader's zod schema preserves these via `.passthrough()`; the judge and
+the runner consume them.
 
-This PR ships the loader and its fixture tests only. The two-conversation runner (`run-ingest-ask`), GPT-4o paper-faithful judge, and Phase 1 wiring land in subsequent PRs against the contract established here.
+## Judge
+
+`src/judge/index.ts` exports `evalFromSpec(spec, inputs, overrides?)` — a
+TypeScript port of V2's `evaluation/qa_eval_metrics.py`. Each V2 question
+carries an `eval_function` spec string of the form `"name|key=value|..."`
+that dispatches to one of six implementations:
+
+**Deterministic (no LLM):**
+
+- `norm_phrase_set_match` — phrase-set membership (unordered)
+- `norm_phrase_set_match_ordered` — phrase-set membership (ordered)
+- `mc_choice_match` — single multiple-choice letter
+- `mc_choice_set_match` — multi-select multiple-choice letters
+
+**LLM judges** (default `gpt-5.2` with `reasoning_effort=medium`, per V2's
+`run_eval.py` defaults):
+
+- `llm_abstention_checker` — flawed-premise (abstention) questions
+- `llm_gotchas_checker` — insight-style gotcha questions
+
+Both LLM judges issue an OpenAI-shape chat completion with a strict
+system prompt + rubric, expect `{"label": 0|1, "reason": "..."}` JSON
+output, and tolerate Markdown code fences + regex-fallback parsing.
+Transport is a direct `fetch` to the chat completions endpoint — tests
+swap `globalThis.fetch`, no production wrapper.
+
+`evalFromSpec` returns `{ label: boolean, reason: string, function: string }`.
+`reason` is empty for deterministic evaluators; the function name is
+echoed for audit/logging.
+
+## Next
+
+The two-conversation runner (`run-ingest-ask`, shipped in PR #32356) and
+this judge unblock Phase 1 wiring (5-item smoke against
+`vellum-simple-memory`), which lands in a follow-up PR.
diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/questions.jsonl b/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/questions.jsonl
index 8317cf09830..a51383069b7 100644
--- a/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/questions.jsonl
+++ b/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/questions.jsonl
@@ -1,3 +1,3 @@
-{"id": "q_001", "domain": "web", "environment": "shopping_admin", "question_type": "static-state-recall", "question": "What is the URL of the project settings page?", "image": null, "answer": "/settings/project", "eval_function": "exact_match"}
-{"id": "q_002", "domain": "enterprise", "environment": "servicenow", "question_type": "dynamic-state-tracking", "question": "After the bulk import completed, what was the new total record count?", "image": null, "answer": "12,481", "eval_function": "exact_match"}
-{"id": "q_003", "domain": "web", "environment": "gitlab", "question_type": "workflow-knowledge", "question": "What sequence of clicks creates a new dashboard?", "image": null, "answer": "Dashboards > New > template > Save", "eval_function": "exact_match", "extra_field_for_passthrough": true}
+{"id": "q_001", "domain": "web", "environment": "shopping_admin", "question_type": "static-state-recall", "question": "What is the URL of the project settings page?", "image": null, "answer": "/settings/project", "eval_function": "norm_phrase_set_match"}
+{"id": "q_002", "domain": "enterprise", "environment": "servicenow", "question_type": "dynamic-state-tracking", "question": "After the bulk import completed, what was the new total record count?", "image": null, "answer": "12,481", "eval_function": "norm_phrase_set_match|separators="}
+{"id": "q_003", "domain": "web", "environment": "gitlab", "question_type": "workflow-knowledge", "question": "What sequence of clicks creates a new dashboard?", "image": null, "answer": "Dashboards > New > template > Save", "eval_function": "norm_phrase_set_match_ordered|separators=>", "extra_field_for_passthrough": true}
diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/judge/deterministic.test.ts b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/deterministic.test.ts
new file mode 100644
index 00000000000..6a346ae0ff0
--- /dev/null
+++ b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/deterministic.test.ts
@@ -0,0 +1,153 @@
+import { describe, expect, test } from "bun:test";
+
+import {
+  extractMultiSelectLetters,
+  mcChoiceMatch,
+  mcChoiceSetMatch,
+  normPhraseSetMatch,
+  normPhraseSetMatchOrdered,
+} from "../../judge/deterministic";
+
+describe("normPhraseSetMatch", () => {
+  test("matches single phrase contained in prediction", () => {
+    expect(
+      normPhraseSetMatch("The URL is /settings/project.", "/settings/project"),
+    ).toBe(true);
+  });
+
+  test("requires every comma-separated phrase to appear", () => {
+    expect(normPhraseSetMatch("apple and banana", "apple, banana")).toBe(true);
+    expect(normPhraseSetMatch("apple only", "apple, banana")).toBe(false);
+  });
+
+  test("order does not matter for the set variant", () => {
+    expect(normPhraseSetMatch("banana apple", "apple, banana")).toBe(true);
+  });
+
+  test("respects custom separator", () => {
+    expect(
+      normPhraseSetMatch("apple and banana", "apple|banana", {
+        separators: ["|"],
+      }),
+    ).toBe(true);
+  });
+
+  test("requireNonEmpty=true rejects empty prediction", () => {
+    expect(normPhraseSetMatch("", "apple")).toBe(false);
+  });
+
+  test("requireNonEmpty=false allows empty answer to match anything", () => {
+    expect(normPhraseSetMatch("anything", "", { requireNonEmpty: false })).toBe(
+      true,
+    );
+  });
+
+  test("normalization makes hyphenated phrase match prediction with spaces", () => {
+    expect(normPhraseSetMatch("project settings", "project-settings")).toBe(
+      true,
+    );
+  });
+
+  test("respects empty separators (single-phrase answer with comma in it)", () => {
+    expect(
+      normPhraseSetMatch("the count was 12,481 records", "12,481", {
+        separators: [],
+      }),
+    ).toBe(true);
+  });
+});
+
+describe("normPhraseSetMatchOrdered", () => {
+  test("matches when answer phrases appear in order", () => {
+    expect(
+      normPhraseSetMatchOrdered(
+        "click Dashboards, then New, then template, then Save",
+        "Dashboards > New > template > Save",
+        { separators: [">"] },
+      ),
+    ).toBe(true);
+  });
+
+  test("fails when order is wrong", () => {
+    expect(
+      normPhraseSetMatchOrdered(
+        "click Save, then template, then New, then Dashboards",
+        "Dashboards > New > template > Save",
+        { separators: [">"] },
+      ),
+    ).toBe(false);
+  });
+
+  test("requireNonEmpty=true rejects empty prediction", () => {
+    expect(normPhraseSetMatchOrdered("", "a > b", { separators: [">"] })).toBe(
+      false,
+    );
+  });
+});
+
+describe("mcChoiceMatch", () => {
+  test("matches single letter regardless of case", () => {
+    expect(mcChoiceMatch("a", "A")).toBe(true);
+  });
+
+  test("extracts from \\boxed{...}", () => {
+    expect(mcChoiceMatch("After thinking, \\boxed{c}", "C")).toBe(true);
+  });
+
+  test('strips "choice" and "option" words', () => {
+    expect(mcChoiceMatch("Choice B", "B")).toBe(true);
+    expect(mcChoiceMatch("Option d.", "D")).toBe(true);
+  });
+
+  test("strips trailing periods by default", () => {
+    expect(mcChoiceMatch("B.", "B")).toBe(true);
+  });
+
+  test("respects custom stripChars", () => {
+    expect(mcChoiceMatch("B)", "B", { stripChars: ".)" })).toBe(true);
+  });
+
+  test("returns false when letters differ", () => {
+    expect(mcChoiceMatch("A", "B")).toBe(false);
+  });
+
+  test("null/undefined prediction or answer → false", () => {
+    expect(mcChoiceMatch(null, "A")).toBe(false);
+    expect(mcChoiceMatch("A", undefined)).toBe(false);
+  });
+});
+
+describe("mcChoiceSetMatch", () => {
+  test("set-equality on multi-letter answers regardless of order", () => {
+    expect(mcChoiceSetMatch("A, B, D", "D B A")).toBe(true);
+  });
+
+  test('filters filler words like "and"/"option"', () => {
+    expect(mcChoiceSetMatch("A and B and C", "A, B, C")).toBe(true);
+  });
+
+  test("returns false on different sets", () => {
+    expect(mcChoiceSetMatch("A, B", "A, B, C")).toBe(false);
+  });
+
+  test("requireNonEmpty=true rejects empty prediction", () => {
+    expect(mcChoiceSetMatch("", "A, B")).toBe(false);
+  });
+});
+
+describe("extractMultiSelectLetters", () => {
+  test("explodes a CSV of letters into per-letter array", () => {
+    expect(extractMultiSelectLetters("A, B, C")).toEqual(["A", "B", "C"]);
+  });
+
+  test("drops filler words", () => {
+    expect(extractMultiSelectLetters("Final answer: A and B")).toEqual([
+      "A",
+      "B",
+    ]);
+  });
+
+  test("explodes multi-letter chunks like 'BD' into ['B','D']", () => {
+    expect(extractMultiSelectLetters("BD")).toEqual(["B", "D"]);
+  });
+});
diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/judge/index.test.ts b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/index.test.ts
new file mode 100644
index 00000000000..8b84b05f617
--- /dev/null
+++ b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/index.test.ts
@@ -0,0 +1,132 @@
+import { afterEach, describe, expect, test } from "bun:test";
+
+import { evalFromSpec } from "../../judge";
+
+const originalFetch = globalThis.fetch;
+
+function mockOpenAI(content: string) {
+  const captured: { body?: Record<string, unknown> } = {};
+  globalThis.fetch = (async (
+    _url: string | URL | Request,
+    init?: RequestInit,
+  ) => {
+    if (init?.body !== undefined) {
+      captured.body = JSON.parse(String(init.body)) as Record<string, unknown>;
+    }
+    return new Response(
+      JSON.stringify({ choices: [{ message: { content } }] }),
+      { status: 200, headers: { "content-type": "application/json" } },
+    );
+  }) as typeof fetch;
+  return captured;
+}
+
+describe("evalFromSpec dispatcher", () => {
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+  });
+
+  test("dispatches norm_phrase_set_match and reports the function name", async () => {
+    const result = await evalFromSpec("norm_phrase_set_match", {
+      prediction: "the project settings page is /settings/project",
+      answer: "/settings/project",
+    });
+    expect(result).toEqual({
+      label: true,
+      reason: "",
+      function: "norm_phrase_set_match",
+    });
+  });
+
+  test("dispatches norm_phrase_set_match_ordered with separator kwargs from spec", async () => {
+    const result = await evalFromSpec(
+      "norm_phrase_set_match_ordered|separators=>",
+      {
+        prediction: "click Dashboards then New then template then Save",
+        answer: "Dashboards > New > template > Save",
+      },
+    );
+    expect(result.label).toBe(true);
+    expect(result.function).toBe("norm_phrase_set_match_ordered");
+  });
+
+  test("dispatches mc_choice_match", async () => {
+    const result = await evalFromSpec("mc_choice_match", {
+      prediction: "Choice B.",
+      answer: "B",
+    });
+    expect(result.label).toBe(true);
+    expect(result.function).toBe("mc_choice_match");
+  });
+
+  test("dispatches mc_choice_set_match", async () => {
+    const result = await evalFromSpec("mc_choice_set_match", {
+      prediction: "Final answer: A and C",
+      answer: "C, A",
+    });
+    expect(result.label).toBe(true);
+    expect(result.function).toBe("mc_choice_set_match");
+  });
+
+  test("dispatches llm_abstention_checker through the OpenAI transport", async () => {
+    mockOpenAI('{"label": 1, "reason": "premise rejected"}');
+
+    const result = await evalFromSpec(
+      "llm_abstention_checker",
+      {
+        prediction: "The premise here is wrong because X.",
+        answer: "Reject the premise.",
+        questionItem: { question: "Why does Z fail?" },
+      },
+      { evaluatorModel: "gpt-5.2", evaluatorApiKey: "unit-test" },
+    );
+
+    expect(result).toEqual({
+      label: true,
+      reason: "premise rejected",
+      function: "llm_abstention_checker",
+    });
+  });
+
+  test("dispatches llm_gotchas_checker through the OpenAI transport", async () => {
+    mockOpenAI('{"label": 1, "reason": "captures insight"}');
+
+    const result = await evalFromSpec(
+      "llm_gotchas_checker",
+      {
+        prediction: "You need to refresh the cache first.",
+        answer: "Cache must be invalidated before reload.",
+      },
+      { evaluatorModel: "gpt-5.2", evaluatorApiKey: "unit-test" },
+    );
+
+    expect(result.label).toBe(true);
+    expect(result.function).toBe("llm_gotchas_checker");
+  });
+
+  test("caller overrides win over spec kwargs", async () => {
+    // Spec sets requireNonEmpty=true; override forces false so an empty
+    // answer still matches.
+    const result = await evalFromSpec(
+      "norm_phrase_set_match|require_non_empty=true",
+      { prediction: "non-empty", answer: "" },
+      { requireNonEmpty: false },
+    );
+    expect(result.label).toBe(true);
+  });
+
+  test("throws on unknown function name", async () => {
+    await expect(
+      evalFromSpec("definitely_not_real", { prediction: "x", answer: "y" }),
+    ).rejects.toThrow(/Unknown eval function/);
+  });
+
+  test("propagates parse errors on malformed spec strings", async () => {
+    await expect(
+      evalFromSpec("norm_phrase_set_match|noequals", {
+        prediction: "x",
+        answer: "y",
+      }),
+    ).rejects.toThrow(/Invalid eval function option/);
+  });
+});
diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/judge/judgement.test.ts b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/judgement.test.ts
new file mode 100644
index 00000000000..e964bcf5085
--- /dev/null
+++ b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/judgement.test.ts
@@ -0,0 +1,89 @@
+import { describe, expect, test } from "bun:test";
+
+import {
+  parseLlmBinaryJudgement,
+  stripMarkdownCodeFence,
+} from "../../judge/judgement";
+
+describe("stripMarkdownCodeFence", () => {
+  test("removes leading and trailing triple-backtick fences", () => {
+    const text = '```json\n{"label": 1}\n```';
+    expect(stripMarkdownCodeFence(text)).toBe('{"label": 1}');
+  });
+
+  test("passes through non-fenced text unchanged", () => {
+    expect(stripMarkdownCodeFence('{"label": 0}')).toBe('{"label": 0}');
+  });
+});
+
+describe("parseLlmBinaryJudgement", () => {
+  test("parses strict JSON with label=1 and reason", () => {
+    const result = parseLlmBinaryJudgement(
+      '{"label": 1, "reason": "identified flaw"}',
+    );
+    expect(result).toEqual({ label: 1, reason: "identified flaw" });
+  });
+
+  test("parses strict JSON with label=0", () => {
+    const result = parseLlmBinaryJudgement('{"label": 0, "reason": "wrong"}');
+    expect(result).toEqual({ label: 0, reason: "wrong" });
+  });
+
+  test("strips a code fence before JSON parse", () => {
+    const result = parseLlmBinaryJudgement(
+      '```\n{"label": 1, "reason": "ok"}\n```',
+    );
+    expect(result).toEqual({ label: 1, reason: "ok" });
+  });
+
+  test("strips a code fence with a language tag", () => {
+    const result = parseLlmBinaryJudgement(
+      '```json\n{"label": 0, "reason": "nope"}\n```',
+    );
+    expect(result).toEqual({ label: 0, reason: "nope" });
+  });
+
+  test("accepts label as a stringy 0 or 1", () => {
+    expect(
+      parseLlmBinaryJudgement('{"label": "1", "reason": "yep"}').label,
+    ).toBe(1);
+    expect(
+      parseLlmBinaryJudgement('{"label": "0", "reason": "nope"}').label,
+    ).toBe(0);
+  });
+
+  test("falls back to regex when JSON is malformed", () => {
+    const text = "Here is my judgement: {label: 1, reason: oops}";
+    const result = parseLlmBinaryJudgement(text);
+    expect(result.label).toBe(1);
+    // Regex fallback returns the whole cleaned string as the reason.
+    expect(result.reason).toContain("label");
+  });
+
+  test("regex fallback accepts single-quoted label key", () => {
+    const result = parseLlmBinaryJudgement("{'label': 0}");
+    expect(result.label).toBe(0);
+  });
+
+  test("regex fallback accepts label=1 shorthand", () => {
+    const result = parseLlmBinaryJudgement("Output: label=1, reason=ok");
+    expect(result.label).toBe(1);
+  });
+
+  test("throws on empty input", () => {
+    expect(() => parseLlmBinaryJudgement("")).toThrow(/Empty judgement/);
+    expect(() => parseLlmBinaryJudgement("   ")).toThrow(/Empty judgement/);
+  });
+
+  test("throws when no label can be extracted", () => {
+    expect(() => parseLlmBinaryJudgement("I refuse to answer")).toThrow(
+      /Could not parse evaluator binary judgement/,
+    );
+  });
+
+  test("throws when JSON has a non-binary label and no shorthand fallback hits", () => {
+    expect(() => parseLlmBinaryJudgement('{"label": "maybe"}')).toThrow(
+      /Could not parse evaluator binary judgement/,
+    );
+  });
+});
diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/judge/llm.test.ts b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/llm.test.ts
new file mode 100644
index 00000000000..bb736958af0
--- /dev/null
+++ b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/llm.test.ts
@@ -0,0 +1,202 @@
+import { afterEach, describe, expect, test } from "bun:test";
+
+import { llmAbstentionChecker, llmGotchasChecker } from "../../judge/llm";
+
+const originalFetch = globalThis.fetch;
+
+interface CapturedRequest {
+  url: string | URL | Request;
+  init?: RequestInit;
+  body?: Record<string, unknown>;
+}
+
+function mockOpenAIChatCompletions(responseBody: unknown, status = 200) {
+  const captured: CapturedRequest = { url: "" };
+  globalThis.fetch = (async (
+    url: string | URL | Request,
+    init?: RequestInit,
+  ) => {
+    captured.url = url;
+    captured.init = init;
+    if (init?.body !== undefined) {
+      captured.body = JSON.parse(String(init.body)) as Record<string, unknown>;
+    }
+    return new Response(JSON.stringify(responseBody), {
+      status,
+      headers: { "content-type": "application/json" },
+    });
+  }) as typeof fetch;
+  return captured;
+}
+
+function openAIChatBody(content: string): Record<string, unknown> {
+  return { choices: [{ message: { content } }] };
+}
+
+describe("llmAbstentionChecker", () => {
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+  });
+
+  test("returns label=true and reason on a positive JSON judgement", async () => {
+    mockOpenAIChatCompletions(
+      openAIChatBody('{"label": 1, "reason": "identified flaw"}'),
+    );
+
+    const result = await llmAbstentionChecker(
+      "The premise is wrong because X.",
+      "Reject the premise (X is impossible here).",
+      {
+        evaluatorModel: "gpt-5.2",
+        evaluatorApiKey: "unit-test",
+        questionItem: { question: "Why does Z fail?" },
+      },
+    );
+
+    expect(result.label).toBe(true);
+    expect(result.reason).toBe("identified flaw");
+  });
+
+  test("returns label=false on a JSON judgement with label=0", async () => {
+    mockOpenAIChatCompletions(
+      openAIChatBody('{"label": 0, "reason": "followed flawed premise"}'),
+    );
+
+    const result = await llmAbstentionChecker("X.", "Reject premise.", {
+      evaluatorModel: "gpt-5.2",
+      evaluatorApiKey: "unit-test",
+    });
+
+    expect(result.label).toBe(false);
+    expect(result.reason).toBe("followed flawed premise");
+  });
+
+  test("posts to /chat/completions with bearer auth and OpenAI body shape", async () => {
+    const captured = mockOpenAIChatCompletions(
+      openAIChatBody('{"label": 1, "reason": "ok"}'),
+    );
+
+    await llmAbstentionChecker("pred", "ans", {
+      evaluatorModel: "gpt-5.2",
+      evaluatorApiKey: "unit-test",
+      evaluatorReasoningEffort: "medium",
+      evaluatorMaxCompletionTokens: 2048,
+      questionItem: { question: "Q?" },
+    });
+
+    expect(String(captured.url)).toBe(
+      "https://api.openai.com/v1/chat/completions",
+    );
+    expect(captured.init?.method).toBe("POST");
+    const headers = new Headers(captured.init?.headers);
+    expect(headers.get("authorization")).toBe("Bearer unit-test");
+    expect(headers.get("content-type")).toBe("application/json");
+    expect(captured.body?.model).toBe("gpt-5.2");
+    expect(captured.body?.reasoning_effort).toBe("medium");
+    expect(captured.body?.max_completion_tokens).toBe(2048);
+    const messages = captured.body?.messages as Array<{
+      role: string;
+      content: string;
+    }>;
+    expect(messages).toHaveLength(2);
+    expect(messages[0].role).toBe("system");
+    expect(messages[0].content).toContain("flawed-premise");
+    expect(messages[1].role).toBe("user");
+    expect(messages[1].content).toContain("Q?");
+    expect(messages[1].content).toContain("ans");
+  });
+
+  test("respects a custom base URL", async () => {
+    const captured = mockOpenAIChatCompletions(
+      openAIChatBody('{"label": 1, "reason": "ok"}'),
+    );
+
+    await llmAbstentionChecker("pred", "ans", {
+      evaluatorModel: "gpt-5.2",
+      evaluatorApiKey: "unit-test",
+      evaluatorBaseUrl: "http://localhost:8001/v1",
+    });
+
+    expect(String(captured.url)).toBe(
+      "http://localhost:8001/v1/chat/completions",
+    );
+  });
+
+  test("throws when evaluatorApiKey is missing and env is unset and no base URL", async () => {
+    const previous = process.env.OPENAI_API_KEY;
+    delete process.env.OPENAI_API_KEY;
+    try {
+      await expect(
+        llmAbstentionChecker("p", "a", { evaluatorModel: "gpt-5.2" }),
+      ).rejects.toThrow(/API key/);
+    } finally {
+      if (previous !== undefined) process.env.OPENAI_API_KEY = previous;
+    }
+  });
+
+  test("returns label=false with explanatory reason on empty prediction", async () => {
+    // No mock — should short-circuit before any fetch call.
+    globalThis.fetch = (async () => {
+      throw new Error("fetch should not be called");
+    }) as unknown as typeof fetch;
+
+    const result = await llmAbstentionChecker("", "ref", {
+      evaluatorModel: "gpt-5.2",
+      evaluatorApiKey: "unit-test",
+    });
+
+    expect(result.label).toBe(false);
+    expect(result.reason).toContain("empty");
+  });
+
+  test("non-2xx HTTP response surfaces as an error with status code", async () => {
+    mockOpenAIChatCompletions({ error: "rate limited" }, 429);
+
+    await expect(
+      llmAbstentionChecker("p", "a", {
+        evaluatorModel: "gpt-5.2",
+        evaluatorApiKey: "unit-test",
+      }),
+    ).rejects.toThrow(/HTTP 429/);
+  });
+
+  test("parses code-fenced judgement output", async () => {
+    mockOpenAIChatCompletions(
+      openAIChatBody('```json\n{"label": 1, "reason": "ok"}\n```'),
+    );
+
+    const result = await llmAbstentionChecker("p", "a", {
+      evaluatorModel: "gpt-5.2",
+      evaluatorApiKey: "unit-test",
+    });
+
+    expect(result.label).toBe(true);
+    expect(result.reason).toBe("ok");
+  });
+});
+
+describe("llmGotchasChecker", () => {
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+  });
+
+  test("uses the gotchas system prompt and rubric", async () => {
+    const captured = mockOpenAIChatCompletions(
+      openAIChatBody('{"label": 1, "reason": "covers insight"}'),
+    );
+
+    await llmGotchasChecker("response covers insight", "insight A; insight B", {
+      evaluatorModel: "gpt-5.2",
+      evaluatorApiKey: "unit-test",
+      questionItem: { question: "What gotcha applies?" },
+    });
+
+    const messages = captured.body?.messages as Array<{
+      role: string;
+      content: string;
+    }>;
+    expect(messages[0].content).toContain("gotchas-style insight");
+    expect(messages[1].content).toContain("gotcha insight");
+    expect(messages[1].content).toContain("insight A; insight B");
+  });
+});
diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/judge/normalize.test.ts b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/normalize.test.ts
new file mode 100644
index 00000000000..b74891d2142
--- /dev/null
+++ b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/normalize.test.ts
@@ -0,0 +1,71 @@
+import { describe, expect, test } from "bun:test";
+
+import {
+  DEFAULT_SEPARATORS,
+  normalizePhrase,
+  splitPhrases,
+} from "../../judge/normalize";
+
+describe("normalizePhrase", () => {
+  test("lowercases, replaces hyphens/underscores, strips punctuation", () => {
+    expect(normalizePhrase("New-Dashboard_Layout!")).toBe(
+      "new dashboard layout",
+    );
+  });
+
+  test("collapses runs of whitespace and trims", () => {
+    expect(normalizePhrase("  hello   world  ")).toBe("hello world");
+  });
+
+  test("returns empty string for null and undefined", () => {
+    expect(normalizePhrase(null)).toBe("");
+    expect(normalizePhrase(undefined)).toBe("");
+  });
+
+  test("respects opts.lower=false", () => {
+    expect(normalizePhrase("AaBb", { lower: false })).toBe("AaBb");
+  });
+
+  test("respects opts.normalizeHyphen=false", () => {
+    expect(normalizePhrase("a-b_c", { normalizeHyphen: false })).toBe("ab_c");
+  });
+
+  test("respects opts.stripPunct=false (preserves non-word non-space)", () => {
+    expect(normalizePhrase("hello, world!", { stripPunct: false })).toBe(
+      "hello world!",
+    );
+  });
+
+  test("stringifies non-string input via String()", () => {
+    expect(normalizePhrase(42)).toBe("42");
+  });
+});
+
+describe("splitPhrases", () => {
+  test("default separators split on commas and semicolons", () => {
+    expect(splitPhrases("foo, bar; baz")).toEqual(["foo", "bar", "baz"]);
+  });
+
+  test("empty separators returns a single normalized phrase", () => {
+    expect(splitPhrases("12,481", { separators: [] })).toEqual(["12 481"]);
+  });
+
+  test("custom separator >", () => {
+    expect(
+      splitPhrases("Dashboards > New > template > Save", { separators: [">"] }),
+    ).toEqual(["dashboards", "new", "template", "save"]);
+  });
+
+  test("filters out parts that normalize to empty", () => {
+    expect(splitPhrases("foo,,bar")).toEqual(["foo", "bar"]);
+  });
+
+  test("null/undefined → empty array", () => {
+    expect(splitPhrases(null)).toEqual([]);
+    expect(splitPhrases(undefined)).toEqual([]);
+  });
+
+  test("DEFAULT_SEPARATORS exports comma and semicolon", () => {
+    expect(DEFAULT_SEPARATORS).toEqual([",", ";"]);
+  });
+});
diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/judge/spec.test.ts b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/spec.test.ts
new file mode 100644
index 00000000000..7ddaa0fc8fe
--- /dev/null
+++ b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/spec.test.ts
@@ -0,0 +1,110 @@
+import { describe, expect, test } from "bun:test";
+
+import { parseEvalFunctionSpec, parseEvalValue } from "../../judge/spec";
+
+describe("parseEvalFunctionSpec", () => {
+  test("bare function name with no kwargs", () => {
+    expect(parseEvalFunctionSpec("norm_phrase_set_match")).toEqual({
+      name: "norm_phrase_set_match",
+      kwargs: {},
+    });
+  });
+
+  test("converts snake_case kwarg keys to camelCase", () => {
+    const parsed = parseEvalFunctionSpec(
+      "norm_phrase_set_match|require_non_empty=false",
+    );
+    expect(parsed.name).toBe("norm_phrase_set_match");
+    expect(parsed.kwargs).toEqual({ requireNonEmpty: false });
+  });
+
+  test("parses bool true/false case-insensitively", () => {
+    const parsed = parseEvalFunctionSpec(
+      "norm_phrase_set_match|lower=TRUE|strip_punct=False",
+    );
+    expect(parsed.kwargs).toEqual({ lower: true, stripPunct: false });
+  });
+
+  test("special-cases separators with empty value", () => {
+    expect(
+      parseEvalFunctionSpec("norm_phrase_set_match|separators=").kwargs,
+    ).toEqual({ separators: [] });
+  });
+
+  test("special-cases separators with single char", () => {
+    expect(
+      parseEvalFunctionSpec("norm_phrase_set_match_ordered|separators=>")
+        .kwargs,
+    ).toEqual({ separators: [">"] });
+  });
+
+  test("special-cases separators with bracketed JSON list", () => {
+    // Note: `|` is reserved as the spec-level kwarg separator and cannot
+    // appear inside a JSON list value — the input would already be
+    // pipe-split before this branch runs. This matches V2's Python behavior.
+    expect(
+      parseEvalFunctionSpec('mc_choice_set_match|separators=[",", ";"]').kwargs,
+    ).toEqual({ separators: [",", ";"] });
+  });
+
+  test("parses none/null to null", () => {
+    expect(
+      parseEvalFunctionSpec("mc_choice_match|strip_chars=NONE").kwargs,
+    ).toEqual({ stripChars: null });
+  });
+
+  test("parses integers and floats", () => {
+    const parsed = parseEvalFunctionSpec(
+      "norm_phrase_set_match|some_int=42|some_float=1.5",
+    );
+    expect(parsed.kwargs).toEqual({ someInt: 42, someFloat: 1.5 });
+  });
+
+  test("leaves non-numeric values as strings", () => {
+    expect(
+      parseEvalFunctionSpec("mc_choice_match|strip_chars=.").kwargs,
+    ).toEqual({ stripChars: "." });
+  });
+
+  test("rejects empty spec", () => {
+    expect(() => parseEvalFunctionSpec("")).toThrow(/non-empty string/);
+  });
+
+  test("rejects non-string spec", () => {
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    expect(() => parseEvalFunctionSpec(42 as any)).toThrow(/non-empty string/);
+  });
+
+  test("rejects kwargs missing =", () => {
+    expect(() =>
+      parseEvalFunctionSpec("norm_phrase_set_match|noequals"),
+    ).toThrow(/Invalid eval function option/);
+  });
+
+  test("rejects duplicate kwarg keys", () => {
+    expect(() =>
+      parseEvalFunctionSpec("norm_phrase_set_match|lower=true|lower=false"),
+    ).toThrow(/Duplicate eval function option/);
+  });
+
+  test("rejects empty kwarg key", () => {
+    expect(() => parseEvalFunctionSpec("norm_phrase_set_match|=value")).toThrow(
+      /Invalid eval function option/,
+    );
+  });
+});
+
+describe("parseEvalValue", () => {
+  test("integer-shaped values parse as int", () => {
+    expect(parseEvalValue("some_int", "42")).toBe(42);
+    expect(parseEvalValue("some_int", "-3")).toBe(-3);
+  });
+
+  test("float-shaped values parse as float", () => {
+    expect(parseEvalValue("some_float", "1.5")).toBe(1.5);
+  });
+
+  test("numeric-with-trailing-text stays a string", () => {
+    expect(parseEvalValue("some_str", "12abc")).toBe("12abc");
+  });
+});
diff --git a/evals/benchmarks/longmemeval-v2/src/judge/deterministic.ts b/evals/benchmarks/longmemeval-v2/src/judge/deterministic.ts
new file mode 100644
index 00000000000..ea9bf7e8497
--- /dev/null
+++ b/evals/benchmarks/longmemeval-v2/src/judge/deterministic.ts
@@ -0,0 +1,148 @@
+/**
+ * Deterministic (no-LLM) evaluators. TypeScript ports of the corresponding
+ * functions in V2's `evaluation/qa_eval_metrics.py`:
+ *
+ * - `norm_phrase_set_match` — phrase-set membership (unordered)
+ * - `norm_phrase_set_match_ordered` — phrase-set membership (ordered)
+ * - `mc_choice_match` — single multiple-choice letter
+ * - `mc_choice_set_match` — multi-select multiple-choice letters
+ */
+
+import {
+  DEFAULT_SEPARATORS,
+  escapeRegex,
+  normalizePhrase,
+  splitPhrases,
+  type SplitOptions,
+} from "./normalize";
+
+export interface PhraseSetMatchOptions extends SplitOptions {
+  requireNonEmpty?: boolean;
+}
+
+export function normPhraseSetMatch(
+  prediction: unknown,
+  answer: unknown,
+  opts: PhraseSetMatchOptions = {},
+): boolean {
+  const requireNonEmpty = opts.requireNonEmpty ?? true;
+  const normalizedPred = normalizePhrase(prediction, opts);
+  const answerPhrases = splitPhrases(answer, {
+    ...opts,
+    separators: opts.separators ?? DEFAULT_SEPARATORS,
+  });
+  if (requireNonEmpty && (!normalizedPred || answerPhrases.length === 0)) {
+    return false;
+  }
+  for (const phrase of new Set(answerPhrases)) {
+    const pattern = new RegExp(`\\b${escapeRegex(phrase)}\\b`);
+    if (!pattern.test(normalizedPred)) return false;
+  }
+  return true;
+}
+
+export function normPhraseSetMatchOrdered(
+  prediction: unknown,
+  answer: unknown,
+  opts: PhraseSetMatchOptions = {},
+): boolean {
+  const requireNonEmpty = opts.requireNonEmpty ?? true;
+  const normalizedPred = normalizePhrase(prediction, opts);
+  const answerPhrases = splitPhrases(answer, {
+    ...opts,
+    separators: opts.separators ?? DEFAULT_SEPARATORS,
+  });
+  if (requireNonEmpty && (!normalizedPred || answerPhrases.length === 0)) {
+    return false;
+  }
+  let start = 0;
+  for (const phrase of answerPhrases) {
+    const pattern = new RegExp(`\\b${escapeRegex(phrase)}\\b`);
+    const match = pattern.exec(normalizedPred.slice(start));
+    if (!match) return false;
+    start += match.index + match[0].length;
+  }
+  return true;
+}
+
+export interface McChoiceMatchOptions {
+  stripChars?: string;
+  requireNonEmpty?: boolean;
+}
+
+export function mcChoiceMatch(
+  prediction: unknown,
+  answer: unknown,
+  opts: McChoiceMatchOptions = {},
+): boolean {
+  if (prediction === null || prediction === undefined) return false;
+  if (answer === null || answer === undefined) return false;
+  const predStr =
+    typeof prediction === "string" ? prediction : String(prediction);
+  const ansStr = typeof answer === "string" ? answer : String(answer);
+  const stripChars = opts.stripChars ?? ".";
+  const requireNonEmpty = opts.requireNonEmpty ?? true;
+
+  const boxedMatch = predStr.toLowerCase().match(/\\boxed\{([^}]*)\}/);
+  let candidate = boxedMatch ? boxedMatch[1] : predStr;
+  candidate = candidate.replace(/\b(choice|option)\b/gi, "");
+  for (const ch of stripChars) {
+    candidate = candidate.split(ch).join("");
+  }
+  const cleaned = candidate.trim().toUpperCase();
+  const expected = ansStr.trim().toUpperCase();
+  if (requireNonEmpty && (!cleaned || !expected)) return false;
+  return cleaned === expected;
+}
+
+const MULTI_SELECT_FILLER_WORDS = new Set([
+  "AND",
+  "ANSWER",
+  "ANSWERS",
+  "CHOICE",
+  "CHOICES",
+  "FINAL",
+  "LETTER",
+  "LETTERS",
+  "OPTION",
+  "OPTIONS",
+]);
+
+export function extractMultiSelectLetters(text: unknown): string[] {
+  if (text === null || text === undefined) return [];
+  const s = typeof text === "string" ? text : String(text);
+  const chunks = s.toUpperCase().match(/[A-Z]+/g) ?? [];
+  const letters: string[] = [];
+  for (const chunk of chunks) {
+    if (MULTI_SELECT_FILLER_WORDS.has(chunk)) continue;
+    for (const ch of chunk) letters.push(ch);
+  }
+  return letters;
+}
+
+export interface McChoiceSetMatchOptions {
+  requireNonEmpty?: boolean;
+}
+
+export function mcChoiceSetMatch(
+  prediction: unknown,
+  answer: unknown,
+  opts: McChoiceSetMatchOptions = {},
+): boolean {
+  const requireNonEmpty = opts.requireNonEmpty ?? true;
+  const predLetters = extractMultiSelectLetters(prediction);
+  const ansLetters = extractMultiSelectLetters(answer);
+  if (
+    requireNonEmpty &&
+    (predLetters.length === 0 || ansLetters.length === 0)
+  ) {
+    return false;
+  }
+  const predSet = new Set(predLetters);
+  const ansSet = new Set(ansLetters);
+  if (predSet.size !== ansSet.size) return false;
+  for (const letter of predSet) {
+    if (!ansSet.has(letter)) return false;
+  }
+  return true;
+}
diff --git a/evals/benchmarks/longmemeval-v2/src/judge/index.ts b/evals/benchmarks/longmemeval-v2/src/judge/index.ts
new file mode 100644
index 00000000000..ab79e00e47a
--- /dev/null
+++ b/evals/benchmarks/longmemeval-v2/src/judge/index.ts
@@ -0,0 +1,162 @@
+/**
+ * Public entry point for the LongMemEval-V2 evaluator. Dispatches per
+ * question's `eval_function` spec string to one of:
+ *
+ *  Deterministic (no LLM):
+ *    - norm_phrase_set_match
+ *    - norm_phrase_set_match_ordered
+ *    - mc_choice_match
+ *    - mc_choice_set_match
+ *
+ *  LLM judges (default model `gpt-5.2`, reasoning_effort=medium):
+ *    - llm_abstention_checker — flawed-premise questions
+ *    - llm_gotchas_checker    — insight-style gotchas
+ *
+ * Mirrors `eval_from_spec` in V2's `evaluation/qa_eval_metrics.py`:
+ * caller-supplied overrides win over per-question spec kwargs.
+ */
+
+import {
+  mcChoiceMatch,
+  mcChoiceSetMatch,
+  normPhraseSetMatch,
+  normPhraseSetMatchOrdered,
+  type McChoiceMatchOptions,
+  type McChoiceSetMatchOptions,
+  type PhraseSetMatchOptions,
+} from "./deterministic";
+import {
+  llmAbstentionChecker,
+  llmGotchasChecker,
+  type LlmJudgeOptions,
+} from "./llm";
+import { parseEvalFunctionSpec } from "./spec";
+
+export interface EvalInputs {
+  prediction: unknown;
+  answer: unknown;
+  /** Question record (used by LLM judges to pull `question.text`). */
+  questionItem?: Record<string, unknown> | null;
+  /** Extracted "final answer" from the model, when distinct from prediction. */
+  parsedPrediction?: string | null;
+  /** Raw full model response, when distinct from prediction. */
+  modelResponse?: string | null;
+}
+
+/** Caller-side overrides applied after the per-question spec kwargs. */
+export type EvalOverrides = LlmJudgeOptions &
+  PhraseSetMatchOptions &
+  McChoiceMatchOptions &
+  McChoiceSetMatchOptions;
+
+export interface EvalResult {
+  label: boolean;
+  /** Populated by LLM judges; empty string for deterministic functions. */
+  reason: string;
+  /** The dispatched function name in V2 snake_case, for logging/audit. */
+  function: string;
+}
+
+export async function evalFromSpec(
+  spec: string,
+  inputs: EvalInputs,
+  overrides: EvalOverrides = {},
+): Promise<EvalResult> {
+  const { name, kwargs } = parseEvalFunctionSpec(spec);
+  const merged = { ...kwargs, ...overrides };
+
+  switch (name) {
+    case "norm_phrase_set_match":
+      return {
+        label: normPhraseSetMatch(
+          inputs.prediction,
+          inputs.answer,
+          merged as PhraseSetMatchOptions,
+        ),
+        reason: "",
+        function: name,
+      };
+    case "norm_phrase_set_match_ordered":
+      return {
+        label: normPhraseSetMatchOrdered(
+          inputs.prediction,
+          inputs.answer,
+          merged as PhraseSetMatchOptions,
+        ),
+        reason: "",
+        function: name,
+      };
+    case "mc_choice_match":
+      return {
+        label: mcChoiceMatch(
+          inputs.prediction,
+          inputs.answer,
+          merged as McChoiceMatchOptions,
+        ),
+        reason: "",
+        function: name,
+      };
+    case "mc_choice_set_match":
+      return {
+        label: mcChoiceSetMatch(
+          inputs.prediction,
+          inputs.answer,
+          merged as McChoiceSetMatchOptions,
+        ),
+        reason: "",
+        function: name,
+      };
+    case "llm_abstention_checker": {
+      const result = await llmAbstentionChecker(
+        inputs.prediction,
+        inputs.answer,
+        {
+          ...(merged as LlmJudgeOptions),
+          questionItem: inputs.questionItem ?? null,
+          parsedPrediction: inputs.parsedPrediction ?? null,
+          modelResponse: inputs.modelResponse ?? null,
+        },
+      );
+      return { ...result, function: name };
+    }
+    case "llm_gotchas_checker": {
+      const result = await llmGotchasChecker(inputs.prediction, inputs.answer, {
+        ...(merged as LlmJudgeOptions),
+        questionItem: inputs.questionItem ?? null,
+        parsedPrediction: inputs.parsedPrediction ?? null,
+        modelResponse: inputs.modelResponse ?? null,
+      });
+      return { ...result, function: name };
+    }
+    default:
+      throw new Error(`Unknown eval function: ${name}`);
+  }
+}
+
+export { parseEvalFunctionSpec, parseEvalValue } from "./spec";
+export { normalizePhrase, splitPhrases, DEFAULT_SEPARATORS } from "./normalize";
+export {
+  mcChoiceMatch,
+  mcChoiceSetMatch,
+  normPhraseSetMatch,
+  normPhraseSetMatchOrdered,
+  extractMultiSelectLetters,
+} from "./deterministic";
+export {
+  llmAbstentionChecker,
+  llmGotchasChecker,
+  DEFAULT_EVALUATOR_MODEL,
+  DEFAULT_EVALUATOR_REASONING_EFFORT,
+  DEFAULT_EVALUATOR_MAX_COMPLETION_TOKENS,
+  DEFAULT_EVALUATOR_TIMEOUT_SECONDS,
+  DEFAULT_EVALUATOR_API_KEY_ENV,
+  DEFAULT_OPENAI_BASE_URL,
+  type LlmJudgeOptions,
+  type LlmJudgeResult,
+  type ReasoningEffort,
+} from "./llm";
+export {
+  parseLlmBinaryJudgement,
+  stripMarkdownCodeFence,
+  type ParsedJudgement,
+} from "./judgement";
diff --git a/evals/benchmarks/longmemeval-v2/src/judge/judgement.ts b/evals/benchmarks/longmemeval-v2/src/judge/judgement.ts
new file mode 100644
index 00000000000..4fc25ee07e6
--- /dev/null
+++ b/evals/benchmarks/longmemeval-v2/src/judge/judgement.ts
@@ -0,0 +1,76 @@
+/**
+ * Parses the LLM judge's binary judgement output.
+ *
+ * V2 instructs both judges to emit `{"label": 0 or 1, "reason": "..."}`.
+ * Real models still go off-script — Markdown code fences, prose around the
+ * JSON, single-quoted "JSON", `label=1` shorthand. `parseLlmBinaryJudgement`
+ * mirrors V2's `_parse_llm_binary_judgement`:
+ *
+ *  1. Strip a wrapping triple-backtick fence (with or without a language tag).
+ *  2. Try to JSON-parse the first balanced `{...}` block.
+ *  3. Fall back to a regex on `label: 0|1` in any common quote style.
+ *  4. Throw if none of the above matches.
+ */
+
+export interface ParsedJudgement {
+  label: 0 | 1;
+  reason: string;
+}
+
+export function stripMarkdownCodeFence(text: string): string {
+  const stripped = text.trim();
+  if (stripped.startsWith("```") && stripped.endsWith("```")) {
+    const lines = stripped.split("\n");
+    if (lines.length >= 3) {
+      return lines.slice(1, -1).join("\n").trim();
+    }
+  }
+  return stripped;
+}
+
+export function parseLlmBinaryJudgement(text: unknown): ParsedJudgement {
+  const cleaned = stripMarkdownCodeFence(stringify(text));
+  if (!cleaned) {
+    throw new Error("Empty judgement response from evaluator model.");
+  }
+
+  // 1) Strict JSON in the first {…} block.
+  const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
+  if (jsonMatch) {
+    try {
+      const payload = JSON.parse(jsonMatch[0]);
+      if (payload && typeof payload === "object" && !Array.isArray(payload)) {
+        const label = (payload as Record<string, unknown>).label;
+        if (label === 0 || label === 1 || label === "0" || label === "1") {
+          const reason = stringify((payload as Record<string, unknown>).reason);
+          return { label: Number(label) as 0 | 1, reason };
+        }
+      }
+    } catch {
+      // Fall through to regex extraction — matches Python.
+    }
+  }
+
+  // 2) Regex fallback for non-strict JSON-shaped outputs.
+  const patterns: ReadonlyArray<RegExp> = [
+    /"label"\s*:\s*([01])/i,
+    /'label'\s*:\s*([01])/i,
+    /\blabel\b\s*[:=]\s*([01])/i,
+  ];
+  for (const pattern of patterns) {
+    const match = cleaned.match(pattern);
+    if (match) {
+      return { label: Number(match[1]) as 0 | 1, reason: cleaned };
+    }
+  }
+
+  throw new Error(
+    `Could not parse evaluator binary judgement: ${JSON.stringify(cleaned)}`,
+  );
+}
+
+function stringify(value: unknown): string {
+  if (value === null || value === undefined) return "";
+  if (typeof value === "string") return value.trim();
+  return String(value).trim();
+}
diff --git a/evals/benchmarks/longmemeval-v2/src/judge/llm.ts b/evals/benchmarks/longmemeval-v2/src/judge/llm.ts
new file mode 100644
index 00000000000..8643160a587
--- /dev/null
+++ b/evals/benchmarks/longmemeval-v2/src/judge/llm.ts
@@ -0,0 +1,342 @@
+/**
+ * LLM judges. TypeScript ports of `llm_abstention_checker` and
+ * `llm_gotchas_checker` from V2's `evaluation/qa_eval_metrics.py`.
+ *
+ * Both judges:
+ *  - issue an OpenAI-shape chat completion against `evaluatorModel`
+ *    (V2 default = `gpt-5.2` with `reasoning_effort=medium`)
+ *  - use a strict system prompt + a rubric-laden user prompt
+ *  - require a JSON binary judgement `{"label": 0|1, "reason": "..."}`
+ *
+ * Transport is a direct `fetch` to the chat completions endpoint, matching
+ * `simulator/user-simulator.ts`. Tests swap `globalThis.fetch`; no
+ * production wrapper.
+ */
+
+import { parseLlmBinaryJudgement } from "./judgement";
+
+const ABSTENTION_JUDGE_SYSTEM_PROMPT =
+  "You are a strict grader for flawed-premise (abstention) questions. " +
+  "Judge whether a model answer correctly identifies that the question premise is wrong, " +
+  "consistent with the reference answer. " +
+  "If the model follows the flawed premise and gives a concrete answer under that premise, " +
+  "it must be graded 0. " +
+  "If the model's final answer is just UNKNOWN / cannot determine without identifying the flaw, grade 0. " +
+  "If the model is contradictory (both rejects premise and also gives a concrete premise-following answer), grade 0. " +
+  "Paraphrases are allowed when they preserve the same core flaw described by the reference answer.";
+
+const GOTCHAS_JUDGE_SYSTEM_PROMPT =
+  "You are a strict grader for gotchas-style insight questions. " +
+  "The reference answer describes the key insight(s). " +
+  "Grade 1 if the model response includes at least one correct insight point from the reference answer " +
+  "(paraphrase allowed), and does not contradict any reference point. " +
+  "If the model's direction is wrong, or it contains contradictions against any reference point, grade 0. " +
+  "If the model gives multiple points, partial coverage is enough for 1 as long as no contradictions appear.";
+
+/** V2 `run_eval.py` defaults. */
+export const DEFAULT_EVALUATOR_MODEL = "gpt-5.2";
+export const DEFAULT_EVALUATOR_REASONING_EFFORT: ReasoningEffort = "medium";
+export const DEFAULT_EVALUATOR_MAX_COMPLETION_TOKENS = 2048;
+export const DEFAULT_EVALUATOR_TIMEOUT_SECONDS = 43200;
+export const DEFAULT_EVALUATOR_API_KEY_ENV = "OPENAI_API_KEY";
+export const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
+
+export type ReasoningEffort = "low" | "medium" | "high";
+
+export interface LlmJudgeOptions {
+  evaluatorModel?: string;
+  evaluatorBaseUrl?: string;
+  evaluatorApiKey?: string;
+  /** Env var to read the API key from when `evaluatorApiKey` is omitted. */
+  evaluatorApiKeyEnv?: string;
+  evaluatorReasoningEffort?: ReasoningEffort;
+  evaluatorMaxCompletionTokens?: number;
+  evaluatorTemperature?: number;
+  evaluatorTopP?: number;
+  evaluatorTimeoutSeconds?: number;
+  requireNonEmpty?: boolean;
+
+  /** Question record, used to pull `question.text` into the prompt. */
+  questionItem?: Record<string, unknown> | null;
+  /** Extracted "final answer" from the model response, if available. */
+  parsedPrediction?: string | null;
+  /** Raw full model response, if it differs from `prediction`. */
+  modelResponse?: string | null;
+}
+
+export interface LlmJudgeResult {
+  label: boolean;
+  reason: string;
+}
+
+export async function llmAbstentionChecker(
+  prediction: unknown,
+  answer: unknown,
+  opts: LlmJudgeOptions = {},
+): Promise<LlmJudgeResult> {
+  return runLlmJudge({
+    systemPrompt: ABSTENTION_JUDGE_SYSTEM_PROMPT,
+    buildUserPrompt: buildAbstentionUserPrompt,
+    prediction,
+    answer,
+    opts,
+  });
+}
+
+export async function llmGotchasChecker(
+  prediction: unknown,
+  answer: unknown,
+  opts: LlmJudgeOptions = {},
+): Promise<LlmJudgeResult> {
+  return runLlmJudge({
+    systemPrompt: GOTCHAS_JUDGE_SYSTEM_PROMPT,
+    buildUserPrompt: buildGotchasUserPrompt,
+    prediction,
+    answer,
+    opts,
+  });
+}
+
+interface JudgeRun {
+  systemPrompt: string;
+  buildUserPrompt: (args: UserPromptInputs) => string;
+  prediction: unknown;
+  answer: unknown;
+  opts: LlmJudgeOptions;
+}
+
+async function runLlmJudge(run: JudgeRun): Promise<LlmJudgeResult> {
+  const predictionText = stringify(run.prediction);
+  const answerText = stringify(run.answer);
+  const requireNonEmpty = run.opts.requireNonEmpty ?? true;
+  if (requireNonEmpty && (!predictionText || !answerText)) {
+    return { label: false, reason: "empty prediction or answer" };
+  }
+
+  const evaluatorModel = run.opts.evaluatorModel ?? DEFAULT_EVALUATOR_MODEL;
+  const apiKey = resolveApiKey(run.opts);
+  if (!apiKey) {
+    throw new Error(
+      "LLM judge requires an API key. Set evaluatorApiKey, set the env var named " +
+        "by evaluatorApiKeyEnv (default OPENAI_API_KEY), or supply evaluatorBaseUrl " +
+        "for a key-less local endpoint.",
+    );
+  }
+
+  const questionText = extractQuestionText(run.opts.questionItem);
+  const finalAnswerText =
+    stringify(run.opts.parsedPrediction) || predictionText;
+  const fullResponseText = stringify(run.opts.modelResponse) || predictionText;
+  if (requireNonEmpty && !finalAnswerText) {
+    return { label: false, reason: "empty final answer" };
+  }
+
+  const userPrompt = run.buildUserPrompt({
+    questionText,
+    referenceAnswer: answerText,
+    modelFullResponse: fullResponseText,
+    modelFinalAnswer: finalAnswerText,
+  });
+
+  const judgeText = await callChatCompletion({
+    apiKey,
+    baseUrl: run.opts.evaluatorBaseUrl ?? DEFAULT_OPENAI_BASE_URL,
+    model: evaluatorModel,
+    messages: [
+      { role: "system", content: run.systemPrompt },
+      { role: "user", content: userPrompt },
+    ],
+    maxCompletionTokens:
+      run.opts.evaluatorMaxCompletionTokens ??
+      DEFAULT_EVALUATOR_MAX_COMPLETION_TOKENS,
+    reasoningEffort:
+      run.opts.evaluatorReasoningEffort ?? DEFAULT_EVALUATOR_REASONING_EFFORT,
+    temperature: run.opts.evaluatorTemperature,
+    topP: run.opts.evaluatorTopP,
+    timeoutSeconds:
+      run.opts.evaluatorTimeoutSeconds ?? DEFAULT_EVALUATOR_TIMEOUT_SECONDS,
+  });
+
+  const parsed = parseLlmBinaryJudgement(judgeText);
+  return { label: parsed.label === 1, reason: parsed.reason };
+}
+
+interface UserPromptInputs {
+  questionText: string;
+  referenceAnswer: string;
+  modelFullResponse: string;
+  modelFinalAnswer: string;
+}
+
+function buildAbstentionUserPrompt(args: UserPromptInputs): string {
+  return (
+    "Evaluate whether the model correctly handled a flawed-premise question.\n\n" +
+    "Question:\n" +
+    `${args.questionText}\n\n` +
+    "Reference answer (the expected flaw-aware conclusion):\n" +
+    `${args.referenceAnswer}\n\n` +
+    "Model full response:\n" +
+    `${args.modelFullResponse}\n\n` +
+    "Model extracted final answer:\n" +
+    `${args.modelFinalAnswer}\n\n` +
+    "Scoring rubric:\n" +
+    "- Label 1 only if the model clearly identifies the flawed premise and reaches a conclusion " +
+    "equivalent to the reference answer.\n" +
+    "- Label 1 also if the model explicitly says it lacks access to the user's specific " +
+    "live environment/instance/configuration and therefore cannot verify, provided it does not " +
+    "give a concrete premise-following answer.\n" +
+    "- Label 0 if the model follows the flawed premise and gives a concrete answer under that premise.\n" +
+    "- Label 0 for generic UNKNOWN/insufficient-info replies that do not identify a flaw and do not " +
+    "make the explicit environment-access limitation clear.\n" +
+    "- Label 0 if contradictory.\n\n" +
+    "Output JSON only:\n" +
+    '{"label": 0 or 1, "reason": "short rationale"}'
+  );
+}
+
+function buildGotchasUserPrompt(args: UserPromptInputs): string {
+  return (
+    "Evaluate whether the model answer captures the gotcha insight.\n\n" +
+    "Question:\n" +
+    `${args.questionText}\n\n` +
+    "Reference answer (insight points):\n" +
+    `${args.referenceAnswer}\n\n` +
+    "Model full response:\n" +
+    `${args.modelFullResponse}\n\n` +
+    "Model extracted final answer:\n" +
+    `${args.modelFinalAnswer}\n\n` +
+    "Scoring rubric:\n" +
+    "- Label 1 if the model includes at least one correct insight point from the reference answer " +
+    "(paraphrase acceptable), and does not contradict any reference point.\n" +
+    "- Label 1 even if only part of a multi-point reference answer is covered, as long as there is " +
+    "no contradiction.\n" +
+    "- Label 0 if direction is wrong (suggests opposite action/cause), even if some wording overlaps.\n" +
+    "- Label 0 if any point in the model response contradicts any reference point.\n" +
+    "- Label 0 if the response is irrelevant or generic without insight.\n\n" +
+    "Output JSON only:\n" +
+    '{"label": 0 or 1, "reason": "short rationale"}'
+  );
+}
+
+interface ChatCompletionRequest {
+  apiKey: string;
+  baseUrl: string;
+  model: string;
+  messages: ReadonlyArray<{
+    role: "system" | "user" | "assistant";
+    content: string;
+  }>;
+  maxCompletionTokens: number;
+  reasoningEffort?: ReasoningEffort;
+  temperature?: number;
+  topP?: number;
+  timeoutSeconds: number;
+}
+
+async function callChatCompletion(
+  request: ChatCompletionRequest,
+): Promise<string> {
+  const url = `${request.baseUrl.replace(/\/+$/, "")}/chat/completions`;
+  const body: Record<string, unknown> = {
+    model: request.model,
+    messages: request.messages,
+    max_completion_tokens: request.maxCompletionTokens,
+  };
+  if (request.reasoningEffort !== undefined) {
+    body.reasoning_effort = request.reasoningEffort;
+  }
+  if (request.temperature !== undefined) body.temperature = request.temperature;
+  if (request.topP !== undefined) body.top_p = request.topP;
+
+  const controller = new AbortController();
+  const timeoutHandle = setTimeout(
+    () => controller.abort(),
+    Math.max(1, request.timeoutSeconds) * 1000,
+  );
+  let response: Response;
+  try {
+    response = await fetch(url, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+        authorization: `Bearer ${request.apiKey}`,
+      },
+      body: JSON.stringify(body),
+      signal: controller.signal,
+    });
+  } finally {
+    clearTimeout(timeoutHandle);
+  }
+
+  if (!response.ok) {
+    const errorBody = await response.text().catch(() => "");
+    throw new Error(
+      `Evaluator chat completion failed: HTTP ${response.status} ${response.statusText}` +
+        (errorBody ? ` — ${errorBody.slice(0, 400)}` : ""),
+    );
+  }
+
+  const data = (await response.json()) as ChatCompletionResponse;
+  const messageContent = data.choices?.[0]?.message?.content;
+  if (typeof messageContent === "string") {
+    const trimmed = messageContent.trim();
+    if (trimmed) return trimmed;
+  }
+  if (Array.isArray(messageContent)) {
+    const textParts: string[] = [];
+    for (const item of messageContent) {
+      if (
+        item &&
+        typeof item === "object" &&
+        "text" in item &&
+        typeof (item as { text: unknown }).text === "string"
+      ) {
+        textParts.push((item as { text: string }).text);
+      }
+    }
+    const joined = textParts.join("\n").trim();
+    if (joined) return joined;
+  }
+  throw new Error("Evaluator model returned empty response content.");
+}
+
+interface ChatCompletionResponse {
+  choices?: Array<{
+    message?: {
+      content?: string | Array<{ text?: string }>;
+    };
+  }>;
+}
+
+function resolveApiKey(opts: LlmJudgeOptions): string | undefined {
+  if (opts.evaluatorApiKey !== undefined) return opts.evaluatorApiKey;
+  const envKey = opts.evaluatorApiKeyEnv ?? DEFAULT_EVALUATOR_API_KEY_ENV;
+  const envValue = process.env[envKey];
+  if (envValue) return envValue;
+  // Mirror Python: a base URL implies a local server that may accept "EMPTY".
+  if (opts.evaluatorBaseUrl) return "EMPTY";
+  return undefined;
+}
+
+function stringify(value: unknown): string {
+  if (value === null || value === undefined) return "";
+  if (typeof value === "string") return value.trim();
+  return String(value).trim();
+}
+
+function extractQuestionText(
+  item: Record<string, unknown> | null | undefined,
+): string {
+  if (!item || typeof item !== "object") return "";
+  const q = (item as Record<string, unknown>).question;
+  if (typeof q === "string") return q.trim();
+  if (
+    q &&
+    typeof q === "object" &&
+    "text" in q &&
+    typeof (q as { text: unknown }).text === "string"
+  ) {
+    return (q as { text: string }).text.trim();
+  }
+  return "";
+}
diff --git a/evals/benchmarks/longmemeval-v2/src/judge/normalize.ts b/evals/benchmarks/longmemeval-v2/src/judge/normalize.ts
new file mode 100644
index 00000000000..8c6739c841a
--- /dev/null
+++ b/evals/benchmarks/longmemeval-v2/src/judge/normalize.ts
@@ -0,0 +1,60 @@
+/**
+ * Text normalization primitives used by the deterministic evaluators.
+ *
+ * Mirrors `normalize_phrase` and `split_phrases` from V2's
+ * `evaluation/qa_eval_metrics.py`. Defaults match the Python version:
+ * lowercase, hyphen/underscore → space, comma/semicolon → space, strip
+ * non-word characters, collapse runs of whitespace.
+ */
+
+export const DEFAULT_SEPARATORS: ReadonlyArray<string> = [",", ";"];
+
+export interface NormalizeOptions {
+  lower?: boolean;
+  normalizeHyphen?: boolean;
+  stripPunct?: boolean;
+}
+
+export interface SplitOptions extends NormalizeOptions {
+  separators?: ReadonlyArray<string>;
+}
+
+export function normalizePhrase(
+  text: unknown,
+  opts: NormalizeOptions = {},
+): string {
+  if (text === null || text === undefined) return "";
+  let s = typeof text === "string" ? text : String(text);
+  const lower = opts.lower ?? true;
+  const normalizeHyphen = opts.normalizeHyphen ?? true;
+  const stripPunct = opts.stripPunct ?? true;
+  if (lower) s = s.toLowerCase();
+  if (normalizeHyphen) s = s.replace(/[-_]/g, " ");
+  s = s.replace(/[,;]/g, " ");
+  if (stripPunct) {
+    // Python re.sub(r"[^\w\s]", "", text). JS \w is [A-Za-z0-9_], same ASCII
+    // semantics as Python's default str regex.
+    s = s.replace(/[^\w\s]/g, "");
+  }
+  s = s.replace(/\s+/g, " ").trim();
+  return s;
+}
+
+export function splitPhrases(text: unknown, opts: SplitOptions = {}): string[] {
+  if (text === null || text === undefined) return [];
+  const separators = opts.separators ?? DEFAULT_SEPARATORS;
+  if (separators.length === 0) {
+    const normalized = normalizePhrase(text, opts);
+    return normalized ? [normalized] : [];
+  }
+  const s = typeof text === "string" ? text : String(text);
+  const pattern = new RegExp(separators.map(escapeRegex).join("|"));
+  const parts = s.split(pattern);
+  return parts
+    .map((part) => normalizePhrase(part, opts))
+    .filter((part) => part.length > 0);
+}
+
+export function escapeRegex(s: string): string {
+  return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
diff --git a/evals/benchmarks/longmemeval-v2/src/judge/spec.ts b/evals/benchmarks/longmemeval-v2/src/judge/spec.ts
new file mode 100644
index 00000000000..237837f54fe
--- /dev/null
+++ b/evals/benchmarks/longmemeval-v2/src/judge/spec.ts
@@ -0,0 +1,87 @@
+/**
+ * Parser for V2's `eval_function` spec strings.
+ *
+ * Spec format (from `parse_eval_function_spec` in V2's
+ * `evaluation/qa_eval_metrics.py`):
+ *
+ *     "<name>|<key1>=<value1>|<key2>=<value2>|..."
+ *
+ * Where `<name>` is the eval function identifier (in V2's snake_case) and
+ * each `<keyN>=<valueN>` is a kwarg override. We keep `name` in V2's
+ * snake_case so the dispatcher can match it verbatim; we convert kwarg
+ * keys to TypeScript-idiomatic camelCase so callers can spread them
+ * alongside other camelCase options.
+ */
+
+export interface ParsedEvalSpec {
+  /** Function identifier in V2 snake_case (e.g. `norm_phrase_set_match`). */
+  name: string;
+  /** Kwargs from the spec string, converted to camelCase. */
+  kwargs: Record<string, unknown>;
+}
+
+export function parseEvalFunctionSpec(spec: unknown): ParsedEvalSpec {
+  if (typeof spec !== "string" || spec.length === 0) {
+    throw new Error("eval function spec must be a non-empty string.");
+  }
+  const parts = spec.split("|").map((part) => part.trim());
+  const name = parts[0];
+  if (!name) {
+    throw new Error("eval function spec missing function name.");
+  }
+  const kwargs: Record<string, unknown> = {};
+  for (const part of parts.slice(1)) {
+    if (!part) continue;
+    const eq = part.indexOf("=");
+    if (eq === -1) {
+      throw new Error(`Invalid eval function option: ${part}`);
+    }
+    const rawKey = part.slice(0, eq).trim();
+    const rawValue = part.slice(eq + 1).trim();
+    if (!rawKey) {
+      throw new Error(`Invalid eval function option: ${part}`);
+    }
+    const camelKey = snakeToCamel(rawKey);
+    if (camelKey in kwargs) {
+      throw new Error(`Duplicate eval function option: ${rawKey}`);
+    }
+    kwargs[camelKey] = parseEvalValue(rawKey, rawValue);
+  }
+  return { name, kwargs };
+}
+
+export function parseEvalValue(rawKey: string, value: string): unknown {
+  const lowered = value.toLowerCase();
+  if (lowered === "true" || lowered === "false") {
+    return lowered === "true";
+  }
+  if (lowered === "none" || lowered === "null") {
+    return null;
+  }
+  if (rawKey === "separators" || rawKey === "separator") {
+    if (value.length === 0) return [];
+    const stripped = value.trim();
+    if (stripped.startsWith("[") && stripped.endsWith("]")) {
+      try {
+        const parsed = JSON.parse(stripped);
+        if (Array.isArray(parsed)) return parsed;
+      } catch {
+        // Fall through to char split — matches Python behavior when JSON
+        // parsing would fail.
+      }
+    }
+    // Mirror Python: split into individual non-whitespace characters.
+    return Array.from(value).filter((ch) => !/\s/.test(ch));
+  }
+  if (/^-?\d+\.\d+$/.test(value)) {
+    return Number.parseFloat(value);
+  }
+  if (/^-?\d+$/.test(value)) {
+    return Number.parseInt(value, 10);
+  }
+  return value;
+}
+
+function snakeToCamel(key: string): string {
+  return key.replace(/_([a-z0-9])/g, (_, ch: string) => ch.toUpperCase());
+}