diff --git a/evals/benchmarks/longmemeval-v2/README.md b/evals/benchmarks/longmemeval-v2/README.md index 9cf68fbf450..a8a8e6d00b7 100644 --- a/evals/benchmarks/longmemeval-v2/README.md +++ b/evals/benchmarks/longmemeval-v2/README.md @@ -48,15 +48,48 @@ interface BenchmarkItem { questionId: string; // V2 questions.jsonl `id` (stable question id) ability: string; // V2 questions.jsonl `question_type` (one of the five abilities) question: string; - answer: string; // reference answer, used by the GPT-4o judge + answer: string; // reference answer, used by the dispatched evaluator trajectoryIds: string[]; // ordered haystack from haystacks/lme_v2_.json } ``` The V2 schema also ships `domain`, `environment`, `image`, and `eval_function` fields (see `SCHEMA.md` in the published dataset). The -loader's zod schema preserves these via `.passthrough()`; the runner / -judge will consume them in subsequent PRs without the loader having to -grow first. +loader's zod schema preserves these via `.passthrough()`; the judge and +the runner consume them. -This PR ships the loader and its fixture tests only. The two-conversation runner (`run-ingest-ask`), GPT-4o paper-faithful judge, and Phase 1 wiring land in subsequent PRs against the contract established here. +## Judge + +`src/judge/index.ts` exports `evalFromSpec(spec, inputs, overrides?)` — a +TypeScript port of V2's `evaluation/qa_eval_metrics.py`. Each V2 question +carries an `eval_function` spec string of the form `"name|key=value|..."` +that dispatches to one of six implementations: + +**Deterministic (no LLM):** + +- `norm_phrase_set_match` — phrase-set membership (unordered) +- `norm_phrase_set_match_ordered` — phrase-set membership (ordered) +- `mc_choice_match` — single multiple-choice letter +- `mc_choice_set_match` — multi-select multiple-choice letters + +**LLM judges** (default `gpt-5.2` with `reasoning_effort=medium`, per V2's +`run_eval.py` defaults): + +- `llm_abstention_checker` — flawed-premise (abstention) questions +- `llm_gotchas_checker` — insight-style gotcha questions + +Both LLM judges issue an OpenAI-shape chat completion with a strict +system prompt + rubric, expect `{"label": 0|1, "reason": "..."}` JSON +output, and tolerate Markdown code fences + regex-fallback parsing. +Transport is a direct `fetch` to the chat completions endpoint — tests +swap `globalThis.fetch`, no production wrapper. + +`evalFromSpec` returns `{ label: boolean, reason: string, function: string }`. +`reason` is empty for deterministic evaluators; the function name is +echoed for audit/logging. + +## Next + +The two-conversation runner (`run-ingest-ask`, shipped in PR #32356) and +this judge unblock Phase 1 wiring (5-item smoke against +`vellum-simple-memory`), which lands in a follow-up PR. diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/questions.jsonl b/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/questions.jsonl index 8317cf09830..a51383069b7 100644 --- a/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/questions.jsonl +++ b/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/questions.jsonl @@ -1,3 +1,3 @@ -{"id": "q_001", "domain": "web", "environment": "shopping_admin", "question_type": "static-state-recall", "question": "What is the URL of the project settings page?", "image": null, "answer": "/settings/project", "eval_function": "exact_match"} -{"id": "q_002", "domain": "enterprise", "environment": "servicenow", "question_type": "dynamic-state-tracking", "question": "After the bulk import completed, what was the new total record count?", "image": null, "answer": "12,481", "eval_function": "exact_match"} -{"id": "q_003", "domain": "web", "environment": "gitlab", "question_type": "workflow-knowledge", "question": "What sequence of clicks creates a new dashboard?", "image": null, "answer": "Dashboards > New > template > Save", "eval_function": "exact_match", "extra_field_for_passthrough": true} +{"id": "q_001", "domain": "web", "environment": "shopping_admin", "question_type": "static-state-recall", "question": "What is the URL of the project settings page?", "image": null, "answer": "/settings/project", "eval_function": "norm_phrase_set_match"} +{"id": "q_002", "domain": "enterprise", "environment": "servicenow", "question_type": "dynamic-state-tracking", "question": "After the bulk import completed, what was the new total record count?", "image": null, "answer": "12,481", "eval_function": "norm_phrase_set_match|separators="} +{"id": "q_003", "domain": "web", "environment": "gitlab", "question_type": "workflow-knowledge", "question": "What sequence of clicks creates a new dashboard?", "image": null, "answer": "Dashboards > New > template > Save", "eval_function": "norm_phrase_set_match_ordered|separators=>", "extra_field_for_passthrough": true} diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/judge/deterministic.test.ts b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/deterministic.test.ts new file mode 100644 index 00000000000..6a346ae0ff0 --- /dev/null +++ b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/deterministic.test.ts @@ -0,0 +1,153 @@ +import { describe, expect, test } from "bun:test"; + +import { + extractMultiSelectLetters, + mcChoiceMatch, + mcChoiceSetMatch, + normPhraseSetMatch, + normPhraseSetMatchOrdered, +} from "../../judge/deterministic"; + +describe("normPhraseSetMatch", () => { + test("matches single phrase contained in prediction", () => { + expect( + normPhraseSetMatch("The URL is /settings/project.", "/settings/project"), + ).toBe(true); + }); + + test("requires every comma-separated phrase to appear", () => { + expect(normPhraseSetMatch("apple and banana", "apple, banana")).toBe(true); + expect(normPhraseSetMatch("apple only", "apple, banana")).toBe(false); + }); + + test("order does not matter for the set variant", () => { + expect(normPhraseSetMatch("banana apple", "apple, banana")).toBe(true); + }); + + test("respects custom separator", () => { + expect( + normPhraseSetMatch("apple and banana", "apple|banana", { + separators: ["|"], + }), + ).toBe(true); + }); + + test("requireNonEmpty=true rejects empty prediction", () => { + expect(normPhraseSetMatch("", "apple")).toBe(false); + }); + + test("requireNonEmpty=false allows empty answer to match anything", () => { + expect(normPhraseSetMatch("anything", "", { requireNonEmpty: false })).toBe( + true, + ); + }); + + test("normalization makes hyphenated phrase match prediction with spaces", () => { + expect(normPhraseSetMatch("project settings", "project-settings")).toBe( + true, + ); + }); + + test("respects empty separators (single-phrase answer with comma in it)", () => { + expect( + normPhraseSetMatch("the count was 12,481 records", "12,481", { + separators: [], + }), + ).toBe(true); + }); +}); + +describe("normPhraseSetMatchOrdered", () => { + test("matches when answer phrases appear in order", () => { + expect( + normPhraseSetMatchOrdered( + "click Dashboards, then New, then template, then Save", + "Dashboards > New > template > Save", + { separators: [">"] }, + ), + ).toBe(true); + }); + + test("fails when order is wrong", () => { + expect( + normPhraseSetMatchOrdered( + "click Save, then template, then New, then Dashboards", + "Dashboards > New > template > Save", + { separators: [">"] }, + ), + ).toBe(false); + }); + + test("requireNonEmpty=true rejects empty prediction", () => { + expect(normPhraseSetMatchOrdered("", "a > b", { separators: [">"] })).toBe( + false, + ); + }); +}); + +describe("mcChoiceMatch", () => { + test("matches single letter regardless of case", () => { + expect(mcChoiceMatch("a", "A")).toBe(true); + }); + + test("extracts from \\boxed{...}", () => { + expect(mcChoiceMatch("After thinking, \\boxed{c}", "C")).toBe(true); + }); + + test('strips "choice" and "option" words', () => { + expect(mcChoiceMatch("Choice B", "B")).toBe(true); + expect(mcChoiceMatch("Option d.", "D")).toBe(true); + }); + + test("strips trailing periods by default", () => { + expect(mcChoiceMatch("B.", "B")).toBe(true); + }); + + test("respects custom stripChars", () => { + expect(mcChoiceMatch("B)", "B", { stripChars: ".)" })).toBe(true); + }); + + test("returns false when letters differ", () => { + expect(mcChoiceMatch("A", "B")).toBe(false); + }); + + test("null/undefined prediction or answer → false", () => { + expect(mcChoiceMatch(null, "A")).toBe(false); + expect(mcChoiceMatch("A", undefined)).toBe(false); + }); +}); + +describe("mcChoiceSetMatch", () => { + test("set-equality on multi-letter answers regardless of order", () => { + expect(mcChoiceSetMatch("A, B, D", "D B A")).toBe(true); + }); + + test('filters filler words like "and"/"option"', () => { + expect(mcChoiceSetMatch("A and B and C", "A, B, C")).toBe(true); + }); + + test("returns false on different sets", () => { + expect(mcChoiceSetMatch("A, B", "A, B, C")).toBe(false); + }); + + test("requireNonEmpty=true rejects empty prediction", () => { + expect(mcChoiceSetMatch("", "A, B")).toBe(false); + }); +}); + +describe("extractMultiSelectLetters", () => { + test("explodes a CSV of letters into per-letter array", () => { + expect(extractMultiSelectLetters("A, B, C")).toEqual(["A", "B", "C"]); + }); + + test("drops filler words", () => { + expect(extractMultiSelectLetters("Final answer: A and B")).toEqual([ + "A", + "B", + ]); + }); + + test("explodes multi-letter chunks like 'BD' into ['B','D']", () => { + expect(extractMultiSelectLetters("BD")).toEqual(["B", "D"]); + }); +}); diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/judge/index.test.ts b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/index.test.ts new file mode 100644 index 00000000000..8b84b05f617 --- /dev/null +++ b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/index.test.ts @@ -0,0 +1,132 @@ +import { afterEach, describe, expect, test } from "bun:test"; + +import { evalFromSpec } from "../../judge"; + +const originalFetch = globalThis.fetch; + +function mockOpenAI(content: string) { + const captured: { body?: Record } = {}; + globalThis.fetch = (async ( + _url: string | URL | Request, + init?: RequestInit, + ) => { + if (init?.body !== undefined) { + captured.body = JSON.parse(String(init.body)) as Record; + } + return new Response( + JSON.stringify({ choices: [{ message: { content } }] }), + { status: 200, headers: { "content-type": "application/json" } }, + ); + }) as typeof fetch; + return captured; +} + +describe("evalFromSpec dispatcher", () => { + afterEach(() => { + globalThis.fetch = originalFetch; + }); + + test("dispatches norm_phrase_set_match and reports the function name", async () => { + const result = await evalFromSpec("norm_phrase_set_match", { + prediction: "the project settings page is /settings/project", + answer: "/settings/project", + }); + expect(result).toEqual({ + label: true, + reason: "", + function: "norm_phrase_set_match", + }); + }); + + test("dispatches norm_phrase_set_match_ordered with separator kwargs from spec", async () => { + const result = await evalFromSpec( + "norm_phrase_set_match_ordered|separators=>", + { + prediction: "click Dashboards then New then template then Save", + answer: "Dashboards > New > template > Save", + }, + ); + expect(result.label).toBe(true); + expect(result.function).toBe("norm_phrase_set_match_ordered"); + }); + + test("dispatches mc_choice_match", async () => { + const result = await evalFromSpec("mc_choice_match", { + prediction: "Choice B.", + answer: "B", + }); + expect(result.label).toBe(true); + expect(result.function).toBe("mc_choice_match"); + }); + + test("dispatches mc_choice_set_match", async () => { + const result = await evalFromSpec("mc_choice_set_match", { + prediction: "Final answer: A and C", + answer: "C, A", + }); + expect(result.label).toBe(true); + expect(result.function).toBe("mc_choice_set_match"); + }); + + test("dispatches llm_abstention_checker through the OpenAI transport", async () => { + mockOpenAI('{"label": 1, "reason": "premise rejected"}'); + + const result = await evalFromSpec( + "llm_abstention_checker", + { + prediction: "The premise here is wrong because X.", + answer: "Reject the premise.", + questionItem: { question: "Why does Z fail?" }, + }, + { evaluatorModel: "gpt-5.2", evaluatorApiKey: "unit-test" }, + ); + + expect(result).toEqual({ + label: true, + reason: "premise rejected", + function: "llm_abstention_checker", + }); + }); + + test("dispatches llm_gotchas_checker through the OpenAI transport", async () => { + mockOpenAI('{"label": 1, "reason": "captures insight"}'); + + const result = await evalFromSpec( + "llm_gotchas_checker", + { + prediction: "You need to refresh the cache first.", + answer: "Cache must be invalidated before reload.", + }, + { evaluatorModel: "gpt-5.2", evaluatorApiKey: "unit-test" }, + ); + + expect(result.label).toBe(true); + expect(result.function).toBe("llm_gotchas_checker"); + }); + + test("caller overrides win over spec kwargs", async () => { + // Spec sets requireNonEmpty=true; override forces false so an empty + // answer still matches. + const result = await evalFromSpec( + "norm_phrase_set_match|require_non_empty=true", + { prediction: "non-empty", answer: "" }, + { requireNonEmpty: false }, + ); + expect(result.label).toBe(true); + }); + + test("throws on unknown function name", async () => { + await expect( + evalFromSpec("definitely_not_real", { prediction: "x", answer: "y" }), + ).rejects.toThrow(/Unknown eval function/); + }); + + test("propagates parse errors on malformed spec strings", async () => { + await expect( + evalFromSpec("norm_phrase_set_match|noequals", { + prediction: "x", + answer: "y", + }), + ).rejects.toThrow(/Invalid eval function option/); + }); +}); diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/judge/judgement.test.ts b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/judgement.test.ts new file mode 100644 index 00000000000..e964bcf5085 --- /dev/null +++ b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/judgement.test.ts @@ -0,0 +1,89 @@ +import { describe, expect, test } from "bun:test"; + +import { + parseLlmBinaryJudgement, + stripMarkdownCodeFence, +} from "../../judge/judgement"; + +describe("stripMarkdownCodeFence", () => { + test("removes leading and trailing triple-backtick fences", () => { + const text = '```json\n{"label": 1}\n```'; + expect(stripMarkdownCodeFence(text)).toBe('{"label": 1}'); + }); + + test("passes through non-fenced text unchanged", () => { + expect(stripMarkdownCodeFence('{"label": 0}')).toBe('{"label": 0}'); + }); +}); + +describe("parseLlmBinaryJudgement", () => { + test("parses strict JSON with label=1 and reason", () => { + const result = parseLlmBinaryJudgement( + '{"label": 1, "reason": "identified flaw"}', + ); + expect(result).toEqual({ label: 1, reason: "identified flaw" }); + }); + + test("parses strict JSON with label=0", () => { + const result = parseLlmBinaryJudgement('{"label": 0, "reason": "wrong"}'); + expect(result).toEqual({ label: 0, reason: "wrong" }); + }); + + test("strips a code fence before JSON parse", () => { + const result = parseLlmBinaryJudgement( + '```\n{"label": 1, "reason": "ok"}\n```', + ); + expect(result).toEqual({ label: 1, reason: "ok" }); + }); + + test("strips a code fence with a language tag", () => { + const result = parseLlmBinaryJudgement( + '```json\n{"label": 0, "reason": "nope"}\n```', + ); + expect(result).toEqual({ label: 0, reason: "nope" }); + }); + + test("accepts label as a stringy 0 or 1", () => { + expect( + parseLlmBinaryJudgement('{"label": "1", "reason": "yep"}').label, + ).toBe(1); + expect( + parseLlmBinaryJudgement('{"label": "0", "reason": "nope"}').label, + ).toBe(0); + }); + + test("falls back to regex when JSON is malformed", () => { + const text = "Here is my judgement: {label: 1, reason: oops}"; + const result = parseLlmBinaryJudgement(text); + expect(result.label).toBe(1); + // Regex fallback returns the whole cleaned string as the reason. + expect(result.reason).toContain("label"); + }); + + test("regex fallback accepts single-quoted label key", () => { + const result = parseLlmBinaryJudgement("{'label': 0}"); + expect(result.label).toBe(0); + }); + + test("regex fallback accepts label=1 shorthand", () => { + const result = parseLlmBinaryJudgement("Output: label=1, reason=ok"); + expect(result.label).toBe(1); + }); + + test("throws on empty input", () => { + expect(() => parseLlmBinaryJudgement("")).toThrow(/Empty judgement/); + expect(() => parseLlmBinaryJudgement(" ")).toThrow(/Empty judgement/); + }); + + test("throws when no label can be extracted", () => { + expect(() => parseLlmBinaryJudgement("I refuse to answer")).toThrow( + /Could not parse evaluator binary judgement/, + ); + }); + + test("throws when JSON has a non-binary label and no shorthand fallback hits", () => { + expect(() => parseLlmBinaryJudgement('{"label": "maybe"}')).toThrow( + /Could not parse evaluator binary judgement/, + ); + }); +}); diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/judge/llm.test.ts b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/llm.test.ts new file mode 100644 index 00000000000..bb736958af0 --- /dev/null +++ b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/llm.test.ts @@ -0,0 +1,202 @@ +import { afterEach, describe, expect, test } from "bun:test"; + +import { llmAbstentionChecker, llmGotchasChecker } from "../../judge/llm"; + +const originalFetch = globalThis.fetch; + +interface CapturedRequest { + url: string | URL | Request; + init?: RequestInit; + body?: Record; +} + +function mockOpenAIChatCompletions(responseBody: unknown, status = 200) { + const captured: CapturedRequest = { url: "" }; + globalThis.fetch = (async ( + url: string | URL | Request, + init?: RequestInit, + ) => { + captured.url = url; + captured.init = init; + if (init?.body !== undefined) { + captured.body = JSON.parse(String(init.body)) as Record; + } + return new Response(JSON.stringify(responseBody), { + status, + headers: { "content-type": "application/json" }, + }); + }) as typeof fetch; + return captured; +} + +function openAIChatBody(content: string): Record { + return { choices: [{ message: { content } }] }; +} + +describe("llmAbstentionChecker", () => { + afterEach(() => { + globalThis.fetch = originalFetch; + }); + + test("returns label=true and reason on a positive JSON judgement", async () => { + mockOpenAIChatCompletions( + openAIChatBody('{"label": 1, "reason": "identified flaw"}'), + ); + + const result = await llmAbstentionChecker( + "The premise is wrong because X.", + "Reject the premise (X is impossible here).", + { + evaluatorModel: "gpt-5.2", + evaluatorApiKey: "unit-test", + questionItem: { question: "Why does Z fail?" }, + }, + ); + + expect(result.label).toBe(true); + expect(result.reason).toBe("identified flaw"); + }); + + test("returns label=false on a JSON judgement with label=0", async () => { + mockOpenAIChatCompletions( + openAIChatBody('{"label": 0, "reason": "followed flawed premise"}'), + ); + + const result = await llmAbstentionChecker("X.", "Reject premise.", { + evaluatorModel: "gpt-5.2", + evaluatorApiKey: "unit-test", + }); + + expect(result.label).toBe(false); + expect(result.reason).toBe("followed flawed premise"); + }); + + test("posts to /chat/completions with bearer auth and OpenAI body shape", async () => { + const captured = mockOpenAIChatCompletions( + openAIChatBody('{"label": 1, "reason": "ok"}'), + ); + + await llmAbstentionChecker("pred", "ans", { + evaluatorModel: "gpt-5.2", + evaluatorApiKey: "unit-test", + evaluatorReasoningEffort: "medium", + evaluatorMaxCompletionTokens: 2048, + questionItem: { question: "Q?" }, + }); + + expect(String(captured.url)).toBe( + "https://api.openai.com/v1/chat/completions", + ); + expect(captured.init?.method).toBe("POST"); + const headers = new Headers(captured.init?.headers); + expect(headers.get("authorization")).toBe("Bearer unit-test"); + expect(headers.get("content-type")).toBe("application/json"); + expect(captured.body?.model).toBe("gpt-5.2"); + expect(captured.body?.reasoning_effort).toBe("medium"); + expect(captured.body?.max_completion_tokens).toBe(2048); + const messages = captured.body?.messages as Array<{ + role: string; + content: string; + }>; + expect(messages).toHaveLength(2); + expect(messages[0].role).toBe("system"); + expect(messages[0].content).toContain("flawed-premise"); + expect(messages[1].role).toBe("user"); + expect(messages[1].content).toContain("Q?"); + expect(messages[1].content).toContain("ans"); + }); + + test("respects a custom base URL", async () => { + const captured = mockOpenAIChatCompletions( + openAIChatBody('{"label": 1, "reason": "ok"}'), + ); + + await llmAbstentionChecker("pred", "ans", { + evaluatorModel: "gpt-5.2", + evaluatorApiKey: "unit-test", + evaluatorBaseUrl: "http://localhost:8001/v1", + }); + + expect(String(captured.url)).toBe( + "http://localhost:8001/v1/chat/completions", + ); + }); + + test("throws when evaluatorApiKey is missing and env is unset and no base URL", async () => { + const previous = process.env.OPENAI_API_KEY; + delete process.env.OPENAI_API_KEY; + try { + await expect( + llmAbstentionChecker("p", "a", { evaluatorModel: "gpt-5.2" }), + ).rejects.toThrow(/API key/); + } finally { + if (previous !== undefined) process.env.OPENAI_API_KEY = previous; + } + }); + + test("returns label=false with explanatory reason on empty prediction", async () => { + // No mock — should short-circuit before any fetch call. + globalThis.fetch = (async () => { + throw new Error("fetch should not be called"); + }) as unknown as typeof fetch; + + const result = await llmAbstentionChecker("", "ref", { + evaluatorModel: "gpt-5.2", + evaluatorApiKey: "unit-test", + }); + + expect(result.label).toBe(false); + expect(result.reason).toContain("empty"); + }); + + test("non-2xx HTTP response surfaces as an error with status code", async () => { + mockOpenAIChatCompletions({ error: "rate limited" }, 429); + + await expect( + llmAbstentionChecker("p", "a", { + evaluatorModel: "gpt-5.2", + evaluatorApiKey: "unit-test", + }), + ).rejects.toThrow(/HTTP 429/); + }); + + test("parses code-fenced judgement output", async () => { + mockOpenAIChatCompletions( + openAIChatBody('```json\n{"label": 1, "reason": "ok"}\n```'), + ); + + const result = await llmAbstentionChecker("p", "a", { + evaluatorModel: "gpt-5.2", + evaluatorApiKey: "unit-test", + }); + + expect(result.label).toBe(true); + expect(result.reason).toBe("ok"); + }); +}); + +describe("llmGotchasChecker", () => { + afterEach(() => { + globalThis.fetch = originalFetch; + }); + + test("uses the gotchas system prompt and rubric", async () => { + const captured = mockOpenAIChatCompletions( + openAIChatBody('{"label": 1, "reason": "covers insight"}'), + ); + + await llmGotchasChecker("response covers insight", "insight A; insight B", { + evaluatorModel: "gpt-5.2", + evaluatorApiKey: "unit-test", + questionItem: { question: "What gotcha applies?" }, + }); + + const messages = captured.body?.messages as Array<{ + role: string; + content: string; + }>; + expect(messages[0].content).toContain("gotchas-style insight"); + expect(messages[1].content).toContain("gotcha insight"); + expect(messages[1].content).toContain("insight A; insight B"); + }); +}); diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/judge/normalize.test.ts b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/normalize.test.ts new file mode 100644 index 00000000000..b74891d2142 --- /dev/null +++ b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/normalize.test.ts @@ -0,0 +1,71 @@ +import { describe, expect, test } from "bun:test"; + +import { + DEFAULT_SEPARATORS, + normalizePhrase, + splitPhrases, +} from "../../judge/normalize"; + +describe("normalizePhrase", () => { + test("lowercases, replaces hyphens/underscores, strips punctuation", () => { + expect(normalizePhrase("New-Dashboard_Layout!")).toBe( + "new dashboard layout", + ); + }); + + test("collapses runs of whitespace and trims", () => { + expect(normalizePhrase(" hello world ")).toBe("hello world"); + }); + + test("returns empty string for null and undefined", () => { + expect(normalizePhrase(null)).toBe(""); + expect(normalizePhrase(undefined)).toBe(""); + }); + + test("respects opts.lower=false", () => { + expect(normalizePhrase("AaBb", { lower: false })).toBe("AaBb"); + }); + + test("respects opts.normalizeHyphen=false", () => { + expect(normalizePhrase("a-b_c", { normalizeHyphen: false })).toBe("ab_c"); + }); + + test("respects opts.stripPunct=false (preserves non-word non-space)", () => { + expect(normalizePhrase("hello, world!", { stripPunct: false })).toBe( + "hello world!", + ); + }); + + test("stringifies non-string input via String()", () => { + expect(normalizePhrase(42)).toBe("42"); + }); +}); + +describe("splitPhrases", () => { + test("default separators split on commas and semicolons", () => { + expect(splitPhrases("foo, bar; baz")).toEqual(["foo", "bar", "baz"]); + }); + + test("empty separators returns a single normalized phrase", () => { + expect(splitPhrases("12,481", { separators: [] })).toEqual(["12 481"]); + }); + + test("custom separator >", () => { + expect( + splitPhrases("Dashboards > New > template > Save", { separators: [">"] }), + ).toEqual(["dashboards", "new", "template", "save"]); + }); + + test("filters out parts that normalize to empty", () => { + expect(splitPhrases("foo,,bar")).toEqual(["foo", "bar"]); + }); + + test("null/undefined → empty array", () => { + expect(splitPhrases(null)).toEqual([]); + expect(splitPhrases(undefined)).toEqual([]); + }); + + test("DEFAULT_SEPARATORS exports comma and semicolon", () => { + expect(DEFAULT_SEPARATORS).toEqual([",", ";"]); + }); +}); diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/judge/spec.test.ts b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/spec.test.ts new file mode 100644 index 00000000000..7ddaa0fc8fe --- /dev/null +++ b/evals/benchmarks/longmemeval-v2/src/__tests__/judge/spec.test.ts @@ -0,0 +1,110 @@ +import { describe, expect, test } from "bun:test"; + +import { parseEvalFunctionSpec, parseEvalValue } from "../../judge/spec"; + +describe("parseEvalFunctionSpec", () => { + test("bare function name with no kwargs", () => { + expect(parseEvalFunctionSpec("norm_phrase_set_match")).toEqual({ + name: "norm_phrase_set_match", + kwargs: {}, + }); + }); + + test("converts snake_case kwarg keys to camelCase", () => { + const parsed = parseEvalFunctionSpec( + "norm_phrase_set_match|require_non_empty=false", + ); + expect(parsed.name).toBe("norm_phrase_set_match"); + expect(parsed.kwargs).toEqual({ requireNonEmpty: false }); + }); + + test("parses bool true/false case-insensitively", () => { + const parsed = parseEvalFunctionSpec( + "norm_phrase_set_match|lower=TRUE|strip_punct=False", + ); + expect(parsed.kwargs).toEqual({ lower: true, stripPunct: false }); + }); + + test("special-cases separators with empty value", () => { + expect( + parseEvalFunctionSpec("norm_phrase_set_match|separators=").kwargs, + ).toEqual({ separators: [] }); + }); + + test("special-cases separators with single char", () => { + expect( + parseEvalFunctionSpec("norm_phrase_set_match_ordered|separators=>") + .kwargs, + ).toEqual({ separators: [">"] }); + }); + + test("special-cases separators with bracketed JSON list", () => { + // Note: `|` is reserved as the spec-level kwarg separator and cannot + // appear inside a JSON list value — the input would already be + // pipe-split before this branch runs. This matches V2's Python behavior. + expect( + parseEvalFunctionSpec('mc_choice_set_match|separators=[",", ";"]').kwargs, + ).toEqual({ separators: [",", ";"] }); + }); + + test("parses none/null to null", () => { + expect( + parseEvalFunctionSpec("mc_choice_match|strip_chars=NONE").kwargs, + ).toEqual({ stripChars: null }); + }); + + test("parses integers and floats", () => { + const parsed = parseEvalFunctionSpec( + "norm_phrase_set_match|some_int=42|some_float=1.5", + ); + expect(parsed.kwargs).toEqual({ someInt: 42, someFloat: 1.5 }); + }); + + test("leaves non-numeric values as strings", () => { + expect( + parseEvalFunctionSpec("mc_choice_match|strip_chars=.").kwargs, + ).toEqual({ stripChars: "." }); + }); + + test("rejects empty spec", () => { + expect(() => parseEvalFunctionSpec("")).toThrow(/non-empty string/); + }); + + test("rejects non-string spec", () => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + expect(() => parseEvalFunctionSpec(42 as any)).toThrow(/non-empty string/); + }); + + test("rejects kwargs missing =", () => { + expect(() => + parseEvalFunctionSpec("norm_phrase_set_match|noequals"), + ).toThrow(/Invalid eval function option/); + }); + + test("rejects duplicate kwarg keys", () => { + expect(() => + parseEvalFunctionSpec("norm_phrase_set_match|lower=true|lower=false"), + ).toThrow(/Duplicate eval function option/); + }); + + test("rejects empty kwarg key", () => { + expect(() => parseEvalFunctionSpec("norm_phrase_set_match|=value")).toThrow( + /Invalid eval function option/, + ); + }); +}); + +describe("parseEvalValue", () => { + test("integer-shaped values parse as int", () => { + expect(parseEvalValue("some_int", "42")).toBe(42); + expect(parseEvalValue("some_int", "-3")).toBe(-3); + }); + + test("float-shaped values parse as float", () => { + expect(parseEvalValue("some_float", "1.5")).toBe(1.5); + }); + + test("numeric-with-trailing-text stays a string", () => { + expect(parseEvalValue("some_str", "12abc")).toBe("12abc"); + }); +}); diff --git a/evals/benchmarks/longmemeval-v2/src/judge/deterministic.ts b/evals/benchmarks/longmemeval-v2/src/judge/deterministic.ts new file mode 100644 index 00000000000..ea9bf7e8497 --- /dev/null +++ b/evals/benchmarks/longmemeval-v2/src/judge/deterministic.ts @@ -0,0 +1,148 @@ +/** + * Deterministic (no-LLM) evaluators. TypeScript ports of the corresponding + * functions in V2's `evaluation/qa_eval_metrics.py`: + * + * - `norm_phrase_set_match` — phrase-set membership (unordered) + * - `norm_phrase_set_match_ordered` — phrase-set membership (ordered) + * - `mc_choice_match` — single multiple-choice letter + * - `mc_choice_set_match` — multi-select multiple-choice letters + */ + +import { + DEFAULT_SEPARATORS, + escapeRegex, + normalizePhrase, + splitPhrases, + type SplitOptions, +} from "./normalize"; + +export interface PhraseSetMatchOptions extends SplitOptions { + requireNonEmpty?: boolean; +} + +export function normPhraseSetMatch( + prediction: unknown, + answer: unknown, + opts: PhraseSetMatchOptions = {}, +): boolean { + const requireNonEmpty = opts.requireNonEmpty ?? true; + const normalizedPred = normalizePhrase(prediction, opts); + const answerPhrases = splitPhrases(answer, { + ...opts, + separators: opts.separators ?? DEFAULT_SEPARATORS, + }); + if (requireNonEmpty && (!normalizedPred || answerPhrases.length === 0)) { + return false; + } + for (const phrase of new Set(answerPhrases)) { + const pattern = new RegExp(`\\b${escapeRegex(phrase)}\\b`); + if (!pattern.test(normalizedPred)) return false; + } + return true; +} + +export function normPhraseSetMatchOrdered( + prediction: unknown, + answer: unknown, + opts: PhraseSetMatchOptions = {}, +): boolean { + const requireNonEmpty = opts.requireNonEmpty ?? true; + const normalizedPred = normalizePhrase(prediction, opts); + const answerPhrases = splitPhrases(answer, { + ...opts, + separators: opts.separators ?? DEFAULT_SEPARATORS, + }); + if (requireNonEmpty && (!normalizedPred || answerPhrases.length === 0)) { + return false; + } + let start = 0; + for (const phrase of answerPhrases) { + const pattern = new RegExp(`\\b${escapeRegex(phrase)}\\b`); + const match = pattern.exec(normalizedPred.slice(start)); + if (!match) return false; + start += match.index + match[0].length; + } + return true; +} + +export interface McChoiceMatchOptions { + stripChars?: string; + requireNonEmpty?: boolean; +} + +export function mcChoiceMatch( + prediction: unknown, + answer: unknown, + opts: McChoiceMatchOptions = {}, +): boolean { + if (prediction === null || prediction === undefined) return false; + if (answer === null || answer === undefined) return false; + const predStr = + typeof prediction === "string" ? prediction : String(prediction); + const ansStr = typeof answer === "string" ? answer : String(answer); + const stripChars = opts.stripChars ?? "."; + const requireNonEmpty = opts.requireNonEmpty ?? true; + + const boxedMatch = predStr.toLowerCase().match(/\\boxed\{([^}]*)\}/); + let candidate = boxedMatch ? boxedMatch[1] : predStr; + candidate = candidate.replace(/\b(choice|option)\b/gi, ""); + for (const ch of stripChars) { + candidate = candidate.split(ch).join(""); + } + const cleaned = candidate.trim().toUpperCase(); + const expected = ansStr.trim().toUpperCase(); + if (requireNonEmpty && (!cleaned || !expected)) return false; + return cleaned === expected; +} + +const MULTI_SELECT_FILLER_WORDS = new Set([ + "AND", + "ANSWER", + "ANSWERS", + "CHOICE", + "CHOICES", + "FINAL", + "LETTER", + "LETTERS", + "OPTION", + "OPTIONS", +]); + +export function extractMultiSelectLetters(text: unknown): string[] { + if (text === null || text === undefined) return []; + const s = typeof text === "string" ? text : String(text); + const chunks = s.toUpperCase().match(/[A-Z]+/g) ?? []; + const letters: string[] = []; + for (const chunk of chunks) { + if (MULTI_SELECT_FILLER_WORDS.has(chunk)) continue; + for (const ch of chunk) letters.push(ch); + } + return letters; +} + +export interface McChoiceSetMatchOptions { + requireNonEmpty?: boolean; +} + +export function mcChoiceSetMatch( + prediction: unknown, + answer: unknown, + opts: McChoiceSetMatchOptions = {}, +): boolean { + const requireNonEmpty = opts.requireNonEmpty ?? true; + const predLetters = extractMultiSelectLetters(prediction); + const ansLetters = extractMultiSelectLetters(answer); + if ( + requireNonEmpty && + (predLetters.length === 0 || ansLetters.length === 0) + ) { + return false; + } + const predSet = new Set(predLetters); + const ansSet = new Set(ansLetters); + if (predSet.size !== ansSet.size) return false; + for (const letter of predSet) { + if (!ansSet.has(letter)) return false; + } + return true; +} diff --git a/evals/benchmarks/longmemeval-v2/src/judge/index.ts b/evals/benchmarks/longmemeval-v2/src/judge/index.ts new file mode 100644 index 00000000000..ab79e00e47a --- /dev/null +++ b/evals/benchmarks/longmemeval-v2/src/judge/index.ts @@ -0,0 +1,162 @@ +/** + * Public entry point for the LongMemEval-V2 evaluator. Dispatches per + * question's `eval_function` spec string to one of: + * + * Deterministic (no LLM): + * - norm_phrase_set_match + * - norm_phrase_set_match_ordered + * - mc_choice_match + * - mc_choice_set_match + * + * LLM judges (default model `gpt-5.2`, reasoning_effort=medium): + * - llm_abstention_checker — flawed-premise questions + * - llm_gotchas_checker — insight-style gotchas + * + * Mirrors `eval_from_spec` in V2's `evaluation/qa_eval_metrics.py`: + * caller-supplied overrides win over per-question spec kwargs. + */ + +import { + mcChoiceMatch, + mcChoiceSetMatch, + normPhraseSetMatch, + normPhraseSetMatchOrdered, + type McChoiceMatchOptions, + type McChoiceSetMatchOptions, + type PhraseSetMatchOptions, +} from "./deterministic"; +import { + llmAbstentionChecker, + llmGotchasChecker, + type LlmJudgeOptions, +} from "./llm"; +import { parseEvalFunctionSpec } from "./spec"; + +export interface EvalInputs { + prediction: unknown; + answer: unknown; + /** Question record (used by LLM judges to pull `question.text`). */ + questionItem?: Record | null; + /** Extracted "final answer" from the model, when distinct from prediction. */ + parsedPrediction?: string | null; + /** Raw full model response, when distinct from prediction. */ + modelResponse?: string | null; +} + +/** Caller-side overrides applied after the per-question spec kwargs. */ +export type EvalOverrides = LlmJudgeOptions & + PhraseSetMatchOptions & + McChoiceMatchOptions & + McChoiceSetMatchOptions; + +export interface EvalResult { + label: boolean; + /** Populated by LLM judges; empty string for deterministic functions. */ + reason: string; + /** The dispatched function name in V2 snake_case, for logging/audit. */ + function: string; +} + +export async function evalFromSpec( + spec: string, + inputs: EvalInputs, + overrides: EvalOverrides = {}, +): Promise { + const { name, kwargs } = parseEvalFunctionSpec(spec); + const merged = { ...kwargs, ...overrides }; + + switch (name) { + case "norm_phrase_set_match": + return { + label: normPhraseSetMatch( + inputs.prediction, + inputs.answer, + merged as PhraseSetMatchOptions, + ), + reason: "", + function: name, + }; + case "norm_phrase_set_match_ordered": + return { + label: normPhraseSetMatchOrdered( + inputs.prediction, + inputs.answer, + merged as PhraseSetMatchOptions, + ), + reason: "", + function: name, + }; + case "mc_choice_match": + return { + label: mcChoiceMatch( + inputs.prediction, + inputs.answer, + merged as McChoiceMatchOptions, + ), + reason: "", + function: name, + }; + case "mc_choice_set_match": + return { + label: mcChoiceSetMatch( + inputs.prediction, + inputs.answer, + merged as McChoiceSetMatchOptions, + ), + reason: "", + function: name, + }; + case "llm_abstention_checker": { + const result = await llmAbstentionChecker( + inputs.prediction, + inputs.answer, + { + ...(merged as LlmJudgeOptions), + questionItem: inputs.questionItem ?? null, + parsedPrediction: inputs.parsedPrediction ?? null, + modelResponse: inputs.modelResponse ?? null, + }, + ); + return { ...result, function: name }; + } + case "llm_gotchas_checker": { + const result = await llmGotchasChecker(inputs.prediction, inputs.answer, { + ...(merged as LlmJudgeOptions), + questionItem: inputs.questionItem ?? null, + parsedPrediction: inputs.parsedPrediction ?? null, + modelResponse: inputs.modelResponse ?? null, + }); + return { ...result, function: name }; + } + default: + throw new Error(`Unknown eval function: ${name}`); + } +} + +export { parseEvalFunctionSpec, parseEvalValue } from "./spec"; +export { normalizePhrase, splitPhrases, DEFAULT_SEPARATORS } from "./normalize"; +export { + mcChoiceMatch, + mcChoiceSetMatch, + normPhraseSetMatch, + normPhraseSetMatchOrdered, + extractMultiSelectLetters, +} from "./deterministic"; +export { + llmAbstentionChecker, + llmGotchasChecker, + DEFAULT_EVALUATOR_MODEL, + DEFAULT_EVALUATOR_REASONING_EFFORT, + DEFAULT_EVALUATOR_MAX_COMPLETION_TOKENS, + DEFAULT_EVALUATOR_TIMEOUT_SECONDS, + DEFAULT_EVALUATOR_API_KEY_ENV, + DEFAULT_OPENAI_BASE_URL, + type LlmJudgeOptions, + type LlmJudgeResult, + type ReasoningEffort, +} from "./llm"; +export { + parseLlmBinaryJudgement, + stripMarkdownCodeFence, + type ParsedJudgement, +} from "./judgement"; diff --git a/evals/benchmarks/longmemeval-v2/src/judge/judgement.ts b/evals/benchmarks/longmemeval-v2/src/judge/judgement.ts new file mode 100644 index 00000000000..4fc25ee07e6 --- /dev/null +++ b/evals/benchmarks/longmemeval-v2/src/judge/judgement.ts @@ -0,0 +1,76 @@ +/** + * Parses the LLM judge's binary judgement output. + * + * V2 instructs both judges to emit `{"label": 0 or 1, "reason": "..."}`. + * Real models still go off-script — Markdown code fences, prose around the + * JSON, single-quoted "JSON", `label=1` shorthand. `parseLlmBinaryJudgement` + * mirrors V2's `_parse_llm_binary_judgement`: + * + * 1. Strip a wrapping triple-backtick fence (with or without a language tag). + * 2. Try to JSON-parse the first balanced `{...}` block. + * 3. Fall back to a regex on `label: 0|1` in any common quote style. + * 4. Throw if none of the above matches. + */ + +export interface ParsedJudgement { + label: 0 | 1; + reason: string; +} + +export function stripMarkdownCodeFence(text: string): string { + const stripped = text.trim(); + if (stripped.startsWith("```") && stripped.endsWith("```")) { + const lines = stripped.split("\n"); + if (lines.length >= 3) { + return lines.slice(1, -1).join("\n").trim(); + } + } + return stripped; +} + +export function parseLlmBinaryJudgement(text: unknown): ParsedJudgement { + const cleaned = stripMarkdownCodeFence(stringify(text)); + if (!cleaned) { + throw new Error("Empty judgement response from evaluator model."); + } + + // 1) Strict JSON in the first {…} block. + const jsonMatch = cleaned.match(/\{[\s\S]*\}/); + if (jsonMatch) { + try { + const payload = JSON.parse(jsonMatch[0]); + if (payload && typeof payload === "object" && !Array.isArray(payload)) { + const label = (payload as Record).label; + if (label === 0 || label === 1 || label === "0" || label === "1") { + const reason = stringify((payload as Record).reason); + return { label: Number(label) as 0 | 1, reason }; + } + } + } catch { + // Fall through to regex extraction — matches Python. + } + } + + // 2) Regex fallback for non-strict JSON-shaped outputs. + const patterns: ReadonlyArray = [ + /"label"\s*:\s*([01])/i, + /'label'\s*:\s*([01])/i, + /\blabel\b\s*[:=]\s*([01])/i, + ]; + for (const pattern of patterns) { + const match = cleaned.match(pattern); + if (match) { + return { label: Number(match[1]) as 0 | 1, reason: cleaned }; + } + } + + throw new Error( + `Could not parse evaluator binary judgement: ${JSON.stringify(cleaned)}`, + ); +} + +function stringify(value: unknown): string { + if (value === null || value === undefined) return ""; + if (typeof value === "string") return value.trim(); + return String(value).trim(); +} diff --git a/evals/benchmarks/longmemeval-v2/src/judge/llm.ts b/evals/benchmarks/longmemeval-v2/src/judge/llm.ts new file mode 100644 index 00000000000..8643160a587 --- /dev/null +++ b/evals/benchmarks/longmemeval-v2/src/judge/llm.ts @@ -0,0 +1,342 @@ +/** + * LLM judges. TypeScript ports of `llm_abstention_checker` and + * `llm_gotchas_checker` from V2's `evaluation/qa_eval_metrics.py`. + * + * Both judges: + * - issue an OpenAI-shape chat completion against `evaluatorModel` + * (V2 default = `gpt-5.2` with `reasoning_effort=medium`) + * - use a strict system prompt + a rubric-laden user prompt + * - require a JSON binary judgement `{"label": 0|1, "reason": "..."}` + * + * Transport is a direct `fetch` to the chat completions endpoint, matching + * `simulator/user-simulator.ts`. Tests swap `globalThis.fetch`; no + * production wrapper. + */ + +import { parseLlmBinaryJudgement } from "./judgement"; + +const ABSTENTION_JUDGE_SYSTEM_PROMPT = + "You are a strict grader for flawed-premise (abstention) questions. " + + "Judge whether a model answer correctly identifies that the question premise is wrong, " + + "consistent with the reference answer. " + + "If the model follows the flawed premise and gives a concrete answer under that premise, " + + "it must be graded 0. " + + "If the model's final answer is just UNKNOWN / cannot determine without identifying the flaw, grade 0. " + + "If the model is contradictory (both rejects premise and also gives a concrete premise-following answer), grade 0. " + + "Paraphrases are allowed when they preserve the same core flaw described by the reference answer."; + +const GOTCHAS_JUDGE_SYSTEM_PROMPT = + "You are a strict grader for gotchas-style insight questions. " + + "The reference answer describes the key insight(s). " + + "Grade 1 if the model response includes at least one correct insight point from the reference answer " + + "(paraphrase allowed), and does not contradict any reference point. " + + "If the model's direction is wrong, or it contains contradictions against any reference point, grade 0. " + + "If the model gives multiple points, partial coverage is enough for 1 as long as no contradictions appear."; + +/** V2 `run_eval.py` defaults. */ +export const DEFAULT_EVALUATOR_MODEL = "gpt-5.2"; +export const DEFAULT_EVALUATOR_REASONING_EFFORT: ReasoningEffort = "medium"; +export const DEFAULT_EVALUATOR_MAX_COMPLETION_TOKENS = 2048; +export const DEFAULT_EVALUATOR_TIMEOUT_SECONDS = 43200; +export const DEFAULT_EVALUATOR_API_KEY_ENV = "OPENAI_API_KEY"; +export const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"; + +export type ReasoningEffort = "low" | "medium" | "high"; + +export interface LlmJudgeOptions { + evaluatorModel?: string; + evaluatorBaseUrl?: string; + evaluatorApiKey?: string; + /** Env var to read the API key from when `evaluatorApiKey` is omitted. */ + evaluatorApiKeyEnv?: string; + evaluatorReasoningEffort?: ReasoningEffort; + evaluatorMaxCompletionTokens?: number; + evaluatorTemperature?: number; + evaluatorTopP?: number; + evaluatorTimeoutSeconds?: number; + requireNonEmpty?: boolean; + + /** Question record, used to pull `question.text` into the prompt. */ + questionItem?: Record | null; + /** Extracted "final answer" from the model response, if available. */ + parsedPrediction?: string | null; + /** Raw full model response, if it differs from `prediction`. */ + modelResponse?: string | null; +} + +export interface LlmJudgeResult { + label: boolean; + reason: string; +} + +export async function llmAbstentionChecker( + prediction: unknown, + answer: unknown, + opts: LlmJudgeOptions = {}, +): Promise { + return runLlmJudge({ + systemPrompt: ABSTENTION_JUDGE_SYSTEM_PROMPT, + buildUserPrompt: buildAbstentionUserPrompt, + prediction, + answer, + opts, + }); +} + +export async function llmGotchasChecker( + prediction: unknown, + answer: unknown, + opts: LlmJudgeOptions = {}, +): Promise { + return runLlmJudge({ + systemPrompt: GOTCHAS_JUDGE_SYSTEM_PROMPT, + buildUserPrompt: buildGotchasUserPrompt, + prediction, + answer, + opts, + }); +} + +interface JudgeRun { + systemPrompt: string; + buildUserPrompt: (args: UserPromptInputs) => string; + prediction: unknown; + answer: unknown; + opts: LlmJudgeOptions; +} + +async function runLlmJudge(run: JudgeRun): Promise { + const predictionText = stringify(run.prediction); + const answerText = stringify(run.answer); + const requireNonEmpty = run.opts.requireNonEmpty ?? true; + if (requireNonEmpty && (!predictionText || !answerText)) { + return { label: false, reason: "empty prediction or answer" }; + } + + const evaluatorModel = run.opts.evaluatorModel ?? DEFAULT_EVALUATOR_MODEL; + const apiKey = resolveApiKey(run.opts); + if (!apiKey) { + throw new Error( + "LLM judge requires an API key. Set evaluatorApiKey, set the env var named " + + "by evaluatorApiKeyEnv (default OPENAI_API_KEY), or supply evaluatorBaseUrl " + + "for a key-less local endpoint.", + ); + } + + const questionText = extractQuestionText(run.opts.questionItem); + const finalAnswerText = + stringify(run.opts.parsedPrediction) || predictionText; + const fullResponseText = stringify(run.opts.modelResponse) || predictionText; + if (requireNonEmpty && !finalAnswerText) { + return { label: false, reason: "empty final answer" }; + } + + const userPrompt = run.buildUserPrompt({ + questionText, + referenceAnswer: answerText, + modelFullResponse: fullResponseText, + modelFinalAnswer: finalAnswerText, + }); + + const judgeText = await callChatCompletion({ + apiKey, + baseUrl: run.opts.evaluatorBaseUrl ?? DEFAULT_OPENAI_BASE_URL, + model: evaluatorModel, + messages: [ + { role: "system", content: run.systemPrompt }, + { role: "user", content: userPrompt }, + ], + maxCompletionTokens: + run.opts.evaluatorMaxCompletionTokens ?? + DEFAULT_EVALUATOR_MAX_COMPLETION_TOKENS, + reasoningEffort: + run.opts.evaluatorReasoningEffort ?? DEFAULT_EVALUATOR_REASONING_EFFORT, + temperature: run.opts.evaluatorTemperature, + topP: run.opts.evaluatorTopP, + timeoutSeconds: + run.opts.evaluatorTimeoutSeconds ?? DEFAULT_EVALUATOR_TIMEOUT_SECONDS, + }); + + const parsed = parseLlmBinaryJudgement(judgeText); + return { label: parsed.label === 1, reason: parsed.reason }; +} + +interface UserPromptInputs { + questionText: string; + referenceAnswer: string; + modelFullResponse: string; + modelFinalAnswer: string; +} + +function buildAbstentionUserPrompt(args: UserPromptInputs): string { + return ( + "Evaluate whether the model correctly handled a flawed-premise question.\n\n" + + "Question:\n" + + `${args.questionText}\n\n` + + "Reference answer (the expected flaw-aware conclusion):\n" + + `${args.referenceAnswer}\n\n` + + "Model full response:\n" + + `${args.modelFullResponse}\n\n` + + "Model extracted final answer:\n" + + `${args.modelFinalAnswer}\n\n` + + "Scoring rubric:\n" + + "- Label 1 only if the model clearly identifies the flawed premise and reaches a conclusion " + + "equivalent to the reference answer.\n" + + "- Label 1 also if the model explicitly says it lacks access to the user's specific " + + "live environment/instance/configuration and therefore cannot verify, provided it does not " + + "give a concrete premise-following answer.\n" + + "- Label 0 if the model follows the flawed premise and gives a concrete answer under that premise.\n" + + "- Label 0 for generic UNKNOWN/insufficient-info replies that do not identify a flaw and do not " + + "make the explicit environment-access limitation clear.\n" + + "- Label 0 if contradictory.\n\n" + + "Output JSON only:\n" + + '{"label": 0 or 1, "reason": "short rationale"}' + ); +} + +function buildGotchasUserPrompt(args: UserPromptInputs): string { + return ( + "Evaluate whether the model answer captures the gotcha insight.\n\n" + + "Question:\n" + + `${args.questionText}\n\n` + + "Reference answer (insight points):\n" + + `${args.referenceAnswer}\n\n` + + "Model full response:\n" + + `${args.modelFullResponse}\n\n` + + "Model extracted final answer:\n" + + `${args.modelFinalAnswer}\n\n` + + "Scoring rubric:\n" + + "- Label 1 if the model includes at least one correct insight point from the reference answer " + + "(paraphrase acceptable), and does not contradict any reference point.\n" + + "- Label 1 even if only part of a multi-point reference answer is covered, as long as there is " + + "no contradiction.\n" + + "- Label 0 if direction is wrong (suggests opposite action/cause), even if some wording overlaps.\n" + + "- Label 0 if any point in the model response contradicts any reference point.\n" + + "- Label 0 if the response is irrelevant or generic without insight.\n\n" + + "Output JSON only:\n" + + '{"label": 0 or 1, "reason": "short rationale"}' + ); +} + +interface ChatCompletionRequest { + apiKey: string; + baseUrl: string; + model: string; + messages: ReadonlyArray<{ + role: "system" | "user" | "assistant"; + content: string; + }>; + maxCompletionTokens: number; + reasoningEffort?: ReasoningEffort; + temperature?: number; + topP?: number; + timeoutSeconds: number; +} + +async function callChatCompletion( + request: ChatCompletionRequest, +): Promise { + const url = `${request.baseUrl.replace(/\/+$/, "")}/chat/completions`; + const body: Record = { + model: request.model, + messages: request.messages, + max_completion_tokens: request.maxCompletionTokens, + }; + if (request.reasoningEffort !== undefined) { + body.reasoning_effort = request.reasoningEffort; + } + if (request.temperature !== undefined) body.temperature = request.temperature; + if (request.topP !== undefined) body.top_p = request.topP; + + const controller = new AbortController(); + const timeoutHandle = setTimeout( + () => controller.abort(), + Math.max(1, request.timeoutSeconds) * 1000, + ); + let response: Response; + try { + response = await fetch(url, { + method: "POST", + headers: { + "content-type": "application/json", + authorization: `Bearer ${request.apiKey}`, + }, + body: JSON.stringify(body), + signal: controller.signal, + }); + } finally { + clearTimeout(timeoutHandle); + } + + if (!response.ok) { + const errorBody = await response.text().catch(() => ""); + throw new Error( + `Evaluator chat completion failed: HTTP ${response.status} ${response.statusText}` + + (errorBody ? ` — ${errorBody.slice(0, 400)}` : ""), + ); + } + + const data = (await response.json()) as ChatCompletionResponse; + const messageContent = data.choices?.[0]?.message?.content; + if (typeof messageContent === "string") { + const trimmed = messageContent.trim(); + if (trimmed) return trimmed; + } + if (Array.isArray(messageContent)) { + const textParts: string[] = []; + for (const item of messageContent) { + if ( + item && + typeof item === "object" && + "text" in item && + typeof (item as { text: unknown }).text === "string" + ) { + textParts.push((item as { text: string }).text); + } + } + const joined = textParts.join("\n").trim(); + if (joined) return joined; + } + throw new Error("Evaluator model returned empty response content."); +} + +interface ChatCompletionResponse { + choices?: Array<{ + message?: { + content?: string | Array<{ text?: string }>; + }; + }>; +} + +function resolveApiKey(opts: LlmJudgeOptions): string | undefined { + if (opts.evaluatorApiKey !== undefined) return opts.evaluatorApiKey; + const envKey = opts.evaluatorApiKeyEnv ?? DEFAULT_EVALUATOR_API_KEY_ENV; + const envValue = process.env[envKey]; + if (envValue) return envValue; + // Mirror Python: a base URL implies a local server that may accept "EMPTY". + if (opts.evaluatorBaseUrl) return "EMPTY"; + return undefined; +} + +function stringify(value: unknown): string { + if (value === null || value === undefined) return ""; + if (typeof value === "string") return value.trim(); + return String(value).trim(); +} + +function extractQuestionText( + item: Record | null | undefined, +): string { + if (!item || typeof item !== "object") return ""; + const q = (item as Record).question; + if (typeof q === "string") return q.trim(); + if ( + q && + typeof q === "object" && + "text" in q && + typeof (q as { text: unknown }).text === "string" + ) { + return (q as { text: string }).text.trim(); + } + return ""; +} diff --git a/evals/benchmarks/longmemeval-v2/src/judge/normalize.ts b/evals/benchmarks/longmemeval-v2/src/judge/normalize.ts new file mode 100644 index 00000000000..8c6739c841a --- /dev/null +++ b/evals/benchmarks/longmemeval-v2/src/judge/normalize.ts @@ -0,0 +1,60 @@ +/** + * Text normalization primitives used by the deterministic evaluators. + * + * Mirrors `normalize_phrase` and `split_phrases` from V2's + * `evaluation/qa_eval_metrics.py`. Defaults match the Python version: + * lowercase, hyphen/underscore → space, comma/semicolon → space, strip + * non-word characters, collapse runs of whitespace. + */ + +export const DEFAULT_SEPARATORS: ReadonlyArray = [",", ";"]; + +export interface NormalizeOptions { + lower?: boolean; + normalizeHyphen?: boolean; + stripPunct?: boolean; +} + +export interface SplitOptions extends NormalizeOptions { + separators?: ReadonlyArray; +} + +export function normalizePhrase( + text: unknown, + opts: NormalizeOptions = {}, +): string { + if (text === null || text === undefined) return ""; + let s = typeof text === "string" ? text : String(text); + const lower = opts.lower ?? true; + const normalizeHyphen = opts.normalizeHyphen ?? true; + const stripPunct = opts.stripPunct ?? true; + if (lower) s = s.toLowerCase(); + if (normalizeHyphen) s = s.replace(/[-_]/g, " "); + s = s.replace(/[,;]/g, " "); + if (stripPunct) { + // Python re.sub(r"[^\w\s]", "", text). JS \w is [A-Za-z0-9_], same ASCII + // semantics as Python's default str regex. + s = s.replace(/[^\w\s]/g, ""); + } + s = s.replace(/\s+/g, " ").trim(); + return s; +} + +export function splitPhrases(text: unknown, opts: SplitOptions = {}): string[] { + if (text === null || text === undefined) return []; + const separators = opts.separators ?? DEFAULT_SEPARATORS; + if (separators.length === 0) { + const normalized = normalizePhrase(text, opts); + return normalized ? [normalized] : []; + } + const s = typeof text === "string" ? text : String(text); + const pattern = new RegExp(separators.map(escapeRegex).join("|")); + const parts = s.split(pattern); + return parts + .map((part) => normalizePhrase(part, opts)) + .filter((part) => part.length > 0); +} + +export function escapeRegex(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} diff --git a/evals/benchmarks/longmemeval-v2/src/judge/spec.ts b/evals/benchmarks/longmemeval-v2/src/judge/spec.ts new file mode 100644 index 00000000000..237837f54fe --- /dev/null +++ b/evals/benchmarks/longmemeval-v2/src/judge/spec.ts @@ -0,0 +1,87 @@ +/** + * Parser for V2's `eval_function` spec strings. + * + * Spec format (from `parse_eval_function_spec` in V2's + * `evaluation/qa_eval_metrics.py`): + * + * "|=|=|..." + * + * Where `` is the eval function identifier (in V2's snake_case) and + * each `=` is a kwarg override. We keep `name` in V2's + * snake_case so the dispatcher can match it verbatim; we convert kwarg + * keys to TypeScript-idiomatic camelCase so callers can spread them + * alongside other camelCase options. + */ + +export interface ParsedEvalSpec { + /** Function identifier in V2 snake_case (e.g. `norm_phrase_set_match`). */ + name: string; + /** Kwargs from the spec string, converted to camelCase. */ + kwargs: Record; +} + +export function parseEvalFunctionSpec(spec: unknown): ParsedEvalSpec { + if (typeof spec !== "string" || spec.length === 0) { + throw new Error("eval function spec must be a non-empty string."); + } + const parts = spec.split("|").map((part) => part.trim()); + const name = parts[0]; + if (!name) { + throw new Error("eval function spec missing function name."); + } + const kwargs: Record = {}; + for (const part of parts.slice(1)) { + if (!part) continue; + const eq = part.indexOf("="); + if (eq === -1) { + throw new Error(`Invalid eval function option: ${part}`); + } + const rawKey = part.slice(0, eq).trim(); + const rawValue = part.slice(eq + 1).trim(); + if (!rawKey) { + throw new Error(`Invalid eval function option: ${part}`); + } + const camelKey = snakeToCamel(rawKey); + if (camelKey in kwargs) { + throw new Error(`Duplicate eval function option: ${rawKey}`); + } + kwargs[camelKey] = parseEvalValue(rawKey, rawValue); + } + return { name, kwargs }; +} + +export function parseEvalValue(rawKey: string, value: string): unknown { + const lowered = value.toLowerCase(); + if (lowered === "true" || lowered === "false") { + return lowered === "true"; + } + if (lowered === "none" || lowered === "null") { + return null; + } + if (rawKey === "separators" || rawKey === "separator") { + if (value.length === 0) return []; + const stripped = value.trim(); + if (stripped.startsWith("[") && stripped.endsWith("]")) { + try { + const parsed = JSON.parse(stripped); + if (Array.isArray(parsed)) return parsed; + } catch { + // Fall through to char split — matches Python behavior when JSON + // parsing would fail. + } + } + // Mirror Python: split into individual non-whitespace characters. + return Array.from(value).filter((ch) => !/\s/.test(ch)); + } + if (/^-?\d+\.\d+$/.test(value)) { + return Number.parseFloat(value); + } + if (/^-?\d+$/.test(value)) { + return Number.parseInt(value, 10); + } + return value; +} + +function snakeToCamel(key: string): string { + return key.replace(/_([a-z0-9])/g, (_, ch: string) => ch.toUpperCase()); +}