vellum-ai · dvargasfuertes · May 28, 2026 · May 27, 2026
diff --git a/evals/README.md b/evals/README.md
@@ -69,12 +69,17 @@ evals/
 │   └── p2/
 │       └── manifest.json
 ├── benchmarks/              # One subdirectory per benchmark
-│   └── personal-intelligence/
+│   ├── personal-intelligence/
+│   │   ├── manifest.json    # displayName + unitDirName + unitNoun
+│   │   └── tests/           # Unit definitions (`unitDirName` per manifest)
+│   │       └── timeline-recall/
+│   │           ├── SPEC.md  # simulator briefing
+│   │           └── metrics/ # (optional) per-metric `.ts` scorers
+│   └── longmemeval-v2/
 │       ├── manifest.json    # displayName + unitDirName + unitNoun
-│       └── tests/           # Unit definitions (`unitDirName` per manifest)
-│           └── timeline-recall/
-│               ├── SPEC.md  # simulator briefing
-│               └── metrics/ # (optional) per-metric `.ts` scorers
+│       ├── data/            # gitignored; populate via `data/download.sh`
+│       ├── items/           # virtual unit dir — items materialized by `src/loader.ts`
+│       └── src/             # benchmark-local code (loader, fixtures, tests)
 ├── .env.example             # API key contract
 ├── package.json
 └── AGENTS.md                # Conventions

diff --git a/evals/benchmarks/longmemeval-v2/README.md b/evals/benchmarks/longmemeval-v2/README.md
@@ -0,0 +1,57 @@
+# LongMemEval v2
+
+The first public benchmark we run through the eval harness. 451 manually-curated questions and 1,870 task trajectories testing five memory abilities:
+
+- **Static state recall** — remembers important landmarks and page layouts.
+- **Dynamic state tracking** — understands how states change over time.
+- **Workflow knowledge** — knows the steps needed for recurring tasks.
+- **Environment gotchas** — recognizes recurring local failure modes.
+- **Premise awareness** — detects assumptions valid elsewhere but wrong here.
+
+Source: [LongMemEval-V2 paper (arXiv 2605.12493)](https://arxiv.org/abs/2605.12493) · [dataset on Hugging Face](https://huggingface.co/datasets/xiaowu0162/longmemeval-v2) · Apache-2.0.
+
+Integration spec: `/workspace/scratch/evals-longmemeval-v2-spec.md`.
+
+## Layout
+
+```
+benchmarks/longmemeval-v2/
+├── manifest.json              # displayName + unitDirName + unitNoun
+├── README.md                  # this file
+├── data/                      # 7+ GB dataset payload (gitignored)
+│   ├── .gitignore
+│   ├── download.sh            # huggingface-cli download wrapper
+│   └── …                      # questions.jsonl, trajectories.jsonl,
+│                              #   haystacks/lme_v2_{small,medium}.json,
+│                              #   question_screenshots/, trajectory_screenshots/
+├── items/                     # virtual unit dir — populated on demand by the loader
+└── src/
+    ├── loader.ts              # questions.jsonl + haystacks/<tier>.json → BenchmarkItem[]
+    └── __tests__/             # fixture-backed loader tests
+```
+
+## Getting the data
+
+```bash
+cd evals/benchmarks/longmemeval-v2/data
+./download.sh
+```
+
+`download.sh` is idempotent. The dataset is 7.12 GB; the `data/` directory stays gitignored.
+
+## Loader
+
+`src/loader.ts` exports `loadLongMemEvalV2({ dataRoot, tier })`, returning an array of `BenchmarkItem`s:
+
+```ts
+interface BenchmarkItem {
+  questionId: string; // V2 questions.jsonl `question_id`
+  ability: string; // V2 questions.jsonl `question_type` (one of the five abilities)
+  question: string;
+  answer: string; // gold answer, used by the GPT-4o judge
+  questionDate?: string;
+  trajectoryIds: string[]; // ordered haystack from haystacks/lme_v2_<tier>.json
+}
+```
+
+This PR ships the loader and its fixture tests only. The two-conversation runner (`run-ingest-ask`), GPT-4o paper-faithful judge, and Phase 1 wiring land in subsequent PRs against the contract established here.
diff --git a/evals/benchmarks/longmemeval-v2/data/.gitignore b/evals/benchmarks/longmemeval-v2/data/.gitignore
@@ -0,0 +1,5 @@
+# LongMemEval-V2 dataset payload is 7+ GB — kept out of the repo.
+# Pull via `bash download.sh` (or `huggingface-cli download xiaowu0162/longmemeval-v2`).
+*
+!.gitignore
+!download.sh
diff --git a/evals/benchmarks/longmemeval-v2/data/download.sh b/evals/benchmarks/longmemeval-v2/data/download.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Fetch the LongMemEval-V2 dataset from Hugging Face into this directory.
+#
+# The dataset is ~7.12 GB and stays gitignored. This script is idempotent:
+# re-running skips already-downloaded files (huggingface-cli compares by hash).
+#
+# Defaults:
+#   - target dir: this script's parent (i.e. evals/benchmarks/longmemeval-v2/data/)
+#   - repo:       xiaowu0162/longmemeval-v2
+#
+# Override via env: DATA_ROOT=... REPO=...
+#
+# Requires: huggingface-cli (`pip install -U "huggingface_hub[cli]"`).
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+DATA_ROOT="${DATA_ROOT:-$SCRIPT_DIR}"
+REPO="${REPO:-xiaowu0162/longmemeval-v2}"
+
+if ! command -v huggingface-cli >/dev/null 2>&1; then
+  cat >&2 <<'EOF'
+error: huggingface-cli not found on PATH.
+
+Install it with:
+  pip install -U "huggingface_hub[cli]"
+
+Then re-run this script.
+EOF
+  exit 1
+fi
+
+echo "Downloading $REPO into $DATA_ROOT ..."
+huggingface-cli download "$REPO" \
+  --repo-type dataset \
+  --local-dir "$DATA_ROOT"
+
+echo
+echo "Done. Top-level files:"
+ls -1 "$DATA_ROOT" | grep -v -E '^(\.gitignore|download\.sh)$' | head -20
+
+cat <<EOF
+
+Next steps:
+  - Optional: extract trajectory screenshots
+      mkdir -p "$DATA_ROOT/screenshots"
+      tar -xzf "$DATA_ROOT/trajectory_screenshots/web_screenshots.tar.gz" \\
+              -C "$DATA_ROOT/screenshots"
+      tar -xzf "$DATA_ROOT/trajectory_screenshots/enterprise_screenshots_base.tar.gz" \\
+              -C "$DATA_ROOT/screenshots"
+  - Validate (optional): sha256sum -c "$DATA_ROOT/checksums.sha256"
+
+The loader (\`src/loader.ts\`) reads:
+  - questions.jsonl
+  - haystacks/lme_v2_{small,medium}.json
+
+trajectories.jsonl and *_screenshots/ are consumed by the runner, not the loader.
+EOF
diff --git a/evals/benchmarks/longmemeval-v2/manifest.json b/evals/benchmarks/longmemeval-v2/manifest.json
@@ -0,0 +1,5 @@
+{
+  "displayName": "LongMemEval v2",
+  "unitDirName": "items",
+  "unitNoun": "item"
+}
diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/haystacks/lme_v2_medium.json b/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/haystacks/lme_v2_medium.json
@@ -0,0 +1,4 @@
+{
+  "q_001": ["traj_a", "traj_b", "traj_x", "traj_y"],
+  "q_002": ["traj_b", "traj_c", "traj_d", "traj_z"]
+}
diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/haystacks/lme_v2_small.json b/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/haystacks/lme_v2_small.json
@@ -0,0 +1,5 @@
+{
+  "q_001": ["traj_a", "traj_b"],
+  "q_002": ["traj_b", "traj_c", "traj_d"],
+  "q_003": ["traj_e"]
+}
diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/questions.jsonl b/evals/benchmarks/longmemeval-v2/src/__tests__/fixtures/questions.jsonl
@@ -0,0 +1,3 @@
+{"question_id": "q_001", "question_type": "static-state-recall", "question": "What is the URL of the project settings page?", "answer": "/settings/project", "question_date": "2026-01-15"}
+{"question_id": "q_002", "question_type": "dynamic-state-tracking", "question": "After the bulk import completed, what was the new total record count?", "answer": "12,481"}
+{"question_id": "q_003", "question_type": "workflow-knowledge", "question": "What sequence of clicks creates a new dashboard?", "answer": "Dashboards > New > template > Save", "extra_field_for_passthrough": true}
diff --git a/evals/benchmarks/longmemeval-v2/src/__tests__/loader.test.ts b/evals/benchmarks/longmemeval-v2/src/__tests__/loader.test.ts
@@ -0,0 +1,182 @@
+import { mkdtemp, mkdir, writeFile } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { dirname, join } from "node:path";
+import { fileURLToPath } from "node:url";
+
+import { describe, expect, test } from "bun:test";
+
+import { loadLongMemEvalV2 } from "../loader";
+
+const HERE = dirname(fileURLToPath(import.meta.url));
+const FIXTURES = join(HERE, "fixtures");
+
+describe("loadLongMemEvalV2", () => {
+  test("joins questions against the small-tier haystack", async () => {
+    const items = await loadLongMemEvalV2({
+      dataRoot: FIXTURES,
+      tier: "small",
+    });
+
+    expect(items.map((i) => i.questionId)).toEqual(["q_001", "q_002", "q_003"]);
+    expect(items[0]).toMatchObject({
+      questionId: "q_001",
+      ability: "static-state-recall",
+      question: "What is the URL of the project settings page?",
+      answer: "/settings/project",
+      questionDate: "2026-01-15",
+      trajectoryIds: ["traj_a", "traj_b"],
+    });
+    expect(items[1].trajectoryIds).toEqual(["traj_b", "traj_c", "traj_d"]);
+    expect(items[2].trajectoryIds).toEqual(["traj_e"]);
+    // `extra_field_for_passthrough` on q_003 should not crash the loader
+    // — passthrough on the raw schema preserves forward compatibility.
+    expect(items[2].ability).toBe("workflow-knowledge");
+  });
+
+  test("resolves a different trajectory list for the medium tier", async () => {
+    // medium-tier fixture intentionally omits q_003, so loading it should
+    // surface the strict-join error rather than silently dropping items.
+    await expect(
+      loadLongMemEvalV2({ dataRoot: FIXTURES, tier: "medium" }),
+    ).rejects.toThrow(
+      /Tier "medium" haystack mapping is missing 1 question id.*q_003/,
+    );
+  });
+
+  test("loads medium tier when every question has a haystack", async () => {
+    const dir = await mkdtemp(join(tmpdir(), "lme-v2-"));
+    await mkdir(join(dir, "haystacks"), { recursive: true });
+    await writeFile(
+      join(dir, "questions.jsonl"),
+      [
+        JSON.stringify({
+          question_id: "q1",
+          question_type: "static-state-recall",
+          question: "Q1?",
+          answer: "A1",
+        }),
+        JSON.stringify({
+          question_id: "q2",
+          question_type: "premise-awareness",
+          question: "Q2?",
+          answer: "A2",
+        }),
+      ].join("\n"),
+      "utf8",
+    );
+    await writeFile(
+      join(dir, "haystacks", "lme_v2_medium.json"),
+      JSON.stringify({
+        q1: ["t1", "t2", "t3"],
+        q2: ["t4"],
+      }),
+      "utf8",
+    );
+
+    const items = await loadLongMemEvalV2({ dataRoot: dir, tier: "medium" });
+    expect(items).toHaveLength(2);
+    expect(items[0].trajectoryIds).toEqual(["t1", "t2", "t3"]);
+  });
+
+  test("reports a helpful error when questions.jsonl is missing", async () => {
+    const dir = await mkdtemp(join(tmpdir(), "lme-v2-"));
+    await expect(
+      loadLongMemEvalV2({ dataRoot: dir, tier: "small" }),
+    ).rejects.toThrow(/questions\.jsonl not found.*data\/download\.sh/);
+  });
+
+  test("reports a helpful error when the tier haystack is missing", async () => {
+    const dir = await mkdtemp(join(tmpdir(), "lme-v2-"));
+    await writeFile(join(dir, "questions.jsonl"), "", "utf8");
+    await expect(
+      loadLongMemEvalV2({ dataRoot: dir, tier: "small" }),
+    ).rejects.toThrow(
+      /haystack mapping for tier "small" not found.*data\/download\.sh/,
+    );
+  });
+
+  test("rejects malformed questions.jsonl with line numbers", async () => {
+    const dir = await mkdtemp(join(tmpdir(), "lme-v2-"));
+    await mkdir(join(dir, "haystacks"), { recursive: true });
+    await writeFile(
+      join(dir, "questions.jsonl"),
+      [
+        JSON.stringify({
+          question_id: "q1",
+          question_type: "static-state-recall",
+          question: "Q1?",
+          answer: "A1",
+        }),
+        // Missing required `answer` field.
+        JSON.stringify({
+          question_id: "q2",
+          question_type: "premise-awareness",
+          question: "Q2?",
+        }),
+      ].join("\n"),
+      "utf8",
+    );
+    await writeFile(
+      join(dir, "haystacks", "lme_v2_small.json"),
+      JSON.stringify({ q1: ["t1"], q2: ["t2"] }),
+      "utf8",
+    );
+
+    await expect(
+      loadLongMemEvalV2({ dataRoot: dir, tier: "small" }),
+    ).rejects.toThrow(/questions\.jsonl line 2 failed schema validation/);
+  });
+
+  test("rejects haystack mappings with empty trajectory lists", async () => {
+    const dir = await mkdtemp(join(tmpdir(), "lme-v2-"));
+    await mkdir(join(dir, "haystacks"), { recursive: true });
+    await writeFile(
+      join(dir, "questions.jsonl"),
+      JSON.stringify({
+        question_id: "q1",
+        question_type: "static-state-recall",
+        question: "Q1?",
+        answer: "A1",
+      }),
+      "utf8",
+    );
+    await writeFile(
+      join(dir, "haystacks", "lme_v2_small.json"),
+      JSON.stringify({ q1: [] }),
+      "utf8",
+    );
+
+    await expect(
+      loadLongMemEvalV2({ dataRoot: dir, tier: "small" }),
+    ).rejects.toThrow(/failed schema validation/);
+  });
+
+  test("skips blank lines in questions.jsonl", async () => {
+    const dir = await mkdtemp(join(tmpdir(), "lme-v2-"));
+    await mkdir(join(dir, "haystacks"), { recursive: true });
+    await writeFile(
+      join(dir, "questions.jsonl"),
+      [
+        "",
+        JSON.stringify({
+          question_id: "q1",
+          question_type: "static-state-recall",
+          question: "Q1?",
+          answer: "A1",
+        }),
+        "",
+        "",
+      ].join("\n"),
+      "utf8",
+    );
+    await writeFile(
+      join(dir, "haystacks", "lme_v2_small.json"),
+      JSON.stringify({ q1: ["t1"] }),
+      "utf8",
+    );
+
+    const items = await loadLongMemEvalV2({ dataRoot: dir, tier: "small" });
+    expect(items).toHaveLength(1);
+    expect(items[0].questionId).toBe("q1");
+  });
+});