vellum-ai · dvargasfuertes · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/evals/AGENTS.md b/evals/AGENTS.md
@@ -27,7 +27,9 @@ Single (1×1), suite (1×M), ablation (N×1), full matrix (N×M). Same codepath.
 
 **Profile:** declarative directory under `profiles/`. `manifest.json` declares species, optional version, optional setup commands. Optional `workspace/` subdirectory provides initial files for the agent. Plugins are installed via setup commands like `vellum exec -- assistant plugins install simple-memory`.
 
-**Test:** declarative directory under `tests/`. `SPEC.md` briefs the simulator agent. Optional `metrics/` subdirectory holds per-metric `.ts` scorers.
+**Benchmark:** top-level directory under `benchmarks/`. Each benchmark groups a coherent set of units (tests, items, tasks, …) that share a definition shape. `personal-intelligence/` is our in-house benchmark and is currently the only one. Public benchmarks (e.g. `longmemeval-v2/`) will live as peers.
+
+**Test:** declarative directory under `benchmarks/personal-intelligence/tests/`. `SPEC.md` briefs the simulator agent. Optional `metrics/` subdirectory holds per-metric `.ts` scorers. The on-disk root is resolved via `getTestsDir()` (overridable via `EVALS_TESTS_DIR`).
 
 **Agent adapter (per species):** thin CLI process wrapper. Owns invocation, stdin/stdout format, session resume, cost extraction. Each test gets a fresh process — no sharing across tests (parallelization-ready). The Vellum adapter hatches a fresh Docker instance, sends user messages via `vellum message`, and reads assistant output from `vellum events --json`.
 

diff --git a/evals/README.md b/evals/README.md
@@ -62,10 +62,12 @@ evals/
 │   │   └── manifest.json
 │   └── p2/
 │       └── manifest.json
-├── tests/                   # Committed test definitions
-│   └── timeline-recall/
-│       ├── SPEC.md          # simulator briefing
-│       └── metrics/         # (optional) per-metric `.ts` scorers
+├── benchmarks/              # One subdirectory per benchmark
+│   └── personal-intelligence/
+│       └── tests/           # Test definitions for this benchmark
+│           └── timeline-recall/
+│               ├── SPEC.md  # simulator briefing
+│               └── metrics/ # (optional) per-metric `.ts` scorers
 ├── .env.example             # API key contract
 ├── package.json
 └── AGENTS.md                # Conventions
@@ -90,7 +92,7 @@ Run `evals profiles list` to see all committed profiles and their setup.
 
 ## Test
 
-A test lives at `tests/<id>/`. The directory name is the test id.
+A test lives at `benchmarks/personal-intelligence/tests/<id>/`. The directory name is the test id. (Other benchmarks live as peers under `benchmarks/` and may use different unit names.)
 
 `SPEC.md` briefs the simulator agent on the role it plays and how it should interact with the assistant. It does not describe assertion behavior.
 

diff --git a/evals/tests/timeline-recall/SPEC.md → ...ntelligence/tests/timeline-recall/SPEC.md b/evals/tests/timeline-recall/SPEC.md → ...ntelligence/tests/timeline-recall/SPEC.md
diff --git a/evals/tests/timeline-recall/constants.ts → ...igence/tests/timeline-recall/constants.ts b/evals/tests/timeline-recall/constants.ts → ...igence/tests/timeline-recall/constants.ts
diff --git a/...timeline-recall/metrics/assistant-cost.ts → ...timeline-recall/metrics/assistant-cost.ts b/...timeline-recall/metrics/assistant-cost.ts → ...timeline-recall/metrics/assistant-cost.ts
@@ -2,7 +2,7 @@ import {
   readUsage,
   type MetricInput,
   type MetricResult,
-} from "../../../src/lib/metrics";
+} from "../../../../../src/lib/metrics";
 
 /**
  * Cost is reported as a negative number so "higher score = better" — a

diff --git a/...timeline-recall/metrics/date-mentioned.ts → ...timeline-recall/metrics/date-mentioned.ts b/...timeline-recall/metrics/date-mentioned.ts → ...timeline-recall/metrics/date-mentioned.ts
@@ -2,7 +2,7 @@ import {
   readTranscript,
   type MetricInput,
   type MetricResult,
-} from "../../../src/lib/metrics";
+} from "../../../../../src/lib/metrics";
 import { PEANUT_ALLERGY_DATE } from "../constants";
 
 export default async function scoreDateMentioned(

diff --git a/evals/tests/timeline-recall/setup.ts → ...telligence/tests/timeline-recall/setup.ts b/evals/tests/timeline-recall/setup.ts → ...telligence/tests/timeline-recall/setup.ts
diff --git a/evals/src/lib/__tests__/metrics.test.ts b/evals/src/lib/__tests__/metrics.test.ts
@@ -19,8 +19,8 @@ import {
   writeUsage,
 } from "../metrics";
 import type { TestDef } from "../test-def";
-import scoreAssistantCost from "../../../tests/timeline-recall/metrics/assistant-cost";
-import scoreDateMentioned from "../../../tests/timeline-recall/metrics/date-mentioned";
+import scoreAssistantCost from "../../../benchmarks/personal-intelligence/tests/timeline-recall/metrics/assistant-cost";
+import scoreDateMentioned from "../../../benchmarks/personal-intelligence/tests/timeline-recall/metrics/date-mentioned";
 
 const testDef: TestDef = {
   id: "timeline-recall",

diff --git a/evals/src/lib/catalog.ts b/evals/src/lib/catalog.ts
@@ -7,7 +7,14 @@ const SAFE_ID = /^[a-z0-9][a-z0-9-]*$/;
 
 const HERE = dirname(fileURLToPath(import.meta.url));
 const DEFAULT_PROFILES_DIR = join(HERE, "..", "..", "profiles");
-const DEFAULT_TESTS_DIR = join(HERE, "..", "..", "tests");
+const DEFAULT_TESTS_DIR = join(
+  HERE,
+  "..",
+  "..",
+  "benchmarks",
+  "personal-intelligence",
+  "tests",
+);
 
 export function getProfilesDir(): string {
   return process.env.EVALS_PROFILES_DIR ?? DEFAULT_PROFILES_DIR;

diff --git a/evals/src/lib/test-def.ts b/evals/src/lib/test-def.ts
@@ -1,28 +1,30 @@
 /**
  * Test definition — directory layout describing what the harness runs.
  *
- * Each test lives at `tests/<id>/` with:
+ * Each test lives at `benchmarks/personal-intelligence/tests/<id>/` with:
  *   - `SPEC.md`  — markdown briefing for the simulator agent.
  *   - `setup.ts` — optional deterministic setup commands.
  *   - `metrics/` — directory of `.ts` files. Each file exports a scorer.
  *
- * The test id is the directory name.
+ * The test id is the directory name. The on-disk root is resolved via
+ * `getTestsDir()` so a future `--benchmark` flag (or env override) can
+ * point at a different benchmark's units.
  */
 import { readdir, stat } from "node:fs/promises";
 import { assertSafeId, getTestsDir, resolveUnder } from "./catalog";
 
 import type { TestSetupCommand } from "./setup-command";
 
 export interface TestDef {
-  /** Directory name under `tests/`. */
+  /** Directory name under the benchmark's `tests/` root. */
   id: string;
-  /** Absolute path to `tests/<id>/SPEC.md`. */
+  /** Absolute path to `<benchmark>/tests/<id>/SPEC.md`. */
   specPath: string;
-  /** Absolute path to optional `tests/<id>/setup.ts`. */
+  /** Absolute path to optional `<benchmark>/tests/<id>/setup.ts`. */
   setupPath: string;
   /** Deterministic commands run before the simulator starts. */
   setupCommands: TestSetupCommand[];
-  /** Absolute path to `tests/<id>/metrics/` — may be empty or absent. */
+  /** Absolute path to `<benchmark>/tests/<id>/metrics/` — may be empty or absent. */
   metricsDir: string;
   /** Absolute paths to each `.ts` file in the metrics directory, sorted. */
   metricPaths: string[];