diff --git a/evals/AGENTS.md b/evals/AGENTS.md index 092fdc63c10..54d7ddfa012 100644 --- a/evals/AGENTS.md +++ b/evals/AGENTS.md @@ -27,7 +27,9 @@ Single (1×1), suite (1×M), ablation (N×1), full matrix (N×M). Same codepath. **Profile:** declarative directory under `profiles/`. `manifest.json` declares species, optional version, optional setup commands. Optional `workspace/` subdirectory provides initial files for the agent. Plugins are installed via setup commands like `vellum exec -- assistant plugins install simple-memory`. -**Test:** declarative directory under `tests/`. `SPEC.md` briefs the simulator agent. Optional `metrics/` subdirectory holds per-metric `.ts` scorers. +**Benchmark:** top-level directory under `benchmarks/`. Each benchmark groups a coherent set of units (tests, items, tasks, …) that share a definition shape. `personal-intelligence/` is our in-house benchmark and is currently the only one. Public benchmarks (e.g. `longmemeval-v2/`) will live as peers. + +**Test:** declarative directory under `benchmarks/personal-intelligence/tests/`. `SPEC.md` briefs the simulator agent. Optional `metrics/` subdirectory holds per-metric `.ts` scorers. The on-disk root is resolved via `getTestsDir()` (overridable via `EVALS_TESTS_DIR`). **Agent adapter (per species):** thin CLI process wrapper. Owns invocation, stdin/stdout format, session resume, cost extraction. Each test gets a fresh process — no sharing across tests (parallelization-ready). The Vellum adapter hatches a fresh Docker instance, sends user messages via `vellum message`, and reads assistant output from `vellum events --json`. diff --git a/evals/README.md b/evals/README.md index 19ab0e4f01d..62519b5436b 100644 --- a/evals/README.md +++ b/evals/README.md @@ -62,10 +62,12 @@ evals/ │ │ └── manifest.json │ └── p2/ │ └── manifest.json -├── tests/ # Committed test definitions -│ └── timeline-recall/ -│ ├── SPEC.md # simulator briefing -│ └── metrics/ # (optional) per-metric `.ts` scorers +├── benchmarks/ # One subdirectory per benchmark +│ └── personal-intelligence/ +│ └── tests/ # Test definitions for this benchmark +│ └── timeline-recall/ +│ ├── SPEC.md # simulator briefing +│ └── metrics/ # (optional) per-metric `.ts` scorers ├── .env.example # API key contract ├── package.json └── AGENTS.md # Conventions @@ -90,7 +92,7 @@ Run `evals profiles list` to see all committed profiles and their setup. ## Test -A test lives at `tests//`. The directory name is the test id. +A test lives at `benchmarks/personal-intelligence/tests//`. The directory name is the test id. (Other benchmarks live as peers under `benchmarks/` and may use different unit names.) `SPEC.md` briefs the simulator agent on the role it plays and how it should interact with the assistant. It does not describe assertion behavior. diff --git a/evals/tests/timeline-recall/SPEC.md b/evals/benchmarks/personal-intelligence/tests/timeline-recall/SPEC.md similarity index 100% rename from evals/tests/timeline-recall/SPEC.md rename to evals/benchmarks/personal-intelligence/tests/timeline-recall/SPEC.md diff --git a/evals/tests/timeline-recall/constants.ts b/evals/benchmarks/personal-intelligence/tests/timeline-recall/constants.ts similarity index 100% rename from evals/tests/timeline-recall/constants.ts rename to evals/benchmarks/personal-intelligence/tests/timeline-recall/constants.ts diff --git a/evals/tests/timeline-recall/metrics/assistant-cost.ts b/evals/benchmarks/personal-intelligence/tests/timeline-recall/metrics/assistant-cost.ts similarity index 96% rename from evals/tests/timeline-recall/metrics/assistant-cost.ts rename to evals/benchmarks/personal-intelligence/tests/timeline-recall/metrics/assistant-cost.ts index 8ed925bb829..b4ea6908f84 100644 --- a/evals/tests/timeline-recall/metrics/assistant-cost.ts +++ b/evals/benchmarks/personal-intelligence/tests/timeline-recall/metrics/assistant-cost.ts @@ -2,7 +2,7 @@ import { readUsage, type MetricInput, type MetricResult, -} from "../../../src/lib/metrics"; +} from "../../../../../src/lib/metrics"; /** * Cost is reported as a negative number so "higher score = better" — a diff --git a/evals/tests/timeline-recall/metrics/date-mentioned.ts b/evals/benchmarks/personal-intelligence/tests/timeline-recall/metrics/date-mentioned.ts similarity index 95% rename from evals/tests/timeline-recall/metrics/date-mentioned.ts rename to evals/benchmarks/personal-intelligence/tests/timeline-recall/metrics/date-mentioned.ts index df6d9ea5e54..98920708a40 100644 --- a/evals/tests/timeline-recall/metrics/date-mentioned.ts +++ b/evals/benchmarks/personal-intelligence/tests/timeline-recall/metrics/date-mentioned.ts @@ -2,7 +2,7 @@ import { readTranscript, type MetricInput, type MetricResult, -} from "../../../src/lib/metrics"; +} from "../../../../../src/lib/metrics"; import { PEANUT_ALLERGY_DATE } from "../constants"; export default async function scoreDateMentioned( diff --git a/evals/tests/timeline-recall/setup.ts b/evals/benchmarks/personal-intelligence/tests/timeline-recall/setup.ts similarity index 100% rename from evals/tests/timeline-recall/setup.ts rename to evals/benchmarks/personal-intelligence/tests/timeline-recall/setup.ts diff --git a/evals/src/lib/__tests__/metrics.test.ts b/evals/src/lib/__tests__/metrics.test.ts index f0589b14cd0..2d8e3fd6412 100644 --- a/evals/src/lib/__tests__/metrics.test.ts +++ b/evals/src/lib/__tests__/metrics.test.ts @@ -19,8 +19,8 @@ import { writeUsage, } from "../metrics"; import type { TestDef } from "../test-def"; -import scoreAssistantCost from "../../../tests/timeline-recall/metrics/assistant-cost"; -import scoreDateMentioned from "../../../tests/timeline-recall/metrics/date-mentioned"; +import scoreAssistantCost from "../../../benchmarks/personal-intelligence/tests/timeline-recall/metrics/assistant-cost"; +import scoreDateMentioned from "../../../benchmarks/personal-intelligence/tests/timeline-recall/metrics/date-mentioned"; const testDef: TestDef = { id: "timeline-recall", diff --git a/evals/src/lib/catalog.ts b/evals/src/lib/catalog.ts index 787fb1c6420..c71933657e7 100644 --- a/evals/src/lib/catalog.ts +++ b/evals/src/lib/catalog.ts @@ -7,7 +7,14 @@ const SAFE_ID = /^[a-z0-9][a-z0-9-]*$/; const HERE = dirname(fileURLToPath(import.meta.url)); const DEFAULT_PROFILES_DIR = join(HERE, "..", "..", "profiles"); -const DEFAULT_TESTS_DIR = join(HERE, "..", "..", "tests"); +const DEFAULT_TESTS_DIR = join( + HERE, + "..", + "..", + "benchmarks", + "personal-intelligence", + "tests", +); export function getProfilesDir(): string { return process.env.EVALS_PROFILES_DIR ?? DEFAULT_PROFILES_DIR; diff --git a/evals/src/lib/test-def.ts b/evals/src/lib/test-def.ts index 1c1c621705a..94f110fcc32 100644 --- a/evals/src/lib/test-def.ts +++ b/evals/src/lib/test-def.ts @@ -1,12 +1,14 @@ /** * Test definition — directory layout describing what the harness runs. * - * Each test lives at `tests//` with: + * Each test lives at `benchmarks/personal-intelligence/tests//` with: * - `SPEC.md` — markdown briefing for the simulator agent. * - `setup.ts` — optional deterministic setup commands. * - `metrics/` — directory of `.ts` files. Each file exports a scorer. * - * The test id is the directory name. + * The test id is the directory name. The on-disk root is resolved via + * `getTestsDir()` so a future `--benchmark` flag (or env override) can + * point at a different benchmark's units. */ import { readdir, stat } from "node:fs/promises"; import { assertSafeId, getTestsDir, resolveUnder } from "./catalog"; @@ -14,15 +16,15 @@ import { assertSafeId, getTestsDir, resolveUnder } from "./catalog"; import type { TestSetupCommand } from "./setup-command"; export interface TestDef { - /** Directory name under `tests/`. */ + /** Directory name under the benchmark's `tests/` root. */ id: string; - /** Absolute path to `tests//SPEC.md`. */ + /** Absolute path to `/tests//SPEC.md`. */ specPath: string; - /** Absolute path to optional `tests//setup.ts`. */ + /** Absolute path to optional `/tests//setup.ts`. */ setupPath: string; /** Deterministic commands run before the simulator starts. */ setupCommands: TestSetupCommand[]; - /** Absolute path to `tests//metrics/` — may be empty or absent. */ + /** Absolute path to `/tests//metrics/` — may be empty or absent. */ metricsDir: string; /** Absolute paths to each `.ts` file in the metrics directory, sorted. */ metricPaths: string[];