Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions scripts/eval/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,22 @@ node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --claude-effort
node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --claude-efforts max,high
node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --agents codex --codex-effort xhigh

# Restrict to specific projects (works with both agents)
node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --projects mealdrop,edgy,echarts

# Fan out across multiple prompts in one batch
node scripts/eval/run-batch.ts --prompts pattern-copy-play,optimized-tests --yes --repetitions 2

# Targeted matrix: medium + high effort, 3 projects, 2 reps each (12 Claude trials)
node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes \
--agents claude --claude-efforts medium,high \
--projects mealdrop,edgy,echarts --repetitions 2

# Same project subset on Codex
node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes \
--agents codex --codex-effort high \
--projects mealdrop,edgy,echarts --repetitions 2

# Different prompt or concurrency
node scripts/eval/run-batch.ts --prompt setup --yes
node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --concurrency 4
Expand Down
26 changes: 26 additions & 0 deletions scripts/eval/lib/publish-trial.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,32 @@ describe('buildTrialLabels', () => {
'prompt:setup',
]);
});

it('truncates labels longer than 50 characters', async () => {
const { buildTrialLabels } = await import('./publish-trial.ts');

const longPrompt = 'monorepo-optimized-tests-relaxed-limits-no-story-deletion';
const labels = buildTrialLabels(
{
name: 'mealdrop',
repo: 'https://github.com/storybook-tmp/mealdrop',
branch: 'main',
githubSlug: 'storybook-tmp/mealdrop',
},
{ agent: 'claude', model: 'sonnet-4.6', effort: 'high' },
longPrompt
);

expect(labels).toEqual([
'eval',
'project:mealdrop',
'agent:claude',
'model:sonnet-4.6',
'effort:high',
'prompt:monorepo-optimized-tests-relaxed-limits-no-',
]);
expect(labels.every((label) => label.length <= 50)).toBe(true);
});
});

describe('publishTrialBranch', () => {
Expand Down
8 changes: 7 additions & 1 deletion scripts/eval/lib/publish-trial.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ import { existsSync } from 'node:fs';
import { readFile } from 'node:fs/promises';
import { join, relative } from 'node:path';
import { x } from 'tinyexec';

const GITHUB_LABEL_MAX_LENGTH = 50;
import type { TrialWorkspace } from './prepare-trial.ts';
import type { EvalData } from './result-docs.ts';
import {
Expand Down Expand Up @@ -32,7 +34,11 @@ export function buildTrialLabels(
`model:${variant.model}`,
`effort:${variant.effort}`,
`prompt:${prompt}`,
];
].map(truncateLabel);
}

function truncateLabel(label: string) {
return label.slice(0, GITHUB_LABEL_MAX_LENGTH);
}

export async function publishTrialBranch(opts: {
Expand Down
167 changes: 167 additions & 0 deletions scripts/eval/run-batch.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ import {
BATCH_VARIANTS,
buildBatchRunDescriptors,
buildBatchVariants,
formatBatchHeader,
formatDuration,
formatPerProjectSummary,
main,
parseRunBatchArgs,
runBatch,
Expand Down Expand Up @@ -255,6 +258,170 @@ describe('parseRunBatchArgs', () => {
claudeEfforts: ['max', 'high'],
});
});

it('parses a comma-separated --projects list from the CLI', () => {
const [first, second] = BATCH_PROJECT_NAMES;
expect(
parseRunBatchArgs(['--prompt', TEST_PROMPT, '--projects', `${first}, ${second}`])
).toEqual({
prompt: TEST_PROMPT,
projects: [first, second],
});
});
});

describe('formatDuration', () => {
it('formats sub-minute durations as seconds', () => {
expect(formatDuration(0)).toBe('0s');
expect(formatDuration(45_000)).toBe('45s');
expect(formatDuration(59_499)).toBe('59s');
});

it('formats minutes and seconds for under-an-hour durations', () => {
expect(formatDuration(60_000)).toBe('1m');
expect(formatDuration(338_759)).toBe('5m 39s');
expect(formatDuration(1_120_358)).toBe('18m 40s');
});

it('formats hours and minutes for long durations', () => {
expect(formatDuration(3_600_000)).toBe('1h 0m');
expect(formatDuration(3_900_000)).toBe('1h 5m');
});
});

describe('formatBatchHeader', () => {
it('summarizes the matrix and lists distinct values', () => {
const descriptors = buildBatchRunDescriptors({
prompt: TEST_PROMPT,
agents: ['claude'],
claudeEfforts: ['medium', 'high'],
projects: ['mealdrop', 'edgy'],
repetitions: 2,
});

const lines = formatBatchHeader({
batchTimestamp: '2026-05-05T12-09-55-151Z',
descriptors,
concurrency: 8,
logsDir: '/tmp/logs',
});

expect(lines[0]).toBe('Eval batch 2026-05-05T12-09-55-151Z');
expect(lines.join('\n')).toContain(
'runs: 8 (2 projects × 1 agent(s) × 2 effort(s) × 2 rep(s))'
);
expect(lines.join('\n')).toContain('prompt: pattern-copy-play');
expect(lines.join('\n')).toContain('projects: edgy, mealdrop');
expect(lines.join('\n')).toContain('efforts: high, medium');
expect(lines.join('\n')).toContain('concurrency: 8');
expect(lines.join('\n')).toContain('logs: /tmp/logs');
});
});

describe('formatPerProjectSummary', () => {
it('produces a column-aligned table grouped by project', () => {
const runs = [
makeRun({ project: 'mealdrop', status: 'success', durationMs: 60_000 }),
makeRun({ project: 'mealdrop', status: 'success', durationMs: 120_000 }),
makeRun({ project: 'mealdrop', status: 'failed', durationMs: 30_000 }),
makeRun({ project: 'edgy', status: 'success', durationMs: 240_000 }),
];
const lines = formatPerProjectSummary(runs);
expect(lines[0]).toBe('');
expect(lines[1]).toBe('Per-project summary:');
const body = lines.slice(2).join('\n');
expect(body).toContain('project');
expect(body).toContain('ok');
expect(body).toMatch(/edgy\s+1\/1/);
expect(body).toMatch(/mealdrop\s+2\/3/);
expect(body).toContain('30s');
expect(body).toContain('4m');
});

it('returns an empty array when there are no runs', () => {
expect(formatPerProjectSummary([])).toEqual([]);
});
});

function makeRun(opts: { project: string; status: 'success' | 'failed'; durationMs: number }) {
return {
project: opts.project,
agent: 'claude' as const,
model: 'opus-4.6',
effort: 'high',
prompt: TEST_PROMPT,
repetition: 1,
label: `${opts.project}-r01`,
args: [],
startTimestamp: '2026-05-05T12:00:00.000Z',
endTimestamp: '2026-05-05T12:01:00.000Z',
durationMs: opts.durationMs,
exitCode: opts.status === 'success' ? 0 : 1,
signal: null,
status: opts.status,
logPath: `/tmp/${opts.project}.log`,
} as Parameters<typeof formatPerProjectSummary>[0][number];
}

describe('buildBatchRunDescriptors with --projects', () => {
it('restricts the matrix to the requested projects, deduplicating', () => {
const [first, second] = BATCH_PROJECT_NAMES;
const descriptors = buildBatchRunDescriptors({
prompt: TEST_PROMPT,
agents: ['claude'],
claudeEfforts: ['medium', 'high'],
projects: [first, second, first],
repetitions: 2,
});

expect(descriptors).toHaveLength(2 * 2 * 2); // 2 projects × 2 efforts × 2 reps
expect(new Set(descriptors.map((d) => d.project))).toEqual(new Set([first, second]));
expect(new Set(descriptors.map((d) => d.effort))).toEqual(new Set(['medium', 'high']));
});

it('throws when an unknown project is requested', () => {
expect(() =>
buildBatchRunDescriptors({
prompt: TEST_PROMPT,
projects: ['not-a-real-project'],
})
).toThrow(/Unknown project/);
});

it('fans out across multiple prompts when --prompts is set', () => {
const [first] = BATCH_PROJECT_NAMES;
const descriptors = buildBatchRunDescriptors({
prompts: ['pattern-copy-play', 'setup'],
agents: ['claude'],
claudeEffort: 'high',
projects: [first],
repetitions: 2,
});

expect(descriptors).toHaveLength(2 * 1 * 2); // 2 prompts × 1 project × 2 reps
expect(new Set(descriptors.map((d) => d.prompt))).toEqual(
new Set(['pattern-copy-play', 'setup'])
);
expect(new Set(descriptors.map((d) => d.label)).size).toBe(descriptors.length);
});

it('throws when an unknown prompt is requested', () => {
expect(() =>
buildBatchRunDescriptors({
prompts: ['pattern-copy-play', 'not-a-real-prompt'],
})
).toThrow(/Unknown prompt/);
});

it('parses --prompts from the CLI', () => {
expect(parseRunBatchArgs(['--prompts', 'pattern-copy-play, setup'])).toMatchObject({
prompts: ['pattern-copy-play', 'setup'],
});
});

it('rejects CLI invocations with neither --prompt nor --prompts', () => {
expect(() => parseRunBatchArgs(['--repetitions', '1'])).toThrow(/--prompt or --prompts/);
});
});

describe('runBatch', () => {
Expand Down
Loading
Loading