storybookjs · Sidnioulz · May 6, 2026 · May 5, 2026 · May 5, 2026 · May 6, 2026
diff --git a/scripts/eval/README.md b/scripts/eval/README.md
@@ -92,6 +92,22 @@ node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --claude-effort
 node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --claude-efforts max,high
 node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --agents codex --codex-effort xhigh
 
+# Restrict to specific projects (works with both agents)
+node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --projects mealdrop,edgy,echarts
+
+# Fan out across multiple prompts in one batch
+node scripts/eval/run-batch.ts --prompts pattern-copy-play,optimized-tests --yes --repetitions 2
+
+# Targeted matrix: medium + high effort, 3 projects, 2 reps each (12 Claude trials)
+node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes \
+  --agents claude --claude-efforts medium,high \
+  --projects mealdrop,edgy,echarts --repetitions 2
+
+# Same project subset on Codex
+node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes \
+  --agents codex --codex-effort high \
+  --projects mealdrop,edgy,echarts --repetitions 2
+
 # Different prompt or concurrency
 node scripts/eval/run-batch.ts --prompt setup --yes
 node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --concurrency 4

diff --git a/scripts/eval/lib/publish-trial.test.ts b/scripts/eval/lib/publish-trial.test.ts
@@ -93,6 +93,32 @@ describe('buildTrialLabels', () => {
       'prompt:setup',
     ]);
   });
+
+  it('truncates labels longer than 50 characters', async () => {
+    const { buildTrialLabels } = await import('./publish-trial.ts');
+
+    const longPrompt = 'monorepo-optimized-tests-relaxed-limits-no-story-deletion';
+    const labels = buildTrialLabels(
+      {
+        name: 'mealdrop',
+        repo: 'https://github.com/storybook-tmp/mealdrop',
+        branch: 'main',
+        githubSlug: 'storybook-tmp/mealdrop',
+      },
+      { agent: 'claude', model: 'sonnet-4.6', effort: 'high' },
+      longPrompt
+    );
+
+    expect(labels).toEqual([
+      'eval',
+      'project:mealdrop',
+      'agent:claude',
+      'model:sonnet-4.6',
+      'effort:high',
+      'prompt:monorepo-optimized-tests-relaxed-limits-no-',
+    ]);
+    expect(labels.every((label) => label.length <= 50)).toBe(true);
+  });
 });
 
 describe('publishTrialBranch', () => {

diff --git a/scripts/eval/lib/publish-trial.ts b/scripts/eval/lib/publish-trial.ts
@@ -2,6 +2,8 @@ import { existsSync } from 'node:fs';
 import { readFile } from 'node:fs/promises';
 import { join, relative } from 'node:path';
 import { x } from 'tinyexec';
+
+const GITHUB_LABEL_MAX_LENGTH = 50;
 import type { TrialWorkspace } from './prepare-trial.ts';
 import type { EvalData } from './result-docs.ts';
 import {
@@ -32,7 +34,11 @@ export function buildTrialLabels(
     `model:${variant.model}`,
     `effort:${variant.effort}`,
     `prompt:${prompt}`,
-  ];
+  ].map(truncateLabel);
+}
+
+function truncateLabel(label: string) {
+  return label.slice(0, GITHUB_LABEL_MAX_LENGTH);
 }
 
 export async function publishTrialBranch(opts: {

diff --git a/scripts/eval/run-batch.test.ts b/scripts/eval/run-batch.test.ts
@@ -18,6 +18,9 @@ import {
   BATCH_VARIANTS,
   buildBatchRunDescriptors,
   buildBatchVariants,
+  formatBatchHeader,
+  formatDuration,
+  formatPerProjectSummary,
   main,
   parseRunBatchArgs,
   runBatch,
@@ -255,6 +258,170 @@ describe('parseRunBatchArgs', () => {
       claudeEfforts: ['max', 'high'],
     });
   });
+
+  it('parses a comma-separated --projects list from the CLI', () => {
+    const [first, second] = BATCH_PROJECT_NAMES;
+    expect(
+      parseRunBatchArgs(['--prompt', TEST_PROMPT, '--projects', `${first}, ${second}`])
+    ).toEqual({
+      prompt: TEST_PROMPT,
+      projects: [first, second],
+    });
+  });
+});
+
+describe('formatDuration', () => {
+  it('formats sub-minute durations as seconds', () => {
+    expect(formatDuration(0)).toBe('0s');
+    expect(formatDuration(45_000)).toBe('45s');
+    expect(formatDuration(59_499)).toBe('59s');
+  });
+
+  it('formats minutes and seconds for under-an-hour durations', () => {
+    expect(formatDuration(60_000)).toBe('1m');
+    expect(formatDuration(338_759)).toBe('5m 39s');
+    expect(formatDuration(1_120_358)).toBe('18m 40s');
+  });
+
+  it('formats hours and minutes for long durations', () => {
+    expect(formatDuration(3_600_000)).toBe('1h 0m');
+    expect(formatDuration(3_900_000)).toBe('1h 5m');
+  });
+});
+
+describe('formatBatchHeader', () => {
+  it('summarizes the matrix and lists distinct values', () => {
+    const descriptors = buildBatchRunDescriptors({
+      prompt: TEST_PROMPT,
+      agents: ['claude'],
+      claudeEfforts: ['medium', 'high'],
+      projects: ['mealdrop', 'edgy'],
+      repetitions: 2,
+    });
+
+    const lines = formatBatchHeader({
+      batchTimestamp: '2026-05-05T12-09-55-151Z',
+      descriptors,
+      concurrency: 8,
+      logsDir: '/tmp/logs',
+    });
+
+    expect(lines[0]).toBe('Eval batch 2026-05-05T12-09-55-151Z');
+    expect(lines.join('\n')).toContain(
+      'runs:        8 (2 projects × 1 agent(s) × 2 effort(s) × 2 rep(s))'
+    );
+    expect(lines.join('\n')).toContain('prompt:      pattern-copy-play');
+    expect(lines.join('\n')).toContain('projects:    edgy, mealdrop');
+    expect(lines.join('\n')).toContain('efforts:     high, medium');
+    expect(lines.join('\n')).toContain('concurrency: 8');
+    expect(lines.join('\n')).toContain('logs:        /tmp/logs');
+  });
+});
+
+describe('formatPerProjectSummary', () => {
+  it('produces a column-aligned table grouped by project', () => {
+    const runs = [
+      makeRun({ project: 'mealdrop', status: 'success', durationMs: 60_000 }),
+      makeRun({ project: 'mealdrop', status: 'success', durationMs: 120_000 }),
+      makeRun({ project: 'mealdrop', status: 'failed', durationMs: 30_000 }),
+      makeRun({ project: 'edgy', status: 'success', durationMs: 240_000 }),
+    ];
+    const lines = formatPerProjectSummary(runs);
+    expect(lines[0]).toBe('');
+    expect(lines[1]).toBe('Per-project summary:');
+    const body = lines.slice(2).join('\n');
+    expect(body).toContain('project');
+    expect(body).toContain('ok');
+    expect(body).toMatch(/edgy\s+1\/1/);
+    expect(body).toMatch(/mealdrop\s+2\/3/);
+    expect(body).toContain('30s');
+    expect(body).toContain('4m');
+  });
+
+  it('returns an empty array when there are no runs', () => {
+    expect(formatPerProjectSummary([])).toEqual([]);
+  });
+});
+
+function makeRun(opts: { project: string; status: 'success' | 'failed'; durationMs: number }) {
+  return {
+    project: opts.project,
+    agent: 'claude' as const,
+    model: 'opus-4.6',
+    effort: 'high',
+    prompt: TEST_PROMPT,
+    repetition: 1,
+    label: `${opts.project}-r01`,
+    args: [],
+    startTimestamp: '2026-05-05T12:00:00.000Z',
+    endTimestamp: '2026-05-05T12:01:00.000Z',
+    durationMs: opts.durationMs,
+    exitCode: opts.status === 'success' ? 0 : 1,
+    signal: null,
+    status: opts.status,
+    logPath: `/tmp/${opts.project}.log`,
+  } as Parameters<typeof formatPerProjectSummary>[0][number];
+}
+
+describe('buildBatchRunDescriptors with --projects', () => {
+  it('restricts the matrix to the requested projects, deduplicating', () => {
+    const [first, second] = BATCH_PROJECT_NAMES;
+    const descriptors = buildBatchRunDescriptors({
+      prompt: TEST_PROMPT,
+      agents: ['claude'],
+      claudeEfforts: ['medium', 'high'],
+      projects: [first, second, first],
+      repetitions: 2,
+    });
+
+    expect(descriptors).toHaveLength(2 * 2 * 2); // 2 projects × 2 efforts × 2 reps
+    expect(new Set(descriptors.map((d) => d.project))).toEqual(new Set([first, second]));
+    expect(new Set(descriptors.map((d) => d.effort))).toEqual(new Set(['medium', 'high']));
+  });
+
+  it('throws when an unknown project is requested', () => {
+    expect(() =>
+      buildBatchRunDescriptors({
+        prompt: TEST_PROMPT,
+        projects: ['not-a-real-project'],
+      })
+    ).toThrow(/Unknown project/);
+  });
+
+  it('fans out across multiple prompts when --prompts is set', () => {
+    const [first] = BATCH_PROJECT_NAMES;
+    const descriptors = buildBatchRunDescriptors({
+      prompts: ['pattern-copy-play', 'setup'],
+      agents: ['claude'],
+      claudeEffort: 'high',
+      projects: [first],
+      repetitions: 2,
+    });
+
+    expect(descriptors).toHaveLength(2 * 1 * 2); // 2 prompts × 1 project × 2 reps
+    expect(new Set(descriptors.map((d) => d.prompt))).toEqual(
+      new Set(['pattern-copy-play', 'setup'])
+    );
+    expect(new Set(descriptors.map((d) => d.label)).size).toBe(descriptors.length);
+  });
+
+  it('throws when an unknown prompt is requested', () => {
+    expect(() =>
+      buildBatchRunDescriptors({
+        prompts: ['pattern-copy-play', 'not-a-real-prompt'],
+      })
+    ).toThrow(/Unknown prompt/);
+  });
+
+  it('parses --prompts from the CLI', () => {
+    expect(parseRunBatchArgs(['--prompts', 'pattern-copy-play, setup'])).toMatchObject({
+      prompts: ['pattern-copy-play', 'setup'],
+    });
+  });
+
+  it('rejects CLI invocations with neither --prompt nor --prompts', () => {
+    expect(() => parseRunBatchArgs(['--repetitions', '1'])).toThrow(/--prompt or --prompts/);
+  });
 });
 
 describe('runBatch', () => {