diff --git a/.circleci/config.yml b/.circleci/config.yml
index 59dd245f7676..d2798c319911 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -32,7 +32,7 @@ jobs:
generate-and-run-config:
executor:
name: node/default
- resource_class: small
+ resource_class: large
steps:
- node/install:
install-yarn: true
diff --git a/.gitignore b/.gitignore
index 3460788946c0..c8417670f741 100644
--- a/.gitignore
+++ b/.gitignore
@@ -80,4 +80,13 @@ CLAUDE.local.md
.vscode/mcp.json
.mcp.json
.nx/polygraph
-.omc
\ No newline at end of file
+
+# Eval system
+scripts/eval/.cache
+scripts/eval/results
+
+# review-pr skill output
+.pr-review
+
+# Unknown
+.omc
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 6bc4e4805902..ac32fd688694 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -23,6 +23,10 @@
"[jsonc]": {
"editor.defaultFormatter": "oxc.oxc-vscode"
},
+ "[typescript]": {
+ "editor.defaultFormatter": "oxc.oxc-vscode",
+ "editor.formatOnSave": true
+ },
"editor.codeActionsOnSave": {
"source.fixAll.eslint": "explicit",
"source.fixAll.oxc": "explicit"
diff --git a/AGENTS.md b/AGENTS.md
index ad985033a0db..9179e4c1c05c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -9,10 +9,11 @@ This file is the canonical instruction source for coding agents. Files like `CLA
Storybook is a large TypeScript monorepo. The git root is the repo root, the main code lives in `code/`, and build tooling lives in `scripts/`. The default branch is `next`.
- **Base branch**: `next` (all PRs should target `next`, not `main`)
-- **Node.js**: `22.21.1` (see `.nvmrc`)
+- **Node.js**: `22.22.1` (see `.nvmrc`) — supports `.ts` natively via type stripping (no loader needed)
- **Package Manager**: Yarn Berry
- **Task orchestration**: NX plus the custom `yarn task` runner
- **CI environment**: Linux and Windows
+- **TS execution**: Migrating from `jiti` to native `node` for running `.ts` files. New scripts should use `node ./path/file.ts` with explicit `.ts` import extensions (enabled by `allowImportingTsExtensions` in tsconfig). Legacy scripts still use `jiti` but should be migrated over time.
## Repository Structure
@@ -236,7 +237,7 @@ When writing tests:
After changing files:
-1. Format with `cd code && oxfmt`
+1. Format with `yarn fmt:write` (run from the repo root)
2. Lint with `yarn --cwd code lint:js:cmd --fix` or `cd code && yarn lint:js:cmd `
3. Run relevant tests before submitting a PR
diff --git a/code/addons/a11y/src/preview.tsx b/code/addons/a11y/src/preview.tsx
index c3a32f7b9067..e97ef0369fcb 100644
--- a/code/addons/a11y/src/preview.tsx
+++ b/code/addons/a11y/src/preview.tsx
@@ -20,6 +20,7 @@ export const afterEach: AfterEach = async ({
}) => {
const a11yParameter: A11yParameters | undefined = parameters.a11y;
const a11yGlobals = globals.a11y;
+ // we do not run a11y checks as part of ghost stories runs
const isGhostStories = !!globals.ghostStories;
const shouldRunEnvironmentIndependent =
diff --git a/code/addons/vitest/src/vitest-plugin/agent-telemetry-reporter.test.ts b/code/addons/vitest/src/vitest-plugin/agent-telemetry-reporter.test.ts
new file mode 100644
index 000000000000..7214a770ac39
--- /dev/null
+++ b/code/addons/vitest/src/vitest-plugin/agent-telemetry-reporter.test.ts
@@ -0,0 +1,206 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { AgentTelemetryReporter } from './agent-telemetry-reporter.ts';
+
+vi.mock('storybook/internal/telemetry', () => ({
+ telemetry: vi.fn(),
+ isExampleStoryId: vi.fn(
+ (id: string) =>
+ id.startsWith('example-button--') ||
+ id.startsWith('example-header--') ||
+ id.startsWith('example-page--')
+ ),
+}));
+
+const { telemetry } = await import('storybook/internal/telemetry');
+
+function createMockTestCase({
+ storyId,
+ status,
+ reports = [],
+ errors = [],
+}: {
+ storyId?: string;
+ status: 'passed' | 'failed' | 'pending';
+ reports?: Array<{ type: string; result?: Record }>;
+ errors?: Array<{ message: string; stack?: string }>;
+}) {
+ return {
+ meta: () => ({ storyId, reports }),
+ result: () => ({
+ state: status,
+ errors: status === 'failed' ? errors : [],
+ }),
+ };
+}
+
+function createMockTestModules(testCounts: { passed: number; failed: number }) {
+ const tests: Array<{ result: () => { state: string } }> = [];
+ for (let i = 0; i < testCounts.passed; i++) {
+ tests.push({ result: () => ({ state: 'passed' }) });
+ }
+ for (let i = 0; i < testCounts.failed; i++) {
+ tests.push({ result: () => ({ state: 'failed' }) });
+ }
+ return [
+ {
+ children: {
+ allTests: function* (filter?: string) {
+ for (const t of tests) {
+ if (!filter || t.result().state === filter) {
+ yield t;
+ }
+ }
+ },
+ },
+ errors: () => [],
+ },
+ ];
+}
+
+describe('AgentTelemetryReporter', () => {
+ let reporter: AgentTelemetryReporter;
+
+ beforeEach(() => {
+ vi.clearAllMocks();
+ reporter = new AgentTelemetryReporter({
+ configDir: '.storybook',
+ agent: { name: 'claude' },
+ });
+ });
+
+ describe('onTestCaseResult', () => {
+ it('should collect story test results', () => {
+ const testCase = createMockTestCase({
+ storyId: 'my-story--primary',
+ status: 'passed',
+ });
+ reporter.onTestCaseResult(testCase as any);
+ });
+
+ it('should skip tests without storyId', () => {
+ const testCase = createMockTestCase({
+ storyId: undefined,
+ status: 'passed',
+ });
+ reporter.onTestCaseResult(testCase as any);
+ });
+
+ it('should skip example story IDs', () => {
+ const testCase = createMockTestCase({
+ storyId: 'example-button--primary',
+ status: 'passed',
+ });
+ reporter.onTestCaseResult(testCase as any);
+ });
+ });
+
+ describe('onTestRunEnd', () => {
+ it('should send telemetry with analysis of collected results', async () => {
+ reporter.onInit({ config: { watch: false } } as any);
+
+ reporter.onTestCaseResult(createMockTestCase({ storyId: 's1', status: 'passed' }) as any);
+ reporter.onTestCaseResult(
+ createMockTestCase({
+ storyId: 's2',
+ status: 'failed',
+ errors: [{ message: 'Error: Module not found: foo' }],
+ }) as any
+ );
+ reporter.onTestCaseResult(
+ createMockTestCase({
+ storyId: 's3',
+ status: 'passed',
+ reports: [{ type: 'render-analysis', result: { emptyRender: true } }],
+ }) as any
+ );
+
+ await reporter.onTestRunEnd(createMockTestModules({ passed: 2, failed: 1 }) as any, []);
+
+ expect(telemetry).toHaveBeenCalledWith(
+ 'ai-setup-self-healing-scoring',
+ expect.objectContaining({
+ agent: { name: 'claude' },
+ analysis: expect.objectContaining({
+ total: 3,
+ passed: 2,
+ passedButEmptyRender: 1,
+ successRate: 0.67,
+ successRateWithoutEmptyRender: 0.33,
+ uniqueErrorCount: 1,
+ }),
+ unhandledErrorCount: 0,
+ watch: false,
+ }),
+ { configDir: '.storybook', stripMetadata: true }
+ );
+ });
+
+ it('should filter out example stories from analysis', async () => {
+ reporter.onInit({ config: { watch: false } } as any);
+
+ reporter.onTestCaseResult(
+ createMockTestCase({ storyId: 'my-story--primary', status: 'passed' }) as any
+ );
+ reporter.onTestCaseResult(
+ createMockTestCase({ storyId: 'example-button--primary', status: 'passed' }) as any
+ );
+
+ await reporter.onTestRunEnd(createMockTestModules({ passed: 2, failed: 0 }) as any, []);
+
+ expect(telemetry).toHaveBeenCalledWith(
+ 'ai-setup-self-healing-scoring',
+ expect.objectContaining({
+ analysis: expect.objectContaining({
+ total: 1,
+ passed: 1,
+ }),
+ }),
+ expect.anything()
+ );
+ });
+
+ it('should count unhandled errors', async () => {
+ reporter.onInit({ config: { watch: false } } as any);
+
+ await reporter.onTestRunEnd(
+ createMockTestModules({ passed: 0, failed: 0 }) as any,
+ [{ message: 'unhandled' }, { message: 'another' }] as any
+ );
+
+ expect(telemetry).toHaveBeenCalledWith(
+ 'ai-setup-self-healing-scoring',
+ expect.objectContaining({
+ unhandledErrorCount: 2,
+ }),
+ expect.anything()
+ );
+ });
+
+ it('should reset collected results after each run', async () => {
+ reporter.onInit({ config: { watch: false } } as any);
+
+ reporter.onTestCaseResult(createMockTestCase({ storyId: 's1', status: 'passed' }) as any);
+ await reporter.onTestRunEnd(createMockTestModules({ passed: 1, failed: 0 }) as any, []);
+
+ reporter.onTestCaseResult(
+ createMockTestCase({
+ storyId: 's2',
+ status: 'failed',
+ errors: [{ message: 'err' }],
+ }) as any
+ );
+ await reporter.onTestRunEnd(createMockTestModules({ passed: 0, failed: 1 }) as any, []);
+
+ const secondCall = vi.mocked(telemetry).mock.calls[1];
+ expect(secondCall[1]).toEqual(
+ expect.objectContaining({
+ analysis: expect.objectContaining({
+ total: 1,
+ passed: 0,
+ }),
+ })
+ );
+ });
+ });
+});
diff --git a/code/addons/vitest/src/vitest-plugin/agent-telemetry-reporter.ts b/code/addons/vitest/src/vitest-plugin/agent-telemetry-reporter.ts
new file mode 100644
index 000000000000..40c58aa65cca
--- /dev/null
+++ b/code/addons/vitest/src/vitest-plugin/agent-telemetry-reporter.ts
@@ -0,0 +1,88 @@
+import type { SerializedError } from 'vitest';
+import type { TestCase, TestModule, Vitest } from 'vitest/node';
+import type { Reporter } from 'vitest/reporters';
+
+import type { TaskMeta } from '@vitest/runner';
+import type { Report } from 'storybook/preview-api';
+import { analyzeTestResults, toStoryTestResult } from 'storybook/internal/core-server';
+import type { StoryTestResult } from 'storybook/internal/core-server';
+import { isExampleStoryId, telemetry } from 'storybook/internal/telemetry';
+import type { AgentInfo } from 'storybook/internal/telemetry';
+
+interface AgentTelemetryReporterOptions {
+ configDir: string;
+ agent: AgentInfo;
+}
+
+export class AgentTelemetryReporter implements Reporter {
+ private ctx!: Vitest;
+
+ private testResults: StoryTestResult[] = [];
+
+ private startTime = Date.now();
+
+ private configDir: string;
+
+ private agent: AgentInfo;
+
+ constructor(options: AgentTelemetryReporterOptions) {
+ this.configDir = options.configDir;
+ this.agent = options.agent;
+ }
+
+ onInit(ctx: Vitest) {
+ this.ctx = ctx;
+ }
+
+ onTestRunStart() {
+ this.startTime = Date.now();
+ }
+
+ onTestCaseResult(testCase: TestCase) {
+ const { storyId, reports } = testCase.meta() as TaskMeta &
+ Partial<{ storyId: string; reports: Report[] }>;
+
+ if (!storyId || isExampleStoryId(storyId)) {
+ return;
+ }
+
+ const testResult = testCase.result();
+ const result = toStoryTestResult({
+ storyId,
+ statusRaw: testResult.state,
+ reports,
+ errors: testResult.errors,
+ });
+
+ if (result) {
+ this.testResults.push(result);
+ }
+ }
+
+ async onTestRunEnd(
+ testModules: readonly TestModule[],
+ unhandledErrors: readonly SerializedError[]
+ ) {
+ const analysis = analyzeTestResults(this.testResults);
+ const duration = Date.now() - this.startTime;
+
+ const testModulesErrors = testModules.flatMap((t) => t.errors());
+ const unhandledErrorCount = unhandledErrors.length + testModulesErrors.length;
+
+ // Fire and forget — same pattern as the existing test-run telemetry
+ telemetry(
+ 'ai-setup-self-healing-scoring',
+ {
+ agent: this.agent,
+ analysis,
+ unhandledErrorCount,
+ duration,
+ watch: this.ctx.config.watch,
+ },
+ { configDir: this.configDir, stripMetadata: true }
+ );
+
+ // Reset for next run (watch mode)
+ this.testResults = [];
+ }
+}
diff --git a/code/addons/vitest/src/vitest-plugin/index.ts b/code/addons/vitest/src/vitest-plugin/index.ts
index 9f1bd5c7bbde..8c61af2e65a0 100644
--- a/code/addons/vitest/src/vitest-plugin/index.ts
+++ b/code/addons/vitest/src/vitest-plugin/index.ts
@@ -20,8 +20,14 @@ import {
} from 'storybook/internal/core-server';
import { componentTransform, readConfig, vitestTransform } from 'storybook/internal/csf-tools';
import { MainFileMissingError } from 'storybook/internal/server-errors';
-import { setTelemetryEnabled, telemetry } from 'storybook/internal/telemetry';
-import { oneWayHash } from 'storybook/internal/telemetry';
+import {
+ detectAgent,
+ isTelemetryModuleEnabled,
+ isWithinInitialSession,
+ oneWayHash,
+ telemetry,
+ setTelemetryEnabled,
+} from 'storybook/internal/telemetry';
import type { Presets } from 'storybook/internal/types';
import { match } from 'micromatch';
@@ -36,6 +42,7 @@ import type { PluginOption } from 'vite';
import { withoutVitePlugins } from '../../../../builders/builder-vite/src/utils/without-vite-plugins.ts';
import type { InternalOptions, UserOptions } from './types.ts';
import { requiresProjectAnnotations } from './utils.ts';
+import { AgentTelemetryReporter } from './agent-telemetry-reporter.ts';
const WORKING_DIR = process.cwd();
@@ -241,6 +248,8 @@ export const storybookTest = async (options?: UserOptions): Promise =>
plugins.push(mdxStubPlugin);
}
+ let withinAgenticSetupSession = false;
+
const storybookTestPlugin: Plugin = {
name: 'vite-plugin-storybook-test',
async transformIndexHtml(html) {
@@ -385,6 +394,15 @@ export const storybookTest = async (options?: UserOptions): Promise =>
globals.ghostStories = {
enabled: true,
};
+ globals.renderAnalysis = {
+ enabled: true,
+ };
+ }
+
+ if (withinAgenticSetupSession) {
+ globals.renderAnalysis = {
+ enabled: true,
+ };
}
return globals;
@@ -441,7 +459,7 @@ export const storybookTest = async (options?: UserOptions): Promise =>
// return the new config, it will be deep-merged by vite
return config;
},
- configureVitest(context) {
+ async configureVitest(context) {
context.vitest.config.coverage.exclude.push('storybook-static');
// NOTE: we start telemetry immediately but do not wait on it. Typically it should complete
@@ -455,6 +473,21 @@ export const storybookTest = async (options?: UserOptions): Promise =>
},
{ configDir: finalOptions.configDir }
);
+
+ if (isTelemetryModuleEnabled()) {
+ // When an agent is running vitest via CLI, inject a reporter that sends
+ // detailed test result telemetry (pass/fail, error analysis, empty renders)
+ const agent = detectAgent();
+ withinAgenticSetupSession = !!agent && (await isWithinInitialSession('ai-setup'));
+ if (agent && withinAgenticSetupSession) {
+ context.vitest.config.reporters.push(
+ new AgentTelemetryReporter({
+ configDir: finalOptions.configDir,
+ agent,
+ })
+ );
+ }
+ }
},
async configureServer(server) {
if (staticDirs) {
diff --git a/code/core/src/cli/globalSettings.ts b/code/core/src/cli/globalSettings.ts
index ee20cf140d35..6dcc016e92ba 100644
--- a/code/core/src/cli/globalSettings.ts
+++ b/code/core/src/cli/globalSettings.ts
@@ -29,6 +29,7 @@ const userSettingSchema = z.object({
items: z
.object({
accessibilityTests: statusValue,
+ aiSetup: statusValue,
autodocs: statusValue,
ciTests: statusValue,
controls: statusValue,
diff --git a/code/core/src/common/js-package-manager/JsPackageManagerFactory.test.ts b/code/core/src/common/js-package-manager/JsPackageManagerFactory.test.ts
index 59302b4fbcda..a77668765642 100644
--- a/code/core/src/common/js-package-manager/JsPackageManagerFactory.test.ts
+++ b/code/core/src/common/js-package-manager/JsPackageManagerFactory.test.ts
@@ -1,11 +1,14 @@
+import { existsSync, readFileSync } from 'node:fs';
import { join } from 'node:path';
import { beforeEach, describe, expect, it, vi } from 'vitest';
import * as find from 'empathic/find';
+import * as walk from 'empathic/walk';
import { PackageManagerName } from './index.ts';
import { executeCommandSync } from '../utils/command.ts';
+import { getProjectRoot } from '../utils/paths.ts';
import { BUNProxy } from './BUNProxy.ts';
import { JsPackageManagerFactory } from './JsPackageManagerFactory.ts';
import { NPMProxy } from './NPMProxy.ts';
@@ -19,11 +22,24 @@ const executeCommandSyncMock = vi.mocked(executeCommandSync);
vi.mock('empathic/find');
const findMock = vi.mocked(find);
+vi.mock('empathic/walk');
+const walkMock = vi.mocked(walk);
+
+vi.mock('../utils/paths', { spy: true });
+const getProjectRootMock = vi.mocked(getProjectRoot);
+
+vi.mock('node:fs', { spy: true });
+const readFileSyncMock = vi.mocked(readFileSync);
+const existsSyncMock = vi.mocked(existsSync);
+
describe('CLASS: JsPackageManagerFactory', () => {
beforeEach(() => {
JsPackageManagerFactory.clearCache();
findMock.up.mockReturnValue(undefined);
findMock.any.mockReturnValue(undefined);
+ walkMock.up.mockReturnValue([]);
+ getProjectRootMock.mockReturnValue(process.cwd());
+ existsSyncMock.mockReturnValue(false);
executeCommandSyncMock.mockImplementation(() => {
throw new Error('Command not found');
});
@@ -115,10 +131,15 @@ describe('CLASS: JsPackageManagerFactory', () => {
});
it('PNPM LOCK IF CLOSER: when a pnpm-lock.yaml file is closer than a yarn.lock', async () => {
- // Allow find to work as normal, we'll set the cwd to our fixture package
- findMock.up.mockImplementation(
- (await vi.importActual('empathic/find')).up
- );
+ // Use real find.up for lockfile resolution but exclude .yarnrc.yml
+ // so the test doesn't depend on the host repo's own Yarn Berry config
+ const realFind = await vi.importActual('empathic/find');
+ findMock.up.mockImplementation((filename, opts) => {
+ if (typeof filename === 'string' && filename === '.yarnrc.yml') {
+ return undefined;
+ }
+ return realFind.up(filename, opts);
+ });
executeCommandSyncMock.mockImplementation((options) => {
// Yarn is ok
@@ -212,10 +233,15 @@ describe('CLASS: JsPackageManagerFactory', () => {
});
it('when multiple lockfiles are in a project, prefers yarn', async () => {
- // Allow find to work as normal, we'll set the cwd to our fixture package
- findMock.up.mockImplementation(
- (await vi.importActual('empathic/find')).up
- );
+ // Use real find.up for lockfile resolution but exclude .yarnrc.yml
+ // so the test doesn't depend on the host repo's own Yarn Berry config
+ const realFind = await vi.importActual('empathic/find');
+ findMock.up.mockImplementation((filename, opts) => {
+ if (typeof filename === 'string' && filename === '.yarnrc.yml') {
+ return undefined;
+ }
+ return realFind.up(filename, opts);
+ });
executeCommandSyncMock.mockImplementation((options) => {
// Yarn is ok
@@ -306,6 +332,180 @@ describe('CLASS: JsPackageManagerFactory', () => {
expect(JsPackageManagerFactory.getPackageManager()).toBeInstanceOf(Yarn2Proxy);
});
+
+ it('BERRY VIA .yarnrc.yml: when yarn --version reports 1.x but .yarnrc.yml exists', () => {
+ executeCommandSyncMock.mockImplementation((options) => {
+ // Yarn reports 1.x (global yarn classic)
+ if (options.command === 'yarn' && options.args?.[0] === '--version') {
+ return '1.22.4';
+ }
+ // NPM is ko
+ if (options.command === 'npm' && options.args?.[0] === '--version') {
+ throw new Error('Command not found');
+ }
+ throw new Error('Command not found');
+ });
+
+ findMock.up.mockImplementation((filename) => {
+ if (typeof filename === 'string' && filename === 'yarn.lock') {
+ return '/Users/johndoe/Documents/yarn.lock';
+ }
+ if (typeof filename === 'string' && filename === '.yarnrc.yml') {
+ return '/Users/johndoe/Documents/.yarnrc.yml';
+ }
+ return undefined;
+ });
+
+ expect(JsPackageManagerFactory.getPackageManager()).toBeInstanceOf(Yarn2Proxy);
+ });
+
+ it('BERRY VIA packageManager FIELD: when yarn --version reports 1.x but package.json has packageManager yarn@4.1.0', () => {
+ executeCommandSyncMock.mockImplementation((options) => {
+ // Yarn reports 1.x (global yarn classic)
+ if (options.command === 'yarn' && options.args?.[0] === '--version') {
+ return '1.22.4';
+ }
+ // NPM is ko
+ if (options.command === 'npm' && options.args?.[0] === '--version') {
+ throw new Error('Command not found');
+ }
+ throw new Error('Command not found');
+ });
+
+ findMock.up.mockImplementation((filename) => {
+ if (typeof filename === 'string' && filename === 'yarn.lock') {
+ return '/Users/johndoe/Documents/yarn.lock';
+ }
+ return undefined;
+ });
+
+ const cwd = process.cwd();
+ walkMock.up.mockReturnValue([cwd]);
+ existsSyncMock.mockImplementation((p) => {
+ if (p === join(cwd, 'package.json')) {
+ return true;
+ }
+ return false;
+ });
+
+ readFileSyncMock.mockImplementation((filePath, encoding) => {
+ if (
+ typeof filePath === 'string' &&
+ filePath === join(cwd, 'package.json') &&
+ encoding === 'utf-8'
+ ) {
+ return JSON.stringify({ packageManager: 'yarn@4.1.0' });
+ }
+ throw new Error('File not found');
+ });
+
+ expect(JsPackageManagerFactory.getPackageManager()).toBeInstanceOf(Yarn2Proxy);
+ });
+
+ it('BERRY VIA packageManager FIELD WITH HASH: when package.json has packageManager yarn@4.1.0+sha256.xxx', () => {
+ executeCommandSyncMock.mockImplementation((options) => {
+ if (options.command === 'yarn' && options.args?.[0] === '--version') {
+ return '1.22.4';
+ }
+ if (options.command === 'npm' && options.args?.[0] === '--version') {
+ throw new Error('Command not found');
+ }
+ throw new Error('Command not found');
+ });
+
+ findMock.up.mockImplementation((filename) => {
+ if (typeof filename === 'string' && filename === 'yarn.lock') {
+ return '/Users/johndoe/Documents/yarn.lock';
+ }
+ return undefined;
+ });
+
+ const cwd = process.cwd();
+ walkMock.up.mockReturnValue([cwd]);
+ existsSyncMock.mockImplementation((p) => {
+ if (p === join(cwd, 'package.json')) {
+ return true;
+ }
+ return false;
+ });
+
+ readFileSyncMock.mockImplementation((filePath, encoding) => {
+ if (
+ typeof filePath === 'string' &&
+ filePath === join(cwd, 'package.json') &&
+ encoding === 'utf-8'
+ ) {
+ return JSON.stringify({
+ packageManager:
+ 'yarn@4.1.0+sha256.81a00df816059803e6b5148acf03ce313cad36b7f6e5af6efa040a8db86b7e8f',
+ });
+ }
+ throw new Error('File not found');
+ });
+
+ expect(JsPackageManagerFactory.getPackageManager()).toBeInstanceOf(Yarn2Proxy);
+ });
+
+ it('BERRY v3: when yarn --version reports 3.x', () => {
+ executeCommandSyncMock.mockImplementation((options) => {
+ if (options.command === 'yarn' && options.args?.[0] === '--version') {
+ return '3.6.4';
+ }
+ if (options.command === 'npm' && options.args?.[0] === '--version') {
+ throw new Error('Command not found');
+ }
+ throw new Error('Command not found');
+ });
+
+ findMock.up.mockImplementation((filename) => {
+ if (typeof filename === 'string' && filename === 'yarn.lock') {
+ return '/Users/johndoe/Documents/yarn.lock';
+ }
+ return undefined;
+ });
+
+ expect(JsPackageManagerFactory.getPackageManager()).toBeInstanceOf(Yarn2Proxy);
+ });
+
+ it('BERRY v4: when yarn --version reports 4.x', () => {
+ executeCommandSyncMock.mockImplementation((options) => {
+ if (options.command === 'yarn' && options.args?.[0] === '--version') {
+ return '4.1.0';
+ }
+ if (options.command === 'npm' && options.args?.[0] === '--version') {
+ throw new Error('Command not found');
+ }
+ throw new Error('Command not found');
+ });
+
+ findMock.up.mockImplementation((filename) => {
+ if (typeof filename === 'string' && filename === 'yarn.lock') {
+ return '/Users/johndoe/Documents/yarn.lock';
+ }
+ return undefined;
+ });
+
+ expect(JsPackageManagerFactory.getPackageManager()).toBeInstanceOf(Yarn2Proxy);
+ });
+
+ it('BERRY WHEN YARN FAILS: when yarn command fails but .yarnrc.yml exists', () => {
+ executeCommandSyncMock.mockImplementation(() => {
+ // All commands fail
+ throw new Error('Command not found');
+ });
+
+ findMock.up.mockImplementation((filename) => {
+ if (typeof filename === 'string' && filename === 'yarn.lock') {
+ return '/Users/johndoe/Documents/yarn.lock';
+ }
+ if (typeof filename === 'string' && filename === '.yarnrc.yml') {
+ return '/Users/johndoe/Documents/.yarnrc.yml';
+ }
+ return undefined;
+ });
+
+ expect(JsPackageManagerFactory.getPackageManager()).toBeInstanceOf(Yarn2Proxy);
+ });
});
describe('BUN proxy', () => {
@@ -357,4 +557,204 @@ describe('CLASS: JsPackageManagerFactory', () => {
expect(() => JsPackageManagerFactory.getPackageManager()).toThrow();
});
});
+
+ describe('getYarnVersionFromPackageJson walks upward', () => {
+ /**
+ * Helper: set up yarn.lock detection and a yarn --version that reports 1.x
+ * so getYarnVersion falls through to getYarnVersionFromPackageJson first.
+ */
+ function setupYarnScenario() {
+ executeCommandSyncMock.mockImplementation((options) => {
+ if (options.command === 'yarn' && options.args?.[0] === '--version') {
+ return '1.22.4';
+ }
+ throw new Error('Command not found');
+ });
+ findMock.up.mockImplementation((filename) => {
+ if (typeof filename === 'string' && filename === 'yarn.lock') {
+ return '/repo/yarn.lock';
+ }
+ return undefined;
+ });
+ }
+
+ it('walks past a workspace package.json without packageManager to find root packageManager', () => {
+ setupYarnScenario();
+ getProjectRootMock.mockReturnValue('/repo');
+
+ // walk.up returns directories from cwd upward to root
+ walkMock.up.mockReturnValue(['/repo/packages/my-app', '/repo/packages', '/repo']);
+
+ existsSyncMock.mockImplementation((p) => {
+ if (p === join('/repo/packages/my-app', 'package.json')) {
+ return true;
+ }
+ if (p === join('/repo', 'package.json')) {
+ return true;
+ }
+ return false;
+ });
+
+ readFileSyncMock.mockImplementation((filePath, encoding) => {
+ if (filePath === join('/repo/packages/my-app', 'package.json') && encoding === 'utf-8') {
+ return JSON.stringify({ name: 'my-app', version: '1.0.0' });
+ }
+ if (filePath === join('/repo', 'package.json') && encoding === 'utf-8') {
+ return JSON.stringify({ packageManager: 'yarn@4.1.0' });
+ }
+ throw new Error('File not found');
+ });
+
+ const cwd = '/repo/packages/my-app';
+ expect(JsPackageManagerFactory.getPackageManager({}, cwd)).toBeInstanceOf(Yarn2Proxy);
+ });
+
+ it('uses getProjectRoot() as the starting context for the walk', () => {
+ setupYarnScenario();
+ getProjectRootMock.mockReturnValue('/custom-root');
+
+ walkMock.up.mockReturnValue([
+ '/custom-root/packages/app',
+ '/custom-root/packages',
+ '/custom-root',
+ ]);
+
+ existsSyncMock.mockImplementation((p) => {
+ if (p === join('/custom-root', 'package.json')) {
+ return true;
+ }
+ return false;
+ });
+
+ readFileSyncMock.mockImplementation((filePath, encoding) => {
+ if (filePath === join('/custom-root', 'package.json') && encoding === 'utf-8') {
+ return JSON.stringify({ packageManager: 'yarn@4.1.0' });
+ }
+ throw new Error('File not found');
+ });
+
+ const cwd = '/custom-root/packages/app';
+ expect(JsPackageManagerFactory.getPackageManager({}, cwd)).toBeInstanceOf(Yarn2Proxy);
+ // Verify walk.up was called with the project root as the `last` boundary
+ expect(walkMock.up).toHaveBeenCalledWith(cwd, { last: '/custom-root' });
+ });
+
+ it('falls back to cwd when getProjectRoot returns cwd', () => {
+ setupYarnScenario();
+ const cwd = '/some/project';
+ getProjectRootMock.mockReturnValue(cwd);
+
+ walkMock.up.mockReturnValue([cwd]);
+
+ existsSyncMock.mockImplementation((p) => {
+ if (p === join(cwd, 'package.json')) {
+ return true;
+ }
+ return false;
+ });
+
+ readFileSyncMock.mockImplementation((filePath, encoding) => {
+ if (filePath === join(cwd, 'package.json') && encoding === 'utf-8') {
+ return JSON.stringify({ packageManager: 'yarn@1.22.0' });
+ }
+ throw new Error('File not found');
+ });
+
+ expect(JsPackageManagerFactory.getPackageManager({}, cwd)).toBeInstanceOf(Yarn1Proxy);
+ });
+
+ it('prefers the closest package.json that declares packageManager', () => {
+ setupYarnScenario();
+ getProjectRootMock.mockReturnValue('/repo');
+
+ walkMock.up.mockReturnValue(['/repo/packages/my-app', '/repo/packages', '/repo']);
+
+ existsSyncMock.mockImplementation((p) => {
+ if (p === join('/repo/packages/my-app', 'package.json')) {
+ return true;
+ }
+ if (p === join('/repo', 'package.json')) {
+ return true;
+ }
+ return false;
+ });
+
+ readFileSyncMock.mockImplementation((filePath, encoding) => {
+ // Closest has yarn@1 packageManager
+ if (filePath === join('/repo/packages/my-app', 'package.json') && encoding === 'utf-8') {
+ return JSON.stringify({ packageManager: 'yarn@1.22.0' });
+ }
+ // Root has yarn@4 packageManager
+ if (filePath === join('/repo', 'package.json') && encoding === 'utf-8') {
+ return JSON.stringify({ packageManager: 'yarn@4.1.0' });
+ }
+ throw new Error('File not found');
+ });
+
+ const cwd = '/repo/packages/my-app';
+ // Should pick up the closest one (yarn@1)
+ expect(JsPackageManagerFactory.getPackageManager({}, cwd)).toBeInstanceOf(Yarn1Proxy);
+ });
+
+ it('returns undefined when no package.json in any ancestor declares packageManager', () => {
+ setupYarnScenario();
+ getProjectRootMock.mockReturnValue('/repo');
+
+ walkMock.up.mockReturnValue(['/repo/packages/my-app', '/repo']);
+
+ existsSyncMock.mockImplementation((p) => {
+ if (p === join('/repo/packages/my-app', 'package.json')) {
+ return true;
+ }
+ if (p === join('/repo', 'package.json')) {
+ return true;
+ }
+ return false;
+ });
+
+ readFileSyncMock.mockImplementation((filePath, encoding) => {
+ if (filePath === join('/repo/packages/my-app', 'package.json') && encoding === 'utf-8') {
+ return JSON.stringify({ name: 'my-app' });
+ }
+ if (filePath === join('/repo', 'package.json') && encoding === 'utf-8') {
+ return JSON.stringify({ name: 'monorepo' });
+ }
+ throw new Error('File not found');
+ });
+
+ const cwd = '/repo/packages/my-app';
+ // No packageManager anywhere, yarn --version reports 1.x, no .yarnrc.yml → Yarn1
+ expect(JsPackageManagerFactory.getPackageManager({}, cwd)).toBeInstanceOf(Yarn1Proxy);
+ });
+
+ it('skips unparsable package.json files and continues walking', () => {
+ setupYarnScenario();
+ getProjectRootMock.mockReturnValue('/repo');
+
+ walkMock.up.mockReturnValue(['/repo/packages/broken', '/repo']);
+
+ existsSyncMock.mockImplementation((p) => {
+ if (p === join('/repo/packages/broken', 'package.json')) {
+ return true;
+ }
+ if (p === join('/repo', 'package.json')) {
+ return true;
+ }
+ return false;
+ });
+
+ readFileSyncMock.mockImplementation((filePath, encoding) => {
+ if (filePath === join('/repo/packages/broken', 'package.json') && encoding === 'utf-8') {
+ return '{ invalid json';
+ }
+ if (filePath === join('/repo', 'package.json') && encoding === 'utf-8') {
+ return JSON.stringify({ packageManager: 'yarn@4.1.0' });
+ }
+ throw new Error('File not found');
+ });
+
+ const cwd = '/repo/packages/broken';
+ expect(JsPackageManagerFactory.getPackageManager({}, cwd)).toBeInstanceOf(Yarn2Proxy);
+ });
+ });
});
diff --git a/code/core/src/common/js-package-manager/JsPackageManagerFactory.ts b/code/core/src/common/js-package-manager/JsPackageManagerFactory.ts
index 853fdf05db87..91586eabf503 100644
--- a/code/core/src/common/js-package-manager/JsPackageManagerFactory.ts
+++ b/code/core/src/common/js-package-manager/JsPackageManagerFactory.ts
@@ -1,6 +1,8 @@
-import { basename, parse, relative } from 'node:path';
+import { existsSync, readFileSync } from 'node:fs';
+import { basename, join, parse, relative } from 'node:path';
import * as find from 'empathic/find';
+import * as walk from 'empathic/walk';
import { executeCommandSync } from '../utils/command.ts';
import { getProjectRoot } from '../utils/paths.ts';
@@ -249,7 +251,63 @@ function hasPNPM(cwd?: string) {
}
}
+/**
+ * Walk upward from `cwd` to `root`, checking each package.json for a
+ * `packageManager` field specifying yarn. Returns 1 or 2 when found,
+ * or undefined only after every ancestor has been checked.
+ *
+ * This avoids a common monorepo pitfall where the closest package.json
+ * (a workspace package) lacks `packageManager` while the repo-root
+ * package.json declares it.
+ */
+function getYarnVersionFromPackageJson(cwd?: string, root?: string): 1 | 2 | undefined {
+ const effectiveRoot = root ?? getProjectRoot();
+ const directories = walk.up(cwd ?? process.cwd(), { last: effectiveRoot });
+
+ for (const dir of directories) {
+ const packageJsonPath = join(dir, 'package.json');
+ if (!existsSync(packageJsonPath)) {
+ continue;
+ }
+
+ try {
+ const content = JSON.parse(readFileSync(packageJsonPath, 'utf-8'));
+ const packageManager: unknown = content.packageManager;
+ if (typeof packageManager === 'string') {
+ const match = packageManager.match(/^yarn@(\d+)\./);
+ if (match) {
+ return match[1] === '1' ? 1 : 2;
+ }
+ }
+ } catch {
+ // Ignore parse errors and continue walking
+ }
+ }
+
+ return undefined;
+}
+
+/**
+ * Check whether a `.yarnrc.yml` file exists in the project tree.
+ * This file only exists in Yarn Berry (v2+) projects.
+ */
+function hasYarnBerryConfig(cwd?: string, root?: string): boolean {
+ return find.up('.yarnrc.yml', { cwd, last: root }) !== undefined;
+}
+
function getYarnVersion(cwd?: string): 1 | 2 | undefined {
+ const root = getProjectRoot();
+
+ // 1. Check packageManager field in closest package.json (highest priority)
+ const versionFromPackageJson = getYarnVersionFromPackageJson(cwd, root);
+ if (versionFromPackageJson !== undefined) {
+ return versionFromPackageJson;
+ }
+
+ // 2. Check for .yarnrc.yml (Berry-only config file)
+ const hasBerryConfig = hasYarnBerryConfig(cwd, root);
+
+ // 3. Run yarn --version
try {
const yarnVersion = executeCommandSync({
command: 'yarn',
@@ -259,8 +317,20 @@ function getYarnVersion(cwd?: string): 1 | 2 | undefined {
Object.entries(process.env).filter(([, value]) => value !== undefined)
) as Record,
});
- return /^1\.+/.test(yarnVersion.trim()) ? 1 : 2;
+
+ if (/^1\./.test(yarnVersion.trim())) {
+ // yarn --version reports 1.x, but .yarnrc.yml means it's actually Berry; happens if the user's global Yarn is used because they forgot to enable corepack
+ return hasBerryConfig ? 2 : 1;
+ }
+
+ return 2;
} catch (err) {
+ // 4. yarn command failed — fall back to .yarnrc.yml presence
+ if (hasBerryConfig) {
+ return 2;
+ }
+
+ // 5. No yarn command and no .yarnrc.yml
return undefined;
}
}
diff --git a/code/core/src/common/satellite-addons.ts b/code/core/src/common/satellite-addons.ts
index 122d3b59cba7..98d2a89cc928 100644
--- a/code/core/src/common/satellite-addons.ts
+++ b/code/core/src/common/satellite-addons.ts
@@ -7,6 +7,7 @@ export default [
'@storybook/addon-coverage',
'@storybook/addon-webpack5-compiler-babel',
'@storybook/addon-webpack5-compiler-swc',
+ '@storybook/addon-mcp',
// Storybook for React Native related packages
// TODO: For Storybook 10, we should check about possible automigrations
'@storybook/addon-ondevice-actions',
diff --git a/code/core/src/components/components/syntaxhighlighter/syntaxhighlighter.tsx b/code/core/src/components/components/syntaxhighlighter/syntaxhighlighter.tsx
index f10d31906470..f55e91f87802 100644
--- a/code/core/src/components/components/syntaxhighlighter/syntaxhighlighter.tsx
+++ b/code/core/src/components/components/syntaxhighlighter/syntaxhighlighter.tsx
@@ -1,9 +1,4 @@
-import type { MouseEvent } from 'react';
-import React, { useCallback, useEffect, useState } from 'react';
-
-import { logger } from 'storybook/internal/client-logger';
-
-import { global } from '@storybook/global';
+import React, { useEffect, useState } from 'react';
import memoize from 'memoizerific';
// @ts-expect-error (Converted from ts-ignore)
@@ -26,15 +21,13 @@ import { styled } from 'storybook/theming';
import { ActionBar } from '../ActionBar/ActionBar.tsx';
import type { ScrollAreaProps } from '../ScrollArea/ScrollArea.tsx';
import { ScrollArea } from '../ScrollArea/ScrollArea.tsx';
-import { createCopyToClipboardFunction } from './clipboard.ts';
+import { useCopyButton } from '../../../shared/useCopyButton.ts';
import type {
SyntaxHighlighterProps,
SyntaxHighlighterRenderer,
SyntaxHighlighterRendererProps,
} from './syntaxhighlighter-types.ts';
-const { window: globalWindow } = global;
-
export const supportedLanguages = {
jsextra: jsExtras,
jsx,
@@ -57,8 +50,6 @@ const themedSyntax = memoize(2)((theme) =>
Object.entries(theme.code || {}).reduce((acc, [key, val]) => ({ ...acc, [`* .${key}`]: val }), {})
);
-const copyToClipboard: (text: string) => Promise = createCopyToClipboardFunction();
-
export interface WrapperProps {
bordered?: boolean;
padded?: boolean;
@@ -179,10 +170,6 @@ const wrapRenderer = (
return defaultRenderer;
};
-export interface SyntaxHighlighterState {
- copied: boolean;
-}
-
// copied from @types/react-syntax-highlighter/index.d.ts
export const SyntaxHighlighter = ({
@@ -211,20 +198,9 @@ export const SyntaxHighlighter = ({
}
}, [children, format, formatter]);
- const [copied, setCopied] = useState(false);
-
- const onClick = useCallback(
- (e: MouseEvent) => {
- e.preventDefault();
- copyToClipboard(highlightableCode)
- .then(() => {
- setCopied(true);
- globalWindow.setTimeout(() => setCopied(false), 1500);
- })
- .catch(logger.error);
- },
- [highlightableCode]
- );
+ const { children: copyChildren, buttonProps: copyButtonProps } = useCopyButton({
+ content: highlightableCode,
+ });
const renderer = wrapRenderer(rest.renderer, showLineNumbers);
return (
@@ -252,7 +228,15 @@ export const SyntaxHighlighter = ({
{copyable ? (
-
+
) : null}
);
diff --git a/code/core/src/core-events/index.ts b/code/core/src/core-events/index.ts
index cd0f562308d9..8c61b60ae7fc 100644
--- a/code/core/src/core-events/index.ts
+++ b/code/core/src/core-events/index.ts
@@ -91,6 +91,9 @@ enum events {
// Story discovery and testing flow
GHOST_STORIES_REQUEST = 'ghostStoriesRequest',
GHOST_STORIES_RESPONSE = 'ghostStoriesResponse',
+ // AI analytics - ai setup command
+ AI_SETUP_ANALYTICS_RESPONSE = 'aiSetupAnalyticsResponse',
+ AI_SETUP_ANALYTICS_REQUEST = 'aiSetupAnalyticsRequest',
// Open a file in the code editor
OPEN_IN_EDITOR_REQUEST = 'openInEditorRequest',
OPEN_IN_EDITOR_RESPONSE = 'openInEditorResponse',
@@ -100,6 +103,8 @@ enum events {
SHARE_STORY_LINK = 'shareStoryLink',
SHARE_ISOLATE_MODE = 'shareIsolateMode',
SHARE_POPOVER_OPENED = 'sharePopoverOpened',
+
+ AI_PROMPT_NUDGE = 'aiPromptNudge',
SIDEBAR_FILTER_CHANGED = 'sidebarFilterChanged',
}
@@ -169,12 +174,15 @@ export const {
ARGTYPES_INFO_RESPONSE,
GHOST_STORIES_REQUEST,
GHOST_STORIES_RESPONSE,
+ AI_SETUP_ANALYTICS_RESPONSE,
+ AI_SETUP_ANALYTICS_REQUEST,
OPEN_IN_EDITOR_REQUEST,
OPEN_IN_EDITOR_RESPONSE,
MANAGER_INERT_ATTRIBUTE_CHANGED,
SHARE_STORY_LINK,
SHARE_ISOLATE_MODE,
SHARE_POPOVER_OPENED,
+ AI_PROMPT_NUDGE,
SIDEBAR_FILTER_CHANGED,
} = events;
diff --git a/code/core/src/core-server/build-dev.onboarding.test.ts b/code/core/src/core-server/build-dev.onboarding.test.ts
new file mode 100644
index 000000000000..679c921304da
--- /dev/null
+++ b/code/core/src/core-server/build-dev.onboarding.test.ts
@@ -0,0 +1,64 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+// Use vi.hoisted so these are available when vi.mock factory runs
+const { mockCacheGet, mockCacheRemove, mockDetectAgent } = vi.hoisted(() => ({
+ mockCacheGet: vi.fn(),
+ mockCacheRemove: vi.fn(),
+ mockDetectAgent: vi.fn(),
+}));
+
+vi.mock('storybook/internal/common', async (importOriginal) => {
+ const actual = await importOriginal();
+ return {
+ ...actual,
+ cache: {
+ get: mockCacheGet,
+ set: vi.fn(),
+ remove: mockCacheRemove,
+ },
+ };
+});
+
+vi.mock('storybook/internal/telemetry', async (importOriginal) => {
+ const actual = await importOriginal();
+ return { ...actual, detectAgent: mockDetectAgent };
+});
+
+import { resolveOnboardingInitialPath } from './build-dev.ts';
+
+describe('resolveOnboardingInitialPath', () => {
+ beforeEach(() => {
+ vi.clearAllMocks();
+ mockDetectAgent.mockReturnValue(undefined); // default: not an agent
+ mockCacheRemove.mockResolvedValue(undefined);
+ });
+
+ it('returns /onboarding and removes cache entry when onboarding-pending is set and no CLI initialPath', async () => {
+ mockCacheGet.mockResolvedValue(true);
+ const result = await resolveOnboardingInitialPath(undefined);
+ expect(result).toBe('/onboarding');
+ expect(mockCacheRemove).toHaveBeenCalledWith('onboarding-pending');
+ });
+
+ it('returns undefined and does not remove cache when onboarding-pending is absent', async () => {
+ mockCacheGet.mockResolvedValue(undefined);
+ const result = await resolveOnboardingInitialPath(undefined);
+ expect(result).toBeUndefined();
+ expect(mockCacheRemove).not.toHaveBeenCalled();
+ });
+
+ it('returns CLI initialPath and does NOT remove cache when CLI initialPath is already set', async () => {
+ mockCacheGet.mockResolvedValue(true);
+ const result = await resolveOnboardingInitialPath('/my-story');
+ expect(result).toBe('/my-story');
+ expect(mockCacheRemove).not.toHaveBeenCalled();
+ });
+
+ it('returns undefined and does NOT remove cache when running in an agent context', async () => {
+ mockDetectAgent.mockReturnValue({ name: 'claude' });
+ mockCacheGet.mockResolvedValue(true);
+ const result = await resolveOnboardingInitialPath(undefined);
+ expect(result).toBeUndefined();
+ expect(mockCacheRemove).not.toHaveBeenCalled();
+ });
+});
diff --git a/code/core/src/core-server/build-dev.ts b/code/core/src/core-server/build-dev.ts
index 29e58d421a07..5de72fdfdae3 100644
--- a/code/core/src/core-server/build-dev.ts
+++ b/code/core/src/core-server/build-dev.ts
@@ -2,6 +2,7 @@ import { readFile } from 'node:fs/promises';
import {
JsPackageManagerFactory,
+ cache,
getConfigInfo,
getInterpretedFile,
getProjectRoot,
@@ -15,7 +16,12 @@ import {
} from 'storybook/internal/common';
import { CLI_COLORS, deprecate, logger, prompt } from 'storybook/internal/node-logger';
import { MissingBuilderError, NoStatsForViteDevError } from 'storybook/internal/server-errors';
-import { oneWayHash, setTelemetryEnabled, telemetry } from 'storybook/internal/telemetry';
+import {
+ detectAgent,
+ oneWayHash,
+ setTelemetryEnabled,
+ telemetry,
+} from 'storybook/internal/telemetry';
import type { BuilderOptions, CLIOptions, LoadOptions, Options } from 'storybook/internal/types';
import { global } from '@storybook/global';
@@ -41,6 +47,32 @@ import { updateCheck } from './utils/update-check.ts';
import { warnOnIncompatibleAddons } from './utils/warnOnIncompatibleAddons.ts';
import { warnWhenUsingArgTypesRegex } from './utils/warnWhenUsingArgTypesRegex.ts';
+/**
+ * Resolves the initialPath for the browser open URL.
+ * CLI-provided initialPath always wins. If not set and not running in an agent context,
+ * checks the project cache for an `onboarding-pending` entry written by `storybook init`.
+ * If found, returns '/onboarding' and removes the cache entry so it only triggers once.
+ * The cache entry is only written by init when onboarding is known to be supported,
+ * so no further addon check is needed here.
+ */
+export async function resolveOnboardingInitialPath(
+ cliInitialPath: string | undefined
+): Promise {
+ if (cliInitialPath || detectAgent()) {
+ // Explicit CLI flag wins; leave cache intact for next run.
+ // Agent environments skip onboarding (no browser to open).
+ return cliInitialPath;
+ }
+ const onboardingPending = await cache.get('onboarding-pending').catch(() => {});
+ if (onboardingPending) {
+ try {
+ await cache.remove('onboarding-pending');
+ } catch {}
+ return '/onboarding';
+ }
+ return undefined;
+}
+
export async function buildDevStandalone(
options: CLIOptions &
LoadOptions &
@@ -87,13 +119,10 @@ export async function buildDevStandalone(
const cacheKey = oneWayHash(relative(getProjectRoot(), configDir));
- const cacheOutputDir = resolvePathInStorybookCache('public', cacheKey);
- let outputDir = resolve(options.outputDir || cacheOutputDir);
- if (options.smokeTest) {
- outputDir = cacheOutputDir;
- }
-
+ // Resolve initialPath: CLI flag takes precedence; fall back to onboarding-pending cache entry.
invariant(port, 'expected options to have a port');
+ options.initialPath = await resolveOnboardingInitialPath(options.initialPath);
+
const { address: localAddress, networkAddress } = getServerAddresses(
port,
options.host,
@@ -101,6 +130,12 @@ export async function buildDevStandalone(
options.initialPath
);
+ const cacheOutputDir = resolvePathInStorybookCache('public', cacheKey);
+ let outputDir = resolve(options.outputDir || cacheOutputDir);
+ if (options.smokeTest) {
+ outputDir = cacheOutputDir;
+ }
+
options.port = port;
options.versionCheck = versionCheck;
options.configType = 'DEVELOPMENT';
@@ -124,6 +159,7 @@ export async function buildDevStandalone(
const config = await loadMainConfig(options);
const { core, framework } = config;
+
const corePresets = [];
let frameworkName = typeof framework === 'string' ? framework : framework?.name;
diff --git a/code/core/src/core-server/index.ts b/code/core/src/core-server/index.ts
index f5f4d75a3be9..7f0aa12b6824 100644
--- a/code/core/src/core-server/index.ts
+++ b/code/core/src/core-server/index.ts
@@ -36,4 +36,10 @@ export {
universalTestProviderStore as internal_universalTestProviderStore,
} from './stores/test-provider.ts';
+export { getComponentCandidates } from './utils/ghost-stories/get-candidates.ts';
+export { runStoryTests } from './utils/ghost-stories/run-story-tests.ts';
export { getServerPort } from './utils/server-address.ts';
+
+export { analyzeTestResults } from '../shared/utils/analyze-test-results.ts';
+export type { StoryTestResult } from '../shared/utils/test-result-types.ts';
+export { toStoryTestResult } from '../shared/utils/to-story-test-result.ts';
diff --git a/code/core/src/core-server/presets/common-preset.ts b/code/core/src/core-server/presets/common-preset.ts
index 2c6a804326c9..89fa7cbf5900 100644
--- a/code/core/src/core-server/presets/common-preset.ts
+++ b/code/core/src/core-server/presets/common-preset.ts
@@ -41,6 +41,7 @@ import { initializeSaveStory } from '../utils/save-story/save-story.ts';
import { parseStaticDir } from '../utils/server-statics.ts';
import { type OptionsWithRequiredCache, initializeWhatsNew } from '../utils/whats-new.ts';
import { getWsToken } from './wsToken.ts';
+import { initAIAnalyticsChannel } from '../server-channel/ai-setup-channel.ts';
const interpolate = (string: string, data: Record = {}) =>
Object.entries(data).reduce((acc, [k, v]) => acc.replace(new RegExp(`%${k}%`, 'g'), v), string);
@@ -275,15 +276,15 @@ export const experimental_serverChannel = async (
await setTelemetryEnabled(!coreOptions?.disableTelemetry);
- initializeChecklist();
+ initAIAnalyticsChannel(channel, options, () => storyIndexGeneratorPromise);
+ initializeChecklist(channel);
initializeWhatsNew(channel, options);
initializeSaveStory(channel, options);
-
initFileSearchChannel(channel, options);
initCreateNewStoryChannel(channel, options);
initGhostStoriesChannel(channel, options);
initOpenInEditorChannel(channel);
- initTelemetryChannel(channel, options);
+ initTelemetryChannel(channel);
return channel;
};
diff --git a/code/core/src/core-server/server-channel/ai-setup-channel.ts b/code/core/src/core-server/server-channel/ai-setup-channel.ts
new file mode 100644
index 000000000000..758229aba8d3
--- /dev/null
+++ b/code/core/src/core-server/server-channel/ai-setup-channel.ts
@@ -0,0 +1,126 @@
+import type { Channel } from 'storybook/internal/channels';
+import {
+ AI_SETUP_ANALYTICS_REQUEST,
+ AI_SETUP_ANALYTICS_RESPONSE,
+} from 'storybook/internal/core-events';
+import {
+ getLastEvents,
+ getStorybookMetadata,
+ isStoryCreatedByAISetup,
+ telemetry,
+} from 'storybook/internal/telemetry';
+import type { Options } from 'storybook/internal/types';
+import { logger } from 'storybook/internal/node-logger';
+
+import { runStoryTests } from '../utils/ghost-stories/run-story-tests.ts';
+import type { StoryIndexGenerator } from 'storybook/internal/core-server';
+import { waitForIdleVitest } from '../utils/wait-for-idle-vitest.ts';
+
+export function initAIAnalyticsChannel(
+ channel: Channel,
+ options: Options,
+ getStoryIndexGeneratorPromise?: () => Promise | undefined
+) {
+ /** Send analytics about the ai setup workflow when requested*/
+ channel.on(AI_SETUP_ANALYTICS_REQUEST, async () => {
+ const stats: {
+ fileCount?: number;
+ storyCount?: number;
+ testRunDuration?: number;
+ } = {};
+
+ try {
+ const lastEvents = await getLastEvents();
+ const lastAISetup = lastEvents?.['ai-setup'];
+ const lastSetupStoryScoringRun = lastEvents?.['ai-setup-final-scoring'];
+
+ // Only run if sb ai setup has been called
+ if (!lastAISetup) {
+ return;
+ }
+
+ // Already ran once for this project — never run again
+ if (lastSetupStoryScoringRun) {
+ return;
+ }
+
+ const metadata = await getStorybookMetadata(options.configDir);
+ const isReactStorybook = metadata?.renderer?.includes('@storybook/react');
+ const hasVitestAddon =
+ !!metadata?.addons &&
+ Object.keys(metadata.addons).some((addonKey) =>
+ addonKey.includes('@storybook/addon-vitest')
+ );
+
+ // For now this is gated by React + Vitest
+ if (!isReactStorybook || !hasVitestAddon) {
+ return;
+ }
+
+ // Wait for any running tests to finish before launching scoring, so we don't
+ // disturb end user activities.
+ const isIdle = await waitForIdleVitest();
+ if (!isIdle) {
+ logger.debug('AI_SETUP_ANALYTICS_REQUEST timed out waiting for vitest to be available.');
+ return;
+ }
+
+ // Fetch AI-generated stories and score them with the ghost stories metrics, if any are found.
+ const generatorPromise = getStoryIndexGeneratorPromise?.();
+ if (!generatorPromise) {
+ logger.debug(
+ 'AI_SETUP_ANALYTICS_REQUEST could not proceed as the index generator is not ready.'
+ );
+ return;
+ }
+
+ const generator = await generatorPromise;
+ const indexAndStats = await generator.getIndexAndStats();
+ if (!indexAndStats) {
+ logger.debug('AI_SETUP_ANALYTICS_REQUEST could not proceed as the index is not ready.');
+ return;
+ }
+
+ const aiStoryFiles = new Set();
+ let aiStoryCount = 0;
+ for (const entry of Object.values(indexAndStats.storyIndex.entries)) {
+ if (isStoryCreatedByAISetup(entry)) {
+ aiStoryFiles.add(entry.importPath);
+ aiStoryCount++;
+ }
+ }
+
+ if (aiStoryFiles.size > 0) {
+ const aiTestRunResult = await runStoryTests([...aiStoryFiles]);
+ telemetry('ai-setup-final-scoring', {
+ stats: {
+ fileCount: aiStoryFiles.size,
+ storyCount: aiStoryCount,
+ testRunDuration: aiTestRunResult.duration,
+ },
+ results: aiTestRunResult.summary,
+ ...(aiTestRunResult.runError ? { runError: aiTestRunResult.runError } : {}),
+ });
+ } else {
+ telemetry('ai-setup-final-scoring', {
+ stats: {
+ fileCount: 0,
+ storyCount: 0,
+ testRunDuration: 0,
+ },
+ runError: 'No stories found that were generated by ai setup',
+ });
+ }
+ } catch {
+ telemetry('ai-setup-final-scoring', {
+ stats,
+ runError: 'Unknown error during AI story scoring',
+ });
+ } finally {
+ // we don't currently do anything with this, but will be useful in the future
+ channel.emit(AI_SETUP_ANALYTICS_RESPONSE);
+ }
+ });
+
+ return channel;
+}
diff --git a/code/core/src/core-server/server-channel/ghost-stories-channel.test.ts b/code/core/src/core-server/server-channel/ghost-stories-channel.test.ts
index fd42626dea38..f02bf9ac06a7 100644
--- a/code/core/src/core-server/server-channel/ghost-stories-channel.test.ts
+++ b/code/core/src/core-server/server-channel/ghost-stories-channel.test.ts
@@ -150,9 +150,7 @@ describe('ghostStoriesChannel', () => {
});
// Has ran tests successfully and written reports to JSON file in cache directory
- vi.mocked(mockCommon.resolvePathInStorybookCache).mockReturnValue(
- '/cache/ghost-stories-tests'
- );
+ vi.mocked(mockCommon.resolvePathInStorybookCache).mockReturnValue('/cache/story-tests');
vi.mocked(mockCommon.executeCommand).mockResolvedValue({} as any);
mockFs.existsSync.mockReturnValue(true);
mockFs.readFile.mockResolvedValue(
@@ -193,7 +191,7 @@ describe('ghostStoriesChannel', () => {
'run',
'--reporter=json',
'--testTimeout=1000',
- expect.stringContaining('--outputFile=/cache/ghost-stories-tests/test-results-'),
+ expect.stringContaining('--outputFile=/cache/story-tests/test-results-'),
'component1.tsx',
'component2.tsx',
],
@@ -220,6 +218,7 @@ describe('ghostStoriesChannel', () => {
successRate: 1,
successRateWithoutEmptyRender: 1,
categorizedErrors: expect.any(Object),
+ cssCheck: 'not-run',
uniqueErrorCount: 0,
passedButEmptyRender: 0,
},
@@ -247,9 +246,7 @@ describe('ghostStoriesChannel', () => {
});
// Has ran tests but with failures, reports written to JSON file in cache directory
- vi.mocked(mockCommon.resolvePathInStorybookCache).mockReturnValue(
- '/cache/ghost-stories-tests'
- );
+ vi.mocked(mockCommon.resolvePathInStorybookCache).mockReturnValue('/cache/story-tests');
vi.mocked(mockCommon.executeCommand).mockResolvedValue({} as any);
mockFs.existsSync.mockReturnValue(true);
mockFs.readFile.mockResolvedValue(
@@ -292,7 +289,7 @@ describe('ghostStoriesChannel', () => {
'run',
'--reporter=json',
'--testTimeout=1000',
- expect.stringContaining('--outputFile=/cache/ghost-stories-tests/test-results-'),
+ expect.stringContaining('--outputFile=/cache/story-tests/test-results-'),
'component1.tsx',
'component2.tsx',
],
@@ -320,6 +317,7 @@ describe('ghostStoriesChannel', () => {
successRate: 0,
// categorizedErrors is now an object with categories as keys
categorizedErrors: expect.any(Object),
+ cssCheck: 'not-run',
uniqueErrorCount: expect.any(Number),
passedButEmptyRender: 0,
}),
@@ -363,7 +361,9 @@ describe('ghostStoriesChannel', () => {
});
expect(mockTelemetry.getLastEvents).toHaveBeenCalled();
- expect(mockTelemetry.getSessionId).toHaveBeenCalled();
+ // getSessionId is no longer checked by ghost stories — session matching
+ // was removed to support mid-session ai-setup triggers.
+ expect(mockTelemetry.getSessionId).not.toHaveBeenCalled();
expect(mockTelemetry.getStorybookMetadata).not.toHaveBeenCalled();
expect(mockStoryGeneration.getComponentCandidates).not.toHaveBeenCalled();
});
@@ -389,7 +389,6 @@ describe('ghostStoriesChannel', () => {
});
expect(mockTelemetry.getLastEvents).toHaveBeenCalled();
- expect(mockTelemetry.getSessionId).toHaveBeenCalled();
expect(mockTelemetry.getStorybookMetadata).toHaveBeenCalled();
expect(mockStoryGeneration.getComponentCandidates).not.toHaveBeenCalled();
});
@@ -415,7 +414,6 @@ describe('ghostStoriesChannel', () => {
});
expect(mockTelemetry.getLastEvents).toHaveBeenCalled();
- expect(mockTelemetry.getSessionId).toHaveBeenCalled();
expect(mockTelemetry.getStorybookMetadata).toHaveBeenCalled();
expect(mockStoryGeneration.getComponentCandidates).not.toHaveBeenCalled();
});
@@ -518,9 +516,7 @@ describe('ghostStoriesChannel', () => {
analyzedCount: 2,
avgComplexity: 1.0,
});
- vi.mocked(mockCommon.resolvePathInStorybookCache).mockReturnValue(
- '/cache/ghost-stories-tests'
- );
+ vi.mocked(mockCommon.resolvePathInStorybookCache).mockReturnValue('/cache/story-tests');
vi.mocked(mockCommon.executeCommand).mockRejectedValue(new Error('Test execution failed'));
mockFs.existsSync.mockReturnValue(false);
@@ -563,9 +559,7 @@ describe('ghostStoriesChannel', () => {
analyzedCount: 2,
avgComplexity: 1.0,
});
- vi.mocked(mockCommon.resolvePathInStorybookCache).mockReturnValue(
- '/cache/ghost-stories-tests'
- );
+ vi.mocked(mockCommon.resolvePathInStorybookCache).mockReturnValue('/cache/story-tests');
vi.mocked(mockCommon.executeCommand).mockRejectedValue(new Error('Startup Error'));
mockFs.existsSync.mockReturnValue(true);
mockFs.readFile.mockResolvedValue(
diff --git a/code/core/src/core-server/server-channel/ghost-stories-channel.ts b/code/core/src/core-server/server-channel/ghost-stories-channel.ts
index 870f61997b1e..b92766da0dc9 100644
--- a/code/core/src/core-server/server-channel/ghost-stories-channel.ts
+++ b/code/core/src/core-server/server-channel/ghost-stories-channel.ts
@@ -1,15 +1,11 @@
import type { Channel } from 'storybook/internal/channels';
import { GHOST_STORIES_REQUEST, GHOST_STORIES_RESPONSE } from 'storybook/internal/core-events';
-import {
- getLastEvents,
- getSessionId,
- getStorybookMetadata,
- telemetry,
-} from 'storybook/internal/telemetry';
+import { getLastEvents, getStorybookMetadata, telemetry } from 'storybook/internal/telemetry';
import type { Options } from 'storybook/internal/types';
import { getComponentCandidates } from '../utils/ghost-stories/get-candidates.ts';
import { runStoryTests } from '../utils/ghost-stories/run-story-tests.ts';
+import { waitForIdleVitest } from '../utils/wait-for-idle-vitest.ts';
class SkipGhostStoriesTelemetry extends Error {}
@@ -32,19 +28,24 @@ export function initGhostStoriesChannel(channel: Channel, options: Options) {
const ghostRunStart = Date.now();
const lastEvents = await getLastEvents();
const lastInit = lastEvents?.init;
- if (!lastEvents || !lastInit) {
+ const lastAISetup = lastEvents?.['ai-setup'];
+ const lastGhostStoriesRun = lastEvents?.['ghost-stories'];
+
+ // We only want to run ghost stories immediately after init or ai setup.
+ const lastRelevantEvent = lastAISetup ?? lastInit;
+ if (!lastRelevantEvent) {
throw new SkipGhostStoriesTelemetry();
}
- const sessionId = await getSessionId();
- const lastGhostStoriesRun = lastEvents['ghost-stories'];
- if (
- lastGhostStoriesRun ||
- (lastInit.body?.sessionId && lastInit.body.sessionId !== sessionId)
- ) {
+ // Already ran once for this project — never run again
+ if (lastGhostStoriesRun) {
throw new SkipGhostStoriesTelemetry();
}
+ // No session-ID match: `storybook ai setup` runs as a separate CLI
+ // process, so its sessionId never matches the dev server's. The
+ // `lastGhostStoriesRun` guard above is enough to enforce once-per-project.
+
const metadata = await getStorybookMetadata(options.configDir);
const isReactStorybook = metadata?.renderer?.includes('@storybook/react');
const hasVitestAddon =
@@ -58,6 +59,13 @@ export function initGhostStoriesChannel(channel: Channel, options: Options) {
throw new SkipGhostStoriesTelemetry();
}
+ // Wait for any running tests to finish before launching scoring, so we don't
+ // disturb end user activities.
+ const isIdle = await waitForIdleVitest();
+ if (!isIdle) {
+ return;
+ }
+
// Phase 1: find candidates from components
const candidateAnalysisStart = Date.now();
const candidatesResult = await getComponentCandidates();
@@ -85,7 +93,9 @@ export function initGhostStoriesChannel(channel: Channel, options: Options) {
// Phase 2: Run tests on those candidates Vitest. The components will be transformed directly to tests
// If they pass, it means that creating a story file for them would succeed.
- const testRunResult = await runStoryTests(candidatesResult.candidates);
+ const testRunResult = await runStoryTests(candidatesResult.candidates, {
+ ghostRun: true,
+ });
stats.totalRunDuration = Date.now() - ghostRunStart;
stats.testRunDuration = testRunResult.duration;
if (testRunResult.runError) {
diff --git a/code/core/src/core-server/server-channel/telemetry-channel.test.ts b/code/core/src/core-server/server-channel/telemetry-channel.test.ts
index d3c664818ddc..3236de948d93 100644
--- a/code/core/src/core-server/server-channel/telemetry-channel.test.ts
+++ b/code/core/src/core-server/server-channel/telemetry-channel.test.ts
@@ -24,7 +24,7 @@ describe('telemetry-channel', () => {
},
} as any;
- initTelemetryChannel(channel, { disableTelemetry: false } as any);
+ initTelemetryChannel(channel);
const payload = {
trigger: 'interaction' as const,
diff --git a/code/core/src/core-server/server-channel/telemetry-channel.ts b/code/core/src/core-server/server-channel/telemetry-channel.ts
index 6de9a0d278e4..0a3b4cafaa9b 100644
--- a/code/core/src/core-server/server-channel/telemetry-channel.ts
+++ b/code/core/src/core-server/server-channel/telemetry-channel.ts
@@ -5,11 +5,11 @@ import {
SHARE_POPOVER_OPENED,
SHARE_STORY_LINK,
SIDEBAR_FILTER_CHANGED,
+ AI_PROMPT_NUDGE,
} from 'storybook/internal/core-events';
import { type InitPayload, telemetry } from 'storybook/internal/telemetry';
import { type CacheEntry, getLastEvents } from 'storybook/internal/telemetry';
import { getSessionId } from 'storybook/internal/telemetry';
-import type { Options } from 'storybook/internal/types';
export const makePayload = (
userAgent: string,
@@ -30,7 +30,7 @@ export const makePayload = (
return payload;
};
-export function initTelemetryChannel(channel: Channel, options: Options) {
+export function initTelemetryChannel(channel: Channel) {
channel.on(PREVIEW_INITIALIZED, async ({ userAgent }) => {
try {
const sessionId = await getSessionId();
@@ -55,4 +55,7 @@ export function initTelemetryChannel(channel: Channel, options: Options) {
channel.on(SIDEBAR_FILTER_CHANGED, (payload) => {
telemetry('sidebar-filter', payload);
});
+ channel.on(AI_PROMPT_NUDGE, async ({ id, origin }: { id: string; origin: string }) => {
+ telemetry('ai-prompt-nudge', { id, origin });
+ });
}
diff --git a/code/core/src/core-server/utils/checklist.test.ts b/code/core/src/core-server/utils/checklist.test.ts
new file mode 100644
index 000000000000..203236a51188
--- /dev/null
+++ b/code/core/src/core-server/utils/checklist.test.ts
@@ -0,0 +1,485 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import type { CacheEntry } from '../../telemetry/event-cache.ts';
+import type { TelemetryEvent } from '../../telemetry/types.ts';
+import { MockUniversalStore } from '../../shared/universal-store/mock.ts';
+import {
+ type StoreEvent,
+ type StoreState,
+ UNIVERSAL_CHECKLIST_STORE_OPTIONS,
+} from '../../shared/checklist-store/index.ts';
+
+vi.mock('storybook/internal/common', () => ({
+ createFileSystemCache: vi.fn(() => ({
+ get: vi.fn().mockResolvedValue(undefined),
+ set: vi.fn().mockResolvedValue(undefined),
+ })),
+ resolvePathInStorybookCache: vi.fn(() => '/tmp/test-cache'),
+}));
+
+vi.mock('storybook/internal/core-server', () => ({
+ experimental_UniversalStore: {
+ create: vi.fn(),
+ },
+}));
+
+vi.mock('storybook/internal/node-logger');
+vi.mock('storybook/internal/telemetry', () => ({
+ telemetry: vi.fn(),
+}));
+
+vi.mock('es-toolkit/function', () => ({
+ throttle: vi.fn((fn: () => void) => fn),
+}));
+
+vi.mock('es-toolkit/object', async () => {
+ const actual = await vi.importActual('es-toolkit/object');
+ return actual;
+});
+
+vi.mock('../../cli/index.ts', () => ({
+ globalSettings: vi.fn(),
+}));
+
+const testProviderStateChangeListeners: Array<(...args: any[]) => void> = [];
+vi.mock('../stores/test-provider.ts', () => ({
+ universalTestProviderStore: {
+ onStateChange: vi.fn((listener: (...args: any[]) => void) => {
+ testProviderStateChangeListeners.push(listener);
+ return () => {
+ const idx = testProviderStateChangeListeners.indexOf(listener);
+ if (idx >= 0) {
+ testProviderStateChangeListeners.splice(idx, 1);
+ }
+ };
+ }),
+ },
+}));
+
+vi.mock('../../telemetry/event-cache.ts', () => ({
+ get: vi.fn(),
+}));
+
+const AI_IDLE_DELAY_MS = 4 * 60 * 1000;
+
+const aiSetupCacheEntry = {
+ timestamp: Date.now(),
+ body: { eventType: 'ai-setup' } as TelemetryEvent,
+} satisfies CacheEntry;
+
+const aiInitOptInCacheEntry = {
+ timestamp: Date.now(),
+ body: { eventType: 'ai-init-opt-in' } as TelemetryEvent,
+} satisfies CacheEntry;
+
+/** Mock getEventCacheEntry to return specific entries by event type. */
+function mockEventCache(events: Record) {
+ return async (eventType: string) => events[eventType];
+}
+
+describe('initializeChecklist', () => {
+ let mockStore: MockUniversalStore;
+ let mockSettingsValue: { checklist?: Record };
+
+ beforeEach(async () => {
+ vi.useFakeTimers();
+ testProviderStateChangeListeners.length = 0;
+
+ mockStore = MockUniversalStore.create(
+ UNIVERSAL_CHECKLIST_STORE_OPTIONS,
+ vi
+ );
+
+ const { experimental_UniversalStore } = await import('storybook/internal/core-server');
+ vi.mocked(experimental_UniversalStore.create).mockReturnValue(
+ mockStore as unknown as ReturnType
+ );
+
+ mockSettingsValue = { checklist: undefined };
+ const { globalSettings } = await import('../../cli/index.ts');
+ vi.mocked(globalSettings).mockResolvedValue({
+ filePath: '/mock/path',
+ value: mockSettingsValue,
+ save: vi.fn(),
+ } as unknown as Awaited>);
+ });
+
+ it('sets loaded immediately, even before the ai-setup check resolves', async () => {
+ const { get: getEventCacheEntry } = await import('../../telemetry/event-cache.ts');
+ // Make the AI cache check hang — it should NOT block loaded: true
+ vi.mocked(getEventCacheEntry).mockReturnValue(new Promise(() => {}));
+
+ const { initializeChecklist } = await import('./checklist.ts');
+ await initializeChecklist();
+
+ const state = mockStore.getState();
+ expect(state.loaded).toBe(true);
+ expect(state.items.aiSetup.status).toBe('open');
+ });
+
+ it('keeps aiSetup as open when no ai-setup event exists in cache', async () => {
+ const { get: getEventCacheEntry } = await import('../../telemetry/event-cache.ts');
+ vi.mocked(getEventCacheEntry).mockResolvedValue(undefined);
+
+ const { initializeChecklist } = await import('./checklist.ts');
+ await initializeChecklist();
+ await vi.advanceTimersByTimeAsync(0);
+
+ const state = mockStore.getState();
+ expect(state.items.aiSetup.status).toBe('open');
+ });
+
+ it('marks aiSetup as done when ai-setup event exists in cache', async () => {
+ const { get: getEventCacheEntry } = await import('../../telemetry/event-cache.ts');
+ vi.mocked(getEventCacheEntry).mockResolvedValue(aiSetupCacheEntry);
+
+ const { initializeChecklist } = await import('./checklist.ts');
+ await initializeChecklist();
+ await vi.advanceTimersByTimeAsync(0);
+
+ const state = mockStore.getState();
+ expect(state.items.aiSetup.status).toBe('done');
+ });
+
+ it('still initializes when reading ai-setup from the event cache fails', async () => {
+ const { get: getEventCacheEntry } = await import('../../telemetry/event-cache.ts');
+ vi.mocked(getEventCacheEntry).mockRejectedValue(new Error('cache read failed'));
+
+ const { initializeChecklist } = await import('./checklist.ts');
+ await expect(initializeChecklist()).resolves.toBeUndefined();
+ await vi.advanceTimersByTimeAsync(0);
+
+ const state = mockStore.getState();
+ expect(state.loaded).toBe(true);
+ expect(state.items.aiSetup.status).toBe('open');
+ });
+
+ it('marks aiSetup as done when ai-setup ran even if persisted status was skipped', async () => {
+ mockSettingsValue.checklist = {
+ items: { aiSetup: { status: 'skipped' } },
+ widget: {},
+ };
+
+ const { get: getEventCacheEntry } = await import('../../telemetry/event-cache.ts');
+ vi.mocked(getEventCacheEntry).mockResolvedValue(aiSetupCacheEntry);
+
+ const { initializeChecklist } = await import('./checklist.ts');
+ await initializeChecklist();
+ await vi.advanceTimersByTimeAsync(0);
+
+ const state = mockStore.getState();
+ expect(state.items.aiSetup.status).toBe('done');
+ });
+
+ describe('debounced analytics and ghost stories', () => {
+ function createMockChannel() {
+ const listeners: Record = {};
+ return {
+ channel: {
+ emit: vi.fn(),
+ on: vi.fn((event: string, fn: Function) => {
+ listeners[event] = listeners[event] || [];
+ listeners[event].push(fn);
+ }),
+ off: vi.fn((event: string, fn: Function) => {
+ listeners[event] = listeners[event]?.filter((f) => f !== fn) ?? [];
+ }),
+ },
+ listeners,
+ };
+ }
+
+ it('does not emit events immediately when ai-setup detected at startup', async () => {
+ const { AI_SETUP_ANALYTICS_REQUEST, GHOST_STORIES_REQUEST } =
+ await import('storybook/internal/core-events');
+ const { get: getEventCacheEntry } = await import('../../telemetry/event-cache.ts');
+ vi.mocked(getEventCacheEntry).mockImplementation(
+ mockEventCache({
+ 'ai-setup': aiSetupCacheEntry,
+ 'ai-init-opt-in': aiInitOptInCacheEntry,
+ })
+ );
+
+ const { channel } = createMockChannel();
+ const { initializeChecklist } = await import('./checklist.ts');
+ await initializeChecklist(channel as any);
+ await vi.advanceTimersByTimeAsync(0);
+
+ expect(channel.emit).not.toHaveBeenCalledWith(AI_SETUP_ANALYTICS_REQUEST);
+ expect(channel.emit).not.toHaveBeenCalledWith(GHOST_STORIES_REQUEST);
+ });
+
+ it('emits ghost stories and analytics after 4 minutes of idle when ai-setup detected at startup', async () => {
+ const { AI_SETUP_ANALYTICS_REQUEST, GHOST_STORIES_REQUEST } =
+ await import('storybook/internal/core-events');
+ const { get: getEventCacheEntry } = await import('../../telemetry/event-cache.ts');
+ vi.mocked(getEventCacheEntry).mockImplementation(
+ mockEventCache({
+ 'ai-setup': aiSetupCacheEntry,
+ 'ai-init-opt-in': aiInitOptInCacheEntry,
+ })
+ );
+
+ const { channel } = createMockChannel();
+ const { initializeChecklist } = await import('./checklist.ts');
+ await initializeChecklist(channel as any);
+ await vi.advanceTimersByTimeAsync(0);
+
+ // Advance past the 4-minute idle delay
+ await vi.advanceTimersByTimeAsync(AI_IDLE_DELAY_MS);
+
+ expect(channel.emit).toHaveBeenCalledWith(GHOST_STORIES_REQUEST);
+ expect(channel.emit).toHaveBeenCalledWith(AI_SETUP_ANALYTICS_REQUEST);
+ });
+
+ it('resets the idle timer on each STORY_INDEX_INVALIDATED', async () => {
+ const { AI_SETUP_ANALYTICS_REQUEST, GHOST_STORIES_REQUEST, STORY_INDEX_INVALIDATED } =
+ await import('storybook/internal/core-events');
+ const { get: getEventCacheEntry } = await import('../../telemetry/event-cache.ts');
+ vi.mocked(getEventCacheEntry).mockImplementation(
+ mockEventCache({
+ 'ai-setup': aiSetupCacheEntry,
+ 'ai-init-opt-in': aiInitOptInCacheEntry,
+ })
+ );
+
+ const { channel, listeners } = createMockChannel();
+ const { initializeChecklist } = await import('./checklist.ts');
+ await initializeChecklist(channel as any);
+ await vi.advanceTimersByTimeAsync(0);
+
+ // Advance 3 minutes (within the 4-minute window)
+ await vi.advanceTimersByTimeAsync(3 * 60 * 1000);
+
+ // Simulate index change — resets the timer
+ listeners[STORY_INDEX_INVALIDATED]?.forEach((fn) => fn());
+ await vi.advanceTimersByTimeAsync(0);
+
+ // 3 more minutes after reset — still within the new 4-minute window
+ await vi.advanceTimersByTimeAsync(3 * 60 * 1000);
+ expect(channel.emit).not.toHaveBeenCalledWith(AI_SETUP_ANALYTICS_REQUEST);
+ expect(channel.emit).not.toHaveBeenCalledWith(GHOST_STORIES_REQUEST);
+
+ // 1 more minute — now 4 minutes since last index change
+ await vi.advanceTimersByTimeAsync(1 * 60 * 1000);
+ expect(channel.emit).toHaveBeenCalledWith(GHOST_STORIES_REQUEST);
+ expect(channel.emit).toHaveBeenCalledWith(AI_SETUP_ANALYTICS_REQUEST);
+ });
+
+ it('resets the idle timer when test provider state changes', async () => {
+ const { AI_SETUP_ANALYTICS_REQUEST, GHOST_STORIES_REQUEST } =
+ await import('storybook/internal/core-events');
+ const { get: getEventCacheEntry } = await import('../../telemetry/event-cache.ts');
+ vi.mocked(getEventCacheEntry).mockImplementation(
+ mockEventCache({
+ 'ai-setup': aiSetupCacheEntry,
+ 'ai-init-opt-in': aiInitOptInCacheEntry,
+ })
+ );
+
+ const { channel } = createMockChannel();
+ const { initializeChecklist } = await import('./checklist.ts');
+ await initializeChecklist(channel as any);
+ await vi.advanceTimersByTimeAsync(0);
+
+ // Advance 3 minutes (within the 4-minute window)
+ await vi.advanceTimersByTimeAsync(3 * 60 * 1000);
+
+ // Simulate test provider state change (e.g. tests started running) — resets the timer
+ testProviderStateChangeListeners.forEach((fn) => fn({}, {}, {}));
+ await vi.advanceTimersByTimeAsync(0);
+
+ // 3 more minutes after reset — still within the new 4-minute window
+ await vi.advanceTimersByTimeAsync(3 * 60 * 1000);
+ expect(channel.emit).not.toHaveBeenCalledWith(AI_SETUP_ANALYTICS_REQUEST);
+ expect(channel.emit).not.toHaveBeenCalledWith(GHOST_STORIES_REQUEST);
+
+ // 1 more minute — now 4 minutes since last test provider state change
+ await vi.advanceTimersByTimeAsync(1 * 60 * 1000);
+ expect(channel.emit).toHaveBeenCalledWith(GHOST_STORIES_REQUEST);
+ expect(channel.emit).toHaveBeenCalledWith(AI_SETUP_ANALYTICS_REQUEST);
+ });
+
+ it('detects ai-setup mid-session when checked at idle time (race condition fix)', async () => {
+ const { AI_SETUP_ANALYTICS_REQUEST, GHOST_STORIES_REQUEST, STORY_INDEX_INVALIDATED } =
+ await import('storybook/internal/core-events');
+ const { get: getEventCacheEntry } = await import('../../telemetry/event-cache.ts');
+
+ // Initially: ai-init-opt-in exists but ai-setup does NOT
+ vi.mocked(getEventCacheEntry).mockImplementation(
+ mockEventCache({ 'ai-init-opt-in': aiInitOptInCacheEntry })
+ );
+
+ const { channel, listeners } = createMockChannel();
+ const { initializeChecklist } = await import('./checklist.ts');
+ await initializeChecklist(channel as any);
+ await vi.advanceTimersByTimeAsync(0);
+
+ expect(mockStore.getState().items.aiSetup.status).toBe('open');
+
+ // Simulate: agent creates files → STORY_INDEX_INVALIDATED fires multiple times
+ listeners[STORY_INDEX_INVALIDATED]?.forEach((fn) => fn());
+ await vi.advanceTimersByTimeAsync(10_000);
+ listeners[STORY_INDEX_INVALIDATED]?.forEach((fn) => fn());
+ await vi.advanceTimersByTimeAsync(10_000);
+ listeners[STORY_INDEX_INVALIDATED]?.forEach((fn) => fn());
+
+ // ai-setup command finishes AFTER last file change — event now in cache.
+ // This is the race condition: no more STORY_INDEX_INVALIDATED events fire,
+ // but the idle timer is still running and will check at idle time.
+ vi.mocked(getEventCacheEntry).mockImplementation(
+ mockEventCache({
+ 'ai-setup': aiSetupCacheEntry,
+ 'ai-init-opt-in': aiInitOptInCacheEntry,
+ })
+ );
+
+ // No more index changes. After 4 minutes of quiet, the idle timer fires.
+ await vi.advanceTimersByTimeAsync(AI_IDLE_DELAY_MS);
+
+ // The idle check found ai-setup in the cache → marked done + emitted events
+ expect(mockStore.getState().items.aiSetup.status).toBe('done');
+ expect(channel.emit).toHaveBeenCalledWith(GHOST_STORIES_REQUEST);
+ expect(channel.emit).toHaveBeenCalledWith(AI_SETUP_ANALYTICS_REQUEST);
+ });
+
+ it('does not emit if user did not opt into AI', async () => {
+ const { AI_SETUP_ANALYTICS_REQUEST, GHOST_STORIES_REQUEST, STORY_INDEX_INVALIDATED } =
+ await import('storybook/internal/core-events');
+ const { get: getEventCacheEntry } = await import('../../telemetry/event-cache.ts');
+
+ // ai-setup exists but ai-init-opt-in does NOT
+ vi.mocked(getEventCacheEntry).mockImplementation(
+ mockEventCache({ 'ai-setup': aiSetupCacheEntry })
+ );
+
+ const { channel, listeners } = createMockChannel();
+ const { initializeChecklist } = await import('./checklist.ts');
+ await initializeChecklist(channel as any);
+ await vi.advanceTimersByTimeAsync(0);
+
+ // Trigger index change and wait for idle
+ listeners[STORY_INDEX_INVALIDATED]?.forEach((fn) => fn());
+ await vi.advanceTimersByTimeAsync(AI_IDLE_DELAY_MS);
+
+ expect(channel.emit).not.toHaveBeenCalledWith(AI_SETUP_ANALYTICS_REQUEST);
+ expect(channel.emit).not.toHaveBeenCalledWith(GHOST_STORIES_REQUEST);
+ });
+
+ it('only emits once even after multiple idle cycles', async () => {
+ const { AI_SETUP_ANALYTICS_REQUEST, GHOST_STORIES_REQUEST, STORY_INDEX_INVALIDATED } =
+ await import('storybook/internal/core-events');
+ const { get: getEventCacheEntry } = await import('../../telemetry/event-cache.ts');
+ vi.mocked(getEventCacheEntry).mockImplementation(
+ mockEventCache({
+ 'ai-setup': aiSetupCacheEntry,
+ 'ai-init-opt-in': aiInitOptInCacheEntry,
+ })
+ );
+
+ const { channel, listeners } = createMockChannel();
+ const { initializeChecklist } = await import('./checklist.ts');
+ await initializeChecklist(channel as any);
+ await vi.advanceTimersByTimeAsync(0);
+
+ // Let it fire once
+ await vi.advanceTimersByTimeAsync(AI_IDLE_DELAY_MS);
+ expect(channel.emit).toHaveBeenCalledWith(GHOST_STORIES_REQUEST);
+ expect(channel.emit).toHaveBeenCalledWith(AI_SETUP_ANALYTICS_REQUEST);
+
+ channel.emit.mockClear();
+
+ // More index changes after it already fired — should not fire again
+ listeners[STORY_INDEX_INVALIDATED]?.forEach((fn) => fn());
+ await vi.advanceTimersByTimeAsync(AI_IDLE_DELAY_MS);
+ expect(channel.emit).not.toHaveBeenCalledWith(AI_SETUP_ANALYTICS_REQUEST);
+ expect(channel.emit).not.toHaveBeenCalledWith(GHOST_STORIES_REQUEST);
+ });
+
+ it('reschedules when a recent external test-run is detected (e.g. npx vitest by AI agent)', async () => {
+ const { AI_SETUP_ANALYTICS_REQUEST, GHOST_STORIES_REQUEST } =
+ await import('storybook/internal/core-events');
+ const { get: getEventCacheEntry } = await import('../../telemetry/event-cache.ts');
+
+ vi.mocked(getEventCacheEntry).mockImplementation(
+ mockEventCache({
+ 'ai-setup': aiSetupCacheEntry,
+ 'ai-init-opt-in': aiInitOptInCacheEntry,
+ })
+ );
+
+ const { channel } = createMockChannel();
+ const { initializeChecklist } = await import('./checklist.ts');
+ await initializeChecklist(channel as any);
+ await vi.advanceTimersByTimeAsync(0);
+
+ // 2 minutes in: agent starts running `npx vitest` — a test-run event is recorded
+ await vi.advanceTimersByTimeAsync(2 * 60 * 1000);
+ const testRunEntry = {
+ timestamp: Date.now(),
+ body: {} as any,
+ } satisfies CacheEntry;
+ vi.mocked(getEventCacheEntry).mockImplementation(
+ mockEventCache({
+ 'ai-setup': aiSetupCacheEntry,
+ 'ai-init-opt-in': aiInitOptInCacheEntry,
+ 'test-run': testRunEntry,
+ })
+ );
+
+ // 2 more minutes: idle timer fires (4 min total). test-run was only 2 min ago
+ // → still within the idle window → reschedule, don't emit yet
+ await vi.advanceTimersByTimeAsync(2 * 60 * 1000);
+ expect(channel.emit).not.toHaveBeenCalledWith(AI_SETUP_ANALYTICS_REQUEST);
+ expect(channel.emit).not.toHaveBeenCalledWith(GHOST_STORIES_REQUEST);
+
+ // Another 4 minutes (8 min total). test-run was 6 min ago — older than the
+ // idle window → agent is done → emit events
+ await vi.advanceTimersByTimeAsync(AI_IDLE_DELAY_MS);
+ expect(channel.emit).toHaveBeenCalledWith(GHOST_STORIES_REQUEST);
+ expect(channel.emit).toHaveBeenCalledWith(AI_SETUP_ANALYTICS_REQUEST);
+ });
+
+ it('reschedules when a recent ai-setup-self-healing-scoring event is detected', async () => {
+ const { AI_SETUP_ANALYTICS_REQUEST, GHOST_STORIES_REQUEST } =
+ await import('storybook/internal/core-events');
+ const { get: getEventCacheEntry } = await import('../../telemetry/event-cache.ts');
+
+ vi.mocked(getEventCacheEntry).mockImplementation(
+ mockEventCache({
+ 'ai-setup': aiSetupCacheEntry,
+ 'ai-init-opt-in': aiInitOptInCacheEntry,
+ })
+ );
+
+ const { channel } = createMockChannel();
+ const { initializeChecklist } = await import('./checklist.ts');
+ await initializeChecklist(channel as any);
+ await vi.advanceTimersByTimeAsync(0);
+
+ // 2 minutes in: agent finishes a vitest run — self-healing scoring event recorded
+ await vi.advanceTimersByTimeAsync(2 * 60 * 1000);
+ const selfHealingEntry = {
+ timestamp: Date.now(),
+ body: {} as any,
+ } satisfies CacheEntry;
+ vi.mocked(getEventCacheEntry).mockImplementation(
+ mockEventCache({
+ 'ai-setup': aiSetupCacheEntry,
+ 'ai-init-opt-in': aiInitOptInCacheEntry,
+ 'ai-setup-self-healing-scoring': selfHealingEntry,
+ })
+ );
+
+ // Timer fires (4 min total). self-healing was 2 min ago → reschedule
+ await vi.advanceTimersByTimeAsync(2 * 60 * 1000);
+ expect(channel.emit).not.toHaveBeenCalledWith(AI_SETUP_ANALYTICS_REQUEST);
+ expect(channel.emit).not.toHaveBeenCalledWith(GHOST_STORIES_REQUEST);
+
+ // Another 4 minutes: self-healing was 6 min ago → emit
+ await vi.advanceTimersByTimeAsync(AI_IDLE_DELAY_MS);
+ expect(channel.emit).toHaveBeenCalledWith(GHOST_STORIES_REQUEST);
+ expect(channel.emit).toHaveBeenCalledWith(AI_SETUP_ANALYTICS_REQUEST);
+ });
+ });
+});
diff --git a/code/core/src/core-server/utils/checklist.ts b/code/core/src/core-server/utils/checklist.ts
index ed79f7147c12..751f1e87ff75 100644
--- a/code/core/src/core-server/utils/checklist.ts
+++ b/code/core/src/core-server/utils/checklist.ts
@@ -1,5 +1,11 @@
+import type { Channel } from 'storybook/internal/channels';
import { createFileSystemCache, resolvePathInStorybookCache } from 'storybook/internal/common';
import { experimental_UniversalStore } from 'storybook/internal/core-server';
+import {
+ AI_SETUP_ANALYTICS_REQUEST,
+ GHOST_STORIES_REQUEST,
+ STORY_INDEX_INVALIDATED,
+} from 'storybook/internal/core-events';
import { logger } from 'storybook/internal/node-logger';
import { telemetry } from 'storybook/internal/telemetry';
@@ -7,6 +13,8 @@ import { throttle } from 'es-toolkit/function';
import { toMerged } from 'es-toolkit/object';
import { globalSettings } from '../../cli/index.ts';
+import { universalTestProviderStore } from '../stores/test-provider.ts';
+import { get as getEventCacheEntry } from '../../telemetry/event-cache.ts';
import {
type ChecklistState,
type StoreEvent,
@@ -14,7 +22,7 @@ import {
UNIVERSAL_CHECKLIST_STORE_OPTIONS,
} from '../../shared/checklist-store/index.ts';
-export async function initializeChecklist() {
+export async function initializeChecklist(channel?: Channel) {
try {
const store = experimental_UniversalStore.create({
...UNIVERSAL_CHECKLIST_STORE_OPTIONS,
@@ -54,6 +62,7 @@ export async function initializeChecklist() {
}),
]);
+ // Load the checklist immediately so the UI is never blocked.
store.setState(
(value) =>
({
@@ -62,6 +71,122 @@ export async function initializeChecklist() {
}) satisfies StoreState
);
+ // AI opt-in flag (set during `storybook init`). Non-blocking so a cache
+ // failure cannot hide the checklist.
+ getEventCacheEntry('ai-init-opt-in')
+ .then((event) => {
+ if (event) {
+ store.setState((state) => ({ ...state, aiOptIn: true }));
+ }
+ })
+ .catch(() => {});
+
+ // Mark the aiSetup item done if `storybook ai setup` has ever run. Called
+ // at startup and on story index changes; errors are swallowed.
+ const markAiSetupDone = async () => {
+ try {
+ const aiSetupEvent = await getEventCacheEntry('ai-setup');
+ if (!aiSetupEvent) {
+ return false;
+ }
+ if (store.getState().items.aiSetup?.status !== 'done') {
+ store.setState((state) => ({
+ ...state,
+ items: {
+ ...state.items,
+ aiSetup: { ...state.items.aiSetup, status: 'done' },
+ },
+ }));
+ }
+ return true;
+ } catch {
+ return false;
+ }
+ };
+
+ // Debounced analytics + ghost stories: emit exactly once, 4 minutes after
+ // activity stops. The timer resets on story-index changes, test-provider
+ // state changes, and detected external vitest runs (`npx vitest`). We check
+ // for `ai-setup` at idle time rather than eagerly, because the event is
+ // cached AFTER `storybook ai setup` finishes its file writes.
+ const AI_IDLE_DELAY_MS = 4 * 60 * 1000;
+ let analyticsTimer: ReturnType | undefined;
+ let analyticsEmitted = false;
+
+ // Story-index invalidations can arrive in flurries. Throttle the
+ // fire-and-forget cache read so we don't hit disk on every tick. The
+ // timer-internal markAiSetupDone() below is awaited separately.
+ const throttledSyncAiSetupStatus = throttle(() => markAiSetupDone().catch(() => {}), 1000);
+
+ const scheduleIdleCheck = () => {
+ if (!channel || analyticsEmitted) {
+ return;
+ }
+ // Sync aiSetup UI immediately so the copy-prompt button disappears as
+ // soon as setup completes, instead of after the 4-minute delay.
+ throttledSyncAiSetupStatus();
+ clearTimeout(analyticsTimer);
+ analyticsTimer = setTimeout(async () => {
+ if (!store.getState().aiOptIn) {
+ return;
+ }
+ // Agents often run `npx vitest` for many minutes. If a recent
+ // `test-run` or `ai-setup-self-healing-scoring` event is in the cache,
+ // the agent is still active — reschedule. `CacheEntry.timestamp` is
+ // the cache-write time (= event-firing time, writes are synchronous).
+ const now = Date.now();
+ const [lastTestRun, lastSelfHealing] = await Promise.all([
+ getEventCacheEntry('test-run').catch(() => undefined),
+ getEventCacheEntry('ai-setup-self-healing-scoring').catch(() => undefined),
+ ]);
+ const hasRecentTestActivity = [lastTestRun, lastSelfHealing].some(
+ (e) => e && now - e.timestamp < AI_IDLE_DELAY_MS
+ );
+ if (hasRecentTestActivity) {
+ scheduleIdleCheck();
+ return;
+ }
+ // Final re-check: ai-setup may have been cached after the last trigger.
+ await markAiSetupDone();
+ if (store.getState().items.aiSetup?.status !== 'done') {
+ return;
+ }
+ analyticsEmitted = true;
+ channel.off(STORY_INDEX_INVALIDATED, onIndexInvalidated);
+ unsubscribeTestProvider();
+ channel.emit(GHOST_STORIES_REQUEST);
+ channel.emit(AI_SETUP_ANALYTICS_REQUEST);
+ }, AI_IDLE_DELAY_MS);
+ };
+
+ // Startup check: covers the case where the dev server was restarted
+ // mid-agentic-session and `ai-setup` was already cached.
+ markAiSetupDone().then((detected) => {
+ if (detected) {
+ scheduleIdleCheck();
+ }
+ });
+
+ const onIndexInvalidated = () => {
+ if (analyticsEmitted) {
+ return;
+ }
+ scheduleIdleCheck();
+ };
+ if (channel) {
+ channel.on(STORY_INDEX_INVALIDATED, onIndexInvalidated);
+ }
+
+ // Test-provider state changes also reset the timer — an agent can spend
+ // long stretches running tests without touching story files. Captured so
+ // we can unsubscribe symmetrically when analytics fires.
+ const unsubscribeTestProvider = universalTestProviderStore.onStateChange(() => {
+ if (analyticsEmitted) {
+ return;
+ }
+ scheduleIdleCheck();
+ });
+
store.onStateChange((state: StoreState, previousState: StoreState) => {
const entries = Object.entries(state.items);
diff --git a/code/core/src/core-server/utils/doTelemetry.ts b/code/core/src/core-server/utils/doTelemetry.ts
index 4ac52f9cf57a..9df7551a71b1 100644
--- a/code/core/src/core-server/utils/doTelemetry.ts
+++ b/code/core/src/core-server/utils/doTelemetry.ts
@@ -1,4 +1,8 @@
-import { getPrecedingUpgrade, telemetry } from 'storybook/internal/telemetry';
+import {
+ collectAiSetupEvidence,
+ getPrecedingUpgrade,
+ telemetry,
+} from 'storybook/internal/telemetry';
import type { CoreConfig, Options } from 'storybook/internal/types';
import type { Polka } from 'polka';
@@ -38,6 +42,13 @@ export async function doTelemetry(
precedingUpgrade: await getPrecedingUpgrade(),
};
if (indexAndStats) {
+ // sb ai commands trigger side effects performed by agent harnesses, which can't be observed
+ // directly. This is the entry point for collecting evidence about those side effects and
+ // recording them in telemetry.
+ if (indexAndStats) {
+ collectAiSetupEvidence('dev', options.configDir, indexAndStats.storyIndex);
+ }
+
Object.assign(payload, {
versionStatus: versionUpdates && versionCheck ? versionStatus(versionCheck) : 'disabled',
storyIndex: summarizeIndex(indexAndStats.storyIndex),
diff --git a/code/core/src/core-server/utils/ghost-stories/get-candidates.ts b/code/core/src/core-server/utils/ghost-stories/get-candidates.ts
index 1e0b5d90c76e..661196a3ebea 100644
--- a/code/core/src/core-server/utils/ghost-stories/get-candidates.ts
+++ b/code/core/src/core-server/utils/ghost-stories/get-candidates.ts
@@ -1,7 +1,6 @@
import { readFile } from 'node:fs/promises';
import { babelParse, traverse } from 'storybook/internal/babel';
-import { logger } from 'storybook/internal/node-logger';
// eslint-disable-next-line depend/ban-dependencies
import { glob } from 'glob';
@@ -128,9 +127,12 @@ export async function getCandidatesForStorybook(
export async function getComponentCandidates({
sampleSize = 20,
globPattern = '**/*.{tsx,jsx}',
+ cwd = process.cwd(),
}: {
sampleSize?: number;
globPattern?: string;
+ /** Working directory for glob. Defaults to process.cwd(). */
+ cwd?: string;
} = {}): Promise<{
candidates: string[];
error?: string;
@@ -145,7 +147,7 @@ export async function getComponentCandidates({
// Find files matching the glob pattern
files = await glob(globPattern, {
- cwd: process.cwd(),
+ cwd,
absolute: true,
ignore: [
'**/node_modules/**',
diff --git a/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.test.ts b/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.test.ts
index 0e29ade6c591..e86024aed86d 100644
--- a/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.test.ts
+++ b/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.test.ts
@@ -49,6 +49,7 @@ describe('parse-vitest-report', () => {
successRateWithoutEmptyRender: 1.0,
uniqueErrorCount: 0,
categorizedErrors: {},
+ cssCheck: 'not-run',
});
});
@@ -94,9 +95,9 @@ describe('parse-vitest-report', () => {
it('should categorize errors and include them in the summary', () => {
const mockVitestResults = {
success: false,
- numTotalTests: 4,
+ numTotalTests: 5,
numPassedTests: 1,
- numFailedTests: 3,
+ numFailedTests: 4,
testResults: [
{
assertionResults: [
@@ -136,7 +137,7 @@ describe('parse-vitest-report', () => {
const result = parseVitestResults(mockVitestResults);
- expect(result.summary?.total).toBe(4);
+ expect(result.summary?.total).toBe(5);
expect(result.summary?.passed).toBe(1);
expect(result.summary?.uniqueErrorCount).toBe(3);
expect(result.summary?.categorizedErrors).toEqual({
@@ -261,5 +262,36 @@ describe('parse-vitest-report', () => {
expect(result.summary?.total).toBe(0);
expect(result.summary?.successRate).toBe(0);
});
+
+ it('surfaces the CssCheck story outcome via summary.cssCheck', () => {
+ const mockVitestResults = {
+ success: false,
+ numTotalTests: 2,
+ numPassedTests: 1,
+ numFailedTests: 1,
+ testResults: [
+ {
+ assertionResults: [
+ {
+ fullName: 'components-button--primary',
+ status: 'passed',
+ meta: { storyId: 'components-button--primary' },
+ failureMessages: [],
+ },
+ {
+ fullName: 'components-button--css-check',
+ status: 'failed',
+ meta: { storyId: 'components-button--css-check' },
+ failureMessages: ['Error: expected rgb(37, 99, 235) but got rgba(0, 0, 0, 0)'],
+ },
+ ],
+ },
+ ],
+ };
+
+ const result = parseVitestResults(mockVitestResults);
+
+ expect(result.summary?.cssCheck).toBe('fail');
+ });
});
});
diff --git a/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts b/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts
index e0bd41cc53a6..87c272d6d648 100644
--- a/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts
+++ b/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts
@@ -1,125 +1,28 @@
-import type { ErrorCategory } from '../../../shared/utils/categorize-render-errors.ts';
-import { categorizeError } from '../../../shared/utils/categorize-render-errors.ts';
-import {
- type ErrorCategorizationResult,
- type StoryTestResult,
- type TestRunSummary,
-} from './types.ts';
+import { analyzeTestResults } from '../../../shared/utils/analyze-test-results.ts';
+import type { StoryTestResult } from '../../../shared/utils/test-result-types.ts';
+import { toStoryTestResult } from '../../../shared/utils/to-story-test-result.ts';
+import type { TestRunSummary } from './types.ts';
-/**
- * For a given list of test results:
- *
- * - Go through failures
- * - Categorize errors into categories
- * - Return structured data about the run, with categorized errors instead of the actual error
- * messages
- */
-function extractCategorizedErrors(testResults: StoryTestResult[]): ErrorCategorizationResult {
- const failed = testResults.filter((r) => r.status === 'FAIL' && r.error);
-
- // Map: category -> { count, uniqueErrors: Set, matchedDependencies }
- const map = new Map<
- ErrorCategory,
- { count: number; uniqueErrors: Set; matchedDependencies: Set }
- >();
-
- // To count unique error messages (by their message, not by category)
- const uniqueErrorMessages = new Set();
-
- for (const r of failed) {
- const { category, matchedDependencies } = categorizeError(r.error!, r.stack);
-
- if (!map.has(category)) {
- map.set(category, { count: 0, uniqueErrors: new Set(), matchedDependencies: new Set() });
- }
-
- const data = map.get(category)!;
- data.count++;
- matchedDependencies.forEach((dep) => data.matchedDependencies.add(dep));
-
- // Use the full error message for unique error message counting
- uniqueErrorMessages.add(r.error!);
- data.uniqueErrors.add(r.error!);
- }
-
- const categorizedErrors = Array.from(map.entries()).reduce>(
- (acc, [category, data]) => {
- acc[category] = {
- uniqueCount: data.uniqueErrors.size,
- count: data.count,
- matchedDependencies: Array.from(data.matchedDependencies).sort(),
- };
- return acc;
- },
- {}
- );
-
- return {
- totalErrors: failed.length,
- uniqueErrorCount: uniqueErrorMessages.size,
- categorizedErrors,
- };
-}
-
-/** Transform the Vitest test results to our expected format and return a TestRunSummary */
+/** Transform the Vitest JSON reporter output to our expected format and return a TestRunSummary */
export function parseVitestResults(report: any): TestRunSummary {
- // Transform the Vitest test results to our expected format
const storyTestResults: StoryTestResult[] = [];
- let passedButEmptyRender = 0;
for (const testSuite of report.testResults) {
for (const assertion of testSuite.assertionResults) {
- const storyId = assertion.meta?.storyId || assertion.fullName;
-
- const status =
- assertion.status === 'passed' ? 'PASS' : assertion.status === 'failed' ? 'FAIL' : 'PENDING';
-
- // Check for empty render in reports
- const hasEmptyRender = assertion.meta?.reports?.some(
- (report: { type: string; result?: { emptyRender?: boolean } }) =>
- report.type === 'render-analysis' && report.result?.emptyRender === true
- );
-
- if (status === 'PASS' && hasEmptyRender) {
- passedButEmptyRender++;
- }
+ const result = toStoryTestResult({
+ storyId: assertion.meta?.storyId ?? assertion.fullName,
+ statusRaw: assertion.status,
+ reports: assertion.meta?.reports,
+ errors: assertion.failureMessages?.map((message: string) => ({ stack: message })),
+ });
- // Extract error message (first line of failureMessages)
- let error: string | undefined;
- let stack: string | undefined;
- if (assertion.failureMessages && assertion.failureMessages.length > 0) {
- stack = assertion.failureMessages[0];
- error = stack?.split('\n')[0]; // Take only the first line
+ if (result) {
+ storyTestResults.push(result);
}
-
- storyTestResults.push({
- storyId,
- status,
- error,
- stack,
- });
}
}
- const total = report.numTotalTests;
- const passed = report.numPassedTests;
- const successRate = total > 0 ? parseFloat((passed / total).toFixed(2)) : 0;
- const successRateWithoutEmptyRender =
- total > 0 ? parseFloat(((passed - passedButEmptyRender) / total).toFixed(2)) : 0;
-
- // Extract and categorize unique errors
- const errorClassification = extractCategorizedErrors(storyTestResults);
- const categorizedErrors = errorClassification.categorizedErrors;
-
return {
- summary: {
- total,
- passed,
- passedButEmptyRender,
- successRate,
- successRateWithoutEmptyRender,
- uniqueErrorCount: errorClassification.uniqueErrorCount,
- categorizedErrors,
- },
+ summary: analyzeTestResults(storyTestResults),
};
}
diff --git a/code/core/src/core-server/utils/ghost-stories/run-story-tests.ts b/code/core/src/core-server/utils/ghost-stories/run-story-tests.ts
index 7c2350aa2fa6..ebaa2491113f 100644
--- a/code/core/src/core-server/utils/ghost-stories/run-story-tests.ts
+++ b/code/core/src/core-server/utils/ghost-stories/run-story-tests.ts
@@ -8,10 +8,21 @@ import { join } from 'pathe';
import { parseVitestResults } from './parse-vitest-report.ts';
import type { TestRunSummary } from './types.ts';
-export async function runStoryTests(componentFilePaths: string[]): Promise {
+/**
+ * Run ghost stories: execute vitest on component file paths to auto-generate
+ * and test stories that don't exist on disk.
+ *
+ * @param componentFilePaths - Absolute paths to component files to test.
+ * @param options.cwd - Working directory for vitest. Defaults to process.cwd().
+ */
+export async function runStoryTests(
+ componentFilePaths: string[],
+ options?: { cwd?: string; ghostRun?: boolean }
+): Promise {
+ const cwd = options?.cwd;
try {
// Create the cache directory for story discovery tests
- const cacheDir = resolvePathInStorybookCache('ghost-stories-tests');
+ const cacheDir = resolvePathInStorybookCache('story-tests');
await mkdir(cacheDir, { recursive: true });
// Create timestamped output file
@@ -34,10 +45,11 @@ export async function runStoryTests(componentFilePaths: string[]): Promise {
const afterEach: AfterEach = async ({ reporting, canvasElement, globals }) => {
try {
- // We only run this through ghost stories runs
- if (!globals.ghostStories) {
+ // Render analysis runs during ghost stories and agent-mode vitest runs
+ if (!globals.renderAnalysis?.enabled) {
return;
}
diff --git a/code/core/src/core-server/utils/ghost-stories/types.ts b/code/core/src/core-server/utils/ghost-stories/types.ts
index 741506db7f85..b93b0aeed28c 100644
--- a/code/core/src/core-server/utils/ghost-stories/types.ts
+++ b/code/core/src/core-server/utils/ghost-stories/types.ts
@@ -1,34 +1,8 @@
-export interface StoryTestResult {
- storyId: string;
- status: 'PASS' | 'FAIL' | 'PENDING';
- error?: string;
- stack?: string;
-}
-
-export interface CategorizedError {
- category: string;
- count: number;
- uniqueCount: number;
- matchedDependencies: string[];
-}
-
-export interface ErrorCategorizationResult {
- totalErrors: number;
- categorizedErrors: Record;
- uniqueErrorCount: number;
-}
+import type { TestRunAnalysis } from '../../../shared/utils/test-result-types.ts';
export interface TestRunSummary {
duration?: number;
- summary?: {
- total: number;
- passed: number;
- passedButEmptyRender: number;
- successRate: number;
- successRateWithoutEmptyRender: number;
- uniqueErrorCount: number;
- categorizedErrors: Record;
- };
+ summary?: TestRunAnalysis;
// Error message if the operation failed
runError?: string;
}
diff --git a/code/core/src/core-server/utils/wait-for-idle-vitest.test.ts b/code/core/src/core-server/utils/wait-for-idle-vitest.test.ts
new file mode 100644
index 000000000000..a5135656c68f
--- /dev/null
+++ b/code/core/src/core-server/utils/wait-for-idle-vitest.test.ts
@@ -0,0 +1,114 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+vi.mock('../stores/test-provider.ts', () => ({
+ fullTestProviderStore: {
+ getFullState: vi.fn(),
+ },
+}));
+
+import { fullTestProviderStore } from '../stores/test-provider.ts';
+import { waitForIdleVitest } from './wait-for-idle-vitest.ts';
+
+const getFullState = vi.mocked(fullTestProviderStore.getFullState);
+
+beforeEach(() => {
+ vi.useFakeTimers();
+ getFullState.mockReset();
+});
+
+describe('waitForIdleVitest', () => {
+ it('returns true immediately when no providers are running', async () => {
+ getFullState.mockReturnValue({ a: 'test-provider-state:pending' });
+ await expect(waitForIdleVitest()).resolves.toBe(true);
+ });
+
+ it('returns true immediately when state is empty', async () => {
+ getFullState.mockReturnValue({});
+ await expect(waitForIdleVitest()).resolves.toBe(true);
+ });
+
+ it('returns true when getFullState throws (store not initialized)', async () => {
+ getFullState.mockImplementation(() => {
+ throw new Error('not initialized');
+ });
+ await expect(waitForIdleVitest()).resolves.toBe(true);
+ });
+
+ it('waits and returns true when provider transitions from running to succeeded', async () => {
+ getFullState
+ .mockReturnValueOnce({ a: 'test-provider-state:running' })
+ .mockReturnValueOnce({ a: 'test-provider-state:succeeded' });
+
+ const promise = waitForIdleVitest(60_000, 100);
+
+ // First poll finds running, schedules a timeout
+ await vi.advanceTimersByTimeAsync(100);
+
+ await expect(promise).resolves.toBe(true);
+ expect(getFullState).toHaveBeenCalledTimes(2);
+ });
+
+ it('waits and returns true when provider transitions from running to crashed', async () => {
+ getFullState
+ .mockReturnValueOnce({ a: 'test-provider-state:running' })
+ .mockReturnValueOnce({ a: 'test-provider-state:crashed' });
+
+ const promise = waitForIdleVitest(60_000, 100);
+
+ // First poll finds running, schedules a timeout
+ await vi.advanceTimersByTimeAsync(100);
+
+ await expect(promise).resolves.toBe(true);
+ expect(getFullState).toHaveBeenCalledTimes(2);
+ });
+
+ it('returns false when maxWaitMs is exceeded', async () => {
+ getFullState.mockReturnValue({ a: 'test-provider-state:running' });
+
+ const promise = waitForIdleVitest(250, 100);
+
+ // Advance past the deadline
+ await vi.advanceTimersByTimeAsync(100);
+ await vi.advanceTimersByTimeAsync(100);
+ await vi.advanceTimersByTimeAsync(100);
+
+ await expect(promise).resolves.toBe(false);
+ });
+
+ it('treats multiple providers correctly — running if any is running', async () => {
+ getFullState.mockReturnValue({
+ a: 'test-provider-state:pending',
+ b: 'test-provider-state:running',
+ });
+
+ const promise = waitForIdleVitest(50, 100);
+ await vi.advanceTimersByTimeAsync(100);
+
+ await expect(promise).resolves.toBe(false);
+ });
+
+ it('returns true when all multiple providers are idle', async () => {
+ getFullState.mockReturnValue({
+ a: 'test-provider-state:pending',
+ b: 'test-provider-state:pending',
+ });
+ await expect(waitForIdleVitest()).resolves.toBe(true);
+ });
+
+ it('polls at the configured interval', async () => {
+ getFullState
+ .mockReturnValueOnce({ a: 'test-provider-state:running' })
+ .mockReturnValueOnce({ a: 'test-provider-state:running' })
+ .mockReturnValueOnce({ a: 'test-provider-state:pending' });
+
+ const promise = waitForIdleVitest(60_000, 200);
+
+ await vi.advanceTimersByTimeAsync(200);
+ expect(getFullState).toHaveBeenCalledTimes(2);
+
+ await vi.advanceTimersByTimeAsync(200);
+ expect(getFullState).toHaveBeenCalledTimes(3);
+
+ await expect(promise).resolves.toBe(true);
+ });
+});
diff --git a/code/core/src/core-server/utils/wait-for-idle-vitest.ts b/code/core/src/core-server/utils/wait-for-idle-vitest.ts
new file mode 100644
index 000000000000..5e0959bac303
--- /dev/null
+++ b/code/core/src/core-server/utils/wait-for-idle-vitest.ts
@@ -0,0 +1,27 @@
+import { fullTestProviderStore } from '../stores/test-provider.ts';
+/**
+ * Wait for the test provider to be idle (no tests running).
+ * Returns true if idle, false if timed out.
+ * Use this if you intend to run a ad-hoc vitest process to
+ * avoid conflicts with already running component tests.
+ */
+export async function waitForIdleVitest(
+ maxWaitMs = 30 * 60 * 1000,
+ pollIntervalMs = 60 * 1000
+): Promise {
+ const deadline = Date.now() + maxWaitMs;
+ while (Date.now() < deadline) {
+ try {
+ const state = fullTestProviderStore.getFullState();
+ const isRunning = Object.values(state).some((s) => s === 'test-provider-state:running');
+ if (!isRunning) {
+ return true;
+ }
+ } catch {
+ // Store not initialized yet — treat as idle
+ return true;
+ }
+ await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
+ }
+ return false;
+}
diff --git a/code/core/src/core-server/withTelemetry.test.ts b/code/core/src/core-server/withTelemetry.test.ts
index 7cfcf982bbc9..9c372f57f516 100644
--- a/code/core/src/core-server/withTelemetry.test.ts
+++ b/code/core/src/core-server/withTelemetry.test.ts
@@ -4,6 +4,7 @@ import { cache, isCI, loadAllPresets } from 'storybook/internal/common';
import { prompt } from 'storybook/internal/node-logger';
import {
ErrorCollector,
+ collectAiSetupEvidence,
isTelemetryStateResolved,
oneWayHash,
setTelemetryEnabled,
@@ -35,6 +36,7 @@ describe('withTelemetry', () => {
vi.resetAllMocks();
vi.mocked(ErrorCollector.getErrors).mockReturnValue([]);
vi.mocked(telemetry).mockResolvedValue(undefined);
+ vi.mocked(collectAiSetupEvidence).mockResolvedValue(undefined);
});
it('works in happy path', async () => {
const run = vi.fn();
@@ -42,6 +44,7 @@ describe('withTelemetry', () => {
await withTelemetry('dev', { cliOptions }, run);
expect(telemetry).toHaveBeenCalledTimes(1);
+ expect(collectAiSetupEvidence).toHaveBeenCalledTimes(1);
expect(telemetry).toHaveBeenCalledWith('boot', { eventType: 'dev' }, { stripMetadata: true });
});
@@ -66,6 +69,7 @@ describe('withTelemetry', () => {
).rejects.toThrow(error);
expect(telemetry).toHaveBeenCalledWith('boot', { eventType: 'dev' }, { stripMetadata: true });
+ expect(collectAiSetupEvidence).toHaveBeenCalledTimes(1);
});
it('does not send boot when cli option is passed', async () => {
@@ -83,6 +87,7 @@ describe('withTelemetry', () => {
).rejects.toThrow(error);
expect(telemetry).toHaveBeenCalledTimes(2);
+ expect(collectAiSetupEvidence).toHaveBeenCalledTimes(1);
expect(telemetry).toHaveBeenCalledWith(
'error',
expect.objectContaining({
diff --git a/code/core/src/core-server/withTelemetry.ts b/code/core/src/core-server/withTelemetry.ts
index c0eeb7278ded..8d091ba75024 100644
--- a/code/core/src/core-server/withTelemetry.ts
+++ b/code/core/src/core-server/withTelemetry.ts
@@ -7,8 +7,10 @@ import {
} from 'storybook/internal/common';
import { logger, prompt } from 'storybook/internal/node-logger';
import {
+ collectAiSetupEvidence,
ErrorCollector,
getPrecedingUpgrade,
+ isTelemetryModuleEnabled,
isTelemetryStateResolved,
oneWayHash,
onPayloadError,
@@ -227,6 +229,10 @@ export async function withTelemetry(
telemetry('boot', { eventType }, { stripMetadata: true });
+ // Fire-and-forget: don't await, don't block the command
+ const configDir = options.cliOptions.configDir || options.presetOptions?.configDir;
+ collectAiSetupEvidence(eventType, configDir);
+
try {
const result = await run();
diff --git a/code/core/src/manager-api/typings.d.ts b/code/core/src/manager-api/typings.d.ts
index 7bf67c97fc33..e3745a310bac 100644
--- a/code/core/src/manager-api/typings.d.ts
+++ b/code/core/src/manager-api/typings.d.ts
@@ -10,3 +10,8 @@ declare var STORYBOOK_ADDON_STATE: Record;
declare var STORYBOOK_FRAMEWORK: import('storybook/internal/types').SupportedFramework | undefined;
declare var STORYBOOK_RENDERER: import('storybook/internal/types').SupportedRenderer | undefined;
declare var STORYBOOK_BUILDER: import('storybook/internal/types').SupportedBuilder | undefined;
+declare var STORYBOOK_LAST_EVENTS: Record<
+ import('storybook/telemetry').EventType,
+ import('storybook/telemetry').CacheEntry
+>;
+declare var STORYBOOK_SESSION_ID: string | undefined;
diff --git a/code/core/src/manager/components/preview/tools/share.tsx b/code/core/src/manager/components/preview/tools/share.tsx
index a2674113e9b7..c87f2e2c4365 100644
--- a/code/core/src/manager/components/preview/tools/share.tsx
+++ b/code/core/src/manager/components/preview/tools/share.tsx
@@ -1,4 +1,4 @@
-import React, { useEffect, useMemo, useState } from 'react';
+import React, { useEffect, useMemo } from 'react';
import { Button, PopoverProvider, TooltipLinkList } from 'storybook/internal/components';
import {
@@ -11,7 +11,7 @@ import type { Addon_BaseType } from 'storybook/internal/types';
import { global } from '@storybook/global';
import { LinkIcon, ShareAltIcon, ShareIcon } from '@storybook/icons';
-import copy from 'copy-to-clipboard';
+import { useCopyButton } from '../../../../shared/useCopyButton.ts';
import { QRCodeSVG as QRCode } from 'qrcode.react';
import { Consumer, types } from 'storybook/manager-api';
import type { API, Combo } from 'storybook/manager-api';
@@ -72,19 +72,29 @@ const ShareMenu = React.memo(function ShareMenu({
}) {
const shortcutKeys = api.getShortcutKeys();
const enableShortcuts = !!shortcutKeys;
- const [copied, setCopied] = useState(false);
const copyStoryLink = shortcutKeys?.copyStoryLink;
const openInIsolation = shortcutKeys?.openInIsolation;
+ const originHrefs = useMemo(
+ () => api.getStoryHrefs(storyId, { base: 'origin', refId }),
+ [api, storyId, refId]
+ );
+ const networkHrefs = useMemo(
+ () => api.getStoryHrefs(storyId, { base: 'network', refId }),
+ [api, storyId, refId]
+ );
+
+ const { children: copyTitle, buttonProps: copyButtonProps } = useCopyButton({
+ children: 'Copy story link',
+ content: originHrefs.managerHref,
+ onCopy: () => api.emit(SHARE_STORY_LINK, originHrefs.managerHref),
+ });
+
useEffect(() => {
api.emit(SHARE_POPOVER_OPENED);
}, [api]);
const links = useMemo(() => {
- const copyTitle = copied ? 'Copied!' : 'Copy story link';
- const originHrefs = api.getStoryHrefs(storyId, { base: 'origin', refId });
- const networkHrefs = api.getStoryHrefs(storyId, { base: 'network', refId });
-
return [
[
{
@@ -92,12 +102,7 @@ const ShareMenu = React.memo(function ShareMenu({
title: copyTitle,
icon: ,
right: enableShortcuts ? : null,
- onClick: () => {
- api.emit(SHARE_STORY_LINK, originHrefs.managerHref);
- copy(originHrefs.managerHref);
- setCopied(true);
- setTimeout(() => setCopied(false), 2000);
- },
+ onClick: copyButtonProps.onClick,
},
{
id: 'open-new-tab',
@@ -131,7 +136,16 @@ const ShareMenu = React.memo(function ShareMenu({
},
],
];
- }, [api, storyId, refId, copied, enableShortcuts, copyStoryLink, openInIsolation]);
+ }, [
+ api,
+ originHrefs,
+ networkHrefs,
+ copyTitle,
+ copyButtonProps,
+ enableShortcuts,
+ copyStoryLink,
+ openInIsolation,
+ ]);
return ;
});
diff --git a/code/core/src/manager/components/sidebar/ChecklistWidget.stories.tsx b/code/core/src/manager/components/sidebar/ChecklistWidget.stories.tsx
index b0cc0ed80a10..d05556a1d4ef 100644
--- a/code/core/src/manager/components/sidebar/ChecklistWidget.stories.tsx
+++ b/code/core/src/manager/components/sidebar/ChecklistWidget.stories.tsx
@@ -94,3 +94,21 @@ export const Narrow = meta.story({
decorators: [(Story) => {Story()}
],
play,
});
+
+const withAiSetupState = {
+ loaded: true,
+ aiOptIn: true,
+ widget: {},
+ items: {
+ ...initialState.items,
+ // aiSetup is intentionally left 'open' so it appears in the widget's task list
+ controls: { status: 'accepted' as const },
+ renderComponent: { status: 'done' as const },
+ },
+};
+
+export const WithAiSetup = meta.story({
+ beforeEach: async () => {
+ mockStore.setState(withAiSetupState);
+ },
+});
diff --git a/code/core/src/manager/components/sidebar/ChecklistWidget.tsx b/code/core/src/manager/components/sidebar/ChecklistWidget.tsx
index 95676b231fff..5450330ab035 100644
--- a/code/core/src/manager/components/sidebar/ChecklistWidget.tsx
+++ b/code/core/src/manager/components/sidebar/ChecklistWidget.tsx
@@ -1,4 +1,4 @@
-import React, { useEffect, useRef, useState } from 'react';
+import React, { type ReactElement, type SyntheticEvent, useEffect, useRef, useState } from 'react';
import {
ActionList,
@@ -9,6 +9,7 @@ import {
} from 'storybook/internal/components';
import {
+ CheckIcon,
ChevronSmallUpIcon,
EyeCloseIcon,
ListUnorderedIcon,
@@ -25,6 +26,7 @@ import { Particles } from '../Particles/Particles.tsx';
import { TextFlip } from '../TextFlip.tsx';
import type { ChecklistItem } from './useChecklist.ts';
import { useChecklist } from './useChecklist.ts';
+import { useCopyButton } from '../../../shared/useCopyButton.ts';
const fadeScaleIn = keyframes`
from {
@@ -172,6 +174,31 @@ const OpenGuideButton = ({
);
};
+const CopyButton = ({
+ label,
+ copyContent,
+ onClick,
+}: {
+ label: string;
+ copyContent: string;
+ onClick: (e: SyntheticEvent) => void;
+}) => {
+ const { children: copyChildren, buttonProps: copyButtonProps } = useCopyButton<
+ string | ReactElement
+ >({
+ children: label,
+ childrenOnCopy: (
+ <>
+ Copied!
+ >
+ ),
+ onCopy: onClick,
+ content: copyContent,
+ });
+
+ return {copyChildren};
+};
+
export const ChecklistWidget = () => {
const api = useStorybookApi();
const { loaded, ready, allItems, nextItems, progress, accept, mute, items } = useChecklist();
@@ -317,6 +344,8 @@ export const ChecklistWidget = () => {
{item.isCompleted && animated ? (
+ ) : item.icon ? (
+
) : (
)}
@@ -327,21 +356,35 @@ export const ChecklistWidget = () => {
- {item.action && (
- {
- e.stopPropagation();
- item.action?.onClick({
- api,
- accept: () => accept(item.id),
- });
- }}
- >
- {item.action.label}
-
- )}
+ {item.action &&
+ (item.action.copyContent ? (
+ {
+ e.stopPropagation();
+ item.action?.onClick({
+ api,
+ accept: () => accept(item.id),
+ });
+ }}
+ />
+ ) : (
+ {
+ e.stopPropagation();
+ item.action?.onClick({
+ api,
+ accept: () => accept(item.id),
+ });
+ }}
+ >
+ {item.action.label}
+
+ ))}
)
)}
diff --git a/code/core/src/manager/components/sidebar/ContextMenu.tsx b/code/core/src/manager/components/sidebar/ContextMenu.tsx
index 6685c18520ff..99149c4e4204 100644
--- a/code/core/src/manager/components/sidebar/ContextMenu.tsx
+++ b/code/core/src/manager/components/sidebar/ContextMenu.tsx
@@ -1,4 +1,4 @@
-import type { ComponentProps, FC, SyntheticEvent } from 'react';
+import type { ComponentProps, FC, MouseEvent, SyntheticEvent } from 'react';
import React, { useContext, useMemo, useState } from 'react';
import { PopoverProvider, TooltipLinkList } from 'storybook/internal/components';
@@ -12,12 +12,12 @@ import {
import { CopyIcon, EditorIcon, EllipsisIcon } from '@storybook/icons';
-import copy from 'copy-to-clipboard';
import { useStorybookApi } from 'storybook/manager-api';
import type { API } from 'storybook/manager-api';
import { styled } from 'storybook/theming';
import type { Link } from '../../../components/components/tooltip/TooltipLinkList.tsx';
+import { useCopyButton } from '../../../shared/useCopyButton.ts';
import { getMostCriticalStatusValue } from '../../utils/status.tsx';
import { Shortcut } from '../Shortcut.tsx';
import { UseSymbol } from './IconSymbols.tsx';
@@ -44,9 +44,14 @@ const FloatingStatusButton = styled(StatusButton)({
export const useContextMenu = (context: API_HashEntry, links: Link[], api: API) => {
const [hoverCount, setHoverCount] = useState(0);
const [isOpen, setIsOpen] = useState(false);
- const [copyText, setCopyText] = React.useState('Copy story name');
const { allStatuses, groupStatus } = useContext(StatusContext);
+ const exportName = context && 'exportName' in context ? (context.exportName ?? '') : '';
+ const { children: copyText, buttonProps: copyButtonProps } = useCopyButton({
+ children: 'Copy story name',
+ content: exportName,
+ });
+
const shortcutKeys = api.getShortcutKeys();
const enableShortcuts = !!shortcutKeys;
@@ -80,17 +85,13 @@ export const useContextMenu = (context: API_HashEntry, links: Link[], api: API)
// ) : null,
onClick: (e: SyntheticEvent) => {
e.preventDefault();
- copy(context.exportName);
- setCopyText('Copied!');
- setTimeout(() => {
- setCopyText('Copy story name');
- }, 2000);
+ copyButtonProps.onClick(e);
},
});
}
return defaultLinks;
- }, [api, context, copyText, enableShortcuts, shortcutKeys]);
+ }, [api, context, copyText, copyButtonProps, enableShortcuts, shortcutKeys]);
const handlers = useMemo(() => {
return {
diff --git a/code/core/src/manager/components/sidebar/useChecklist.ts b/code/core/src/manager/components/sidebar/useChecklist.ts
index 3d554ed52e24..cb313079157b 100644
--- a/code/core/src/manager/components/sidebar/useChecklist.ts
+++ b/code/core/src/manager/components/sidebar/useChecklist.ts
@@ -56,7 +56,12 @@ const useStoryIndex = () => {
const checkAvailable = (
item: RawItemWithSection,
itemsById: Record,
- context: { api: API; index: API_IndexHash | undefined; item: RawItemWithSection }
+ context: {
+ api: API;
+ index: API_IndexHash | undefined;
+ item: RawItemWithSection;
+ storeState: import('../../../shared/checklist-store/index.ts').StoreState;
+ }
) => {
if (item.available && !item.available(context)) {
return false;
@@ -143,7 +148,7 @@ export const useChecklist = () => {
const isAvailable = isCompleted
? item.afterCompletion !== 'unavailable'
- : checkAvailable(item, itemsById, { api, index, item });
+ : checkAvailable(item, itemsById, { api, index, item, storeState: checklistState });
const isLockedBy = checkLockedBy(item, itemsById, items);
const isImmutable = isCompleted && item.afterCompletion === 'immutable';
const isReady = isOpen && isAvailable && !isMuted && !isLockedBy;
@@ -162,7 +167,7 @@ export const useChecklist = () => {
isMuted,
};
});
- }, [itemsById, items, widget, api, index]);
+ }, [itemsById, items, widget, api, index, checklistState]);
const itemCollections = useMemo(() => {
const availableItems = allItems.filter((item) => item.isAvailable);
diff --git a/code/core/src/manager/globals/exports.ts b/code/core/src/manager/globals/exports.ts
index d48a7284091e..2bad5d4d4b68 100644
--- a/code/core/src/manager/globals/exports.ts
+++ b/code/core/src/manager/globals/exports.ts
@@ -574,6 +574,9 @@ export default {
'withReset',
],
'storybook/internal/core-events': [
+ 'AI_PROMPT_NUDGE',
+ 'AI_SETUP_ANALYTICS_REQUEST',
+ 'AI_SETUP_ANALYTICS_RESPONSE',
'ARGTYPES_INFO_REQUEST',
'ARGTYPES_INFO_RESPONSE',
'CHANNEL_CREATED',
diff --git a/code/core/src/manager/settings/Checklist/AiSetupBlock.stories.tsx b/code/core/src/manager/settings/Checklist/AiSetupBlock.stories.tsx
new file mode 100644
index 000000000000..8cad1b1341d2
--- /dev/null
+++ b/code/core/src/manager/settings/Checklist/AiSetupBlock.stories.tsx
@@ -0,0 +1,88 @@
+import React from 'react';
+
+import { ManagerContext } from 'storybook/manager-api';
+import { fn } from 'storybook/test';
+
+import preview from '../../../../../.storybook/preview.tsx';
+import { checklistData } from '../../../shared/checklist-store/checklistData.tsx';
+import type { ChecklistItem } from '../../components/sidebar/useChecklist.ts';
+import { AiSetupBlock } from './AiSetupBlock.tsx';
+
+const managerContext: any = {
+ state: {},
+ api: {
+ getDocsUrl: fn().mockName('api::getDocsUrl'),
+ getData: fn().mockName('api::getData'),
+ getIndex: fn().mockName('api::getIndex'),
+ getUrlState: fn().mockName('api::getUrlState'),
+ navigate: fn().mockName('api::navigate'),
+ on: fn().mockName('api::on'),
+ off: fn().mockName('api::off'),
+ once: fn().mockName('api::once'),
+ emit: fn().mockName('api::emit'),
+ },
+};
+
+// Get the raw aiSetup item. Cast through unknown to avoid deep discriminated-union issues
+// with the as-const typed checklistData — the shape is correct at runtime.
+const rawAiSetupItem = checklistData.sections
+ .flatMap((s) => s.items as unknown as ChecklistItem[])
+ .find((item) => item.id === 'aiSetup')!;
+
+const makeItem = (overrides: Partial = {}): ChecklistItem => ({
+ ...rawAiSetupItem,
+ itemIndex: 0,
+ sectionId: 'basics',
+ sectionIndex: 0,
+ sectionTitle: 'Storybook basics',
+ isAvailable: true,
+ isLockedBy: undefined,
+ isImmutable: false,
+ isCompleted: false,
+ isReady: true,
+ isOpen: true,
+ isAccepted: false,
+ isDone: false,
+ isSkipped: false,
+ isMuted: false,
+ ...overrides,
+});
+
+const meta = preview.meta({
+ component: AiSetupBlock,
+ decorators: [
+ (Story) => (
+
+
+
+
+
+ ),
+ ],
+});
+
+export const Open = meta.story({
+ args: {
+ item: makeItem(),
+ reset: fn().mockName('reset'),
+ skip: fn().mockName('skip'),
+ },
+});
+
+export const Skipped = meta.story({
+ args: {
+ item: makeItem({ isSkipped: true, isOpen: false }),
+ reset: fn().mockName('reset'),
+ skip: fn().mockName('skip'),
+ },
+});
+
+// Shows the block starting in the Skipped (collapsed) state.
+// Toggle between this story and Open to see the animation.
+export const SkippedToOpen = meta.story({
+ args: {
+ item: makeItem({ isSkipped: true, isOpen: false }),
+ reset: fn().mockName('reset'),
+ skip: fn().mockName('skip'),
+ },
+});
diff --git a/code/core/src/manager/settings/Checklist/AiSetupBlock.tsx b/code/core/src/manager/settings/Checklist/AiSetupBlock.tsx
new file mode 100644
index 000000000000..b4f3e3033214
--- /dev/null
+++ b/code/core/src/manager/settings/Checklist/AiSetupBlock.tsx
@@ -0,0 +1,124 @@
+import React, { type ReactNode } from 'react';
+
+import { Button, Collapsible } from 'storybook/internal/components';
+import { AI_PROMPT_NUDGE } from 'storybook/internal/core-events';
+
+import { CheckIcon, UndoIcon } from '@storybook/icons';
+
+import { type API, useStorybookApi } from 'storybook/manager-api';
+import { styled } from 'storybook/theming';
+
+import { AI_SETUP_PROMPT } from '../../../shared/constants/ai-prompts.ts';
+import { useCopyButton } from '../../../shared/useCopyButton.ts';
+import type { ItemId } from '../../../shared/checklist-store/index.ts';
+import type { ChecklistItem } from '../../components/sidebar/useChecklist.ts';
+import { Skipped, StatusIcon } from './Checklist.tsx';
+
+const AiCtaCard = styled.div(({ theme }) => ({
+ display: 'flex',
+ flexDirection: 'column',
+ padding: '10px 10px 10px 15px',
+ border: `1px solid ${theme.base === 'dark' ? theme.color.darker : theme.color.border}`,
+ borderRadius: 8,
+ background: theme.background.content,
+}));
+
+const AiCtaHeadingRow = styled.div({
+ display: 'flex',
+ alignItems: 'center',
+ gap: 10,
+});
+
+const AiCtaHeading = styled.h2<{ $skipped: boolean }>(({ theme, $skipped }) => ({
+ flex: 1,
+ margin: 0,
+ color: $skipped ? theme.textMutedColor : theme.color.defaultText,
+ fontSize: theme.typography.size.s2,
+ fontWeight: theme.typography.weight.bold,
+ textWrap: 'pretty',
+}));
+
+const AiCtaDescription = styled.p(({ theme }) => ({
+ margin: 0,
+ color: theme.color.defaultText,
+ fontSize: theme.typography.size.s2,
+ fontWeight: theme.typography.weight.regular,
+ marginTop: 8,
+}));
+
+const AiCtaActions = styled.div({
+ display: 'flex',
+ gap: 8,
+ justifyContent: 'flex-end',
+ marginTop: 12,
+});
+
+const CopyButton = ({ api }: { api: API }) => {
+ const { children: buttonChildren, buttonProps } = useCopyButton({
+ children: 'Copy prompt',
+ childrenOnCopy: (
+ <>
+ Copied!
+ >
+ ),
+ content: AI_SETUP_PROMPT,
+ onCopy: () => {
+ api.emit(AI_PROMPT_NUDGE, { id: 'setup', origin: 'onboarding-guide-page' });
+ },
+ });
+
+ return (
+
+ );
+};
+
+export const AiSetupBlock = ({
+ item,
+ reset,
+ skip,
+}: {
+ item: ChecklistItem;
+ reset: (id: ItemId) => void;
+ skip: (id: ItemId) => void;
+}) => {
+ const api = useStorybookApi();
+
+ const showAiCta = !item.isDone && !item.isAccepted && !item.isCompleted;
+
+ if (!showAiCta) {
+ return null;
+ }
+
+ return (
+
+
+ {item.isSkipped && (
+
+ Skipped
+
+ )}
+ Set up Storybook with AI
+ {item.isSkipped && (
+
+ )}
+
+
+
+
+ Run a prompt in your AI agent to analyze your codebase, configure decorators and mocks,
+ write sample stories for your UI components, and verify everything works.
+
+
+
+
+
+
+
+ );
+};
diff --git a/code/core/src/manager/settings/Checklist/Checklist.stories.tsx b/code/core/src/manager/settings/Checklist/Checklist.stories.tsx
index a8ef4a7b6304..c4381c950ec0 100644
--- a/code/core/src/manager/settings/Checklist/Checklist.stories.tsx
+++ b/code/core/src/manager/settings/Checklist/Checklist.stories.tsx
@@ -10,15 +10,8 @@ import { checklistData } from '../../../shared/checklist-store/checklistData.tsx
import type { ChecklistItem } from '../../components/sidebar/useChecklist.ts';
import { Checklist } from './Checklist.tsx';
-const values: Record = {
- controls: 'accepted',
- renderComponent: 'done',
- whatsNewStorybook10: 'done',
- viewports: 'skipped',
-};
-
-const availableItems = checklistData.sections.flatMap(
- ({ id: sectionId, title: sectionTitle, items }, sectionIndex) =>
+const buildItems = (values: Record) =>
+ checklistData.sections.flatMap(({ id: sectionId, title: sectionTitle, items }, sectionIndex) =>
items.map((item, itemIndex) => {
const itemValue = values[item.id];
const isAccepted = itemValue === 'accepted';
@@ -43,7 +36,14 @@ const availableItems = checklistData.sections.flatMap(
isMuted: false,
};
})
-);
+ );
+
+const availableItems = buildItems({
+ controls: 'accepted',
+ renderComponent: 'done',
+ whatsNewStorybook10: 'done',
+ viewports: 'skipped',
+});
const Container = styled.div(({ theme }) => ({
fontSize: theme.typography.size.s2,
@@ -83,3 +83,29 @@ const meta = preview.meta({
export const Default = meta.story({
args: { availableItems, ...checklistStore },
});
+
+export const WithAiSetup = meta.story({
+ args: {
+ availableItems: buildItems({
+ controls: 'accepted',
+ renderComponent: 'done',
+ whatsNewStorybook10: 'done',
+ viewports: 'skipped',
+ // aiSetup is intentionally omitted → status 'open'
+ }),
+ ...checklistStore,
+ },
+});
+
+export const WithAiSetupSkipped = meta.story({
+ args: {
+ availableItems: buildItems({
+ controls: 'accepted',
+ renderComponent: 'done',
+ whatsNewStorybook10: 'done',
+ viewports: 'skipped',
+ aiSetup: 'skipped',
+ }),
+ ...checklistStore,
+ },
+});
diff --git a/code/core/src/manager/settings/Checklist/Checklist.tsx b/code/core/src/manager/settings/Checklist/Checklist.tsx
index 5f6e1eb55733..0d68bbd23af5 100644
--- a/code/core/src/manager/settings/Checklist/Checklist.tsx
+++ b/code/core/src/manager/settings/Checklist/Checklist.tsx
@@ -1,4 +1,4 @@
-import React, { useMemo } from 'react';
+import React, { type SyntheticEvent, useMemo } from 'react';
import { ActionList, Button, Collapsible } from 'storybook/internal/components';
@@ -16,6 +16,7 @@ import { styled } from 'storybook/theming';
import { Focus } from '../../components/Focus/Focus.tsx';
import type { ChecklistItem, useChecklist } from '../../components/sidebar/useChecklist.ts';
import { useLocationHash } from '../../hooks/useLocation.ts';
+import { useCopyButton } from '../../../shared/useCopyButton.ts';
type ChecklistSection = {
id: string;
@@ -164,7 +165,7 @@ const ItemContent = styled.div(({ theme }) => ({
},
}));
-const StatusIcon = styled.div(({ theme }) => ({
+export const StatusIcon = styled.div(({ theme }) => ({
position: 'relative',
flex: '0 0 auto',
minHeight: 16,
@@ -193,7 +194,7 @@ const Checked = styled(StatusPassIcon)<{ 'data-visible'?: boolean }>(
transition: 'all var(--transition-duration, 0.2s)',
})
);
-const Skipped = styled.span<{ visible?: boolean }>(({ theme, visible }) => ({
+export const Skipped = styled.span<{ visible?: boolean }>(({ theme, visible }) => ({
display: 'flex',
alignItems: 'center',
color: theme.textMutedColor,
@@ -228,12 +229,37 @@ const ToggleButton = styled(Button)({
},
});
+const ChecklistCopyButton = ({
+ label,
+ copyContent,
+ onClick,
+}: {
+ label: string;
+ copyContent: string;
+ onClick: (e: SyntheticEvent) => void;
+}) => {
+ const { children: copyChildren, buttonProps: copyButtonProps } = useCopyButton({
+ children: label,
+ onCopy: onClick,
+ content: copyContent,
+ });
+
+ return (
+
+ );
+};
+
export const Checklist = ({
availableItems,
accept,
skip,
reset,
-}: Pick, 'availableItems' | 'accept' | 'skip' | 'reset'>) => {
+ forceCollapsed = false,
+}: Pick, 'availableItems' | 'accept' | 'skip' | 'reset'> & {
+ forceCollapsed?: boolean;
+}) => {
const api = useStorybookApi();
const locationHash = useLocationHash();
@@ -258,7 +284,9 @@ export const Checklist = ({
const sections = useMemo(
() =>
Object.values(sectionsById).map(({ id, title, itemIds }) => {
- const items = itemIds.map((id) => itemsById[id]);
+ const items = itemIds
+ .map((id) => itemsById[id])
+ .filter(({ showOnGuidePage }) => showOnGuidePage !== false);
const progress =
(items.reduce((acc, item) => (item.isOpen ? acc : acc + 1), 0) / items.length) * 100;
return { id, title, items, progress };
@@ -269,7 +297,8 @@ export const Checklist = ({
return (
{sections.map(({ id, title, items, progress }) => {
- const collapsed = progress === 100 && items.every((item) => item.id !== locationHash);
+ const collapsed =
+ forceCollapsed || (progress === 100 && items.every((item) => item.id !== locationHash));
return (
@@ -361,22 +390,36 @@ export const Checklist = ({
)}
- {isOpen && !isLocked && item.action && (
-
- )}
+ {isOpen &&
+ !isLocked &&
+ item.action &&
+ (item.action.copyContent ? (
+
+ item.action!.onClick({
+ api,
+ accept: () => accept(item.id),
+ })
+ }
+ />
+ ) : (
+
+ ))}
{isOpen && !isLocked && !item.action && !item.subscribe && (
-
+ {aiSetupItem && (
+
+ )}
+
{global.FEATURES?.sidebarOnboardingChecklist !== false && (
<>
{checklist.openItems.length === 0 ? (
diff --git a/code/core/src/manager/settings/whats_new.tsx b/code/core/src/manager/settings/whats_new.tsx
index b30573bbf36e..44c8a8c2e55d 100644
--- a/code/core/src/manager/settings/whats_new.tsx
+++ b/code/core/src/manager/settings/whats_new.tsx
@@ -9,6 +9,8 @@ import { AlertIcon as AlertIconSvg, EyeCloseIcon, EyeIcon, HeartIcon } from '@st
import { useStorybookApi, useStorybookState } from 'storybook/manager-api';
import { styled, useTheme } from 'storybook/theming';
+import { useCopyButton } from '../../shared/useCopyButton.ts';
+
const Centered = styled.div({
top: '50%',
position: 'absolute',
@@ -48,27 +50,24 @@ const Container = styled.div(({ theme }) => ({
export const WhatsNewFooter = ({
isNotificationsEnabled,
onToggleNotifications,
- onCopyLink,
+ copyContent,
}: {
isNotificationsEnabled: boolean;
onToggleNotifications?: () => void;
- onCopyLink?: () => void;
+ copyContent: string;
}) => {
const theme = useTheme();
- const [copyText, setCopyText] = useState('Copy Link');
- const copyLink = () => {
- // @ts-expect-error (non strict)
- onCopyLink();
- setCopyText('Copied!');
- setTimeout(() => setCopyText('Copy Link'), 4000);
- };
+ const { children: copyText, buttonProps: copyButtonProps } = useCopyButton({
+ children: 'Copy Link',
+ content: copyContent,
+ });
return (
Share this with your team.
-
@@ -135,7 +134,7 @@ export interface WhatsNewProps {
onLoad: () => void;
url?: string;
isNotificationsEnabled: boolean;
- onCopyLink?: () => void;
+ copyContent: string;
onToggleNotifications?: () => void;
}
@@ -144,7 +143,7 @@ const PureWhatsNewScreen: FC = ({
isLoaded,
onLoad,
url,
- onCopyLink,
+ copyContent,
onToggleNotifications,
isNotificationsEnabled,
}) => (
@@ -158,7 +157,7 @@ const PureWhatsNewScreen: FC = ({
>
)}
@@ -195,10 +194,7 @@ const WhatsNewScreen: FC = () => {
}}
url={whatsNewData.url}
isNotificationsEnabled={isNotificationsEnabled}
- onCopyLink={() => {
- // eslint-disable-next-line compat/compat
- navigator.clipboard?.writeText(whatsNewData.blogUrl ?? whatsNewData.url);
- }}
+ copyContent={whatsNewData.blogUrl ?? whatsNewData.url}
onToggleNotifications={() => {
if (isNotificationsEnabled) {
if (global.confirm('All update notifications will no longer be shown. Are you sure?')) {
diff --git a/code/core/src/manager/settings/whats_new_footer.stories.tsx b/code/core/src/manager/settings/whats_new_footer.stories.tsx
index 74ed7b11fc77..ecbfe05d2511 100644
--- a/code/core/src/manager/settings/whats_new_footer.stories.tsx
+++ b/code/core/src/manager/settings/whats_new_footer.stories.tsx
@@ -13,6 +13,7 @@ type Story = StoryObj;
export const Default: Story = {
args: {
isNotificationsEnabled: false,
+ copyContent: 'https://storybook.js.org/blog',
},
parameters: {
design: {
diff --git a/code/core/src/shared/checklist-store/checklistData.state.ts b/code/core/src/shared/checklist-store/checklistData.state.ts
index 09818f7956fa..0f85b44b26f9 100644
--- a/code/core/src/shared/checklist-store/checklistData.state.ts
+++ b/code/core/src/shared/checklist-store/checklistData.state.ts
@@ -3,6 +3,7 @@ import type { StoreState } from './index.ts';
export const initialState = {
items: {
accessibilityTests: { status: 'open' },
+ aiSetup: { status: 'open' },
autodocs: { status: 'open' },
ciTests: { status: 'open' },
controls: { status: 'open' },
diff --git a/code/core/src/shared/checklist-store/checklistData.tsx b/code/core/src/shared/checklist-store/checklistData.tsx
index da9d00f23a9f..c5c20525b6fc 100644
--- a/code/core/src/shared/checklist-store/checklistData.tsx
+++ b/code/core/src/shared/checklist-store/checklistData.tsx
@@ -3,6 +3,7 @@ import React from 'react';
import { Link, SyntaxHighlighter } from 'storybook/internal/components';
import {
+ AI_PROMPT_NUDGE,
PREVIEW_INITIALIZED,
STORY_ARGS_UPDATED,
STORY_FINISHED,
@@ -18,6 +19,8 @@ import {
import { type API, Tag, addons, internal_universalTestProviderStore } from 'storybook/manager-api';
import { ThemeProvider, convert, styled, themes } from 'storybook/theming';
+import { WandIcon } from '@storybook/icons';
+
import { ADDON_ID as ADDON_A11Y_ID } from '../../../../addons/a11y/src/constants.ts';
import {
ADDON_ONBOARDING_CHANNEL,
@@ -32,6 +35,7 @@ import { ADDON_ID as ADDON_DOCS_ID } from '../../docs-tools/shared.ts';
import { TourGuide } from '../../manager/components/TourGuide/TourGuide.tsx';
import { LocationMonitor } from '../../manager/hooks/useLocation.ts';
import type { initialState } from './checklistData.state.ts';
+import { AI_SETUP_PROMPT } from '../constants/ai-prompts.ts';
const CodeWrapper = styled.div(({ theme }) => ({
alignSelf: 'stretch',
@@ -66,6 +70,9 @@ export interface ChecklistData {
/** Display name. Keep it short and actionable (with a verb). */
label: string;
+ /** Optional custom icon component to display instead of the default status icon. */
+ icon?: React.ComponentType;
+
/** Description of the criteria that must be met to complete the item. */
criteria: string;
@@ -75,6 +82,9 @@ export interface ChecklistData {
/** What to do after the item is completed (prevent undo or hide the item). */
afterCompletion?: 'immutable' | 'unavailable';
+ /** Whether to show the item in the GuidePage. Only set to `false` if the GuidePage has another tailored way to display the item.*/
+ showOnGuidePage?: boolean;
+
/**
* Function to check if the item should be available (displayed in the checklist). Called any
* time the index is updated.
@@ -83,6 +93,7 @@ export interface ChecklistData {
api: API;
index: API_IndexHash | undefined;
item: ChecklistData['sections'][number]['items'][number];
+ storeState: import('./index.ts').StoreState;
}) => boolean;
/** Function returning content to display in the checklist item's collapsible area. */
@@ -91,6 +102,8 @@ export interface ChecklistData {
/** Action button to be displayed when item is not completed. */
action?: {
label: string;
+ /** If set, clicking the button copies this text to the clipboard via useCopyButton. */
+ copyContent?: string;
onClick: (args: { api: API; accept: () => void }) => void;
};
@@ -150,6 +163,25 @@ export const checklistData = {
id: 'basics',
title: 'Storybook basics',
items: [
+ {
+ id: 'aiSetup',
+ label: 'Set up with AI',
+ icon: WandIcon,
+ available: ({ storeState }) => {
+ // Show only if the user opted into AI during `storybook init` and has not run
+ // `storybook ai setup` yet. Both flags are populated server-side from the event cache.
+ return !!storeState.aiOptIn && storeState.items.aiSetup?.status !== 'done';
+ },
+ criteria: 'ai setup command has not been run yet',
+ showOnGuidePage: false,
+ action: {
+ label: 'Copy prompt',
+ copyContent: AI_SETUP_PROMPT,
+ onClick: ({ api }) => {
+ api.emit(AI_PROMPT_NUDGE, { id: 'setup', origin: 'onboarding-checklist-side' });
+ },
+ },
+ },
{
id: 'guidedTour',
label: 'Take the guided tour',
diff --git a/code/core/src/shared/checklist-store/index.ts b/code/core/src/shared/checklist-store/index.ts
index 3c0f45239db4..da6b32e39a58 100644
--- a/code/core/src/shared/checklist-store/index.ts
+++ b/code/core/src/shared/checklist-store/index.ts
@@ -12,6 +12,8 @@ export type ChecklistState = NonNullable<
export type StoreState = Required> & {
items: NonNullable>;
loaded?: boolean;
+ /** True when the user opted into AI during `storybook init`. Set by the server from the event cache. */
+ aiOptIn?: boolean;
};
export type ItemId = keyof StoreState['items'];
diff --git a/code/core/src/shared/constants/ai-prompts.ts b/code/core/src/shared/constants/ai-prompts.ts
new file mode 100644
index 000000000000..0c1d6f904bd5
--- /dev/null
+++ b/code/core/src/shared/constants/ai-prompts.ts
@@ -0,0 +1,5 @@
+/**
+ * Source of truth for AI prompts throughout the application.
+ */
+
+export const AI_SETUP_PROMPT = `Run \`npx storybook ai setup\` and follow its instructions precisely.`;
diff --git a/code/core/src/shared/useCopyButton.ts b/code/core/src/shared/useCopyButton.ts
new file mode 100644
index 000000000000..f5c47f6bb6c4
--- /dev/null
+++ b/code/core/src/shared/useCopyButton.ts
@@ -0,0 +1,97 @@
+import {
+ type ReactNode,
+ type SyntheticEvent,
+ useCallback,
+ useEffect,
+ useMemo,
+ useRef,
+ useState,
+} from 'react';
+
+import { announce, clearAnnouncer } from '@react-aria/live-announcer';
+
+export interface UseCopyButtonOptions {
+ /** Content shown in the button by default. */
+ children?: T;
+ /** Content shown in the button for `duration` ms after a successful copy. */
+ childrenOnCopy?: T;
+ /** Text written to the clipboard when the button is clicked. */
+ content: string;
+ /** Optional side-effect called after the text is successfully written to the clipboard. */
+ onCopy?: (e: SyntheticEvent) => void;
+ /** aria-label for the button in its default state. Pass `false` to suppress. */
+ ariaLabel?: false | string;
+ /** aria-label for the button while in its "copied" state. Pass `false` to suppress. */
+ ariaLabelOnCopy?: false | string;
+ /** Duration in milliseconds to show the copied state. Defaults to 3000. */
+ duration?: number;
+}
+
+export interface UseCopyButtonResult {
+ /** Current label/icon — switches to `childrenOnCopy` after a copy. */
+ children: T;
+ /** Props to spread onto the `` element. */
+ buttonProps: {
+ onClick: (e: SyntheticEvent) => void;
+ ariaLabel: false | string;
+ };
+}
+
+export function useCopyButton({
+ children = 'Copy',
+ childrenOnCopy = 'Copied!',
+ content,
+ onCopy,
+ ariaLabel = false,
+ ariaLabelOnCopy = false,
+ duration = 3000,
+}: UseCopyButtonOptions): UseCopyButtonResult {
+ const [copied, setCopied] = useState(false);
+ const timerRef = useRef | null>(null);
+
+ useEffect(
+ () => () => {
+ if (timerRef.current) {
+ clearTimeout(timerRef.current);
+ }
+ },
+ []
+ );
+
+ const handleClick = useCallback(
+ (e: SyntheticEvent) => {
+ if (timerRef.current) {
+ clearTimeout(timerRef.current);
+ }
+
+ const announcement =
+ typeof ariaLabelOnCopy === 'string' ? ariaLabelOnCopy : 'Copied to clipboard';
+
+ // eslint-disable-next-line compat/compat
+ navigator.clipboard?.writeText(content).then(() => {
+ onCopy?.(e);
+ setCopied(true);
+ announce(announcement, 'polite');
+
+ timerRef.current = setTimeout(() => {
+ setCopied(false);
+ clearAnnouncer('polite');
+ timerRef.current = null;
+ }, duration);
+ });
+ },
+ [content, onCopy, ariaLabelOnCopy, duration]
+ );
+
+ return {
+ // @ts-expect-error - TypeScript is not realising T is constrained identically in both interfaces.
+ children: copied ? childrenOnCopy! : children!,
+ buttonProps: useMemo(
+ () => ({
+ onClick: handleClick,
+ ariaLabel: copied ? ariaLabelOnCopy : ariaLabel,
+ }),
+ [handleClick, copied, ariaLabelOnCopy, ariaLabel]
+ ),
+ };
+}
diff --git a/code/core/src/shared/utils/analyze-test-results.test.ts b/code/core/src/shared/utils/analyze-test-results.test.ts
new file mode 100644
index 000000000000..99313e8bf76b
--- /dev/null
+++ b/code/core/src/shared/utils/analyze-test-results.test.ts
@@ -0,0 +1,188 @@
+import { describe, expect, it, vi } from 'vitest';
+
+import { analyzeTestResults, extractCategorizedErrors } from './analyze-test-results.ts';
+import type { StoryTestResult } from './test-result-types.ts';
+
+vi.mock('./categorize-render-errors', { spy: true });
+
+describe('analyze-test-results', () => {
+ describe('extractCategorizedErrors', () => {
+ it('should return empty results for all-passing tests', () => {
+ const results: StoryTestResult[] = [
+ { storyId: 's1', status: 'PASS' },
+ { storyId: 's2', status: 'PASS' },
+ ];
+ const analysis = extractCategorizedErrors(results);
+ expect(analysis.totalErrors).toBe(0);
+ expect(analysis.uniqueErrorCount).toBe(0);
+ expect(analysis.categorizedErrors).toEqual({});
+ });
+
+ it('should categorize errors from failed tests', () => {
+ const results: StoryTestResult[] = [
+ {
+ storyId: 's1',
+ status: 'FAIL',
+ error: 'Error: Cannot read property "x" of undefined',
+ stack: 'at /deps/styled-components.js:1168:14',
+ },
+ {
+ storyId: 's2',
+ status: 'FAIL',
+ error: 'Error: Cannot read property "x" of undefined',
+ stack: 'at /deps/styled-components.js:1168:14',
+ },
+ {
+ storyId: 's3',
+ status: 'FAIL',
+ error: 'Error: Module not found: react-router',
+ stack: 'at import statement',
+ },
+ ];
+ const analysis = extractCategorizedErrors(results);
+ expect(analysis.totalErrors).toBe(3);
+ expect(analysis.uniqueErrorCount).toBe(2);
+ expect(analysis.categorizedErrors['MISSING_THEME_PROVIDER']).toEqual({
+ uniqueCount: 1,
+ count: 2,
+ matchedDependencies: ['styled-components'],
+ });
+ expect(analysis.categorizedErrors['MODULE_IMPORT_ERROR']).toEqual({
+ uniqueCount: 1,
+ count: 1,
+ matchedDependencies: [],
+ });
+ });
+
+ it('should skip failed tests without error messages', () => {
+ const results: StoryTestResult[] = [{ storyId: 's1', status: 'FAIL' }];
+ const analysis = extractCategorizedErrors(results);
+ expect(analysis.totalErrors).toBe(0);
+ });
+ });
+
+ describe('analyzeTestResults', () => {
+ it('should compute correct summary for all-passing tests', () => {
+ const results: StoryTestResult[] = [
+ { storyId: 's1', status: 'PASS' },
+ { storyId: 's2', status: 'PASS' },
+ { storyId: 's3', status: 'PASS' },
+ ];
+ const analysis = analyzeTestResults(results);
+ expect(analysis).toEqual({
+ total: 3,
+ passed: 3,
+ passedButEmptyRender: 0,
+ successRate: 1.0,
+ successRateWithoutEmptyRender: 1.0,
+ uniqueErrorCount: 0,
+ categorizedErrors: {},
+ cssCheck: 'not-run',
+ });
+ });
+
+ it('should compute correct summary with failures', () => {
+ const results: StoryTestResult[] = [
+ { storyId: 's1', status: 'PASS' },
+ { storyId: 's2', status: 'FAIL', error: 'Error: Invalid hook call', stack: '' },
+ { storyId: 's3', status: 'FAIL', error: 'Error: Module not found', stack: '' },
+ ];
+ const analysis = analyzeTestResults(results);
+ expect(analysis.total).toBe(3);
+ expect(analysis.passed).toBe(1);
+ expect(analysis.successRate).toBe(0.33);
+ expect(analysis.uniqueErrorCount).toBe(2);
+ });
+
+ it('should count passedButEmptyRender', () => {
+ const results: StoryTestResult[] = [
+ { storyId: 's1', status: 'PASS' },
+ { storyId: 's2', status: 'PASS', emptyRender: true },
+ { storyId: 's3', status: 'PASS', emptyRender: true },
+ ];
+ const analysis = analyzeTestResults(results);
+ expect(analysis.passedButEmptyRender).toBe(2);
+ expect(analysis.successRate).toBe(1.0);
+ expect(analysis.successRateWithoutEmptyRender).toBe(0.33);
+ });
+
+ it('should handle zero tests', () => {
+ const analysis = analyzeTestResults([]);
+ expect(analysis.total).toBe(0);
+ expect(analysis.successRate).toBe(0);
+ expect(analysis.successRateWithoutEmptyRender).toBe(0);
+ });
+
+ it('should handle PENDING tests by not counting them as passed', () => {
+ const results: StoryTestResult[] = [
+ { storyId: 's1', status: 'PASS' },
+ { storyId: 's2', status: 'PENDING' },
+ ];
+ const analysis = analyzeTestResults(results);
+ expect(analysis.total).toBe(2);
+ expect(analysis.passed).toBe(1);
+ expect(analysis.successRate).toBe(0.5);
+ });
+
+ describe('cssCheck', () => {
+ it("is 'pass' when a --css-check story passed", () => {
+ const results: StoryTestResult[] = [
+ { storyId: 'components-button--primary', status: 'PASS' },
+ { storyId: 'components-button--css-check', status: 'PASS' },
+ ];
+ expect(analyzeTestResults(results).cssCheck).toBe('pass');
+ });
+
+ it("is 'fail' when a --css-check story failed", () => {
+ const results: StoryTestResult[] = [
+ {
+ storyId: 'components-button--css-check',
+ status: 'FAIL',
+ error: 'expected rgb(37, 99, 235) but got rgba(0, 0, 0, 0)',
+ },
+ ];
+ expect(analyzeTestResults(results).cssCheck).toBe('fail');
+ });
+
+ it("is 'not-run' when no --css-check story is present", () => {
+ const results: StoryTestResult[] = [
+ { storyId: 'components-button--primary', status: 'PASS' },
+ ];
+ expect(analyzeTestResults(results).cssCheck).toBe('not-run');
+ });
+
+ it("is 'not-run' when the --css-check story was skipped / pending / todo", () => {
+ // PENDING covers any non-pass / non-fail Vitest status (skipped,
+ // pending, todo, filtered out). No pass/fail signal available →
+ // 'not-run', same bucket as "story wasn't authored at all".
+ const results: StoryTestResult[] = [
+ { storyId: 'components-button--css-check', status: 'PENDING' },
+ ];
+ expect(analyzeTestResults(results).cssCheck).toBe('not-run');
+ });
+
+ it("is 'not-run' for an empty result list", () => {
+ expect(analyzeTestResults([]).cssCheck).toBe('not-run');
+ });
+
+ it('uses the first match when multiple --css-check stories exist', () => {
+ // Prompt violation: the AI setup prompt asks for exactly one.
+ // First match wins; downstream aggregates still reflect all of them.
+ const results: StoryTestResult[] = [
+ { storyId: 'components-button--css-check', status: 'PASS' },
+ { storyId: 'components-card--css-check', status: 'FAIL', error: 'style mismatch' },
+ ];
+ expect(analyzeTestResults(results).cssCheck).toBe('pass');
+ });
+
+ it('is case-insensitive on the suffix (defensive)', () => {
+ // CSF already lowercases storyIds. This keeps the check resilient
+ // to a future upstream change in sanitization.
+ const results: StoryTestResult[] = [
+ { storyId: 'components-button--CSS-CHECK', status: 'PASS' },
+ ];
+ expect(analyzeTestResults(results).cssCheck).toBe('pass');
+ });
+ });
+ });
+});
diff --git a/code/core/src/shared/utils/analyze-test-results.ts b/code/core/src/shared/utils/analyze-test-results.ts
new file mode 100644
index 000000000000..1be681c39af3
--- /dev/null
+++ b/code/core/src/shared/utils/analyze-test-results.ts
@@ -0,0 +1,104 @@
+import type { ErrorCategory } from './categorize-render-errors.ts';
+import { categorizeError } from './categorize-render-errors.ts';
+import type {
+ ErrorCategorizationResult,
+ StoryTestResult,
+ TestRunAnalysis,
+} from './test-result-types.ts';
+
+/**
+ * For a given list of test results, categorize errors into categories and return structured data
+ * about the run. Only failed tests with error messages are categorized.
+ */
+export function extractCategorizedErrors(
+ testResults: StoryTestResult[]
+): ErrorCategorizationResult {
+ const failed = testResults.filter((r) => r.status === 'FAIL' && r.error);
+
+ const map = new Map<
+ ErrorCategory,
+ { count: number; uniqueErrors: Set; matchedDependencies: Set }
+ >();
+
+ const uniqueErrorMessages = new Set();
+
+ for (const r of failed) {
+ const { category, matchedDependencies } = categorizeError(r.error!, r.stack);
+
+ if (!map.has(category)) {
+ map.set(category, { count: 0, uniqueErrors: new Set(), matchedDependencies: new Set() });
+ }
+
+ const data = map.get(category)!;
+ data.count++;
+ matchedDependencies.forEach((dep) => data.matchedDependencies.add(dep));
+
+ uniqueErrorMessages.add(r.error!);
+ data.uniqueErrors.add(r.error!);
+ }
+
+ const categorizedErrors = Array.from(map.entries()).reduce>(
+ (acc, [category, data]) => {
+ acc[category] = {
+ uniqueCount: data.uniqueErrors.size,
+ count: data.count,
+ matchedDependencies: Array.from(data.matchedDependencies).sort(),
+ };
+ return acc;
+ },
+ {}
+ );
+
+ return {
+ totalErrors: failed.length,
+ uniqueErrorCount: uniqueErrorMessages.size,
+ categorizedErrors,
+ };
+}
+
+/**
+ * StoryId suffix for a story named `CssCheck` (after Storybook's CSF
+ * `toStartCaseStr` + `sanitize`: `CssCheck` → `Css Check` → `css-check`).
+ */
+const CSS_CHECK_STORY_ID_SUFFIX = '--css-check';
+
+/**
+ * Analyze a list of story test results and produce a TestRunAnalysis with pass/fail counts, success
+ * rates, empty render detection, and categorized errors.
+ */
+export function analyzeTestResults(results: StoryTestResult[]): TestRunAnalysis {
+ const total = results.length;
+ const passed = results.filter((r) => r.status === 'PASS').length;
+ const passedButEmptyRender = results.filter((r) => r.status === 'PASS' && r.emptyRender).length;
+
+ const successRate = total > 0 ? parseFloat((passed / total).toFixed(2)) : 0;
+ const successRateWithoutEmptyRender =
+ total > 0 ? parseFloat(((passed - passedButEmptyRender) / total).toFixed(2)) : 0;
+
+ const errorClassification = extractCategorizedErrors(results);
+
+ // `'not-run'` covers both "no CssCheck story in the suite" and "story
+ // existed but wasn't executed" — they're the same signal for consumers
+ // (no pass/fail outcome available). Collapsing them avoids a fourth
+ // state and keeps dashboards from interpreting an absent field.
+ const cssCheckMatch = results.find((r) =>
+ r.storyId.toLowerCase().endsWith(CSS_CHECK_STORY_ID_SUFFIX)
+ );
+ const cssCheck: TestRunAnalysis['cssCheck'] =
+ cssCheckMatch?.status === 'PASS'
+ ? 'pass'
+ : cssCheckMatch?.status === 'FAIL'
+ ? 'fail'
+ : 'not-run';
+
+ return {
+ total,
+ passed,
+ passedButEmptyRender,
+ successRate,
+ successRateWithoutEmptyRender,
+ uniqueErrorCount: errorClassification.uniqueErrorCount,
+ categorizedErrors: errorClassification.categorizedErrors,
+ cssCheck,
+ };
+}
diff --git a/code/core/src/shared/utils/categorize-render-errors.test.ts b/code/core/src/shared/utils/categorize-render-errors.test.ts
index 8376317149e9..834267e0c811 100644
--- a/code/core/src/shared/utils/categorize-render-errors.test.ts
+++ b/code/core/src/shared/utils/categorize-render-errors.test.ts
@@ -68,6 +68,23 @@ describe('categorize-render-errors', () => {
expect(
categorizeError('Hooks can only be called inside React function components.').category
).toBe(ERROR_CATEGORIES.HOOK_USAGE_ERROR);
+
+ expect(
+ categorizeError(
+ 'Too many re-renders. React limits the number of renders to prevent an infinite loop.'
+ ).category
+ ).toBe(ERROR_CATEGORIES.HOOK_USAGE_ERROR);
+
+ expect(
+ categorizeError(
+ 'Maximum update depth exceeded. This can happen when a component calls setState inside useEffect.'
+ ).category
+ ).toBe(ERROR_CATEGORIES.HOOK_USAGE_ERROR);
+
+ expect(
+ categorizeError('useMyHook is a hook and must be called inside a function component.')
+ .category
+ ).toBe(ERROR_CATEGORIES.HOOK_USAGE_ERROR);
});
});
@@ -183,6 +200,10 @@ describe('categorize-render-errors', () => {
expect(categorizeError('Portal root not found').category).toBe(
ERROR_CATEGORIES.MISSING_PORTAL_ROOT
);
+
+ expect(categorizeError('Target container is not a DOM element.').category).toBe(
+ ERROR_CATEGORIES.MISSING_PORTAL_ROOT
+ );
});
});
@@ -192,6 +213,24 @@ describe('categorize-render-errors', () => {
ERROR_CATEGORIES.MISSING_PROVIDER
);
});
+
+ it('should categorize context not found errors', () => {
+ expect(categorizeError('context not found').category).toBe(
+ ERROR_CATEGORIES.MISSING_PROVIDER
+ );
+
+ expect(categorizeError('No provider found for context').category).toBe(
+ ERROR_CATEGORIES.MISSING_PROVIDER
+ );
+
+ expect(categorizeError('Component cannot be rendered without a provider').category).toBe(
+ ERROR_CATEGORIES.MISSING_PROVIDER
+ );
+
+ expect(categorizeError('context is null').category).toBe(
+ ERROR_CATEGORIES.MISSING_PROVIDER
+ );
+ });
});
describe('SERVER_COMPONENTS_ERROR', () => {
@@ -223,6 +262,32 @@ describe('categorize-render-errors', () => {
expect(categorizeError('Failed to render component').category).toBe(
ERROR_CATEGORIES.COMPONENT_RENDER_ERROR
);
+
+ expect(categorizeError('MyComponent is not a function').category).toBe(
+ ERROR_CATEGORIES.COMPONENT_RENDER_ERROR
+ );
+
+ expect(categorizeError('null is not an object (evaluating foo.bar)').category).toBe(
+ ERROR_CATEGORIES.COMPONENT_RENDER_ERROR
+ );
+
+ expect(categorizeError('ReferenceError: MyVar is not defined').category).toBe(
+ ERROR_CATEGORIES.COMPONENT_RENDER_ERROR
+ );
+
+ expect(
+ categorizeError('Element type is invalid: expected a string but got: undefined.')
+ .category
+ ).toBe(ERROR_CATEGORIES.COMPONENT_RENDER_ERROR);
+
+ expect(
+ categorizeError('Objects are not valid as a React child (found: object with keys {}).')
+ .category
+ ).toBe(ERROR_CATEGORIES.COMPONENT_RENDER_ERROR);
+
+ expect(categorizeError('Maximum call stack size exceeded').category).toBe(
+ ERROR_CATEGORIES.COMPONENT_RENDER_ERROR
+ );
});
});
diff --git a/code/core/src/shared/utils/categorize-render-errors.ts b/code/core/src/shared/utils/categorize-render-errors.ts
index 2bf36b1086a3..08c6984b666d 100644
--- a/code/core/src/shared/utils/categorize-render-errors.ts
+++ b/code/core/src/shared/utils/categorize-render-errors.ts
@@ -109,8 +109,12 @@ const CATEGORIZATION_RULES: CategorizationRule[] = [
priority: 90,
match: (ctx) =>
ctx.normalizedMessage.includes('invalid hook call') ||
- ctx.normalizedMessage.includes('rendered more hooks than') ||
- ctx.normalizedMessage.includes('hooks can only be called'),
+ ctx.normalizedMessage.includes('rendered more hooks') ||
+ ctx.normalizedMessage.includes('hooks can only be called') ||
+ ctx.normalizedMessage.includes('too many re-renders') ||
+ ctx.normalizedMessage.includes('maximum update depth exceeded') ||
+ (ctx.normalizedMessage.includes('hook') &&
+ ctx.normalizedMessage.includes('function component')),
},
{
@@ -166,9 +170,9 @@ const CATEGORIZATION_RULES: CategorizationRule[] = [
category: ERROR_CATEGORIES.MISSING_PORTAL_ROOT,
priority: 70,
match: (ctx) =>
- ctx.normalizedMessage.includes('portal') &&
- (ctx.normalizedMessage.includes('container') || ctx.normalizedMessage.includes('root')) &&
- (ctx.normalizedMessage.includes('null') || ctx.normalizedMessage.includes('not found')),
+ ctx.normalizedMessage.includes('target container is not a dom element') ||
+ (ctx.normalizedMessage.includes('portal') &&
+ (ctx.normalizedMessage.includes('container') || ctx.normalizedMessage.includes('root'))),
},
{
@@ -177,10 +181,13 @@ const CATEGORIZATION_RULES: CategorizationRule[] = [
match: (ctx) =>
(ctx.normalizedMessage.includes('use') && ctx.normalizedMessage.includes('provider')) ||
ctx.normalizedMessage.includes('') ||
+ ctx.normalizedMessage.includes('no provider') ||
+ ctx.normalizedMessage.includes('without a provider') ||
((ctx.normalizedMessage.includes('could not find') ||
- ctx.normalizedMessage.includes('missing')) &&
+ ctx.normalizedMessage.includes('missing') ||
+ ctx.normalizedMessage.includes('not found')) &&
ctx.normalizedMessage.includes('context')) ||
- (ctx.normalizedMessage.includes('usecontext') &&
+ (ctx.normalizedMessage.includes('context') &&
(ctx.normalizedMessage.includes('null') || ctx.normalizedMessage.includes('undefined'))),
},
@@ -189,7 +196,12 @@ const CATEGORIZATION_RULES: CategorizationRule[] = [
priority: 10,
match: (ctx) =>
ctx.normalizedMessage.includes('cannot read') ||
- ctx.normalizedMessage.includes('undefined is not a function') ||
+ ctx.normalizedMessage.includes('is not a function') ||
+ ctx.normalizedMessage.includes('is not an object') ||
+ ctx.normalizedMessage.includes('is not defined') ||
+ ctx.normalizedMessage.includes('element type is invalid') ||
+ ctx.normalizedMessage.includes('objects are not valid as a react child') ||
+ ctx.normalizedMessage.includes('maximum call stack') ||
ctx.normalizedMessage.includes('render'),
},
];
diff --git a/code/core/src/shared/utils/test-result-types.ts b/code/core/src/shared/utils/test-result-types.ts
new file mode 100644
index 000000000000..097279ac3493
--- /dev/null
+++ b/code/core/src/shared/utils/test-result-types.ts
@@ -0,0 +1,47 @@
+export interface StoryTestResult {
+ storyId: string;
+ status: 'PASS' | 'FAIL' | 'PENDING';
+ error?: string;
+ stack?: string;
+ /** Whether the story rendered to an empty/invisible DOM element */
+ emptyRender?: boolean;
+}
+
+export interface CategorizedError {
+ category: string;
+ count: number;
+ uniqueCount: number;
+ matchedDependencies: string[];
+}
+
+export interface ErrorCategorizationResult {
+ totalErrors: number;
+ categorizedErrors: Record;
+ uniqueErrorCount: number;
+}
+
+export interface TestRunAnalysis {
+ total: number;
+ passed: number;
+ passedButEmptyRender: number;
+ successRate: number;
+ successRateWithoutEmptyRender: number;
+ uniqueErrorCount: number;
+ categorizedErrors: Record;
+ /**
+ * Outcome of the `CssCheck` story — a story (id suffix `--css-check`)
+ * whose `play` asserts a component-specific computed style via
+ * `getComputedStyle`. Distinguishes "component mounted" from "the
+ * user's CSS actually loaded".
+ *
+ * - `'pass'` — a `CssCheck` story ran and passed.
+ * - `'fail'` — a `CssCheck` story ran and failed.
+ * - `'not-run'` — no pass/fail signal available: either no `CssCheck`
+ * story is in the suite, or the story existed but was
+ * not executed (skipped, pending, todo, filtered out).
+ *
+ * Only the three-valued enum is emitted — no storyId or component
+ * name — so no user-authored data enters telemetry.
+ */
+ cssCheck: 'pass' | 'fail' | 'not-run';
+}
diff --git a/code/core/src/shared/utils/to-story-test-result.test.ts b/code/core/src/shared/utils/to-story-test-result.test.ts
new file mode 100644
index 000000000000..c66d555ec62c
--- /dev/null
+++ b/code/core/src/shared/utils/to-story-test-result.test.ts
@@ -0,0 +1,132 @@
+import { describe, expect, it } from 'vitest';
+
+import {
+ detectEmptyRender,
+ extractErrorMessage,
+ toStoryTestResult,
+} from './to-story-test-result.ts';
+
+describe('extractErrorMessage', () => {
+ it('returns the first line of a plain message', () => {
+ expect(extractErrorMessage('TypeError: foo is not a function\n at bar', undefined)).toBe(
+ 'TypeError: foo is not a function'
+ );
+ });
+
+ it('strips the Storybook debug banner and returns the actual message', () => {
+ const message =
+ '\n\x1B[34mClick to debug the error directly in Storybook: http://localhost:6006/?path=/story/button--primary\x1B[39m\n\nmissing theme context provider';
+ expect(extractErrorMessage(message, undefined)).toBe('missing theme context provider');
+ });
+
+ it('strips the debug banner even when ANSI codes have been removed', () => {
+ const message =
+ '\nClick to debug the error directly in Storybook: http://localhost:6006/?path=/story/button--primary\n\nmissing theme context provider';
+ expect(extractErrorMessage(message, undefined)).toBe('missing theme context provider');
+ });
+
+ it('falls back to the first line of the stack when message is empty', () => {
+ expect(extractErrorMessage('', 'Error: something broke\n at foo')).toBe(
+ 'Error: something broke'
+ );
+ });
+
+ it('falls back to the first line of the stack when message is undefined', () => {
+ expect(extractErrorMessage(undefined, 'Error: something broke\n at foo')).toBe(
+ 'Error: something broke'
+ );
+ });
+
+ it('returns "unknown error" when both message and stack are empty', () => {
+ expect(extractErrorMessage('', undefined)).toBe('unknown error');
+ });
+
+ it('returns "unknown error" when both message and stack are undefined', () => {
+ expect(extractErrorMessage(undefined, undefined)).toBe('unknown error');
+ });
+
+ it('handles a banner where the actual message itself is multi-line', () => {
+ const message =
+ '\n\x1B[34mClick to debug the error directly in Storybook: http://localhost:6006/?path=/story/button--primary\x1B[39m\n\nfirst error line\nsecond line';
+ expect(extractErrorMessage(message, undefined)).toBe('first error line');
+ });
+
+ it('falls back to stack when message starts with a newline but has no banner', () => {
+ expect(extractErrorMessage('\nsome error', 'Error: fallback\n at foo')).toBe(
+ 'Error: fallback'
+ );
+ });
+});
+
+describe('detectEmptyRender', () => {
+ it('returns false for undefined reports', () => {
+ expect(detectEmptyRender(undefined)).toBe(false);
+ });
+
+ it('returns false when no render-analysis report flags emptyRender', () => {
+ expect(detectEmptyRender([{ type: 'render-analysis', result: { emptyRender: false } }])).toBe(
+ false
+ );
+ });
+
+ it('returns true when a render-analysis report flags emptyRender', () => {
+ expect(detectEmptyRender([{ type: 'render-analysis', result: { emptyRender: true } }])).toBe(
+ true
+ );
+ });
+
+ it('ignores non-render-analysis reports', () => {
+ expect(detectEmptyRender([{ type: 'other', result: { emptyRender: true } as any }])).toBe(
+ false
+ );
+ });
+});
+
+describe('toStoryTestResult', () => {
+ it('returns null when storyId is missing', () => {
+ expect(toStoryTestResult({ storyId: undefined, statusRaw: 'passed' })).toBeNull();
+ });
+
+ it('normalizes passed/failed/other statuses', () => {
+ expect(toStoryTestResult({ storyId: 's', statusRaw: 'passed' })?.status).toBe('PASS');
+ expect(toStoryTestResult({ storyId: 's', statusRaw: 'failed' })?.status).toBe('FAIL');
+ expect(toStoryTestResult({ storyId: 's', statusRaw: 'skipped' })?.status).toBe('PENDING');
+ expect(toStoryTestResult({ storyId: 's', statusRaw: undefined })?.status).toBe('PENDING');
+ });
+
+ it('flags emptyRender only when status is PASS', () => {
+ const reports = [{ type: 'render-analysis', result: { emptyRender: true } }];
+ expect(toStoryTestResult({ storyId: 's', statusRaw: 'passed', reports })?.emptyRender).toBe(
+ true
+ );
+ expect(
+ toStoryTestResult({ storyId: 's', statusRaw: 'failed', reports })?.emptyRender
+ ).toBeUndefined();
+ });
+
+ it('extracts error message and stack from runtime-style error objects', () => {
+ const result = toStoryTestResult({
+ storyId: 's',
+ statusRaw: 'failed',
+ errors: [{ message: 'TypeError: boom\n at x', stack: 'at x' }],
+ });
+ expect(result?.error).toBe('TypeError: boom');
+ expect(result?.stack).toBe('at x');
+ });
+
+ it('extracts error message from json-style (stack-only) failure messages', () => {
+ const result = toStoryTestResult({
+ storyId: 's',
+ statusRaw: 'failed',
+ errors: [{ stack: 'Error: something broke\n at foo' }],
+ });
+ expect(result?.error).toBe('Error: something broke');
+ expect(result?.stack).toBe('Error: something broke\n at foo');
+ });
+
+ it('leaves error/stack undefined when there are no errors', () => {
+ const result = toStoryTestResult({ storyId: 's', statusRaw: 'failed' });
+ expect(result?.error).toBeUndefined();
+ expect(result?.stack).toBeUndefined();
+ });
+});
diff --git a/code/core/src/shared/utils/to-story-test-result.ts b/code/core/src/shared/utils/to-story-test-result.ts
new file mode 100644
index 000000000000..b997779a7644
--- /dev/null
+++ b/code/core/src/shared/utils/to-story-test-result.ts
@@ -0,0 +1,84 @@
+import type { StoryTestResult } from './test-result-types.ts';
+
+export interface VitestLikeReport {
+ type: string;
+ result?: { emptyRender?: boolean } | unknown;
+}
+
+export interface VitestLikeError {
+ message?: string;
+ stack?: string;
+}
+
+export interface VitestLikeInput {
+ storyId: string | undefined;
+ /** Raw vitest status, e.g. 'passed' | 'failed' | 'skipped' | 'pending' | 'running' | ... */
+ statusRaw: string | undefined;
+ errors?: readonly VitestLikeError[];
+ reports?: readonly VitestLikeReport[];
+}
+
+// Matches the "Click to debug" banner prepended by addons/vitest/src/vitest-plugin/setup-file.ts,
+// with or without the surrounding ANSI color codes — environments that strip ANSI (CI wrappers,
+// NO_COLOR) shouldn't leave the banner as the reported error.
+const DEBUG_BANNER_RE = /^\n(?:\x1B\[\d+m)?Click to debug\b[^\n]*\n\n/;
+
+/**
+ * Extracts a clean single-line error message from a Vitest error.
+ *
+ * Strips the Storybook "Click to debug" banner if present, then returns the first line of the
+ * message (falling back to the first line of the stack, or `'unknown error'`).
+ */
+export function extractErrorMessage(
+ message: string | undefined,
+ stack: string | undefined
+): string {
+ const rawMessage = (message ?? '').replace(DEBUG_BANNER_RE, '');
+ return rawMessage.split('\n')[0] || stack?.split('\n')[0] || 'unknown error';
+}
+
+export function detectEmptyRender(reports: readonly VitestLikeReport[] | undefined): boolean {
+ return (
+ reports?.some(
+ (report) =>
+ report.type === 'render-analysis' &&
+ (report.result as { emptyRender?: boolean } | undefined)?.emptyRender === true
+ ) ?? false
+ );
+}
+
+function normalizeStatus(statusRaw: string | undefined): StoryTestResult['status'] {
+ if (statusRaw === 'passed') return 'PASS';
+ if (statusRaw === 'failed') return 'FAIL';
+ return 'PENDING';
+}
+
+/**
+ * Convert a Vitest-like input (either a JSON reporter assertion or a runtime TestCase) into a
+ * StoryTestResult. Returns null when the input has no storyId — callers can use this to skip
+ * non-story tests.
+ */
+export function toStoryTestResult(input: VitestLikeInput): StoryTestResult | null {
+ if (!input.storyId) {
+ return null;
+ }
+
+ const status = normalizeStatus(input.statusRaw);
+ const emptyRender = status === 'PASS' && detectEmptyRender(input.reports);
+
+ let error: string | undefined;
+ let stack: string | undefined;
+ if (input.errors && input.errors.length > 0) {
+ const firstError = input.errors[0];
+ error = extractErrorMessage(firstError.message, firstError.stack);
+ stack = firstError.stack ?? firstError.message;
+ }
+
+ return {
+ storyId: input.storyId,
+ status,
+ error,
+ stack,
+ emptyRender: emptyRender || undefined,
+ };
+}
diff --git a/code/core/src/telemetry/ai-setup-utils.test.ts b/code/core/src/telemetry/ai-setup-utils.test.ts
new file mode 100644
index 000000000000..5828f805b43d
--- /dev/null
+++ b/code/core/src/telemetry/ai-setup-utils.test.ts
@@ -0,0 +1,323 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import type { IndexEntry, StoryIndex } from 'storybook/internal/types';
+
+import {
+ checkPreviewChanged,
+ collectAiSetupEvidence,
+ countAiAuthoredStories,
+ isStoryCreatedByAISetup,
+} from './ai-setup-utils.ts';
+
+// Mock modules with spy pattern
+vi.mock('storybook/internal/common', async (importOriginal) => {
+ const actual = await importOriginal();
+ return {
+ ...actual,
+ findConfigFile: vi.fn(),
+ };
+});
+
+vi.mock('./detect-agent.ts', () => ({
+ detectAgent: vi.fn(() => undefined),
+}));
+
+vi.mock('./event-cache.ts', async (importOriginal) => {
+ const actual = await importOriginal();
+ return {
+ ...actual,
+ getAiSetupPending: vi.fn(() => undefined),
+ };
+});
+
+vi.mock('./index.ts', async (importOriginal) => {
+ const actual = await importOriginal();
+ return {
+ ...actual,
+ telemetry: vi.fn(),
+ };
+});
+
+// Import mocked modules for spy access
+import { findConfigFile } from 'storybook/internal/common';
+import { readFile } from 'node:fs/promises';
+import { detectAgent } from './detect-agent.ts';
+import { getAiSetupPending } from './event-cache.ts';
+import { SESSION_TIMEOUT } from './session-id.ts';
+import { telemetry } from './index.ts';
+
+vi.mock('node:fs/promises', async (importOriginal) => {
+ const actual = await importOriginal();
+ return {
+ ...actual,
+ readFile: vi.fn(),
+ };
+});
+
+const makePendingRecord = (overrides = {}) => ({
+ timestamp: Date.now() - 60_000, // 1 minute ago
+ sessionId: 'test-session-id',
+ configDir: '/test/config',
+ previewPath: '/test/config/preview.ts',
+ previewHash: 'abc123',
+ ...overrides,
+});
+
+const makeStoryIndex = (entries: Record = {}): StoryIndex => ({
+ v: 5,
+ entries,
+});
+
+beforeEach(() => {
+ vi.resetAllMocks();
+ vi.mocked(telemetry).mockImplementation(async (_eventType, payloadOrFactory) => {
+ if (typeof payloadOrFactory === 'function') {
+ return payloadOrFactory();
+ }
+ return payloadOrFactory;
+ });
+});
+
+describe('isStoryCreatedByAISetup', () => {
+ it('returns true for stories with the ai-generated tag', () => {
+ expect(
+ isStoryCreatedByAISetup({
+ type: 'story',
+ title: 'Foo',
+ tags: ['ai-generated', 'dev', 'play-fn'],
+ } as IndexEntry)
+ ).toBe(true);
+ });
+
+ it('returns false for regular stories', () => {
+ expect(isStoryCreatedByAISetup({ type: 'story', title: 'Foo' } as IndexEntry)).toBe(false);
+ });
+});
+
+describe('countAiAuthoredStories', () => {
+ it('counts correctly with mixed entries', () => {
+ const index = makeStoryIndex({
+ 'ai-1': {
+ type: 'story',
+ title: 'AI Generated/Button',
+ tags: ['ai-generated'],
+ id: 'ai-1',
+ name: 'Default',
+ importPath: './ai.stories.ts',
+ },
+ 'ai-2': {
+ type: 'story',
+ title: 'AI Generated/Card',
+ tags: ['ai-generated'],
+ id: 'ai-2',
+ name: 'Default',
+ importPath: './ai2.stories.ts',
+ },
+ regular: {
+ type: 'story',
+ title: 'Components/Input',
+ id: 'regular',
+ name: 'Default',
+ importPath: './input.stories.ts',
+ },
+ docs: {
+ type: 'docs',
+ title: 'AI Generated/Docs',
+ tags: ['ai-generated'],
+ id: 'docs',
+ name: 'Docs',
+ importPath: './docs.mdx',
+ storiesImports: [],
+ },
+ });
+ // Only type: 'story' entries are counted, not docs
+ expect(countAiAuthoredStories(index)).toBe(2);
+ });
+
+ it('returns 0 when no AI stories exist', () => {
+ const index = makeStoryIndex({
+ regular: {
+ type: 'story',
+ title: 'Components/Button',
+ id: 'regular',
+ name: 'Default',
+ importPath: './button.stories.ts',
+ },
+ });
+ expect(countAiAuthoredStories(index)).toBe(0);
+ });
+});
+
+describe('checkPreviewChanged', () => {
+ it('returns false when hash matches snapshot', async () => {
+ vi.mocked(findConfigFile).mockReturnValue('/test/config/preview.ts');
+ vi.mocked(readFile).mockResolvedValue('file content');
+
+ // Pre-compute the expected hash
+ const { createHash } = await import('node:crypto');
+ const expectedHash = createHash('sha256').update('file content').digest('hex');
+
+ const result = await checkPreviewChanged('/test/config', {
+ previewPath: '/test/config/preview.ts',
+ previewHash: expectedHash,
+ });
+ expect(result).toBe(false);
+ });
+
+ it('returns true when hash differs from snapshot', async () => {
+ vi.mocked(findConfigFile).mockReturnValue('/test/config/preview.ts');
+ vi.mocked(readFile).mockResolvedValue('modified content');
+
+ const result = await checkPreviewChanged('/test/config', {
+ previewPath: '/test/config/preview.ts',
+ previewHash: 'some-old-hash',
+ });
+ expect(result).toBe(true);
+ });
+
+ it('returns true when preview file is missing or unreadable', async () => {
+ vi.mocked(findConfigFile).mockReturnValue('/test/config/preview.ts');
+ vi.mocked(readFile).mockRejectedValue(new Error('ENOENT'));
+
+ const result = await checkPreviewChanged('/test/config', {
+ previewPath: '/test/config/preview.ts',
+ previewHash: 'some-hash',
+ });
+ expect(result).toBe(true);
+ });
+
+ it('returns true when file path changed', async () => {
+ vi.mocked(findConfigFile).mockReturnValue('/test/config/preview.tsx');
+
+ const result = await checkPreviewChanged('/test/config', {
+ previewPath: '/test/config/preview.ts',
+ previewHash: 'hash',
+ });
+ expect(result).toBe(true);
+ });
+});
+
+describe('collectAiSetupEvidence', () => {
+ it('does not fire when no agent detected', async () => {
+ vi.mocked(detectAgent).mockReturnValue(undefined);
+
+ await collectAiSetupEvidence('dev', '/test/config');
+ expect(telemetry).not.toHaveBeenCalled();
+ });
+
+ it('does not fire when no pending record', async () => {
+ vi.mocked(detectAgent).mockReturnValue({ name: 'claude' });
+ vi.mocked(getAiSetupPending).mockResolvedValue(undefined);
+
+ await collectAiSetupEvidence('dev', '/test/config');
+ expect(telemetry).not.toHaveBeenCalled();
+ });
+
+ it('does not fire when pending record is expired', async () => {
+ vi.mocked(detectAgent).mockReturnValue({ name: 'claude' });
+ vi.mocked(getAiSetupPending).mockResolvedValue(
+ makePendingRecord({ timestamp: Date.now() - SESSION_TIMEOUT - 1000 })
+ );
+
+ await collectAiSetupEvidence('dev', '/test/config');
+ expect(telemetry).not.toHaveBeenCalled();
+ });
+
+ it('does not fire when configDir does not match', async () => {
+ vi.mocked(detectAgent).mockReturnValue({ name: 'claude' });
+ vi.mocked(getAiSetupPending).mockResolvedValue(
+ makePendingRecord({ configDir: '/other/project/.storybook' })
+ );
+
+ await collectAiSetupEvidence('dev', '/test/config');
+ expect(telemetry).not.toHaveBeenCalled();
+ });
+
+ it('fires event with correct payload when all gates pass', async () => {
+ vi.mocked(detectAgent).mockReturnValue({ name: 'claude' });
+ const pending = makePendingRecord({ configDir: '/test/config' });
+ vi.mocked(getAiSetupPending).mockResolvedValue(pending);
+ vi.mocked(findConfigFile).mockReturnValue(pending.previewPath);
+ vi.mocked(readFile).mockRejectedValue(new Error('ENOENT'));
+
+ await collectAiSetupEvidence('dev', '/test/config');
+
+ expect(telemetry).toHaveBeenCalledWith(
+ 'ai-setup-evidence',
+ expect.any(Function),
+ expect.objectContaining({
+ immediate: true,
+ configDir: '/test/config',
+ })
+ );
+
+ const factory = vi.mocked(telemetry).mock.calls[0][1] as () => Promise;
+ await expect(factory()).resolves.toMatchObject({
+ previewChanged: true,
+ aiAuthoredStories: undefined,
+ sessionId: 'test-session-id',
+ });
+ });
+
+ it('reports aiAuthoredStories as undefined when no story index provided', async () => {
+ vi.mocked(detectAgent).mockReturnValue({ name: 'claude' });
+ const pending = makePendingRecord({ configDir: '/test/config' });
+ vi.mocked(getAiSetupPending).mockResolvedValue(pending);
+ vi.mocked(findConfigFile).mockReturnValue(null);
+
+ await collectAiSetupEvidence('dev', '/test/config');
+
+ expect(telemetry).toHaveBeenCalledWith(
+ 'ai-setup-evidence',
+ expect.any(Function),
+ expect.anything()
+ );
+
+ const factory = vi.mocked(telemetry).mock.calls[0][1] as () => Promise;
+ await expect(factory()).resolves.toMatchObject({
+ aiAuthoredStories: undefined,
+ });
+ });
+
+ it('counts aiAuthoredStories when story index provided', async () => {
+ vi.mocked(detectAgent).mockReturnValue({ name: 'claude' });
+ const pending = makePendingRecord({
+ configDir: '/test/config',
+ previewFile: null,
+ previewHash: null,
+ });
+ vi.mocked(getAiSetupPending).mockResolvedValue(pending);
+ vi.mocked(findConfigFile).mockReturnValue(null);
+
+ const storyIndex = makeStoryIndex({
+ 'ai-1': {
+ type: 'story',
+ title: 'AI Generated/Button',
+ tags: ['ai-generated'],
+ id: 'ai-1',
+ name: 'Default',
+ importPath: './ai.stories.ts',
+ },
+ regular: {
+ type: 'story',
+ title: 'Components/Input',
+ id: 'regular',
+ name: 'Default',
+ importPath: './input.stories.ts',
+ },
+ });
+
+ await collectAiSetupEvidence('dev', '/test/config', storyIndex);
+
+ expect(telemetry).toHaveBeenCalledWith(
+ 'ai-setup-evidence',
+ expect.any(Function),
+ expect.anything()
+ );
+
+ const factory = vi.mocked(telemetry).mock.calls[0][1] as () => Promise;
+ await expect(factory()).resolves.toMatchObject({
+ aiAuthoredStories: 1,
+ });
+ });
+});
diff --git a/code/core/src/telemetry/ai-setup-utils.ts b/code/core/src/telemetry/ai-setup-utils.ts
new file mode 100644
index 000000000000..198a6221492e
--- /dev/null
+++ b/code/core/src/telemetry/ai-setup-utils.ts
@@ -0,0 +1,146 @@
+import { flushAiSetupPending, getAiSetupPending } from './event-cache.ts';
+import { SESSION_TIMEOUT } from './session-id.ts';
+import { createHash } from 'node:crypto';
+import { readFile } from 'node:fs/promises';
+
+import { findConfigFile } from 'storybook/internal/common';
+import { detectAgent } from './detect-agent.ts';
+import { isTelemetryModuleEnabled, telemetry } from './index.ts';
+import type { EventType } from './types.ts';
+import type { IndexEntry, StoryIndex } from 'storybook/internal/types';
+
+/**
+ * Determines whether a story index entry was authored by the `sb ai setup` flow.
+ * Currently checks title prefix. When we migrate to a tag-based approach,
+ * swap this to check for the tag instead — this is the single swap point.
+ */
+export function isStoryCreatedByAISetup(entry: IndexEntry): boolean {
+ return entry.type === 'story' && (entry.tags?.includes('ai-generated') ?? false);
+}
+
+/**
+ * Count stories in the index that were created by `sb ai setup`.
+ */
+export function countAiAuthoredStories(storyIndex: StoryIndex): number {
+ return Object.values(storyIndex.entries).filter(isStoryCreatedByAISetup).length;
+}
+
+/**
+ * Snapshot the preview file state for baseline comparison.
+ * Returns the filename and SHA-256 hash, or nulls if no preview file exists.
+ */
+export async function snapshotPreviewFile(
+ configDir: string
+): Promise<{ previewPath: string | null; previewHash: string | null }> {
+ const previewPath = findConfigFile('preview', configDir);
+ if (!previewPath) {
+ return { previewPath: null, previewHash: null };
+ }
+
+ try {
+ const content = await readFile(previewPath, 'utf-8');
+ const hash = createHash('sha256').update(content).digest('hex');
+ return { previewPath, previewHash: hash };
+ } catch {
+ // File found by findConfigFile but unreadable — treat as absent
+ return { previewPath, previewHash: null };
+ }
+}
+
+/**
+ * Check whether the preview file has changed from an ai-setup baseline.
+ * Returns true if: hash differs, file appeared, file disappeared, or file is unreadable.
+ */
+export async function checkPreviewChanged(
+ configDir: string,
+ baseline: { previewPath: string | null; previewHash: string | null }
+): Promise {
+ const currentPath = findConfigFile('preview', configDir);
+ if (currentPath !== baseline.previewPath) {
+ return true;
+ }
+ if (!currentPath) {
+ return false;
+ }
+ try {
+ const content = await readFile(currentPath, 'utf-8');
+ const hash = createHash('sha256').update(content).digest('hex');
+ return hash !== baseline.previewHash;
+ } catch {
+ // File unreadable — treat as changed because we expected it to be readable post init
+ return true;
+ }
+}
+
+/**
+ * Check for a pending ai-setup record and fire an evidence event if found.
+ *
+ * Called from:
+ * - `withTelemetry` after the boot event for non-dev/build CLI commands (no story index)
+ * - `doTelemetry` for dev/build commands (story index available)
+ *
+ * Gated on: agent detected → pending record exists → within session window → configDir matches.
+ */
+export async function collectAiSetupEvidence(
+ eventType: EventType,
+ configDir: string | undefined,
+ storyIndex?: StoryIndex
+): Promise {
+ try {
+ // Gate 1: Is this an agent? (cheapest check)
+ const agent = detectAgent();
+ if (!agent) {
+ return;
+ }
+
+ // Gate 2: Is there a pending ai-setup record?
+ const pending = await getAiSetupPending();
+ if (!pending) {
+ return;
+ }
+
+ // Gate 3: Does the configDir match? (cross-project guard)
+ if (configDir && pending.configDir !== configDir) {
+ return;
+ }
+
+ // Gate 4: Is it within the session window?
+ const timeSinceSetup = Date.now() - pending.timestamp;
+ if (timeSinceSetup > SESSION_TIMEOUT) {
+ // Session expired, clean up pending record.
+ await flushAiSetupPending();
+ return;
+ }
+
+ // Don't fire evidence for ai-setup itself — the setup command gives the
+ // prompt to the agent and exits, so we only expect changes after the agent
+ // has started processing it.
+ if (eventType === 'ai-setup') {
+ return;
+ }
+
+ await telemetry(
+ 'ai-setup-evidence',
+ async () => {
+ // Check if preview file changed from baseline
+ const previewChanged = await checkPreviewChanged(pending.configDir, pending);
+
+ // Count AI-authored stories if story index is available
+ const aiAuthoredStories = storyIndex ? countAiAuthoredStories(storyIndex) : undefined;
+
+ return {
+ previewChanged,
+ aiAuthoredStories,
+ sessionId: pending.sessionId,
+ timeSinceSetup,
+ };
+ },
+ {
+ immediate: true,
+ configDir,
+ }
+ );
+ } catch {
+ // Evidence collection is best-effort — never block the actual command
+ }
+}
diff --git a/code/core/src/telemetry/detect-agent.test.ts b/code/core/src/telemetry/detect-agent.test.ts
index df49df7e9143..ef162f2931e4 100644
--- a/code/core/src/telemetry/detect-agent.test.ts
+++ b/code/core/src/telemetry/detect-agent.test.ts
@@ -1,10 +1,39 @@
-import { afterEach, describe, expect, it, vi } from 'vitest';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import { detectAgent } from './detect-agent.ts';
describe('detectAgent', () => {
+ // Save any ambient agent env vars that might be set by the host environment
+ // (e.g. OPENCODE when running inside OpenCode CLI) so we can restore them.
+ const agentEnvVars = [
+ 'OPENCODE',
+ 'CLAUDECODE',
+ 'CLAUDE_CODE',
+ 'GEMINI_CLI',
+ 'CODEX_SANDBOX',
+ 'CODEX_THREAD_ID',
+ 'CURSOR_AGENT',
+ 'AI_AGENT',
+ ];
+ const savedEnv: Record = {};
+
+ beforeEach(() => {
+ for (const key of agentEnvVars) {
+ savedEnv[key] = process.env[key];
+ delete process.env[key];
+ }
+ });
+
afterEach(() => {
vi.unstubAllEnvs();
+ // Restore ambient env vars
+ for (const key of agentEnvVars) {
+ if (savedEnv[key] !== undefined) {
+ process.env[key] = savedEnv[key];
+ } else {
+ delete process.env[key];
+ }
+ }
});
it('detects claude via CLAUDECODE', () => {
diff --git a/code/core/src/telemetry/event-cache.test.ts b/code/core/src/telemetry/event-cache.test.ts
index 4ab2c6a0fbc3..bd0f9494c7c9 100644
--- a/code/core/src/telemetry/event-cache.test.ts
+++ b/code/core/src/telemetry/event-cache.test.ts
@@ -4,7 +4,13 @@ import { beforeEach, describe, expect, it, vi } from 'vitest';
import { cache } from 'storybook/internal/common';
import type { CacheEntry } from './event-cache.ts';
-import { getLastEvents, getPrecedingUpgrade, set } from './event-cache.ts';
+import {
+ flushAiSetupPending,
+ getAiSetupPending,
+ getLastEvents,
+ getPrecedingUpgrade,
+ set,
+} from './event-cache.ts';
import type { TelemetryEvent } from './types.ts';
vi.mock('storybook/internal/common', { spy: true });
@@ -347,4 +353,40 @@ describe('event-cache', () => {
expect(result).toEqual(afterDev);
});
});
+
+ describe('ai-setup pending cache', () => {
+ let cacheGetMock: MockInstance;
+ let cacheRemoveMock: MockInstance;
+
+ beforeEach(() => {
+ vi.clearAllMocks();
+ cacheGetMock = vi.mocked(cache.get);
+ cacheRemoveMock = vi.mocked(cache.remove);
+ });
+
+ it('returns cached ai-setup pending record when present', async () => {
+ const pending = {
+ timestamp: 123,
+ sessionId: 'session-1',
+ configDir: '/tmp/.storybook',
+ previewPath: '/tmp/.storybook/preview.ts',
+ previewHash: 'abc123',
+ };
+
+ cacheGetMock.mockResolvedValueOnce(pending);
+
+ await expect(getAiSetupPending()).resolves.toEqual(pending);
+ expect(cacheGetMock).toHaveBeenCalledWith('ai-setup-pending');
+ });
+
+ it('removes the cached ai-setup pending record and returns undefined', async () => {
+ cacheRemoveMock.mockResolvedValueOnce(undefined);
+ cacheGetMock.mockResolvedValueOnce(undefined);
+
+ await expect(flushAiSetupPending()).resolves.toBeUndefined();
+ expect(cacheRemoveMock).toHaveBeenCalledWith('ai-setup-pending');
+ await expect(getAiSetupPending()).resolves.toBeUndefined();
+ expect(cacheGetMock).toHaveBeenCalledWith('ai-setup-pending');
+ });
+ });
});
diff --git a/code/core/src/telemetry/event-cache.ts b/code/core/src/telemetry/event-cache.ts
index 55768ea1088f..95c5b9559aab 100644
--- a/code/core/src/telemetry/event-cache.ts
+++ b/code/core/src/telemetry/event-cache.ts
@@ -81,3 +81,60 @@ export const getPrecedingUpgrade = async (
? upgradeFields(lastUpgradeEvent)
: undefined;
};
+/**
+ * Record cached at ai-setup time.
+ * Read by subsequent CLI entry points for evidence collection.
+ * Canonical definition — imported by event-cache.ts and prepare-requirements.ts.
+ */
+export interface AiSetupPendingRecord {
+ timestamp: number;
+ sessionId: string;
+ configDir: string;
+ previewPath: string | null;
+ previewHash: string | null;
+}
+
+export const getAiSetupPending = async (): Promise => {
+ // Wait for any pending set operations to complete before reading
+ await processingPromise;
+ return (await cache.get('ai-setup-pending')) ?? undefined;
+};
+
+export const flushAiSetupPending = async (): Promise => {
+ // Wait for any pending set operations to complete before removing
+ await processingPromise;
+ await cache.remove('ai-setup-pending');
+ return undefined;
+};
+
+/**
+ * Returns true when the current session falls within the 2-hour window opened by the most recent
+ * occurrence of one of the given event types
+ *
+ * Used to gate telemetry that should only be captured during a single session window of a given event (e.g. init)
+ */
+export async function isWithinInitialSession(events: EventType | EventType[]): Promise {
+ try {
+ const eventTypes = Array.isArray(events) ? events : [events];
+ const lastEvents = await getLastEvents();
+
+ const lastRelevantEvent = lastEvent(lastEvents, eventTypes);
+
+ if (!lastRelevantEvent) {
+ return false;
+ }
+
+ const { getSessionId } = await import('./session-id.ts');
+ const sessionId = await getSessionId();
+
+ // If the stored event carries a sessionId that differs from the current one the 2h window
+ // has expired and a new session was started.
+ if (lastRelevantEvent.body?.sessionId && lastRelevantEvent.body.sessionId !== sessionId) {
+ return false;
+ }
+
+ return true;
+ } catch {
+ return false;
+ }
+}
diff --git a/code/core/src/telemetry/index.ts b/code/core/src/telemetry/index.ts
index bca390ba971b..29acd39a0b6d 100644
--- a/code/core/src/telemetry/index.ts
+++ b/code/core/src/telemetry/index.ts
@@ -23,12 +23,23 @@ export * from './sanitize.ts';
export * from './error-collector.ts';
-export { getPrecedingUpgrade, getLastEvents, type CacheEntry } from './event-cache.ts';
+export * from './ai-setup-utils.ts';
-export { getSessionId } from './session-id.ts';
+export {
+ getPrecedingUpgrade,
+ getLastEvents,
+ isWithinInitialSession,
+ type CacheEntry,
+ getAiSetupPending,
+ type AiSetupPendingRecord,
+} from './event-cache.ts';
+
+export { getSessionId, SESSION_TIMEOUT } from './session-id.ts';
export { addToGlobalContext } from './telemetry.ts';
+export { detectAgent, type AgentInfo } from './detect-agent.ts';
+
/** Is this story part of the CLI generated examples, including user-created stories in those files */
export const isExampleStoryId = (storyId: string) =>
storyId.startsWith('example-button--') ||
@@ -168,7 +179,6 @@ async function _processAndSend(
}
} finally {
const { error } = payload;
-
// make sure to anonymise possible paths from error messages
if (error) {
payload.error = sanitizeError(error);
diff --git a/code/core/src/telemetry/storybook-metadata.test.ts b/code/core/src/telemetry/storybook-metadata.test.ts
index 595029ebb043..8429bfbb329e 100644
--- a/code/core/src/telemetry/storybook-metadata.test.ts
+++ b/code/core/src/telemetry/storybook-metadata.test.ts
@@ -15,6 +15,7 @@ import {
import { detect } from 'package-manager-detector';
import { type Settings, globalSettings } from '../cli/globalSettings.ts';
+import { detectAgent } from './detect-agent.ts';
import { getApplicationFileCount } from '../telemetry/get-application-file-count.ts';
import { analyzeEcosystemPackages } from '../telemetry/get-known-packages.ts';
import { getMonorepoType } from '../telemetry/get-monorepo-type.ts';
@@ -32,6 +33,9 @@ import {
} from './storybook-metadata.ts';
vi.mock(import('../cli/globalSettings.ts'), { spy: true });
+vi.mock('./detect-agent.ts', () => ({
+ detectAgent: vi.fn().mockReturnValue(undefined),
+}));
vi.mock(import('./package-json.ts'), { spy: true });
vi.mock(import('./get-monorepo-type.ts'), { spy: true });
vi.mock(import('./get-framework-info.ts'), { spy: true });
@@ -562,7 +566,7 @@ describe('storybook-metadata', () => {
expect(res.userSince).toEqual(1717334400000);
});
- it('should not detect userSince info in CI', async () => {
+ it('should not detect userSince info in CI when agent is not detected', async () => {
vi.mocked(isCI).mockImplementation(() => true);
vi.mocked(globalSettings).mockResolvedValue({} as Settings);
@@ -577,6 +581,26 @@ describe('storybook-metadata', () => {
expect(res.userSince).not.toBeDefined();
});
+ it('should detect userSince info in CI when agent is detected', async () => {
+ vi.mocked(isCI).mockImplementation(() => true);
+ vi.mocked(detectAgent).mockReturnValue({ name: 'claude' });
+ vi.mocked(globalSettings).mockResolvedValue({
+ value: {
+ userSince: 1717334400000,
+ },
+ } as Settings);
+
+ const res = await computeStorybookMetadata({
+ configDir: '.storybook',
+ packageJson: packageJsonMock,
+ packageJsonPath,
+ mainConfig: mainJsMock,
+ });
+
+ expect(globalSettings).toHaveBeenCalled();
+ expect(res.userSince).toEqual(1717334400000);
+ });
+
it('should include knownPackages in metadata', async () => {
const res = await computeStorybookMetadata({
configDir: '.storybook',
diff --git a/code/core/src/telemetry/storybook-metadata.ts b/code/core/src/telemetry/storybook-metadata.ts
index 9f58597f3ce3..3cbe341bb270 100644
--- a/code/core/src/telemetry/storybook-metadata.ts
+++ b/code/core/src/telemetry/storybook-metadata.ts
@@ -18,6 +18,7 @@ import * as pkg from 'empathic/package';
import { version } from '../../package.json';
import { globalSettings } from '../cli/globalSettings.ts';
+import { detectAgent } from './detect-agent.ts';
import { getApplicationFileCount } from './get-application-file-count.ts';
import { getChromaticVersionSpecifier } from './get-chromatic-version.ts';
import { getFrameworkInfo } from './get-framework-info.ts';
@@ -113,7 +114,7 @@ export const computeStorybookMetadata = async ({
mainConfig?: StorybookConfig & Record;
configDir: string;
}): Promise => {
- const settings = isCI() ? undefined : await globalSettings();
+ const settings = isCI() && !detectAgent() ? undefined : await globalSettings();
const metadata: Partial = {
generatedAt: new Date().getTime(),
userSince: settings?.value.userSince,
diff --git a/code/core/src/telemetry/types.ts b/code/core/src/telemetry/types.ts
index 90d82f18d388..37b0d372a7de 100644
--- a/code/core/src/telemetry/types.ts
+++ b/code/core/src/telemetry/types.ts
@@ -44,8 +44,14 @@ export type EventType =
| 'preview-first-load'
| 'doctor'
| 'share'
+ | 'ghost-stories'
| 'sidebar-filter'
- | 'ghost-stories';
+ | 'ai-setup'
+ | 'ai-setup-evidence'
+ | 'ai-setup-final-scoring'
+ | 'ai-prompt-nudge'
+ | 'ai-init-opt-in'
+ | 'ai-setup-self-healing-scoring';
export interface Dependency {
version: string | undefined;
versionSpecifier?: string;
@@ -139,7 +145,7 @@ export interface TelemetryEvent extends TelemetryData {
export interface InitPayload {
projectType: string;
- features: { dev: boolean; docs: boolean; test: boolean; onboarding: boolean };
+ features: { dev: boolean; docs: boolean; test: boolean; onboarding: boolean; ai: boolean };
newUser: boolean;
versionSpecifier: string | undefined;
cliIntegration: string | undefined;
diff --git a/code/core/src/types/modules/features.ts b/code/core/src/types/modules/features.ts
index 3c289d2e5b0b..07af4bfb04f2 100644
--- a/code/core/src/types/modules/features.ts
+++ b/code/core/src/types/modules/features.ts
@@ -3,4 +3,5 @@ export enum Feature {
TEST = 'test',
ONBOARDING = 'onboarding',
A11Y = 'a11y',
+ AI = 'ai',
}
diff --git a/code/core/src/typings.d.ts b/code/core/src/typings.d.ts
index 1ecd785ebef0..373feef0b51a 100644
--- a/code/core/src/typings.d.ts
+++ b/code/core/src/typings.d.ts
@@ -19,6 +19,11 @@ declare var STORYBOOK_CURRENT_TASK_LOG: undefined | null | Array;
declare var SB_TELEMETRY_STATE: 'enabled' | 'disabled' | undefined;
declare var PAYLOAD_ERROR_HANDLER: PayloadErrorHandler | undefined;
+declare var STORYBOOK_LAST_EVENTS: Record<
+ import('./telemetry').EventType,
+ import('./telemetry').CacheEntry
+>;
+declare var STORYBOOK_SESSION_ID: string | undefined;
declare var STORYBOOK_NETWORK_ADDRESS: string | undefined;
declare var PREVIEW_URL: string | undefined;
diff --git a/code/lib/cli-storybook/src/ai/index.test.ts b/code/lib/cli-storybook/src/ai/index.test.ts
new file mode 100644
index 000000000000..d483b5833cef
--- /dev/null
+++ b/code/lib/cli-storybook/src/ai/index.test.ts
@@ -0,0 +1,96 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+vi.mock('storybook/internal/common', async () => {
+ const actual = await vi.importActual(
+ 'storybook/internal/common'
+ );
+ return {
+ ...actual,
+ cache: { set: vi.fn(), get: vi.fn(), remove: vi.fn() },
+ };
+});
+
+vi.mock('storybook/internal/telemetry', () => ({
+ telemetry: vi.fn(),
+ getSessionId: vi.fn().mockResolvedValue('session-xyz'),
+ snapshotPreviewFile: vi
+ .fn()
+ .mockResolvedValue({ previewPath: '/proj/.storybook/preview.ts', previewHash: 'abc' }),
+ isTelemetryModuleEnabled: vi.fn(() => true),
+}));
+
+vi.mock('storybook/internal/node-logger', () => ({
+ logger: { log: vi.fn(), error: vi.fn(), warn: vi.fn(), debug: vi.fn() },
+}));
+
+vi.mock('../../../create-storybook/src/services/ProjectTypeService.ts', () => ({
+ ProjectTypeService: class {
+ async detectLanguage() {
+ return 'ts';
+ }
+ },
+}));
+
+vi.mock('../automigrate/helpers/mainConfigFile.ts', () => ({
+ getStorybookData: vi.fn().mockResolvedValue({
+ versionInstalled: '10.4.0',
+ frameworkPackage: '@storybook/react-vite',
+ rendererPackage: '@storybook/react',
+ renderer: 'react',
+ builderPackage: '@storybook/builder-vite',
+ addons: [],
+ configDir: '/proj/.storybook',
+ storiesPaths: [],
+ hasCsfFactoryPreview: false,
+ packageManager: {},
+ }),
+}));
+
+import { cache } from 'storybook/internal/common';
+import {
+ isTelemetryModuleEnabled,
+ snapshotPreviewFile,
+ telemetry,
+} from 'storybook/internal/telemetry';
+
+import { aiSetup } from './index.ts';
+
+beforeEach(() => {
+ vi.mocked(cache.set).mockClear();
+ vi.mocked(snapshotPreviewFile).mockClear();
+ vi.mocked(telemetry).mockClear();
+});
+
+describe('aiSetup telemetry gating', () => {
+ it('records ai-setup-pending + preview snapshot when telemetry is enabled', async () => {
+ await aiSetup({ configDir: '/proj/.storybook', disableTelemetry: false });
+
+ expect(vi.mocked(snapshotPreviewFile)).toHaveBeenCalledTimes(1);
+ expect(vi.mocked(cache.set)).toHaveBeenCalledWith(
+ 'ai-setup-pending',
+ expect.objectContaining({
+ configDir: expect.stringContaining('.storybook'),
+ sessionId: 'session-xyz',
+ previewPath: '/proj/.storybook/preview.ts',
+ previewHash: 'abc',
+ })
+ );
+ expect(vi.mocked(telemetry)).toHaveBeenCalledWith('ai-setup', expect.any(Object));
+ });
+
+ it('skips snapshot + cache write when telemetry is disabled', async () => {
+ vi.mocked(isTelemetryModuleEnabled).mockReturnValueOnce(false);
+
+ await aiSetup({ configDir: '/proj/.storybook', disableTelemetry: true });
+
+ expect(vi.mocked(snapshotPreviewFile)).not.toHaveBeenCalled();
+ expect(vi.mocked(cache.set)).not.toHaveBeenCalled();
+ });
+
+ it('treats missing disableTelemetry as enabled (backwards compatible default)', async () => {
+ await aiSetup({ configDir: '/proj/.storybook' });
+
+ expect(vi.mocked(snapshotPreviewFile)).toHaveBeenCalledTimes(1);
+ expect(vi.mocked(cache.set)).toHaveBeenCalledWith('ai-setup-pending', expect.any(Object));
+ });
+});
diff --git a/code/lib/cli-storybook/src/ai/index.ts b/code/lib/cli-storybook/src/ai/index.ts
new file mode 100644
index 000000000000..20cd4884ae4e
--- /dev/null
+++ b/code/lib/cli-storybook/src/ai/index.ts
@@ -0,0 +1,131 @@
+import { writeFile } from 'node:fs/promises';
+import { resolve } from 'node:path';
+
+import type { PackageManagerName } from 'storybook/internal/common';
+import { cache } from 'storybook/internal/common';
+import { logger } from 'storybook/internal/node-logger';
+import {
+ getSessionId,
+ isTelemetryModuleEnabled,
+ snapshotPreviewFile,
+ telemetry,
+ type AiSetupPendingRecord,
+} from 'storybook/internal/telemetry';
+import { SupportedLanguage } from 'storybook/internal/types';
+
+import { ProjectTypeService } from '../../../create-storybook/src/services/ProjectTypeService.ts';
+
+import { getStorybookData } from '../automigrate/helpers/mainConfigFile.ts';
+import { generateMarkdownOutput } from './prompt.ts';
+import type { ProjectInfo, AiSetupOptions } from './types.ts';
+
+export async function aiSetup(options: AiSetupOptions): Promise {
+ const { configDir: userConfigDir, packageManager: packageManagerName, output } = options;
+
+ let projectInfo: ProjectInfo;
+
+ try {
+ const data = await getStorybookData({
+ configDir: userConfigDir,
+ packageManagerName: packageManagerName as PackageManagerName | undefined,
+ });
+ const majorVersion = data.versionInstalled
+ ? parseMajorVersion(data.versionInstalled)
+ : undefined;
+
+ if (!data.frameworkPackage || !data.rendererPackage || !data.builderPackage) {
+ logger.error(
+ 'Could not detect framework, renderer, or builder from your Storybook config. Make sure you are running this command from your project root, or specify --config-dir.'
+ );
+ return;
+ }
+
+ const projectTypeService = new ProjectTypeService(data.packageManager);
+ const detectedLanguage = await projectTypeService.detectLanguage();
+ const language = detectedLanguage === SupportedLanguage.TYPESCRIPT ? 'ts' : 'js';
+
+ projectInfo = {
+ storybookVersion: data.versionInstalled,
+ majorVersion,
+ framework: data.frameworkPackage,
+ rendererPackage: data.rendererPackage,
+ renderer: data.renderer,
+ builderPackage: data.builderPackage,
+ addons: data.addons ?? [],
+ configDir: data.configDir,
+ storiesPaths: data.storiesPaths,
+ hasCsfFactoryPreview: data.hasCsfFactoryPreview,
+ language,
+ };
+ } catch (err: unknown) {
+ logger.error(
+ `Failed to read Storybook configuration: ${err instanceof Error ? err.message : String(err)}`
+ );
+ logger.log(
+ 'Make sure you are running this command from your project root, or specify --config-dir.'
+ );
+ return;
+ }
+
+ if (
+ projectInfo.rendererPackage !== '@storybook/react' ||
+ projectInfo.builderPackage !== '@storybook/builder-vite'
+ ) {
+ logger.log(
+ 'AI-assisted setup is currently only available for projects using the React renderer with Vite builder. Detected renderer: ' +
+ projectInfo.rendererPackage +
+ ', builder: ' +
+ projectInfo.builderPackage
+ );
+ return;
+ }
+
+ const result = await generateMarkdownOutput(projectInfo);
+ const markdownOutput = result.markdown;
+
+ await telemetry('ai-setup', {
+ cliOptions: {
+ output: output ? 'file' : undefined,
+ configDir: projectInfo.configDir,
+ packageManager: packageManagerName,
+ },
+ project: {
+ framework: projectInfo.framework,
+ renderer: projectInfo.rendererPackage,
+ builder: projectInfo.builderPackage,
+ language: projectInfo.language,
+ hasCsfFactoryPreview: projectInfo.hasCsfFactoryPreview,
+ },
+ });
+
+ // Snapshot the preview file baseline and cache the pending setup record.
+ // Subsequent CLI entry points (dev, build, doctor, etc.) read this to
+ // collect evidence of what the agent accomplished — but only via telemetry
+ // (the `ai-setup-evidence` event). Skip the snapshot + cache write when
+ // telemetry is disabled so there's nobody to read it.
+ if (isTelemetryModuleEnabled()) {
+ const resolvedConfigDir = resolve(projectInfo.configDir);
+ const previewSnapshot = await snapshotPreviewFile(resolvedConfigDir);
+ const sessionId = await getSessionId();
+ const pendingRecord: AiSetupPendingRecord = {
+ timestamp: Date.now(),
+ sessionId,
+ configDir: resolvedConfigDir,
+ ...previewSnapshot,
+ };
+ await cache.set('ai-setup-pending', pendingRecord);
+ }
+
+ if (output) {
+ const outputPath = resolve(output);
+ await writeFile(outputPath, markdownOutput, 'utf-8');
+ logger.log(`Prompt written to ${outputPath}`);
+ } else {
+ logger.log(markdownOutput);
+ }
+}
+
+function parseMajorVersion(version: string): number | undefined {
+ const match = version.match(/^(\d+)/);
+ return match ? parseInt(match[1], 10) : undefined;
+}
diff --git a/code/lib/cli-storybook/src/ai/prompt.ts b/code/lib/cli-storybook/src/ai/prompt.ts
new file mode 100644
index 000000000000..55d9a5a2b52b
--- /dev/null
+++ b/code/lib/cli-storybook/src/ai/prompt.ts
@@ -0,0 +1,40 @@
+import { dedent } from 'ts-dedent';
+
+import type { ProjectInfo } from './types.ts';
+import { getPrompts } from './setup-prompts/index.ts';
+
+function getProjectOverview(projectInfo: ProjectInfo): string {
+ return dedent`
+ ## Project Info
+
+ | Property | Value |
+ |----------|-------|
+ | Version | ${projectInfo.storybookVersion || 'unknown'} |
+ | Renderer | ${projectInfo.rendererPackage || 'unknown'} |
+ | Framework | ${projectInfo.framework || 'unknown'} |
+ | Builder | ${projectInfo.builderPackage || 'unknown'} |
+ | Config Dir | \`${projectInfo.configDir}\` |
+ | CSF Format | ${projectInfo.hasCsfFactoryPreview ? 'CSF Factory' : 'CSF3'} |
+ | Addons | ${projectInfo.addons.length > 0 ? projectInfo.addons.join(', ') : 'none'} |
+ `;
+}
+
+export async function generateMarkdownOutput(projectInfo: ProjectInfo): Promise<{
+ markdown: string;
+}> {
+ const { prompts: aiPrompts } = await getPrompts(projectInfo);
+
+ const sections: string[] = [];
+
+ sections.push(dedent`
+ # Storybook Setup
+ `);
+
+ sections.push(getProjectOverview(projectInfo));
+
+ for (const aiPrompt of aiPrompts) {
+ sections.push(aiPrompt.instructions);
+ }
+
+ return { markdown: sections.join('\n\n') };
+}
diff --git a/code/lib/cli-storybook/src/ai/setup-prompts/index.ts b/code/lib/cli-storybook/src/ai/setup-prompts/index.ts
new file mode 100644
index 000000000000..ab8d1a7c3905
--- /dev/null
+++ b/code/lib/cli-storybook/src/ai/setup-prompts/index.ts
@@ -0,0 +1,68 @@
+import type { AiPrompt, ProjectInfo } from '../types.ts';
+
+import * as patternCopyPlay from './pattern-copy-play.ts';
+
+/**
+ * Main prompt used currently in `npx storybook ai setup` command. If you promote a new prompt to be default, move this to the FORMERLY_USED_PROMPTS object below.
+ */
+const CURRENTLY_USED_PROMPT: Record string> = {
+ 'pattern-copy-play': patternCopyPlay.instructions,
+};
+
+/**
+ * Names of variants registered behind `EVAL_SETUP_PROMPT`. Loaded on demand
+ * from sibling files so the bundler can code-split them away from the
+ * default-only path that real users hit.
+ */
+const FORMERLY_USED_PROMPTS: Record Promise<(projectInfo: ProjectInfo) => string>> = {
+ setup: async () => (await import('./setup.ts')).instructions,
+};
+
+export type PromptName = string;
+
+/** Names available to the eval harness — defaults plus experimental variants. */
+export const PROMPT_NAMES: PromptName[] = [
+ ...Object.keys(CURRENTLY_USED_PROMPT),
+ ...Object.keys(FORMERLY_USED_PROMPTS),
+];
+
+/**
+ * The single prompt variant that ships to real users. Running
+ * `npx storybook ai setup` without any overrides always produces this prompt.
+ */
+export const DEFAULT_PROMPT_NAME: PromptName = 'pattern-copy-play';
+
+/**
+ * Internal env var read only by `getPrompts`. The eval harness sets this
+ * before spawning `ai setup` to select a non-default prompt variant for A/B
+ * comparison. Unknown values fall back to the default so a typo never breaks
+ * the CLI for real users.
+ */
+const EVAL_SETUP_PROMPT_ENV = 'EVAL_SETUP_PROMPT';
+
+function resolvePromptName(): PromptName {
+ const requested = process.env[EVAL_SETUP_PROMPT_ENV]?.trim();
+ if (
+ requested &&
+ (Object.hasOwn(CURRENTLY_USED_PROMPT, requested) ||
+ Object.hasOwn(FORMERLY_USED_PROMPTS, requested))
+ ) {
+ return requested;
+ }
+ return DEFAULT_PROMPT_NAME;
+}
+
+export async function getPrompts(projectInfo: ProjectInfo): Promise<{ prompts: AiPrompt[] }> {
+ const name = resolvePromptName();
+ const builder = CURRENTLY_USED_PROMPT[name] ?? (await FORMERLY_USED_PROMPTS[name]());
+
+ return {
+ prompts: [
+ {
+ name,
+ description: 'Set up Storybook for success',
+ instructions: builder(projectInfo),
+ },
+ ],
+ };
+}
diff --git a/code/lib/cli-storybook/src/ai/setup-prompts/pattern-copy-play.ts b/code/lib/cli-storybook/src/ai/setup-prompts/pattern-copy-play.ts
new file mode 100644
index 000000000000..a8abe0493f15
--- /dev/null
+++ b/code/lib/cli-storybook/src/ai/setup-prompts/pattern-copy-play.ts
@@ -0,0 +1,870 @@
+/**
+ * Prompt variant: `pattern-copy-play` (current default for `npx storybook ai setup`)
+ *
+ * - Created: 2026-04-22 (eval iteration 2, default since this PR)
+ * - Status: shipping default — produced by every `ai setup` invocation
+ * without `EVAL_SETUP_PROMPT` set.
+ * - Reference eval results:
+ * https://github.com/search?q=is:pr label:"prompt:pattern-copy-play" org:storybook-tmp&type=pullrequests
+ *
+ * Update this header when iterating: bump the iteration number and link the
+ * latest eval run so reviewers can compare variants without spelunking git.
+ */
+import { dedent } from 'ts-dedent';
+
+import type { ProjectInfo } from '../types.ts';
+
+/**
+ * Builds a markdown-format docs URL with renderer and language query parameters.
+ * Appending .md to any Storybook docs URL returns clean markdown with code examples.
+ */
+function getDocsMarkdownUrl(
+ path: string,
+ projectInfo?: Pick
+): string {
+ const { majorVersion, renderer = 'react', language = 'ts' } = projectInfo ?? {};
+ const versionSegment = majorVersion ? `/${majorVersion}` : '';
+ const params = new URLSearchParams();
+ if (renderer) {
+ params.set('renderer', renderer);
+ }
+ params.set('language', language);
+ const query = params.toString();
+ return `https://storybook.js.org/docs${versionSegment}/${path}.md${query ? `?${query}` : ''}`;
+}
+
+function getTypeImportSource(projectInfo: ProjectInfo): string {
+ return projectInfo.framework || projectInfo.rendererPackage || '@storybook/react';
+}
+
+function getDocsReferenceSection(projectInfo: ProjectInfo): string {
+ const docsUrl = (path: string) => getDocsMarkdownUrl(path, projectInfo);
+
+ return dedent`
+ ### Storybook Documentation Reference
+
+ Use the following references to look up Storybook APIs, concepts, or examples:
+
+ - Full docs index: https://storybook.js.org/llms.txt
+ - See code snippets only with codeOnly=true param e.g. ${docsUrl('writing-stories')}&codeOnly=true
+
+ Key documentation pages for this task:
+ - Writing stories: ${docsUrl('writing-stories')}
+ - Decorators: ${docsUrl('writing-stories/decorators')}
+ - Args: ${docsUrl('writing-stories/args')}
+ - Play functions: ${docsUrl('writing-stories/play-function')}
+ - Vitest integration: ${docsUrl('writing-tests/vitest-plugin')}
+
+ Fetch these URLs directly when you need guidance on Storybook APIs or patterns.
+ `;
+}
+
+function getPreviewConfigExample(projectInfo: ProjectInfo): string {
+ const configDir = projectInfo.configDir;
+ const typeImport = getTypeImportSource(projectInfo);
+
+ if (projectInfo.hasCsfFactoryPreview) {
+ return dedent`
+ \`\`\`tsx
+ // ${configDir}/preview.tsx
+ import '../src/index.css'; // import global styles
+ import MockDate from 'mockdate';
+
+ import { definePreview } from 'storybook/preview';
+ import { SessionProvider } from '../src/contexts/SessionContext';
+
+ export default definePreview({
+ decorators: [
+ (Story) => (
+
+
+
+ ),
+ ],
+ async beforeEach() {
+ localStorage.setItem('theme', 'dark');
+ localStorage.setItem('sidebar:open', 'true');
+ MockDate.set('2024-04-01T12:00:00Z');
+ },
+ });
+ \`\`\`
+ `;
+ }
+
+ return dedent`
+ \`\`\`tsx
+ // ${configDir}/preview.tsx
+ import type { Preview } from '${typeImport}';
+ import MockDate from 'mockdate';
+ import '../src/index.css'; // import global styles
+ import { SessionProvider } from '../src/contexts/SessionContext';
+
+ const preview: Preview = {
+ decorators: [
+ (Story) => (
+
+
+
+ ),
+ ],
+ async beforeEach() {
+ localStorage.setItem('theme', 'dark');
+ localStorage.setItem('sidebar:open', 'true');
+ MockDate.set('2024-04-01T12:00:00Z');
+ },
+ };
+
+ export default preview;
+ \`\`\`
+ `;
+}
+
+function getMockDateExample(projectInfo: ProjectInfo): string {
+ const typeImport = getTypeImportSource(projectInfo);
+
+ if (projectInfo.hasCsfFactoryPreview) {
+ return dedent`
+ \`\`\`tsx
+ import MockDate from 'mockdate';
+ import { definePreview } from 'storybook/preview';
+
+ export default definePreview({
+ async beforeEach() {
+ MockDate.set('2024-04-01T12:00:00Z');
+ },
+ });
+ \`\`\`
+ `;
+ }
+
+ return dedent`
+ \`\`\`tsx
+ import type { Preview } from '${typeImport}';
+ import MockDate from 'mockdate';
+
+ const preview: Preview = {
+ async beforeEach() {
+ MockDate.set('2024-04-01T12:00:00Z');
+ },
+ };
+
+ export default preview;
+ \`\`\`
+ `;
+}
+
+function getMswPreviewExample(projectInfo: ProjectInfo): string {
+ const configDir = projectInfo.configDir;
+ const typeImport = getTypeImportSource(projectInfo);
+
+ if (projectInfo.hasCsfFactoryPreview) {
+ return dedent`
+ \`\`\`tsx
+ // ${configDir}/preview.tsx
+ import { definePreview } from 'storybook/preview';
+ import { initialize, mswLoader } from 'msw-storybook-addon';
+ import { mswHandlers } from './msw-handlers';
+
+ initialize({
+ onUnhandledRequest: 'bypass',
+ });
+
+ export default definePreview({
+ loaders: [mswLoader],
+ parameters: {
+ msw: {
+ handlers: mswHandlers,
+ },
+ },
+ });
+ \`\`\`
+ `;
+ }
+
+ return dedent`
+ \`\`\`tsx
+ // ${configDir}/preview.tsx
+ import type { Preview } from '${typeImport}';
+ import { initialize, mswLoader } from 'msw-storybook-addon';
+ import { mswHandlers } from './msw-handlers';
+
+ initialize({
+ onUnhandledRequest: 'bypass',
+ });
+
+ const preview: Preview = {
+ loaders: [mswLoader],
+ parameters: {
+ msw: {
+ handlers: mswHandlers,
+ },
+ },
+ };
+
+ export default preview;
+ \`\`\`
+ `;
+}
+
+function getStoryExample(projectInfo: ProjectInfo): string {
+ if (projectInfo.hasCsfFactoryPreview) {
+ return dedent`
+ \`\`\`tsx
+ import preview from '#.storybook/preview';
+ import { expect } from 'storybook/test';
+ import { SomeComponent } from './SomeComponent';
+
+ const meta = preview.meta({
+ component: SomeComponent,
+ tags: ['ai-generated'],
+ });
+
+ export const Default = meta.story({
+ render: () => ,
+ play: async ({ canvas }) => {
+ await expect(canvas.getByRole('button')).toBeVisible();
+ },
+ });
+ \`\`\`
+ `;
+ }
+
+ const typeImport = getTypeImportSource(projectInfo);
+
+ return dedent`
+ \`\`\`tsx
+ import type { Meta, StoryObj } from '${typeImport}';
+ import { expect } from 'storybook/test';
+ import { SomeComponent } from './SomeComponent';
+
+ const meta = {
+ component: SomeComponent,
+ tags: ['ai-generated'],
+ } satisfies Meta;
+
+ export default meta;
+ type Story = StoryObj;
+
+ export const Default: Story = {
+ render: () => ,
+ play: async ({ canvas }) => {
+ await expect(canvas.getByRole('button')).toBeVisible();
+ },
+ };
+ \`\`\`
+ `;
+}
+
+function getNeedsWorkTagExample(projectInfo: ProjectInfo): string {
+ if (projectInfo.hasCsfFactoryPreview) {
+ return dedent`
+ \`\`\`ts
+ const meta = preview.meta({
+ component: SomeComponent,
+ tags: ['ai-generated', 'needs-work'],
+ });
+ \`\`\`
+ `;
+ }
+
+ return dedent`
+ \`\`\`ts
+ const meta = {
+ component: SomeComponent,
+ tags: ['ai-generated', 'needs-work'],
+ } satisfies Meta;
+ \`\`\`
+ `;
+}
+
+function getArgsStoryExample(projectInfo: ProjectInfo): string {
+ if (projectInfo.hasCsfFactoryPreview) {
+ return dedent`
+ \`\`\`tsx
+ import preview from '#.storybook/preview';
+ import { expect } from 'storybook/test';
+ import { Button } from './Button';
+
+ const meta = preview.meta({
+ component: Button,
+ tags: ['ai-generated'],
+ });
+
+ export const Primary = meta.story({
+ args: {
+ variant: 'primary',
+ children: 'Save',
+ },
+ play: async ({ canvas }) => {
+ await expect(canvas.getByRole('button', { name: /save/i })).toBeVisible();
+ },
+ });
+
+ export const Disabled = meta.story({
+ args: {
+ variant: 'primary',
+ disabled: true,
+ children: 'Save',
+ },
+ play: async ({ canvas }) => {
+ await expect(canvas.getByRole('button')).toBeDisabled();
+ },
+ });
+ \`\`\`
+ `;
+ }
+
+ const typeImport = getTypeImportSource(projectInfo);
+
+ return dedent`
+ \`\`\`tsx
+ import type { Meta, StoryObj } from '${typeImport}';
+ import { expect } from 'storybook/test';
+ import { Button } from './Button';
+
+ const meta = {
+ component: Button,
+ tags: ['ai-generated'],
+ } satisfies Meta;
+
+ export default meta;
+ type Story = StoryObj;
+
+ export const Primary: Story = {
+ args: {
+ variant: 'primary',
+ children: 'Save',
+ },
+ play: async ({ canvas }) => {
+ await expect(canvas.getByRole('button', { name: /save/i })).toBeVisible();
+ },
+ };
+
+ export const Disabled: Story = {
+ args: {
+ variant: 'primary',
+ disabled: true,
+ children: 'Save',
+ },
+ play: async ({ canvas }) => {
+ await expect(canvas.getByRole('button')).toBeDisabled();
+ },
+ };
+ \`\`\`
+ `;
+}
+
+function getRenderCompositionExample(projectInfo: ProjectInfo): string {
+ if (projectInfo.hasCsfFactoryPreview) {
+ return dedent`
+ \`\`\`tsx
+ import preview from '#.storybook/preview';
+ import { expect } from 'storybook/test';
+ import { Button } from './Button';
+ import { Card } from './Card';
+
+ const meta = preview.meta({
+ component: Button,
+ tags: ['ai-generated'],
+ });
+
+ export const InsideCard = meta.story({
+ render: () => (
+
+ Save
+
+ ),
+ play: async ({ canvas, userEvent }) => {
+ await expect(canvas.getByRole('button', { name: /save/i })).toBeVisible();
+ await userEvent.click(canvas.getByRole('button', { name: /save/i }));
+ },
+ });
+ \`\`\`
+ `;
+ }
+
+ const typeImport = getTypeImportSource(projectInfo);
+
+ return dedent`
+ \`\`\`tsx
+ import type { Meta, StoryObj } from '${typeImport}';
+ import { expect } from 'storybook/test';
+ import { Button } from './Button';
+ import { Card } from './Card';
+
+ const meta = {
+ component: Button,
+ tags: ['ai-generated'],
+ } satisfies Meta;
+
+ export default meta;
+ type Story = StoryObj;
+
+ export const InsideCard: Story = {
+ render: () => (
+
+ Save
+
+ ),
+ play: async ({ canvas, userEvent }) => {
+ await expect(canvas.getByRole('button', { name: /save/i })).toBeVisible();
+ await userEvent.click(canvas.getByRole('button', { name: /save/i }));
+ },
+ };
+ \`\`\`
+ `;
+}
+
+function getPageStoryExample(projectInfo: ProjectInfo): string {
+ if (projectInfo.hasCsfFactoryPreview) {
+ return dedent`
+ \`\`\`tsx
+ import preview from '#.storybook/preview';
+ import { expect } from 'storybook/test';
+ import { ProductPage } from './ProductPage';
+
+ const meta = preview.meta({
+ component: ProductPage,
+ tags: ['ai-generated'],
+ });
+
+ export const Default = meta.story({
+ render: () => ,
+ play: async ({ canvas }) => {
+ await expect(
+ canvas.getByRole('heading', { name: /products/i }),
+ ).toBeVisible();
+ },
+ });
+ \`\`\`
+ `;
+ }
+
+ const typeImport = getTypeImportSource(projectInfo);
+
+ return dedent`
+ \`\`\`tsx
+ import type { Meta, StoryObj } from '${typeImport}';
+ import { expect } from 'storybook/test';
+ import { ProductPage } from './ProductPage';
+
+ const meta = {
+ component: ProductPage,
+ tags: ['ai-generated'],
+ } satisfies Meta;
+
+ export default meta;
+ type Story = StoryObj;
+
+ export const Default: Story = {
+ render: () => ,
+ play: async ({ canvas }) => {
+ await expect(
+ canvas.getByRole('heading', { name: /products/i }),
+ ).toBeVisible();
+ },
+ };
+ \`\`\`
+ `;
+}
+
+export function instructions(projectInfo: ProjectInfo): string {
+ const configDir = projectInfo.configDir;
+ const typeImport = getTypeImportSource(projectInfo);
+
+ return dedent`
+ Attention: The following instructions must be followed in order to successfully set up Storybook in this project. Do not skip steps or attempt to do them out of order.
+
+ Your goal is to make Storybook fully functional in this project by analyzing the codebase,
+ configuring the preview with the right decorators, and writing stories for some components.
+
+ The end state should be a Storybook where any component — from a small button to a full page — can be added without story-specific workarounds. All necessary providers, CSS, browser state, and network mocks should live in the shared preview so that just rendering the component in the story is enough.
+
+ After each created story, run Vitest to verify it renders.
+ If the test fails, read the error, fix the issue, and re-run until it passes before moving on.
+
+ - Copy real patterns from the codebase
+ - Keep the app code unchanged
+ - Put the default setup in \`${configDir}/preview.tsx\`
+ - Keep app mocking and runtime setup in \`${configDir}/preview.tsx\`, not in the stories
+
+ ${getDocsReferenceSection(projectInfo)}
+
+ ### Step 1: Analyze the codebase
+
+ Read enough of the app to understand the full runtime environment before writing any stories.
+
+ Do not stop at \`main.tsx\` or \`App.tsx\`.
+ Follow imports into providers, pages, hooks, and shared components until you know:
+
+ - which providers exist
+ - which CSS files are injected
+ - which queries fetch data
+ - which browser-state reads happen
+ - which portals and portal roots exist
+ - which pages and components show the real usage patterns
+
+ Example of what to copy:
+
+ \`\`\`tsx
+ // src/main.tsx
+ import "./index.css";
+ import App from "./App";
+ import { SessionProvider } from "./contexts/SessionContext";
+
+ createRoot(document.getElementById("root")!).render(
+
+
+ ,
+ );
+ \`\`\`
+
+ That means Storybook should copy:
+
+ - the \`index.css\` import
+ - the \`SessionProvider\`
+ - the same provider order
+
+ Example of tracing the app deeper:
+
+ \`\`\`tsx
+ // src/App.tsx
+ function App() {
+ const { products, loadMoreProducts } = useProducts();
+ const { currentUser, signOut } = useSession();
+ // ...
+ }
+ \`\`\`
+
+ \`\`\`ts
+ // src/hooks/useProducts.ts
+ const response = await fetch(apiBaseUrl + "/products?page=1");
+ \`\`\`
+
+ \`\`\`ts
+ // src/hooks/useTheme.ts
+ const savedTheme = localStorage.getItem("theme");
+ \`\`\`
+
+ That means the default Storybook setup should discover and prepare:
+
+ - provider state
+ - MSW handlers for queries
+ - browser-state values that are actually read during render
+
+ ### Step 2: Build one default app environment in preview
+
+ Set up Storybook once so most stories work without story-specific setup.
+
+ Start with the smallest faithful environment:
+
+ - the real provider tree
+ - the real root CSS
+ - seeded browser state if the app reads it during render
+ - MSW for network/data queries
+
+ It is fine to seed browser state such as \`localStorage\`, \`sessionStorage\`, and cookies when the app reads them during render.
+ Seed only the specific app-owned keys and values you need.
+ Do not clear all \`localStorage\`, \`sessionStorage\`, or cookies, and do not reset Storybook's own state.
+ Do not mock or redefine the browser runtime itself.
+ The stories run in Vitest browser mode, so the real browser environment should already exist.
+
+ ${getPreviewConfigExample(projectInfo)}
+
+ Use this same idea for:
+
+ - providers
+ - root CSS
+ - browser state
+ - dates, and if the app logic depends on them during render then always use \`mockdate\`
+
+ Example with the \`mockdate\` package:
+
+ ${getMockDateExample(projectInfo)}
+
+ ### Step 3: Support portals with preview-body.html
+
+ If the app uses portals, copy that setup into Storybook too.
+
+ Look for patterns like:
+
+ - \`createPortal(...)\`
+ - modal, dialog, drawer, popover, tooltip, toast, or dropdown portal components
+ - hard-coded roots such as \`#portal-root\`, \`#modal-root\`, \`#drawer-root\`, or \`#toast-root\`
+
+ Example of what to copy:
+
+ \`\`\`tsx
+ // real component
+ return createPortal(, document.getElementById("portal-root")!);
+ \`\`\`
+
+ That means Storybook should create the same portal root in \`${configDir}/preview-body.html\`:
+
+ \`\`\`html
+
+
+ \`\`\`
+
+ If the app uses multiple portal roots, create all of them there:
+
+ \`\`\`html
+
+
+
+
+ \`\`\`
+
+ If a library portals directly to \`document.body\`, do not add extra roots for it.
+ Make sure the copied page shell, CSS, and layout still allow overlays, fixed positioning, and z-index stacking to render correctly.
+
+ ### Step 4: Mock side effects globally
+
+ All network/data queries should be handled by the default Storybook environment.
+
+ - Always use \`msw-storybook-addon\` for query mocking.
+ - If you introduce MSW, run \`npx msw init ./public --save\` to create the worker file.
+ - Make sure Storybook serves \`./public\` as a static dir so \`mockServiceWorker.js\` is available.
+ - Do not mock \`fetch\` directly.
+ - Network/data queries should return deterministic mock data.
+ - If you need to change dependencies, first check the lockfile and use that package manager for the change.
+
+ Example of copying a real fetch pattern into shared handlers:
+
+ \`\`\`ts
+ // real app hook
+ const response = await fetch(
+ apiBaseUrl +
+ "/products?" +
+ new URLSearchParams({
+ page: "1",
+ sort: "featured",
+ }),
+ );
+ \`\`\`
+
+ \`\`\`ts
+ // ${configDir}/msw-handlers.ts
+ import { http, HttpResponse } from "msw";
+
+ export const mswHandlers = {
+ products: [
+ http.get("https://api.example.com/products", () =>
+ HttpResponse.json({
+ items: [
+ {
+ id: "product-1",
+ name: "Example product",
+ description: "Mock product description",
+ imageUrl: "https://images.example.com/product.jpg",
+ price: 42,
+ },
+ ],
+ }),
+ ),
+ ],
+ };
+ \`\`\`
+
+ ${getMswPreviewExample(projectInfo)}
+
+ \`\`\`ts
+ // ${configDir}/main.ts
+ import type { StorybookConfig } from "${typeImport}";
+
+ const config: StorybookConfig = {
+ staticDirs: ["../public"],
+ };
+
+ export default config;
+ \`\`\`
+
+ Keep these mocks global.
+ Do not put fetch mocks in individual stories.
+ Only add handlers for requests that the shared preview setup or the stories actually use.
+ Do not add catch-all handlers that can hide unrelated failures.
+ If the defaults are not enough, improve the shared default setup instead.
+ Seed browser state when needed, but do not mock \`window\`, \`document\`, \`navigator\`, observers, or similar runtime APIs.
+ The only exception is \`mockdate\` when date-based rendering exists.
+
+ ### Step 5: Write stories
+
+ Try to find around 10 good candidate components for story files.
+ Write colocated stories for top-level components, from low-level reusable components up to page components.
+ Write up to 10 story files, or fewer only if the codebase clearly has fewer meaningful targets.
+
+ The stories should use JSX copied from real usage patterns in:
+
+ - pages
+ - app shells
+ - routes
+ - tests
+ - existing feature code
+
+ As a rule of thumb, each story file should have around 3 story exports when the component or page has enough meaningful states.
+ It can have more when the real usage supports it, up to 10 story exports in one file.
+
+ Always show all imports explicitly in story and preview files.
+ Do not rely on omitted or implied imports in examples or generated code.
+
+ #### Story tags
+
+ Every story meta must include the \`ai-generated\` tag to identify AI-created stories:
+
+ ${getStoryExample(projectInfo)}
+
+ If a story could not be fully fixed after the self-healing loop (the test still fails
+ or the rendering is incomplete), add the \`needs-work\` tag alongside \`ai-generated\`:
+
+ ${getNeedsWorkTagExample(projectInfo)}
+
+ #### Args vs render
+
+ For simple components where props drive the state, prefer \`args\` stories — no \`render\` function needed:
+
+ ${getArgsStoryExample(projectInfo)}
+
+ Use \`render\` when the story needs composition — wrapping the component in layout, combining multiple components, or passing children as JSX:
+
+ ${getRenderCompositionExample(projectInfo)}
+
+ Keep app mocking and runtime setup in preview, not in the stories.
+ Do not build large story-specific harnesses.
+ Do not write story files for subcomponents, hooks, contexts, or helpers.
+ Do not create new application components.
+ Do not add a custom \`title\`.
+ Do not stop after only a few easy targets if the codebase has more meaningful components or pages available.
+
+ ### Step 6: Write a play function for every story
+
+ Every named story export must have a \`play\` function.
+ The \`play\` function is not optional, even for simple stories.
+
+ The purpose of the \`play\` function is to prove that the story actually works in the copied Storybook environment:
+
+ - the story renders something real and non-empty
+ - the decorators provide the needed context
+ - the CSS is applied well enough for the intended state to be visible
+ - the MSW mocks or seeded browser state are actually being used
+ - important interactions, async loading states, and portals behave correctly
+
+ Use \`play\` functions to verify behavior, not just to click around.
+ A story without assertions is incomplete.
+
+ Use tools from \`storybook/test\` such as:
+
+ - \`expect\`
+ - \`waitFor\`
+
+ Prefer \`canvas\` and \`userEvent\` from the \`play\` context.
+ Do not destructure \`canvasElement\` just to create \`const canvas = within(canvasElement)\`.
+ Do not import \`userEvent\` from \`storybook/test\`; use \`userEvent\` from the \`play\` context instead.
+ Only use \`canvasElement.ownerDocument\` when you need to query outside the canvas, such as for portals.
+
+ Example:
+
+ \`\`\`tsx
+ import type { StoryObj } from "${typeImport}";
+
+ export const FilledForm: Story = {
+ play: async ({ canvas, userEvent }) => {
+ const emailInput = canvas.getByLabelText("email", {
+ selector: "input",
+ });
+
+ await userEvent.type(emailInput, "example-email@email.com", {
+ delay: 100,
+ });
+
+ const passwordInput = canvas.getByLabelText("password", {
+ selector: "input",
+ });
+
+ await userEvent.type(passwordInput, "ExamplePassword", {
+ delay: 100,
+ });
+
+ const submitButton = canvas.getByRole("button");
+ await userEvent.click(submitButton);
+ },
+ };
+ \`\`\`
+
+ The assertions should match the real pattern you copied:
+
+ - for provider-backed stories, assert the provider-dependent UI appears correctly
+ - for mocked-data stories, wait for the mocked data to appear and assert on it
+ - for CSS-sensitive states, assert on visibility, text layout, class-driven states, or meaningful computed styles
+ - for routing or navigation stories, assert the routed state or navigation outcome
+ - for portal stories, query from \`canvasElement.ownerDocument\` when the UI renders outside the canvas
+
+ Examples of useful checks:
+
+ - a themed button has the expected label and is visibly enabled or disabled
+ - a modal opened through a decorator or provider is visible in the portal root
+ - mocked API data appears in the page instead of a loading spinner forever
+ - a selected tab actually shows the selected panel
+ - a toast, alert, or badge has the expected accessible text and visual state
+ - a CSS class or computed style confirms the real state that matters
+
+ ### Step 7: Prove CSS is loaded in exactly one story named \`CssCheck\`
+
+ In exactly one story, named \`CssCheck\`, assert a component-specific computed style. \`toBeVisible\` passes on an unstyled component; a concrete style value proves the shared preview loaded the app's CSS.
+
+ Pick a visually distinctive component, read a styling value from its source, and assert it with \`getComputedStyle\`:
+
+ \`\`\`tsx
+ export const CssCheck: Story = {
+ args: { children: "Submit" },
+ play: async ({ canvas }) => {
+ const button = canvas.getByRole("button", { name: /submit/i });
+ // PrimaryButton uses bg-blue-600 — fails if Tailwind / global CSS did not load.
+ await expect(getComputedStyle(button).backgroundColor).toBe("rgb(37, 99, 235)");
+ },
+ };
+ \`\`\`
+
+ ### Step 8: Cover the patterns you found
+
+ Write stories for the real patterns in the codebase, for example:
+
+ - a low-level reusable component in real JSX usage
+ - a provider-backed component
+ - a browser-state-backed component
+ - a fetched-data component
+ - a real page component
+
+ Use \`App.tsx\` to inspect the real provider tree and usage patterns, but do not make a story for \`App\` when the codebase has actual page components.
+
+ Example page story:
+
+ ${getPageStoryExample(projectInfo)}
+
+ ### Step 9: Verify both rendering and types
+
+ As you work, verify the stories with Vitest:
+
+ \`\`\`bash
+ npx vitest --project storybook
+ \`\`\`
+
+ Also verify types so you catch missing required props, broken imports, and preview typing issues. Run the same TypeScript command the project itself uses.
+
+ \`\`\`bash
+
+ \`\`\`
+
+ After verification passes, review every changed file and remove anything that is not needed for the final solution, especially debug fixes, overly broad mocks, unnecessary dependencies, and eval artifacts.
+
+ Keep iterating until:
+
+ - every story you wrote passes
+ - every story you wrote has a meaningful passing \`play\` function
+ - the changed stories and preview setup pass the project's real TypeScript check
+ - the rendered output looks sensible
+ - the default global mocked environment is strong enough that stories do not need manual fetch overrides
+ - stories no longer fail because the shared preview setup and story JSX are fixed
+ - all passing stories have \`tags: ['ai-generated']\` in their meta
+ - any stories that still need work have \`tags: ['ai-generated', 'needs-work']\` in their meta
+ `;
+}
diff --git a/code/lib/cli-storybook/src/ai/setup-prompts/setup.ts b/code/lib/cli-storybook/src/ai/setup-prompts/setup.ts
new file mode 100644
index 000000000000..916028087d38
--- /dev/null
+++ b/code/lib/cli-storybook/src/ai/setup-prompts/setup.ts
@@ -0,0 +1,295 @@
+/**
+ * Prompt variant: `setup`
+ *
+ * - Created: 2026-04-15 (eval iteration 1, baseline before `pattern-copy-play`)
+ * - Status: experimental — not the default. Selected only when the eval
+ * harness sets `EVAL_SETUP_PROMPT=setup`.
+ * - Reference eval results:
+ * https://github.com/search?q=is:pr label:"prompt:setup" org:storybook-tmp&type=pullrequests
+ *
+ * Update this header when iterating: bump the iteration number and link the
+ * latest eval run so reviewers can compare variants without spelunking git.
+ */
+import { dedent } from 'ts-dedent';
+
+import type { ProjectInfo } from '../types.ts';
+
+function getTypeImportSource(projectInfo: ProjectInfo): string {
+ return projectInfo.framework || projectInfo.rendererPackage || '@storybook/react';
+}
+
+function getPreviewDecoratorExample(projectInfo: ProjectInfo): string {
+ const configDir = projectInfo.configDir;
+
+ if (projectInfo.hasCsfFactoryPreview) {
+ return dedent`
+ \`\`\`tsx
+ // ${configDir}/preview.tsx
+ import '../src/index.css'; // import global styles
+
+ import { definePreview } from 'storybook/preview';
+
+ export default definePreview({
+ decorators: [
+ (Story) => (
+
+
+
+
+
+ ),
+ ],
+ });
+ \`\`\`
+ `;
+ }
+
+ const typeImport = getTypeImportSource(projectInfo);
+
+ return dedent`
+ \`\`\`tsx
+ // ${configDir}/preview.tsx
+ import type { Preview } from '${typeImport}';
+ import '../src/index.css'; // import global styles
+
+ const preview: Preview = {
+ decorators: [
+ (Story) => (
+
+
+
+
+
+ ),
+ ],
+ };
+
+ export default preview;
+ \`\`\`
+ `;
+}
+
+function getSimpleStoryExample(projectInfo: ProjectInfo): string {
+ if (projectInfo.hasCsfFactoryPreview) {
+ return dedent`
+ \`\`\`tsx
+ import preview from '#.storybook/preview';
+ import { Button } from './Button';
+
+ const meta = preview.meta({
+ title: 'AI Generated/Simple/Button',
+ component: Button,
+ tags: ['ai-generated'],
+ });
+
+ export const Default = meta.story({
+ args: {
+ label: 'Click me',
+ },
+ });
+
+ export const Disabled = meta.story({
+ args: {
+ label: 'Disabled',
+ disabled: true,
+ },
+ });
+ \`\`\`
+ `;
+ }
+
+ const typeImport = getTypeImportSource(projectInfo);
+
+ return dedent`
+ \`\`\`tsx
+ import type { Meta, StoryObj } from '${typeImport}';
+ import { Button } from './Button';
+
+ const meta = {
+ title: 'AI Generated/Simple/Button',
+ component: Button,
+ tags: ['ai-generated'],
+ } satisfies Meta;
+
+ export default meta;
+ type Story = StoryObj;
+
+ export const Default: Story = {
+ args: {
+ label: 'Click me',
+ },
+ };
+
+ export const Disabled: Story = {
+ args: {
+ label: 'Disabled',
+ disabled: true,
+ },
+ };
+ \`\`\`
+ `;
+}
+
+function getNeedsWorkTagExample(projectInfo: ProjectInfo): string {
+ if (projectInfo.hasCsfFactoryPreview) {
+ return dedent`
+ \`\`\`ts
+ const meta = preview.meta({
+ title: 'AI Generated/Simple/Button',
+ component: Button,
+ tags: ['ai-generated', 'needs-work'],
+ });
+ \`\`\`
+ `;
+ }
+
+ return dedent`
+ \`\`\`ts
+ const meta = {
+ title: 'AI Generated/Simple/Button',
+ component: Button,
+ tags: ['ai-generated', 'needs-work'],
+ } satisfies Meta;
+ \`\`\`
+ `;
+}
+
+export function instructions(projectInfo: ProjectInfo): string {
+ const configDir = projectInfo.configDir;
+
+ return dedent`
+ Attention: The following instructions must be followed in order to successfully set up Storybook in this project. Do not skip steps or attempt to do them out of order.
+
+ Your goal is to make Storybook fully functional in this project by analyzing the codebase,
+ configuring the preview with the right decorators, and writing example stories for 9 components.
+
+ Work through these steps in order. After each story file, run Vitest to verify it renders.
+ If the test fails, read the error, fix the issue, and re-run until it passes before moving on.
+
+ ### Step 1: Analyze the codebase
+
+ Before writing any stories, understand what the components need to render:
+
+ - Scan the project for context providers, theme systems, routers, stores, and i18n setups.
+ Look at the app's entry point (e.g. \`App.tsx\`, \`main.tsx\`, \`layout.tsx\`) to see what
+ providers wrap the component tree.
+ - Identify global CSS or style imports required for components to look correct.
+ - Note any path aliases configured in tsconfig or bundler config.
+ - Read \`${configDir}/main.ts\` (or \`main.js\`) to find the \`stories\` glob patterns.
+ Your story files must match those patterns to be picked up by Storybook.
+
+ ### Step 2: Configure \`${configDir}/preview.tsx\` with decorators
+
+ Add decorators that wrap every story with the providers your components need.
+ Without this, most non-trivial components will crash.
+
+ ${getPreviewDecoratorExample(projectInfo)}
+
+ Common decorators to add:
+
+ - **Theme providers** (e.g. ThemeProvider, MUI ThemeProvider, styled-components, Tailwind)
+ - **Router** (e.g. MemoryRouter, BrowserRouter mock)
+ - **State stores** (e.g. Redux Provider, Zustand, Jotai)
+ - **i18n** (e.g. IntlProvider, I18nextProvider)
+ - **Global CSS** — import global stylesheets at the top of \`preview.tsx\`
+
+ ### Step 3: Write stories for 9 components
+
+ Pick 9 real components from the codebase, 3 of each complexity level.
+ Use the title prefix \`AI Generated//\` so they are grouped
+ together in the Storybook sidebar.
+
+ **Simple (3 components)** — Presentational with few props, no internal state.
+ Examples: Button, Badge, Avatar, Icon, Label, Chip.
+ Title format: \`AI Generated/Simple/\`
+
+ **Medium (3 components)** — Multiple visual variants or composed from simpler components.
+ Examples: Card, Alert, Input, Select, Tooltip, Tabs.
+ Title format: \`AI Generated/Medium/\`
+
+ **Complex (3 components)** — Internal state, side effects, or deep composition.
+ Examples: Modal, DataTable, Form, Dropdown, Accordion, Sidebar.
+ Title format: \`AI Generated/Complex/\`
+
+ For each component, create a \`.stories.tsx\` file next to the component.
+ Each file must have at least 2 story exports covering the component's main states.
+ Make sure the file location and naming matches the \`stories\` patterns in \`${configDir}/main.ts\`.
+
+ #### Story tags
+
+ Every story meta must include the \`ai-generated\` tag to identify AI-created stories:
+
+ ${getSimpleStoryExample(projectInfo)}
+
+ If a story could not be fully fixed after the self-healing loop (the test still fails
+ or the rendering is incomplete), add the \`needs-work\` tag alongside \`ai-generated\`:
+
+ ${getNeedsWorkTagExample(projectInfo)}
+
+ Rules:
+
+ - Every named export is a story. Use \`args\` to set props.
+ - Provide all required props via \`args\` — check the component's types.
+ - If a component needs per-story decorators (beyond the global ones), add them in the meta.
+ - Do NOT use \`any\` types. Use the component's prop types for type safety.
+
+ Reference: https://storybook.js.org/docs/writing-stories
+
+ ### Step 4: Verify each story with Vitest
+
+ After writing each story file, immediately verify it:
+
+ \`\`\`bash
+ npx vitest --project storybook
+ \`\`\`
+
+ **Self-healing loop — repeat for every story file:**
+
+ 1. Write/update the story file
+ 2. Run \`npx vitest --project storybook \`
+ 3. If it fails: read the error output carefully
+ - Missing provider → add a decorator in \`${configDir}/preview.tsx\` or in the story meta
+ - Missing prop → add the required prop to \`args\`
+ - Import error → fix the import path
+ - CSS/asset error → add static dirs or import the stylesheet
+ 4. Fix the issue and go back to step 2
+ 5. Once the test passes, move to the next component
+
+ After all 9 story files pass individually, run the full suite:
+
+ \`\`\`bash
+ npx vitest --project storybook
+ \`\`\`
+
+ Once all stories pass, run a full Storybook build as a final check:
+
+ \`\`\`bash
+ npx storybook build
+ \`\`\`
+
+ If the build fails, fix the issue before finishing.
+
+ Finally, run \`npx storybook doctor\` to check for common issues
+ (version mismatches, duplicated deps, etc.) and fix anything it reports.
+
+ ### Checklist
+
+ - [ ] Analyzed codebase for providers, global styles, and path aliases
+ - [ ] Read story patterns from \`${configDir}/main.ts\`
+ - [ ] Configured \`${configDir}/preview.tsx\` with necessary decorators
+ - [ ] Simple component 1: story written and passing
+ - [ ] Simple component 2: story written and passing
+ - [ ] Simple component 3: story written and passing
+ - [ ] Medium component 1: story written and passing
+ - [ ] Medium component 2: story written and passing
+ - [ ] Medium component 3: story written and passing
+ - [ ] Complex component 1: story written and passing
+ - [ ] Complex component 2: story written and passing
+ - [ ] Complex component 3: story written and passing
+ - [ ] Full Vitest suite passes: \`npx vitest --project storybook\`
+ - [ ] \`npx storybook build\` succeeds
+ - [ ] \`npx storybook doctor\` reports no remaining issues
+ - [ ] All passing stories have \`tags: ['ai-generated']\` in their meta
+ - [ ] Any stories that still need work have \`tags: ['ai-generated', 'needs-work']\` in their meta
+ `;
+}
diff --git a/code/lib/cli-storybook/src/ai/types.ts b/code/lib/cli-storybook/src/ai/types.ts
new file mode 100644
index 000000000000..93a3ea88aabd
--- /dev/null
+++ b/code/lib/cli-storybook/src/ai/types.ts
@@ -0,0 +1,36 @@
+import type { SupportedRenderer } from 'storybook/internal/types';
+
+export interface AiSetupOptions {
+ configDir?: string;
+ packageManager?: string;
+ output?: string;
+ /** Populated from the program-level `--disable-telemetry` flag (defaults from `STORYBOOK_DISABLE_TELEMETRY`). */
+ disableTelemetry?: boolean;
+}
+
+export interface ProjectInfo {
+ storybookVersion: string | undefined;
+ majorVersion: number | undefined;
+ framework: string | null;
+ /** The full renderer package name, e.g. "@storybook/react" */
+ rendererPackage: string | null;
+ /** The short renderer name for docs URLs, e.g. "react" */
+ renderer?: SupportedRenderer;
+ builderPackage: string | null;
+ addons: string[];
+ configDir: string;
+ storiesPaths: string[];
+ hasCsfFactoryPreview: boolean;
+ /** Whether the project uses TypeScript ('ts') or JavaScript ('js'), inferred from the main config file extension. */
+ language: 'ts' | 'js';
+}
+
+/**
+ * Represents a skill category that can be expanded in the future.
+ * Each skill provides a name, description, and instructions for agents.
+ */
+export interface AiPrompt {
+ name: string;
+ description: string;
+ instructions: string;
+}
diff --git a/code/lib/cli-storybook/src/automigrate/helpers/mainConfigFile.ts b/code/lib/cli-storybook/src/automigrate/helpers/mainConfigFile.ts
index acdb3e0f24f1..e9995c6e5cce 100644
--- a/code/lib/cli-storybook/src/automigrate/helpers/mainConfigFile.ts
+++ b/code/lib/cli-storybook/src/automigrate/helpers/mainConfigFile.ts
@@ -118,6 +118,11 @@ export const getStorybookData = async ({
configDir: configDirFromScript,
previewConfigPath,
versionSpecifier,
+ frameworkPackage,
+ rendererPackage,
+ renderer,
+ builderPackage,
+ addons,
} = await getStorybookInfo(
userDefinedConfigDir,
userDefinedConfigDir ? dirname(userDefinedConfigDir) : undefined
@@ -165,6 +170,11 @@ export const getStorybookData = async ({
packageManager,
storiesPaths,
hasCsfFactoryPreview,
+ frameworkPackage,
+ rendererPackage,
+ renderer,
+ builderPackage,
+ addons,
};
};
export type GetStorybookData = typeof getStorybookData;
diff --git a/code/lib/cli-storybook/src/bin/run.ts b/code/lib/cli-storybook/src/bin/run.ts
index 36ebb1a28a7b..5f91b508e4ec 100644
--- a/code/lib/cli-storybook/src/bin/run.ts
+++ b/code/lib/cli-storybook/src/bin/run.ts
@@ -3,7 +3,6 @@ import {
HandledError,
JsPackageManagerFactory,
PackageManagerName,
- isCI,
optionalEnvToBoolean,
removeAddon as remove,
versions,
@@ -24,6 +23,7 @@ import { doctor } from '../doctor/index.ts';
import { link } from '../link.ts';
import { migrate } from '../migrate.ts';
import { sandbox } from '../sandbox.ts';
+import { aiSetup } from '../ai/index.ts';
import { type UpgradeOptions, upgrade } from '../upgrade.ts';
addToGlobalContext('cliVersion', versions.storybook);
@@ -105,14 +105,10 @@ command('init')
.option('-y --yes', 'Answer yes to all prompts')
.option('-b --builder ', 'Builder library')
.option('-l --linkable', 'Prepare installation for link (contributor helper)')
- .option(
- '--dev',
- 'Launch the development server after completing initialization. Enabled by default (default: true)',
- !isCI() && !optionalEnvToBoolean(process.env.IN_STORYBOOK_SANDBOX)
- )
+ .option('--dev', 'Launch the development server after completing initialization')
.option(
'--no-dev',
- 'Complete the initialization of Storybook without launching the Storybook development server'
+ 'Do not launch the Storybook development server after completing initialization (default)'
);
command('add ')
@@ -175,6 +171,10 @@ command('upgrade')
.option('-f --force', 'force the upgrade, skipping autoblockers')
.option('-n --dry-run', 'Only check for upgrades, do not install')
.option('-s --skip-check', 'Skip postinstall version and automigration checks')
+ .option(
+ '--skip-automigrations',
+ 'Skip running automigrations entirely (only update package versions and install)'
+ )
.option(
'-c, --config-dir ',
'Directory(ies) where to load Storybook configurations from'
@@ -299,6 +299,35 @@ command('doctor')
}).catch(handleCommandFailure(options.logfile));
});
+const aiCommand = command('ai')
+ .description('AI agent helpers for Storybook')
+ .option(
+ '-o, --output ',
+ 'Write the prompt output to a file instead of printing it to stdout'
+ );
+
+aiCommand
+ .command('setup')
+ .description('Generate setup instructions to write stories for real components')
+ .addOption(
+ new Option('--package-manager ', 'Force package manager for installing deps').choices(
+ Object.values(PackageManagerName)
+ )
+ )
+ .option('-c, --config-dir ', 'Directory of Storybook configuration')
+ .action(async (options, cmd) => {
+ const parentOptions = cmd.parent?.opts() ?? {};
+ const mergedOptions = { ...parentOptions, ...options };
+ await withTelemetry('ai-setup', { cliOptions: mergedOptions }, async () => {
+ await aiSetup(mergedOptions);
+ }).catch(handleCommandFailure(mergedOptions.logfile));
+ });
+
+// Show available subcommands when `storybook ai` is run without arguments
+aiCommand.action(() => {
+ aiCommand.outputHelp();
+});
+
program.on('command:*', ([invalidCmd]) => {
let errorMessage = ` Invalid command: ${picocolors.bold(invalidCmd)}.\n See --help for a list of available commands.`;
const availableCommands = program.commands.map((cmd) => cmd.name());
diff --git a/code/lib/cli-storybook/src/upgrade.ts b/code/lib/cli-storybook/src/upgrade.ts
index 6a2a7b79ec40..8060fd00b041 100644
--- a/code/lib/cli-storybook/src/upgrade.ts
+++ b/code/lib/cli-storybook/src/upgrade.ts
@@ -116,6 +116,7 @@ export const checkVersionConsistency = () => {
export type UpgradeOptions = {
skipCheck: boolean;
+ skipAutomigrations?: boolean;
packageManager?: PackageManagerName;
dryRun: boolean;
yes: boolean;
@@ -413,11 +414,17 @@ export async function upgrade(options: UpgradeOptions): Promise {
}
}
- // Run automigrations for all projects
- const { automigrationResults, detectedAutomigrations } = await runAutomigrations(
- storybookProjects,
- options
- );
+ // Run automigrations for all projects (unless explicitly skipped)
+ let automigrationResults: Record = {};
+ let detectedAutomigrations: AutomigrationCheckResult[] = [];
+ if (options.skipAutomigrations) {
+ logger.log('Skipping automigrations (--skip-automigrations).');
+ } else {
+ ({ automigrationResults, detectedAutomigrations } = await runAutomigrations(
+ storybookProjects,
+ options
+ ));
+ }
// Install dependencies
const rootPackageManager =
diff --git a/code/lib/create-storybook/src/bin/run.ts b/code/lib/create-storybook/src/bin/run.ts
index 1d7aff8d4fe7..19875786bf7b 100644
--- a/code/lib/create-storybook/src/bin/run.ts
+++ b/code/lib/create-storybook/src/bin/run.ts
@@ -1,5 +1,5 @@
import { ProjectType } from 'storybook/internal/cli';
-import { PackageManagerName, isCI, optionalEnvToBoolean } from 'storybook/internal/common';
+import { PackageManagerName, optionalEnvToBoolean } from 'storybook/internal/common';
import { logTracker, logger } from 'storybook/internal/node-logger';
import { addToGlobalContext } from 'storybook/internal/telemetry';
import { Feature, SupportedBuilder } from 'storybook/internal/types';
@@ -9,6 +9,7 @@ import { Option, program } from 'commander';
import { version } from '../../package.json';
import type { CommandOptions } from '../generators/types.ts';
import { initiate } from '../initiate.ts';
+import { isAgent, detectAgent } from 'std-env';
addToGlobalContext('cliVersion', version);
@@ -73,6 +74,8 @@ const createStorybookProgram = program
'--no-dev',
'Complete the initialization of Storybook without launching the Storybook development server'
)
+ .option('--agent', 'Force agent mode (non-interactive, logs AI setup instructions)')
+ .option('--no-agent', 'Force disable agent mode even when an AI agent is detected')
.option(
'--logfile [path]',
'Write all debug logs to the specified file at the end of the run. Defaults to debug-storybook.log when [path] is not provided'
@@ -89,6 +92,7 @@ const createStorybookProgram = program
)
.hook('preAction', async (self) => {
const options = self.opts();
+ const resolvedAgent = options.agent ?? isAgent;
if (options.debug) {
logger.setLogLevel('debug');
@@ -98,7 +102,7 @@ const createStorybookProgram = program
logger.setLogLevel(options.loglevel);
}
- if (options.logfile) {
+ if (options.logfile || resolvedAgent) {
logTracker.enableLogWriting();
}
})
@@ -112,16 +116,25 @@ const createStorybookProgram = program
createStorybookProgram
.action(async (options) => {
- const isNeitherCiNorSandbox =
- !isCI() && !optionalEnvToBoolean(process.env.IN_STORYBOOK_SANDBOX);
options.debug = options.debug ?? false;
- options.dev = options.dev ?? isNeitherCiNorSandbox;
+ options.dev = options.dev ?? false;
if (options.features === false) {
// Ensure features are treated as empty when --no-features is set
options.features = [];
}
+ const resolvedAgent = options.agent ?? isAgent;
+ options.agent = resolvedAgent;
+
+ if (resolvedAgent) {
+ const agent = detectAgent();
+ const agentName = agent ? agent.name : 'unknown';
+ logger.log(
+ `This command is running via an AI agent: ${agentName}. Proceeding with agentic installation flow.`
+ );
+ }
+
await initiate(options as CommandOptions).catch(() => process.exit(1));
})
.version(String(version))
diff --git a/code/lib/create-storybook/src/commands/FinalizationCommand.test.ts b/code/lib/create-storybook/src/commands/FinalizationCommand.test.ts
index dfb3c87f4465..7eec40c78880 100644
--- a/code/lib/create-storybook/src/commands/FinalizationCommand.test.ts
+++ b/code/lib/create-storybook/src/commands/FinalizationCommand.test.ts
@@ -7,7 +7,7 @@ import { logger } from 'storybook/internal/node-logger';
import * as find from 'empathic/find';
-import { FinalizationCommand } from './FinalizationCommand.ts';
+import { FinalizationCommand, executeFinalization } from './FinalizationCommand.ts';
vi.mock('node:fs/promises', { spy: true });
vi.mock('storybook/internal/common', { spy: true });
@@ -18,7 +18,11 @@ describe('FinalizationCommand', () => {
let command: FinalizationCommand;
beforeEach(() => {
- command = new FinalizationCommand(undefined);
+ command = new FinalizationCommand({
+ logfile: undefined,
+ showAgentFollowUp: false,
+ showAiInstructions: false,
+ });
vi.mocked(getProjectRoot).mockReturnValue('/test/project');
vi.mocked(logger.step).mockImplementation(() => {});
@@ -107,4 +111,182 @@ describe('FinalizationCommand', () => {
expect(logger.log).toHaveBeenCalledWith(expect.stringContaining('ng run my-app:storybook'));
});
});
+
+ describe('agent mode', () => {
+ it('should show agent-specific message when showAgentFollowUp=true', async () => {
+ const agentCommand = new FinalizationCommand({
+ logfile: undefined,
+ showAgentFollowUp: true,
+ showAiInstructions: false,
+ });
+ vi.mocked(find.up).mockReturnValue(undefined);
+
+ await agentCommand.execute({});
+
+ expect(logger.step).toHaveBeenCalledWith(
+ expect.stringContaining('is not entirely set up yet')
+ );
+ expect(logger.step).toHaveBeenCalledWith(expect.stringContaining('npx storybook ai setup'));
+ });
+
+ it('should show standard success message when showAgentFollowUp=false with AI instructions', async () => {
+ const agentCommand = new FinalizationCommand({
+ logfile: undefined,
+ showAgentFollowUp: false,
+ showAiInstructions: true,
+ });
+ vi.mocked(find.up).mockReturnValue(undefined);
+
+ await agentCommand.execute({});
+
+ expect(logger.step).toHaveBeenCalledWith(
+ expect.stringContaining('Storybook was successfully installed')
+ );
+ // Ensure the agent message is NOT shown
+ const stepCalls = vi.mocked(logger.step).mock.calls.map((c) => String(c[0]));
+ expect(stepCalls.some((msg) => msg.includes('is not entirely set up yet'))).toBe(false);
+ });
+
+ it('should show standard success message when showAgentFollowUp=false', async () => {
+ const nonAgentCommand = new FinalizationCommand({
+ logfile: undefined,
+ showAgentFollowUp: false,
+ showAiInstructions: false,
+ });
+ vi.mocked(find.up).mockReturnValue(undefined);
+
+ await nonAgentCommand.execute({});
+
+ expect(logger.step).toHaveBeenCalledWith(
+ expect.stringContaining('Storybook was successfully installed')
+ );
+ // Ensure the agent message is NOT shown
+ const stepCalls = vi.mocked(logger.step).mock.calls.map((c) => String(c[0]));
+ expect(stepCalls.some((msg) => msg.includes('is not entirely set up yet'))).toBe(false);
+ });
+ });
+
+ describe('AI instructions', () => {
+ it('should show AI instructions when showAiInstructions=true', async () => {
+ const aiCommand = new FinalizationCommand({
+ logfile: undefined,
+ showAgentFollowUp: false,
+ showAiInstructions: true,
+ });
+ vi.mocked(find.up).mockReturnValue(undefined);
+
+ await aiCommand.execute({});
+
+ expect(logger.step).toHaveBeenCalledWith(
+ expect.stringContaining('To finalize setting up with AI')
+ );
+ expect(logger.step).toHaveBeenCalledWith(expect.stringContaining('npx storybook ai setup'));
+ });
+
+ it('should NOT show AI instructions when showAiInstructions=false', async () => {
+ const noAiCommand = new FinalizationCommand({
+ logfile: undefined,
+ showAgentFollowUp: false,
+ showAiInstructions: false,
+ });
+ vi.mocked(find.up).mockReturnValue(undefined);
+
+ await noAiCommand.execute({});
+
+ const stepCalls = vi.mocked(logger.step).mock.calls.map((c) => String(c[0]));
+ expect(stepCalls.some((msg) => msg.includes('To finalize setting up with AI'))).toBe(false);
+ });
+
+ it('should show both agent message and AI instructions when both are true', async () => {
+ const bothCommand = new FinalizationCommand({
+ logfile: undefined,
+ showAgentFollowUp: true,
+ showAiInstructions: true,
+ });
+ vi.mocked(find.up).mockReturnValue(undefined);
+
+ await bothCommand.execute({});
+
+ expect(logger.step).toHaveBeenCalledWith(
+ expect.stringContaining('is not entirely set up yet')
+ );
+ expect(logger.step).toHaveBeenCalledWith(
+ expect.stringContaining('To finalize setting up with AI')
+ );
+ });
+ });
+
+ describe('storybookCommand message', () => {
+ it('should print "To run Storybook, run" with the command', async () => {
+ const cmd = new FinalizationCommand({
+ logfile: undefined,
+ showAgentFollowUp: false,
+ showAiInstructions: false,
+ });
+ vi.mocked(find.up).mockReturnValue(undefined);
+
+ await cmd.execute({ storybookCommand: 'npm run storybook' });
+
+ expect(logger.log).toHaveBeenCalledWith(expect.stringContaining('To run Storybook, run'));
+ expect(logger.log).toHaveBeenCalledWith(expect.stringContaining('npm run storybook'));
+ });
+
+ it('should not print storybook command message when storybookCommand is null', async () => {
+ const cmd = new FinalizationCommand({
+ logfile: undefined,
+ showAgentFollowUp: false,
+ showAiInstructions: false,
+ });
+ vi.mocked(find.up).mockReturnValue(undefined);
+
+ await cmd.execute({ storybookCommand: null });
+
+ const logCalls = vi.mocked(logger.log).mock.calls.map((c) => String(c[0]));
+ expect(logCalls.some((msg) => msg.includes('To run Storybook, run'))).toBe(false);
+ });
+ });
+
+ describe('executeFinalization helper', () => {
+ it('should show agent follow-up when showAgentFollowUp=true', async () => {
+ vi.mocked(find.up).mockReturnValue(undefined);
+
+ await executeFinalization({
+ showAgentFollowUp: true,
+ showAiInstructions: false,
+ logfile: undefined,
+ });
+
+ // Agent mode should show agent-specific message
+ expect(logger.step).toHaveBeenCalledWith(
+ expect.stringContaining('is not entirely set up yet')
+ );
+ });
+
+ it('should pass showAiInstructions=true through to the command', async () => {
+ vi.mocked(find.up).mockReturnValue(undefined);
+
+ await executeFinalization({
+ showAgentFollowUp: false,
+ showAiInstructions: true,
+ logfile: undefined,
+ });
+
+ expect(logger.step).toHaveBeenCalledWith(
+ expect.stringContaining('To finalize setting up with AI')
+ );
+ });
+
+ it('should forward storybookCommand to execute', async () => {
+ vi.mocked(find.up).mockReturnValue(undefined);
+
+ await executeFinalization({
+ showAgentFollowUp: false,
+ showAiInstructions: false,
+ logfile: undefined,
+ storybookCommand: 'yarn storybook',
+ });
+
+ expect(logger.log).toHaveBeenCalledWith(expect.stringContaining('yarn storybook'));
+ });
+ });
});
diff --git a/code/lib/create-storybook/src/commands/FinalizationCommand.ts b/code/lib/create-storybook/src/commands/FinalizationCommand.ts
index f0bb3fbd4265..e8665ca1e907 100644
--- a/code/lib/create-storybook/src/commands/FinalizationCommand.ts
+++ b/code/lib/create-storybook/src/commands/FinalizationCommand.ts
@@ -7,6 +7,14 @@ import { ErrorCollector } from 'storybook/internal/telemetry';
import * as find from 'empathic/find';
import { dedent } from 'ts-dedent';
+export type FinalizationCommandOptions = {
+ logfile: string | boolean | undefined;
+ /** When true, show the agent follow-up message ("run npx storybook ai setup") instead of the normal success message */
+ showAgentFollowUp: boolean;
+ /** When true, show the "paste this prompt to your AI agent" instructions */
+ showAiInstructions: boolean;
+};
+
type ExecuteFinalizationParams = {
storybookCommand?: string | null;
};
@@ -22,7 +30,8 @@ type ExecuteFinalizationParams = {
* - Show next steps
*/
export class FinalizationCommand {
- constructor(private logfile: string | boolean | undefined) {}
+ constructor(private options: FinalizationCommandOptions) {}
+
/** Execute finalization steps */
async execute({ storybookCommand }: ExecuteFinalizationParams): Promise {
// Update .gitignore
@@ -69,33 +78,52 @@ export class FinalizationCommand {
this.printNextSteps(storybookCommand);
try {
- const logFile = await logTracker.writeToFile(this.logfile);
+ const logFile = await logTracker.writeToFile(this.options.logfile);
logger.warn(`Debug logs are written to: ${logFile}`);
} catch {}
}
/** Print success message with feature summary */
private printSuccessMessage(storybookCommand?: string | null): void {
- logger.step(CLI_COLORS.success('Storybook was successfully installed in your project!'));
+ if (this.options.showAgentFollowUp) {
+ logger.step(
+ CLI_COLORS.storybook(
+ dedent`Storybook is installed but is not entirely set up yet.
+ To finish setting up, now run \`npx storybook ai setup\` and follow its instructions precisely.`
+ )
+ );
+ } else {
+ logger.step(CLI_COLORS.success('Storybook was successfully installed in your project!'));
+ }
this.printNextSteps(storybookCommand);
}
private printNextSteps(storybookCommand?: string | null): void {
if (storybookCommand) {
- logger.log(
- `To run Storybook manually, run ${CLI_COLORS.cta(storybookCommand)}. CTRL+C to stop.`
- );
+ logger.log(`To run Storybook, run ${CLI_COLORS.cta(storybookCommand)}. CTRL+C to stop.`);
}
logger.log(dedent`
Want to learn more about Storybook? ${CLI_COLORS.cta('https://storybook.js.org/')}
Having trouble or want to chat? ${CLI_COLORS.cta('https://discord.gg/storybook/')}
`);
+
+ if (this.options.showAiInstructions) {
+ logger.step(dedent`To finalize setting up with AI, paste this prompt to your AI agent:
+
+ ${CLI_COLORS.storybook(`Run \`npx storybook ai setup\` and follow its instructions precisely.`)}
+ `);
+ }
}
}
+
export const executeFinalization = ({
logfile,
+ showAgentFollowUp,
+ showAiInstructions,
...params
-}: ExecuteFinalizationParams & { logfile: string | boolean | undefined }) => {
- return new FinalizationCommand(logfile).execute(params);
+}: ExecuteFinalizationParams & FinalizationCommandOptions) => {
+ return new FinalizationCommand({ logfile, showAgentFollowUp, showAiInstructions }).execute(
+ params
+ );
};
diff --git a/code/lib/create-storybook/src/commands/PreflightCheckCommand.test.ts b/code/lib/create-storybook/src/commands/PreflightCheckCommand.test.ts
index 8277831390d1..829e61fc50be 100644
--- a/code/lib/create-storybook/src/commands/PreflightCheckCommand.test.ts
+++ b/code/lib/create-storybook/src/commands/PreflightCheckCommand.test.ts
@@ -107,6 +107,15 @@ describe('PreflightCheckCommand', () => {
});
});
+ it('should log the detected package manager', async () => {
+ vi.mocked(scaffoldModule.currentDirectoryIsEmpty).mockReturnValue(false);
+ mockPackageManager.type = PackageManagerName.YARN2;
+
+ await command.execute({ force: false } as any);
+
+ expect(vi.mocked(logger.info)).toHaveBeenCalledWith('Package manager: Yarn Berry');
+ });
+
it('should warn when package.json name is "storybook"', async () => {
vi.mocked(scaffoldModule.currentDirectoryIsEmpty).mockReturnValue(false);
mockPackageManager.primaryPackageJson = { packageJson: { name: 'storybook' } };
diff --git a/code/lib/create-storybook/src/commands/PreflightCheckCommand.ts b/code/lib/create-storybook/src/commands/PreflightCheckCommand.ts
index b7ad70e1aa3c..e6d06b5ef091 100644
--- a/code/lib/create-storybook/src/commands/PreflightCheckCommand.ts
+++ b/code/lib/create-storybook/src/commands/PreflightCheckCommand.ts
@@ -18,6 +18,15 @@ export interface PreflightCheckResult {
isEmptyProject: boolean;
}
+/** Human-friendly labels for each package manager type */
+const PACKAGE_MANAGER_LABEL: Record = {
+ [PackageManagerName.NPM]: 'npm',
+ [PackageManagerName.YARN1]: 'Yarn Classic (v1)',
+ [PackageManagerName.YARN2]: 'Yarn Berry',
+ [PackageManagerName.PNPM]: 'pnpm',
+ [PackageManagerName.BUN]: 'Bun',
+};
+
/**
* Command for running preflight checks before Storybook initialization
*
@@ -63,6 +72,10 @@ export class PreflightCheckCommand {
force: options.packageManager,
});
+ logger.info(
+ `Package manager: ${PACKAGE_MANAGER_LABEL[packageManager.type] ?? packageManager.type}`
+ );
+
// Install base project dependencies if we scaffolded a new project
if (isEmptyDirProject && !options.skipInstall) {
await packageManager.installDependencies();
diff --git a/code/lib/create-storybook/src/commands/UserPreferencesCommand.test.ts b/code/lib/create-storybook/src/commands/UserPreferencesCommand.test.ts
index 3c2bcc39955a..74c0205aa8f1 100644
--- a/code/lib/create-storybook/src/commands/UserPreferencesCommand.test.ts
+++ b/code/lib/create-storybook/src/commands/UserPreferencesCommand.test.ts
@@ -1,16 +1,15 @@
-import { beforeEach, describe, expect, it, vi } from 'vitest';
+import { afterAll, beforeEach, describe, expect, it, vi } from 'vitest';
import { AddonVitestService, ProjectType, globalSettings } from 'storybook/internal/cli';
-import type { JsPackageManager } from 'storybook/internal/common';
import { PackageManagerName, isCI } from 'storybook/internal/common';
import { logger, prompt } from 'storybook/internal/node-logger';
-import type { SupportedBuilder } from 'storybook/internal/types';
+import type { SupportedBuilder, SupportedRenderer } from 'storybook/internal/types';
import { Feature } from 'storybook/internal/types';
import type { CommandOptions } from '../generators/types.ts';
import { FeatureCompatibilityService } from '../services/FeatureCompatibilityService.ts';
import { TelemetryService } from '../services/TelemetryService.ts';
-import { UserPreferencesCommand } from './UserPreferencesCommand.ts';
+import { UserPreferencesCommand, executeUserPreferences } from './UserPreferencesCommand.ts';
vi.mock('storybook/internal/cli', { spy: true });
vi.mock('storybook/internal/common', { spy: true });
@@ -22,13 +21,33 @@ interface CommandWithPrivates {
telemetryService: {
trackNewUserCheck: ReturnType;
trackInstallType: ReturnType;
+ trackAiSetupNudge: ReturnType;
};
- featureService: { validateTestFeatureCompatibility: ReturnType };
}
describe('UserPreferencesCommand', () => {
let command: UserPreferencesCommand;
- const mockPackageManager = {} as Partial as JsPackageManager;
+ const originalIsTTYDescriptor = Object.getOwnPropertyDescriptor(process.stdout, 'isTTY');
+
+ const defaultExecuteOptions = {
+ framework: null as null,
+ builder: 'vite' as SupportedBuilder,
+ renderer: 'react' as SupportedRenderer,
+ projectType: ProjectType.REACT,
+ isTestFeatureAvailable: true,
+ isAiSetupAvailable: false,
+ };
+
+ afterAll(() => {
+ if (originalIsTTYDescriptor) {
+ Object.defineProperty(process.stdout, 'isTTY', originalIsTTYDescriptor);
+ } else {
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: undefined,
+ configurable: true,
+ });
+ }
+ });
beforeEach(() => {
// Provide required CommandOptions to avoid undefined access
@@ -37,7 +56,7 @@ describe('UserPreferencesCommand', () => {
disableTelemetry: true,
};
- command = new UserPreferencesCommand(commandOptions, mockPackageManager);
+ command = new UserPreferencesCommand(commandOptions);
// Mock AddonVitestService
const mockAddonVitestService = vi.fn().mockImplementation(() => ({
@@ -74,15 +93,11 @@ describe('UserPreferencesCommand', () => {
const mockTelemetryService = {
trackNewUserCheck: vi.fn(),
trackInstallType: vi.fn(),
- };
-
- const mockFeatureService = {
- validateTestFeatureCompatibility: vi.fn().mockResolvedValue({ compatible: true }),
+ trackAiSetupNudge: vi.fn(),
};
// Inject mocked services
(command as unknown as CommandWithPrivates).telemetryService = mockTelemetryService;
- (command as unknown as CommandWithPrivates).featureService = mockFeatureService;
// Mock logger and prompt
vi.mocked(logger.intro).mockImplementation(() => {});
@@ -91,15 +106,28 @@ describe('UserPreferencesCommand', () => {
vi.mocked(logger.log).mockImplementation(() => {});
vi.mocked(isCI).mockReturnValue(false);
+ // Reset isTTY to avoid leaking between tests
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: undefined,
+ configurable: true,
+ });
+
+ // Reset sandbox env to avoid leaking between tests (or from the actual test environment)
+ delete process.env.IN_STORYBOOK_SANDBOX;
+
vi.clearAllMocks();
+
+ // Re-apply mocks after clearAllMocks (which clears call history but not implementations,
+ // however mockResolvedValueOnce queues may leak between tests, so reset prompt mocks)
+ vi.mocked(prompt.select).mockReset();
+ vi.mocked(prompt.confirm).mockReset();
});
describe('execute', () => {
it('should return recommended config for new users in non-interactive mode', async () => {
const result = await command.execute({
- framework: null,
- builder: 'vite' as SupportedBuilder,
- projectType: ProjectType.REACT,
+ ...defaultExecuteOptions,
+ isTestFeatureAvailable: true,
});
expect(result.newUser).toBe(true);
@@ -110,15 +138,14 @@ describe('UserPreferencesCommand', () => {
it('should prompt for new user in interactive mode', async () => {
// Mock TTY
- Object.defineProperty(process.stdout, 'isTTY', { value: true, configurable: true });
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: true,
+ configurable: true,
+ });
vi.mocked(prompt.select).mockResolvedValueOnce(true); // new user
- const result = await command.execute({
- framework: null,
- builder: 'vite' as SupportedBuilder,
- projectType: ProjectType.REACT,
- });
+ const result = await command.execute(defaultExecuteOptions);
expect(prompt.select).toHaveBeenCalledWith(
expect.objectContaining({
@@ -131,84 +158,320 @@ describe('UserPreferencesCommand', () => {
});
it('should prompt for install type when not a new user', async () => {
- Object.defineProperty(process.stdout, 'isTTY', { value: true, configurable: true });
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: true,
+ configurable: true,
+ });
vi.mocked(prompt.select)
.mockResolvedValueOnce(false) // not new user
.mockResolvedValueOnce('light'); // minimal install
+ const result = await command.execute(defaultExecuteOptions);
+
+ expect(result.selectedFeatures.has(Feature.TEST)).toBe(false);
+ expect(result.selectedFeatures.has(Feature.DOCS)).toBe(false);
+ expect(result.selectedFeatures.has(Feature.ONBOARDING)).toBe(false);
+ });
+
+ it('should remove test feature if isTestFeatureAvailable is false', async () => {
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: true,
+ configurable: true,
+ });
+
+ vi.mocked(prompt.select).mockResolvedValueOnce(true); // new user
+
const result = await command.execute({
- framework: null,
- builder: 'vite' as SupportedBuilder,
- projectType: ProjectType.REACT,
+ ...defaultExecuteOptions,
+ isTestFeatureAvailable: false,
});
- expect(prompt.select).toHaveBeenCalledTimes(2);
- expect(result.newUser).toBe(false);
- const telemetryService = (command as unknown as CommandWithPrivates).telemetryService;
- expect(telemetryService.trackInstallType).toHaveBeenCalledWith('light');
+ expect(result.selectedFeatures.has(Feature.TEST)).toBe(false);
+ expect(result.selectedFeatures.has(Feature.DOCS)).toBe(true);
+ expect(result.selectedFeatures.has(Feature.ONBOARDING)).toBe(true);
+ });
+ });
+
+ describe('isTestFeatureAvailable option', () => {
+ it('should include test feature when isTestFeatureAvailable=true in recommended install', async () => {
+ const result = await command.execute({
+ ...defaultExecuteOptions,
+ isTestFeatureAvailable: true,
+ });
+
+ expect(result.selectedFeatures.has(Feature.TEST)).toBe(true);
+ });
+
+ it('should NOT include test feature when isTestFeatureAvailable=false in recommended install', async () => {
+ const result = await command.execute({
+ ...defaultExecuteOptions,
+ isTestFeatureAvailable: false,
+ });
+
+ expect(result.selectedFeatures.has(Feature.TEST)).toBe(false);
+ // Other features should still be present
+ expect(result.selectedFeatures.has(Feature.DOCS)).toBe(true);
+ expect(result.selectedFeatures.has(Feature.A11Y)).toBe(true);
+ });
+ });
+
+ describe('AI setup prompt', () => {
+ it('should include AI feature when user accepts AI setup in interactive mode', async () => {
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: true,
+ configurable: true,
+ });
+
+ vi.mocked(prompt.select).mockResolvedValueOnce(true); // new user
+ vi.mocked(prompt.confirm).mockResolvedValueOnce(true); // AI setup: yes
+
+ const result = await command.execute({
+ ...defaultExecuteOptions,
+ isAiSetupAvailable: true,
+ });
+
+ expect(prompt.confirm).toHaveBeenCalledWith(
+ expect.objectContaining({
+ message: expect.stringContaining(
+ 'Would you like to install AI features (MCP addon, skills and prompt suggestions)?'
+ ),
+ })
+ );
+ expect(result.selectedFeatures.has(Feature.AI)).toBe(true);
+ });
+
+ it('should not include ONBOARDING feature when user accepts AI setup', async () => {
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: true,
+ configurable: true,
+ });
+
+ vi.mocked(prompt.select).mockResolvedValueOnce(true); // new user
+ vi.mocked(prompt.confirm).mockResolvedValueOnce(true); // AI setup: yes
+
+ const result = await command.execute({
+ ...defaultExecuteOptions,
+ isAiSetupAvailable: true,
+ });
+
+ expect(result.selectedFeatures.has(Feature.AI)).toBe(true);
+ expect(result.selectedFeatures.has(Feature.ONBOARDING)).toBe(false);
+ });
+
+ it('should include ONBOARDING when AI is selected inside a sandbox (IN_STORYBOOK_SANDBOX)', async () => {
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: true,
+ configurable: true,
+ });
+ vi.mocked(prompt.select).mockResolvedValueOnce(true); // new user
+ vi.mocked(prompt.confirm).mockResolvedValueOnce(true); // AI setup: yes
+
+ const oldInSandbox = process.env.IN_STORYBOOK_SANDBOX;
+ process.env.IN_STORYBOOK_SANDBOX = 'true';
+
+ try {
+ const result = await command.execute({
+ ...defaultExecuteOptions,
+ isAiSetupAvailable: true,
+ });
+
+ expect(result.selectedFeatures.has(Feature.AI)).toBe(true);
+ expect(result.selectedFeatures.has(Feature.ONBOARDING)).toBe(true);
+ } finally {
+ if (oldInSandbox !== undefined) {
+ process.env.IN_STORYBOOK_SANDBOX = oldInSandbox;
+ } else {
+ delete process.env.IN_STORYBOOK_SANDBOX;
+ }
+ }
+ });
+
+ it('should not include AI feature when user declines AI setup', async () => {
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: true,
+ configurable: true,
+ });
+
+ vi.mocked(prompt.select).mockResolvedValueOnce(true); // new user
+ vi.mocked(prompt.confirm).mockResolvedValueOnce(false); // AI setup: no
+
+ const result = await command.execute({
+ ...defaultExecuteOptions,
+ isAiSetupAvailable: true,
+ });
+
+ expect(result.selectedFeatures.has(Feature.AI)).toBe(false);
+ });
+
+ it('should default AI to true when prompts are skipped (non-interactive)', async () => {
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: undefined,
+ configurable: true,
+ });
+
+ const result = await command.execute({
+ ...defaultExecuteOptions,
+ isAiSetupAvailable: true,
+ });
+
+ expect(prompt.confirm).not.toHaveBeenCalled();
+ expect(result.selectedFeatures.has(Feature.AI)).toBe(true);
+ });
+
+ it('should default AI to true when --yes flag is used', async () => {
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: true,
+ configurable: true,
+ });
+
+ const commandOptions: CommandOptions = {
+ packageManager: PackageManagerName.NPM,
+ disableTelemetry: true,
+ yes: true,
+ };
+ const yesCommand = new UserPreferencesCommand(commandOptions);
+
+ // Inject mocked services
+ (yesCommand as unknown as CommandWithPrivates).telemetryService = {
+ trackNewUserCheck: vi.fn(),
+ trackInstallType: vi.fn(),
+ trackAiSetupNudge: vi.fn(),
+ };
+
+ const result = await yesCommand.execute({
+ ...defaultExecuteOptions,
+ isAiSetupAvailable: true,
+ });
+
+ expect(prompt.confirm).not.toHaveBeenCalled();
+ expect(result.selectedFeatures.has(Feature.AI)).toBe(true);
+ });
+
+ it('should not prompt for AI setup when isAiSetupAvailable is false', async () => {
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: true,
+ configurable: true,
+ });
+
+ vi.mocked(prompt.select).mockResolvedValueOnce(true); // new user
+
+ const result = await command.execute({
+ ...defaultExecuteOptions,
+ isAiSetupAvailable: false,
+ });
+
+ expect(prompt.confirm).not.toHaveBeenCalled();
+ expect(result.selectedFeatures.has(Feature.AI)).toBe(false);
});
- it('should not include test feature in minimal install', async () => {
- Object.defineProperty(process.stdout, 'isTTY', { value: true, configurable: true });
+ it('should include test feature in minimal installs when user accepts AI setup', async () => {
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: true,
+ configurable: true,
+ });
+
+ vi.mocked(prompt.select)
+ .mockResolvedValueOnce(false) // not new user
+ .mockResolvedValueOnce('light'); // minimal install
+ vi.mocked(prompt.confirm).mockResolvedValueOnce(true); // AI setup: yes
+
+ const result = await command.execute({
+ ...defaultExecuteOptions,
+ isAiSetupAvailable: true,
+ });
+
+ expect(result.selectedFeatures.has(Feature.AI)).toBe(true);
+ expect(result.selectedFeatures.has(Feature.TEST)).toBe(true);
+ // Other recommended features should NOT be present with light install
+ expect(result.selectedFeatures.has(Feature.DOCS)).toBe(false);
+ });
+
+ it('should not include test feature in minimal installs when user declines AI setup', async () => {
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: true,
+ configurable: true,
+ });
vi.mocked(prompt.select)
.mockResolvedValueOnce(false) // not new user
.mockResolvedValueOnce('light'); // minimal install
+ vi.mocked(prompt.confirm).mockResolvedValueOnce(false); // AI setup: no
const result = await command.execute({
- framework: null,
- builder: 'vite' as SupportedBuilder,
- projectType: ProjectType.REACT,
+ ...defaultExecuteOptions,
+ isAiSetupAvailable: true,
});
+ expect(result.selectedFeatures.has(Feature.AI)).toBe(false);
expect(result.selectedFeatures.has(Feature.TEST)).toBe(false);
expect(result.selectedFeatures.has(Feature.DOCS)).toBe(false);
- expect(result.selectedFeatures.has(Feature.ONBOARDING)).toBe(false);
});
- it('should validate test feature compatibility in interactive mode', async () => {
- Object.defineProperty(process.stdout, 'isTTY', { value: true, configurable: true });
+ it('should track ai-prompt-nudge telemetry when user accepts AI setup', async () => {
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: true,
+ configurable: true,
+ });
vi.mocked(prompt.select).mockResolvedValueOnce(true); // new user
- const featureService = (command as unknown as CommandWithPrivates).featureService;
- vi.mocked(featureService.validateTestFeatureCompatibility).mockResolvedValue({
- compatible: true,
- });
+ vi.mocked(prompt.confirm).mockResolvedValueOnce(true); // AI setup: yes
await command.execute({
- framework: null,
- builder: 'vite' as SupportedBuilder,
- projectType: ProjectType.REACT,
+ ...defaultExecuteOptions,
+ isAiSetupAvailable: true,
});
- expect(featureService.validateTestFeatureCompatibility).toHaveBeenCalledWith(
- null,
- 'vite',
- process.cwd()
- );
+ const telemetryService = (command as unknown as CommandWithPrivates).telemetryService;
+ expect(telemetryService.trackAiSetupNudge).toHaveBeenCalledWith({ skipPrompt: false });
});
- it('should remove test feature if user chooses to continue without it', async () => {
- Object.defineProperty(process.stdout, 'isTTY', { value: true, configurable: true });
+ it('should not track ai-prompt-nudge telemetry when user declines AI setup', async () => {
+ Object.defineProperty(process.stdout, 'isTTY', {
+ value: true,
+ configurable: true,
+ });
vi.mocked(prompt.select).mockResolvedValueOnce(true); // new user
- const featureService = (command as unknown as CommandWithPrivates).featureService;
- vi.mocked(featureService.validateTestFeatureCompatibility).mockResolvedValue({
- compatible: false,
- reasons: ['React version is too old'],
+ vi.mocked(prompt.confirm).mockResolvedValueOnce(false); // AI setup: no
+
+ await command.execute({
+ ...defaultExecuteOptions,
+ isAiSetupAvailable: true,
});
- vi.mocked(prompt.confirm).mockResolvedValueOnce(true); // continue without test
+ const telemetryService = (command as unknown as CommandWithPrivates).telemetryService;
+ expect(telemetryService.trackAiSetupNudge).not.toHaveBeenCalled();
+ });
+
+ it('should track ai-prompt-nudge telemetry when AI is auto-accepted in non-interactive mode', async () => {
+ // Non-interactive (no TTY) with AI available — auto-accepts
const result = await command.execute({
- framework: null,
- builder: 'vite' as SupportedBuilder,
- projectType: ProjectType.REACT,
+ ...defaultExecuteOptions,
+ isAiSetupAvailable: true,
});
- expect(result.selectedFeatures.has(Feature.TEST)).toBe(false);
- expect(result.selectedFeatures.has(Feature.DOCS)).toBe(true);
- expect(result.selectedFeatures.has(Feature.ONBOARDING)).toBe(true);
+ expect(result.selectedFeatures.has(Feature.AI)).toBe(true);
+ const telemetryService = (command as unknown as CommandWithPrivates).telemetryService;
+ expect(telemetryService.trackAiSetupNudge).toHaveBeenCalledWith({ skipPrompt: true });
+ });
+ });
+
+ describe('executeUserPreferences helper', () => {
+ it('should return a valid result', async () => {
+ const commandOptions: CommandOptions = {
+ packageManager: PackageManagerName.NPM,
+ disableTelemetry: true,
+ };
+
+ const result = await executeUserPreferences({
+ options: commandOptions,
+ ...defaultExecuteOptions,
+ });
+
+ // Should return a valid result
+ expect(result.selectedFeatures).toBeDefined();
+ expect(result.newUser).toBeDefined();
});
});
});
diff --git a/code/lib/create-storybook/src/commands/UserPreferencesCommand.ts b/code/lib/create-storybook/src/commands/UserPreferencesCommand.ts
index f135e8a01f1e..1a67b7197ca2 100644
--- a/code/lib/create-storybook/src/commands/UserPreferencesCommand.ts
+++ b/code/lib/create-storybook/src/commands/UserPreferencesCommand.ts
@@ -1,8 +1,12 @@
import type { ProjectType } from 'storybook/internal/cli';
import { globalSettings } from 'storybook/internal/cli';
-import { type JsPackageManager, isCI } from 'storybook/internal/common';
+import { isCI } from 'storybook/internal/common';
import { logger, prompt } from 'storybook/internal/node-logger';
-import type { SupportedBuilder, SupportedFramework } from 'storybook/internal/types';
+import type {
+ SupportedBuilder,
+ SupportedFramework,
+ SupportedRenderer,
+} from 'storybook/internal/types';
import { Feature } from 'storybook/internal/types';
import picocolors from 'picocolors';
@@ -28,7 +32,10 @@ export interface UserPreferencesOptions {
skipPrompt?: boolean;
framework: SupportedFramework | null;
builder: SupportedBuilder;
+ renderer: SupportedRenderer;
projectType: ProjectType;
+ isTestFeatureAvailable: boolean;
+ isAiSetupAvailable: boolean;
}
/**
@@ -45,8 +52,6 @@ export interface UserPreferencesOptions {
export class UserPreferencesCommand {
constructor(
private readonly commandOptions: CommandOptions,
- packageManager: JsPackageManager,
- private readonly featureService = new FeatureCompatibilityService(packageManager),
private readonly telemetryService = new TelemetryService()
) {}
@@ -56,11 +61,6 @@ export class UserPreferencesCommand {
const isInteractive = process.stdout.isTTY && !isCI();
const skipPrompt = !isInteractive || !!this.commandOptions.yes;
- const isTestFeatureAvailable = await this.isTestFeatureAvailable(
- options.framework,
- options.builder
- );
-
// Get new user preference
const newUser = await this.promptNewUser(skipPrompt);
@@ -76,14 +76,18 @@ export class UserPreferencesCommand {
// Get install type
const installType: InstallType =
!newUser && !this.commandOptions.features
- ? await this.promptInstallType(skipPrompt, isTestFeatureAvailable)
+ ? await this.promptInstallType(skipPrompt, options.isTestFeatureAvailable)
: 'recommended';
+ // Ask about AI setup (only available for compatible projects, e.g. React + Vite)
+ const useAiForSetup = options.isAiSetupAvailable ? await this.promptAiSetup(skipPrompt) : false;
+
const selectedFeatures = this.determineFeatures(
installType,
newUser,
- isTestFeatureAvailable,
- options.projectType
+ options.isTestFeatureAvailable,
+ options.projectType,
+ useAiForSetup
);
return { newUser, selectedFeatures };
@@ -184,7 +188,8 @@ export class UserPreferencesCommand {
installType: InstallType,
newUser: boolean,
isTestFeatureAvailable: boolean,
- projectType: ProjectType
+ projectType: ProjectType,
+ useAiForSetup: boolean
): Set {
const features = new Set();
@@ -200,28 +205,44 @@ export class UserPreferencesCommand {
}
}
+ // If user has asked for AI setup, we provide the MCP addon and ensure test is included
+ if (useAiForSetup) {
+ features.add(Feature.AI);
+ if (isTestFeatureAvailable) {
+ features.add(Feature.TEST);
+ }
+
+ // We leave onboarding for sandboxes as we test onboarding in CI
+ if (!process.env.IN_STORYBOOK_SANDBOX) {
+ features.delete(Feature.ONBOARDING);
+ }
+ }
+
return features;
}
- /** Validate test feature compatibility and prompt user if issues found */
- private async isTestFeatureAvailable(
- framework: SupportedFramework | null,
- builder: SupportedBuilder
- ): Promise {
- const result = await this.featureService.validateTestFeatureCompatibility(
- framework,
- builder,
- process.cwd()
- );
+ /** Prompt user about AI-assisted Storybook setup */
+ private async promptAiSetup(skipPrompt: boolean): Promise {
+ const useAi = skipPrompt
+ ? true
+ : await prompt.confirm({
+ message:
+ 'Would you like to install AI features (MCP addon, skills and prompt suggestions)?',
+ });
+
+ if (useAi) {
+ await this.telemetryService.trackAiSetupNudge({ skipPrompt });
+ }
- return result.compatible;
+ return useAi;
}
}
export const executeUserPreferences = ({
options,
- packageManager,
...restOptions
-}: UserPreferencesOptions & { options: CommandOptions; packageManager: JsPackageManager }) => {
- return new UserPreferencesCommand(options, packageManager).execute(restOptions);
+}: UserPreferencesOptions & {
+ options: CommandOptions;
+}) => {
+ return new UserPreferencesCommand(options).execute(restOptions);
};
diff --git a/code/lib/create-storybook/src/generators/types.ts b/code/lib/create-storybook/src/generators/types.ts
index 71d43828bc5d..6854a5d8d3c3 100644
--- a/code/lib/create-storybook/src/generators/types.ts
+++ b/code/lib/create-storybook/src/generators/types.ts
@@ -121,11 +121,13 @@ export type CommandOptions = {
features?: Array;
type?: ProjectType;
force?: any;
+ /** Whether this is being run via an ai agent */
+ agent?: boolean;
html?: boolean;
skipInstall?: boolean;
language?: SupportedLanguage;
parser?: string;
- // Automatically answer yes to prompts
+ /** Automatically answer yes to prompts */
yes?: boolean;
builder?: SupportedBuilder;
linkable?: boolean;
diff --git a/code/lib/create-storybook/src/initiate.ts b/code/lib/create-storybook/src/initiate.ts
index 10c5a9725244..c69d205a7eed 100644
--- a/code/lib/create-storybook/src/initiate.ts
+++ b/code/lib/create-storybook/src/initiate.ts
@@ -2,10 +2,18 @@ import { ProjectType } from 'storybook/internal/cli';
import {
type JsPackageManager,
PackageManagerName,
+ cache,
executeCommand,
} from 'storybook/internal/common';
import { getServerPort, withTelemetry } from 'storybook/internal/core-server';
import { logTracker, logger } from 'storybook/internal/node-logger';
+import { telemetry } from 'storybook/internal/telemetry';
+import { Feature } from 'storybook/internal/types';
+import type {
+ SupportedBuilder,
+ SupportedFramework,
+ SupportedRenderer,
+} from 'storybook/internal/types';
import {
executeAddonConfiguration,
@@ -23,6 +31,32 @@ import type { CommandOptions } from './generators/types.ts';
import { FeatureCompatibilityService } from './services/FeatureCompatibilityService.ts';
import { TelemetryService } from './services/TelemetryService.ts';
+/** Validate test feature compatibility and check AI setup support */
+async function checkFeatureSupport(
+ packageManager: JsPackageManager,
+ framework: SupportedFramework | null,
+ builder: SupportedBuilder,
+ renderer: SupportedRenderer
+): Promise<{
+ isTestFeatureAvailable: boolean;
+ isAiSetupAvailable: boolean;
+}> {
+ const featureService = new FeatureCompatibilityService(packageManager);
+
+ const result = await featureService.validateTestFeatureCompatibility(
+ framework,
+ builder,
+ process.cwd()
+ );
+
+ const aiSetup = FeatureCompatibilityService.supportsAISetupFeature(renderer, builder, framework);
+
+ return {
+ isTestFeatureAvailable: result.compatible,
+ isAiSetupAvailable: aiSetup,
+ };
+}
+
/**
* Main entry point for Storybook initialization
*
@@ -39,6 +73,10 @@ export async function doInitiate(options: CommandOptions): Promise<
}
| { shouldRunDev: false }
> {
+ if (options.agent) {
+ options.yes = true;
+ }
+
// Initialize services
const telemetryService = new TelemetryService();
@@ -48,7 +86,7 @@ export async function doInitiate(options: CommandOptions): Promise<
let dependencyCollector: DependencyCollector | null = new DependencyCollector();
// Step 1: Run preflight checks
- const { packageManager } = await executePreflightCheck(options);
+ const { packageManager, isEmptyProject } = await executePreflightCheck(options);
// Step 2: Detect project type
const { projectType, language } = await executeProjectDetection(packageManager, options);
@@ -61,12 +99,23 @@ export async function doInitiate(options: CommandOptions): Promise<
);
// Step 4: Get user preferences and feature selections (with framework/builder for validation)
- const { newUser, selectedFeatures } = await executeUserPreferences({
+ const { isTestFeatureAvailable, isAiSetupAvailable } = await checkFeatureSupport(
packageManager,
+ framework,
+ builder,
+ renderer
+ );
+
+ const { newUser, selectedFeatures } = await executeUserPreferences({
options,
framework,
builder,
+ renderer,
projectType,
+ isTestFeatureAvailable,
+ // Skip AI feature recommendation when scaffolding into an empty directory,
+ // since the user hasn't yet committed to a project setup where AI tooling adds value.
+ isAiSetupAvailable: isAiSetupAvailable && !isEmptyProject,
});
// Step 5: Execute generator with dependency collector (now with frameworkInfo)
@@ -102,7 +151,15 @@ export async function doInitiate(options: CommandOptions): Promise<
});
// Step 8: Print final summary
+ const hasAiFeature = selectedFeatures.has(Feature.AI);
+ if (hasAiFeature) {
+ // Record the init-time AI opt-in in the telemetry event cache so the server can gate
+ // AI-related UI (checklist item, analytics) via the universal checklist store.
+ await telemetry('ai-init-opt-in', {}).catch(() => {});
+ }
await executeFinalization({
+ showAgentFollowUp: !!options.agent && hasAiFeature,
+ showAiInstructions: hasAiFeature,
logfile: options.logfile,
storybookCommand,
});
@@ -110,6 +167,11 @@ export async function doInitiate(options: CommandOptions): Promise<
// Step 9: Track telemetry
await telemetryService.trackInitWithContext(projectType, selectedFeatures, newUser);
+ // Signal dev to redirect to onboarding on first run
+ if (selectedFeatures.has(Feature.ONBOARDING)) {
+ await cache.set('onboarding-pending', true).catch(() => {});
+ }
+
return {
shouldRunDev:
!!options.dev &&
@@ -154,7 +216,8 @@ export async function initiate(options: CommandOptions): Promise {
handleCommandFailure(options.logfile);
});
- if (initiateResult?.shouldRunDev) {
+ // Launch dev server only if --dev was explicitly passed
+ if (!options.agent && initiateResult?.shouldRunDev) {
await runStorybookDev(initiateResult);
}
}
@@ -164,17 +227,14 @@ async function runStorybookDev(result: {
projectType: ProjectType;
packageManager: JsPackageManager;
storybookCommand?: string | null;
- shouldOnboard: boolean;
}): Promise {
- const { projectType, packageManager, storybookCommand, shouldOnboard } = result;
+ const { projectType, packageManager, storybookCommand } = result;
if (!storybookCommand) {
return;
}
try {
- const supportsOnboarding = FeatureCompatibilityService.supportsOnboarding(projectType);
-
const parts = storybookCommand.split(' ');
// Angular CLI throws "Unknown argument: silent"
@@ -203,10 +263,6 @@ async function runStorybookDev(result: {
parts.push(`-p`, `${availablePort}`);
}
- if (supportsOnboarding && shouldOnboard) {
- parts.push('--initial-path=/onboarding');
- }
-
parts.push('--quiet');
}
diff --git a/code/lib/create-storybook/src/services/AddonService.ts b/code/lib/create-storybook/src/services/AddonService.ts
index 3d7603795959..b82c64bbee4c 100644
--- a/code/lib/create-storybook/src/services/AddonService.ts
+++ b/code/lib/create-storybook/src/services/AddonService.ts
@@ -43,6 +43,10 @@ export class AddonService {
addons.push('@storybook/addon-onboarding');
}
+ if (features.has(Feature.AI)) {
+ addons.push('@storybook/addon-mcp');
+ }
+
return addons;
}
diff --git a/code/lib/create-storybook/src/services/FeatureCompatibilityService.test.ts b/code/lib/create-storybook/src/services/FeatureCompatibilityService.test.ts
index 71522f775554..6d2affc0988c 100644
--- a/code/lib/create-storybook/src/services/FeatureCompatibilityService.test.ts
+++ b/code/lib/create-storybook/src/services/FeatureCompatibilityService.test.ts
@@ -2,7 +2,7 @@ import { beforeEach, describe, expect, it, vi } from 'vitest';
import { AddonVitestService, ProjectType } from 'storybook/internal/cli';
import type { JsPackageManager } from 'storybook/internal/common';
-import { SupportedBuilder, SupportedFramework } from 'storybook/internal/types';
+import { SupportedBuilder, SupportedFramework, SupportedRenderer } from 'storybook/internal/types';
import { FeatureCompatibilityService } from './FeatureCompatibilityService.ts';
@@ -44,6 +44,66 @@ describe('FeatureCompatibilityService', () => {
});
});
+ describe('supportsAISetupFeature', () => {
+ it('should return true for react renderer with vite builder', () => {
+ expect(
+ FeatureCompatibilityService.supportsAISetupFeature(
+ SupportedRenderer.REACT,
+ SupportedBuilder.VITE,
+ SupportedFramework.REACT_VITE
+ )
+ ).toBe(true);
+ });
+
+ it('should return false for vue3 renderer with vite builder', () => {
+ expect(
+ FeatureCompatibilityService.supportsAISetupFeature(
+ SupportedRenderer.VUE3,
+ SupportedBuilder.VITE,
+ SupportedFramework.VUE3_VITE
+ )
+ ).toBe(false);
+ });
+
+ it('should return false for react renderer with webpack5 builder', () => {
+ expect(
+ FeatureCompatibilityService.supportsAISetupFeature(
+ SupportedRenderer.REACT,
+ SupportedBuilder.WEBPACK5,
+ SupportedFramework.REACT_WEBPACK5
+ )
+ ).toBe(false);
+ });
+
+ it('should return false for non-react renderer with non-vite builder', () => {
+ expect(
+ FeatureCompatibilityService.supportsAISetupFeature(
+ SupportedRenderer.ANGULAR,
+ SupportedBuilder.WEBPACK5,
+ SupportedFramework.ANGULAR
+ )
+ ).toBe(false);
+
+ expect(
+ FeatureCompatibilityService.supportsAISetupFeature(
+ SupportedRenderer.SVELTE,
+ SupportedBuilder.WEBPACK5,
+ null
+ )
+ ).toBe(false);
+ });
+
+ it('should return false for react-native-web-vite framework', () => {
+ expect(
+ FeatureCompatibilityService.supportsAISetupFeature(
+ SupportedRenderer.REACT,
+ SupportedBuilder.VITE,
+ SupportedFramework.REACT_NATIVE_WEB_VITE
+ )
+ ).toBe(false);
+ });
+ });
+
describe('validateTestFeatureCompatibility', () => {
let mockValidateCompatibility: ReturnType;
diff --git a/code/lib/create-storybook/src/services/FeatureCompatibilityService.ts b/code/lib/create-storybook/src/services/FeatureCompatibilityService.ts
index 032d73bcaecf..8e8f2a7b611c 100644
--- a/code/lib/create-storybook/src/services/FeatureCompatibilityService.ts
+++ b/code/lib/create-storybook/src/services/FeatureCompatibilityService.ts
@@ -1,6 +1,6 @@
import { AddonVitestService, ProjectType } from 'storybook/internal/cli';
import type { JsPackageManager } from 'storybook/internal/common';
-import type { SupportedBuilder, SupportedFramework } from 'storybook/internal/types';
+import { SupportedBuilder, SupportedFramework, SupportedRenderer } from 'storybook/internal/types';
/** Project types that support the onboarding feature */
const ONBOARDING_PROJECT_TYPES: ProjectType[] = [
@@ -32,6 +32,18 @@ export class FeatureCompatibilityService {
);
}
+ /** Check if AI-assisted setup (storybook ai setup) is supported for this project configuration */
+ static supportsAISetupFeature(
+ renderer: SupportedRenderer,
+ builder: SupportedBuilder,
+ framework: SupportedFramework | null
+ ): boolean {
+ if (framework === SupportedFramework.REACT_NATIVE_WEB_VITE) {
+ return false;
+ }
+ return renderer === SupportedRenderer.REACT && builder === SupportedBuilder.VITE;
+ }
+
/**
* Validate all compatibility checks for test feature
*
diff --git a/code/lib/create-storybook/src/services/TelemetryService.test.ts b/code/lib/create-storybook/src/services/TelemetryService.test.ts
index d87735ed1087..2608cfafbc36 100644
--- a/code/lib/create-storybook/src/services/TelemetryService.test.ts
+++ b/code/lib/create-storybook/src/services/TelemetryService.test.ts
@@ -50,6 +50,7 @@ describe('TelemetryService', () => {
docs: true,
test: false,
onboarding: true,
+ ai: false,
},
newUser: true,
versionSpecifier: '8.0.0',
@@ -71,6 +72,26 @@ describe('TelemetryService', () => {
expect(telemetry).toHaveBeenCalledWith('scaffolded-empty', data);
});
+
+ it('should track ai-prompt-nudge event with context when prompt was shown', async () => {
+ await telemetryService.trackAiSetupNudge({ skipPrompt: false });
+
+ expect(telemetry).toHaveBeenCalledWith('ai-prompt-nudge', {
+ id: 'setup',
+ origin: 'init',
+ context: { skipPrompt: false },
+ });
+ });
+
+ it('should track ai-prompt-nudge event with context when prompt was skipped', async () => {
+ await telemetryService.trackAiSetupNudge({ skipPrompt: true });
+
+ expect(telemetry).toHaveBeenCalledWith('ai-prompt-nudge', {
+ id: 'setup',
+ origin: 'init',
+ context: { skipPrompt: true },
+ });
+ });
});
describe('trackInitWithContext', () => {
@@ -92,6 +113,7 @@ describe('TelemetryService', () => {
docs: true,
test: true,
onboarding: false,
+ ai: false,
},
newUser: true,
versionSpecifier: '8.0.5',
@@ -116,6 +138,7 @@ describe('TelemetryService', () => {
docs: false,
test: false,
onboarding: false,
+ ai: false,
},
newUser: false,
versionSpecifier: undefined,
@@ -138,5 +161,25 @@ describe('TelemetryService', () => {
})
);
});
+
+ describe('when AI feature is selected', () => {
+ beforeEach(() => {
+ vi.mocked(getProcessAncestry).mockReturnValue([]);
+ });
+
+ it('should set ai: true when AI feature is selected', async () => {
+ const telemetryService = new TelemetryService();
+ const selectedFeatures = new Set([Feature.AI]);
+
+ await telemetryService.trackInitWithContext(ProjectType.REACT, selectedFeatures, true);
+
+ expect(telemetry).toHaveBeenCalledWith(
+ 'init',
+ expect.objectContaining({
+ features: expect.objectContaining({ ai: true }),
+ })
+ );
+ });
+ });
});
});
diff --git a/code/lib/create-storybook/src/services/TelemetryService.ts b/code/lib/create-storybook/src/services/TelemetryService.ts
index 897c4bce1111..3c9733af97fe 100644
--- a/code/lib/create-storybook/src/services/TelemetryService.ts
+++ b/code/lib/create-storybook/src/services/TelemetryService.ts
@@ -30,6 +30,15 @@ export class TelemetryService {
});
}
+ /** Track when a user accepts the AI setup nudge prompt */
+ async trackAiSetupNudge(context: { skipPrompt: boolean }): Promise {
+ await telemetry('ai-prompt-nudge', {
+ id: 'setup',
+ origin: 'init',
+ context,
+ });
+ }
+
/** Track Playwright prompt decision (install | skip | aborted) */
async trackPlaywrightPromptDecision(
result: 'installed' | 'skipped' | 'aborted' | 'failed'
@@ -48,6 +57,7 @@ export class TelemetryService {
docs: boolean;
test: boolean;
onboarding: boolean;
+ ai: boolean;
};
newUser: boolean;
versionSpecifier?: string;
@@ -88,6 +98,7 @@ export class TelemetryService {
docs: selectedFeatures.has(Feature.DOCS),
test: selectedFeatures.has(Feature.TEST),
onboarding: selectedFeatures.has(Feature.ONBOARDING),
+ ai: selectedFeatures.has(Feature.AI),
};
await telemetry('init', {
diff --git a/code/tsconfig.json b/code/tsconfig.json
index db4b6c217e44..870835c74b2a 100644
--- a/code/tsconfig.json
+++ b/code/tsconfig.json
@@ -9,11 +9,12 @@
"ignoreDeprecations": "5.0",
"incremental": false,
"isolatedModules": true,
- "allowImportingTsExtensions": true,
"jsx": "preserve",
"lib": ["dom", "dom.iterable", "esnext"],
"module": "Preserve",
"moduleResolution": "bundler",
+ // Required for explicit .ts import extensions — migrating toward native Node TS execution
+ "allowImportingTsExtensions": true,
"noImplicitAny": true,
"noUnusedLocals": false,
"skipLibCheck": true,
diff --git a/docs/_snippets/ai-setup-output.md b/docs/_snippets/ai-setup-output.md
new file mode 100644
index 000000000000..3f4e43400e07
--- /dev/null
+++ b/docs/_snippets/ai-setup-output.md
@@ -0,0 +1,11 @@
+```shell renderer="common" language="js" packageManager="npm"
+npx storybook ai setup --output storybook-setup.md
+```
+
+```shell renderer="common" language="js" packageManager="pnpm"
+pnpm exec storybook ai setup --output storybook-setup.md
+```
+
+```shell renderer="common" language="js" packageManager="yarn"
+yarn exec storybook ai setup --output storybook-setup.md
+```
diff --git a/docs/ai/best-practices.mdx b/docs/ai/best-practices.mdx
index efa101fa403e..5a9a98a397a1 100644
--- a/docs/ai/best-practices.mdx
+++ b/docs/ai/best-practices.mdx
@@ -137,3 +137,4 @@ import { Meta } from '@storybook/addon-docs/blocks';
- [MCP server API](./mcp/api.mdx)
- [Sharing your MCP server](./mcp/sharing.mdx)
- [Manifests](./manifests.mdx)
+- [Agentic setup](./setup.mdx)
diff --git a/docs/ai/index.mdx b/docs/ai/index.mdx
index 5e6efb14d5ba..2e8899484c2a 100644
--- a/docs/ai/index.mdx
+++ b/docs/ai/index.mdx
@@ -90,3 +90,4 @@ These manifests are automatically generated and updated as you work on your Stor
- [Sharing your MCP server](./mcp/sharing.mdx)
- [Best practices for using Storybook with AI](./best-practices.mdx)
- [Manifests](./manifests.mdx)
+- [Agentic setup](./setup.mdx)
diff --git a/docs/ai/manifests.mdx b/docs/ai/manifests.mdx
index 5dad6f60c9a9..5cc4e7d18822 100644
--- a/docs/ai/manifests.mdx
+++ b/docs/ai/manifests.mdx
@@ -298,3 +298,4 @@ import { Meta } from '@storybook/addon-docs/blocks';
- [MCP server API](./mcp/api.mdx)
- [Sharing your MCP server](./mcp/sharing.mdx)
- [Best practices for using Storybook with AI](./best-practices.mdx)
+- [Agentic setup](./setup.mdx)
diff --git a/docs/ai/mcp/api.mdx b/docs/ai/mcp/api.mdx
index dc94a2efaf0e..7a41c2e123ff 100644
--- a/docs/ai/mcp/api.mdx
+++ b/docs/ai/mcp/api.mdx
@@ -78,3 +78,4 @@ The testing toolset includes the [`run-story-tests`](./overview.mdx#run-story-te
- [Sharing your MCP server](./sharing.mdx)
- [Best practices for using Storybook with AI](../best-practices.mdx)
- [Manifests](../manifests.mdx)
+- [Agentic setup](../setup.mdx)
diff --git a/docs/ai/mcp/overview.mdx b/docs/ai/mcp/overview.mdx
index f487fc94e7d4..5688ebc5bb4f 100644
--- a/docs/ai/mcp/overview.mdx
+++ b/docs/ai/mcp/overview.mdx
@@ -175,3 +175,4 @@ The docs toolset relies on the [manifests](../manifests.mdx) generated by Storyb
- [Sharing your MCP server](./sharing.mdx)
- [Best practices for using Storybook with AI](../best-practices.mdx)
- [Manifests](../manifests.mdx)
+- [Agentic setup](../setup.mdx)
diff --git a/docs/ai/mcp/sharing.mdx b/docs/ai/mcp/sharing.mdx
index 633787a464c7..822913624dce 100644
--- a/docs/ai/mcp/sharing.mdx
+++ b/docs/ai/mcp/sharing.mdx
@@ -75,3 +75,4 @@ export async function handleRequest(request: Request): Promise {
- [MCP server API](./api.mdx)
- [Best practices for using Storybook with AI](../best-practices.mdx)
- [Manifests](../manifests.mdx)
+- [Agentic setup](../setup.mdx)
diff --git a/docs/ai/setup.mdx b/docs/ai/setup.mdx
new file mode 100644
index 000000000000..c9955e441ad9
--- /dev/null
+++ b/docs/ai/setup.mdx
@@ -0,0 +1,83 @@
+---
+title: Agentic setup
+sidebar:
+ order: 4
+---
+
+
+
+Storybook's agentic setup is currently only available for projects using the [React](?renderer=react) renderer with the [Vite](../builders/vite.mdx) builder. Support for additional renderers and builders will follow.
+
+The API may change in future releases. We welcome feedback and contributions to help improve this feature.
+
+
+
+
+
+Getting Storybook wired up to an existing application (configuring providers, mocking side effects, and writing the first few real stories) is the kind of repetitive, project-specific work that AI agents are well suited for. The **agentic setup** flow uses the [`storybook ai setup`](../api/cli-options.mdx#ai) command to generate a detailed, project-aware instruction set that an AI agent can follow to make Storybook fully functional in your project.
+
+The command analyzes your Storybook configuration (framework, renderer, builder, language, addons) and produces a Markdown prompt containing step-by-step instructions tailored to your project, covering everything from configuring your preview file to writing and verifying stories. See [what the generated prompt covers](#what-the-generated-prompt-covers) for the full list of steps.
+
+## Starting from `storybook init`
+
+When an agent runs [`storybook init`](../api/cli-options.mdx#init) to add Storybook to a new project, the output instructs the agent to continue with `storybook ai setup`. No extra prompting is needed; the agent will pick up the agentic setup flow automatically.
+
+If Storybook is already installed, you can kick off the flow yourself using one of the two approaches below.
+
+## Agent-initiated setup
+
+In this flow, you ask your agent to run `storybook ai setup`. For example:
+
+```txt
+Use Storybook's agentic setup command to configure Storybook for this project and write some initial stories.
+```
+
+The agent will run the command from your project root, read the generated prompt from stdout, and follow the steps in order: analyzing your codebase, updating `preview.tsx`, and writing stories. After each story, it runs Vitest to verify that the story renders and fixes any failures before moving on.
+
+This flow works best when your agent has access to a terminal in your project (most modern coding agents do). No flags or additional configuration is required. The generated instructions are self-contained.
+
+## User-initiated setup
+
+If you'd rather drive the process yourself, you can run the command manually and hand the output to any agent, even one that can't execute shell commands.
+
+1. From your project root, run:
+
+
+
+ This writes the instructions to `storybook-setup.md` instead of printing them to your terminal. Omit `--output` to print to stdout and pipe it elsewhere.
+
+2. Open the generated file and paste its contents into your agent's chat, or attach it as context. The prompt is designed to be self-contained: it references your specific `configDir`, framework, and renderer, and links back to the relevant Storybook docs in Markdown form.
+
+3. Let the agent work through the steps. You can review each change as the agent applies it.
+
+Use this flow when you want tighter control over what the agent does, when you're working with an agent that doesn't have shell access, or when you want to save the prompt to reuse across projects.
+
+## What the generated prompt covers
+
+Regardless of how the flow is initiated, the generated Markdown prompt walks the agent through the same ordered steps:
+
+1. **Analyze the codebase:** read providers, global CSS, portals, and data-fetching patterns.
+2. **Configure the [preview](../configure/story-rendering.mdx):** set up [decorators](../writing-stories/decorators.mdx), global styles, and any framework-level providers in `preview.tsx`.
+3. **Support portals:** ensure portal roots exist in the Storybook preview DOM.
+4. **Mock side effects:** intercept [network requests](../writing-stories/mocking-data-and-modules/mocking-network-requests.mdx) (via MSW), storage, timers, and navigation at the preview level rather than per-story.
+5. **Write [stories](../writing-stories/index.mdx):** copy real usage patterns from the app, [tagging](../writing-stories/tags.mdx) generated stories with `ai-generated` and `needs-work` so you can review them later.
+6. **Add [play functions](../writing-stories/play-function.mdx):** implement interaction tests for the most important flows.
+7. **Cover additional patterns:** expand coverage across the components the agent has already touched.
+8. **Verify:** run [Vitest](../writing-tests/integrations/vitest-addon/index.mdx) against every new story to confirm it renders, and run the type checker.
+
+
+
+The command snapshots your `preview` file so that subsequent runs of `storybook dev`, `storybook build`, and `storybook doctor` can detect progress the agent made and report it via telemetry. If you'd prefer not to share this data, pass `--disable-telemetry` (see [telemetry](../configure/telemetry.mdx)).
+
+
+
+## Next steps
+
+Once the agent has completed the setup:
+
+- Connect the [Storybook MCP server](./mcp/overview.mdx) to your agent so it can continue reading manifests, generating stories, and running tests against your live Storybook.
+- Review the stories tagged `ai-generated` and `needs-work`, and remove those tags once you've validated each one.
+- Follow the [best practices](./best-practices.mdx) to make your stories and documentation maximally useful to both humans and agents.
+
+
+{/* End supported renderers */}
diff --git a/docs/api/cli-options.mdx b/docs/api/cli-options.mdx
index 9257662b9ca8..64d1830b5421 100644
--- a/docs/api/cli-options.mdx
+++ b/docs/api/cli-options.mdx
@@ -280,6 +280,49 @@ Options include:
| `--package-manager` | Sets the package manager to use when running the health check.
Available package managers include `npm`, `yarn`, and `pnpm`.
`storybook doctor --package-manager pnpm` |
| `--debug` | Outputs more logs in the CLI to assist debugging.
`storybook doctor --debug` |
+### `ai`
+
+Helpers for AI agents. The `ai` command exposes subcommands that generate AI-friendly instructions for automating Storybook tasks. See the [agentic setup docs](../ai/setup.mdx) for a full walkthrough.
+
+```shell
+storybook ai [options] [command]
+```
+
+Options include:
+
+| Option | Description |
+| ----------------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `-h`, `--help` | Output usage information.
`storybook ai --help` |
+| `-o`, `--output ` | Write the prompt output to a file instead of printing it to stdout.
`storybook ai setup --output storybook-setup.md` |
+
+#### `ai setup`
+
+Generates a detailed, project-aware Markdown prompt that instructs an AI agent to configure Storybook in your project and write initial stories for real components. The prompt is built from your detected Storybook configuration (framework, renderer, builder, language, addons) and covers analyzing the codebase, configuring the preview, mocking side effects, writing stories, and verifying them with Vitest.
+
+
+
+`storybook ai setup` is currently only available for projects using the React renderer with the Vite builder.
+
+
+
+```shell
+storybook ai setup [options]
+```
+
+Options include:
+
+| Option | Description |
+| ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `-h`, `--help` | Output usage information.
`storybook ai setup --help` |
+| `-c`, `--config-dir [dir-name]` | Storybook configuration directory.
`storybook ai setup -c .storybook` |
+| `--package-manager ` | Force package manager to use when detecting project information.
Available package managers include `npm`, `yarn`, and `pnpm`.
`storybook ai setup --package-manager pnpm` |
+| `--disable-telemetry` | Disables Storybook's telemetry. Learn more about it [here](../configure/telemetry.mdx#how-to-opt-out).
`storybook ai setup --disable-telemetry` |
+| `--debug` | Outputs more logs in the CLI to assist debugging.
`storybook ai setup --debug` |
+| `--loglevel [level]` | Controls level of logging.
Available options: `trace`, `debug`, `info` (default), `warn`, `error`, `silent`.
`storybook ai setup --loglevel warn` |
+| `--logfile [path]` | Write debug logs to a file.
`storybook ai setup --logfile ./sb.log` |
+
+When run without `--output`, the generated prompt is printed to stdout. This is how AI agents typically consume it, by running the command directly and reading the result. When run with `--output`, the prompt is written to the given file path so you can paste or attach it to an agent that doesn't have shell access. See the [agentic setup](../ai/setup.mdx#user-initiated-setup) docs for details on both flows.
+
### `info`
Reports useful debugging information about your environment. Helpful in providing information when opening an issue or a discussion.
diff --git a/scripts/ci/common-jobs.ts b/scripts/ci/common-jobs.ts
index 774ef3012ce4..75ce07d0b364 100644
--- a/scripts/ci/common-jobs.ts
+++ b/scripts/ci/common-jobs.ts
@@ -67,7 +67,7 @@ export const build_linux = defineJob('Build (linux)', (workflowName) => ({
export const fmt = defineJob('Format check', () => ({
executor: {
name: 'sb_node_22_classic',
- class: 'medium+',
+ class: 'xlarge',
},
steps: [
git.checkout(),
diff --git a/scripts/eval/README.md b/scripts/eval/README.md
new file mode 100644
index 000000000000..7e95ad2307fb
--- /dev/null
+++ b/scripts/eval/README.md
@@ -0,0 +1,306 @@
+# Eval Harness
+
+The eval harness benchmarks how well AI coding agents (Claude, Codex) can set up Storybook and write stories for real-world projects. It runs agents against benchmark repos, grades the results, publishes them as draft PRs, and collects the data into a SQLite database for analysis.
+
+## Prerequisites
+
+- **`gh` CLI** — installed and authenticated (`gh auth login`)
+- **Claude Code CLI** and/or **Codex CLI** — installed with an active subscription
+
+## How it works
+
+
+
+The system forms a cycle:
+
+1. `**sync-baselines.ts**` pushes a canonical `.storybook` config to each benchmark repo so every trial starts from the same known-good baseline.
+2. `**eval.ts**` (single trial) or `**run-batch.ts**` (batch) creates a git worktree from a benchmark repo, runs an agent inside it, grades the output, and publishes a draft PR with structured result data.
+3. `**collect-pr-data.ts**` scrapes those draft PRs via the GitHub API and loads the results into a local SQLite database for analysis.
+
+Each trial follows this lifecycle:
+
+1. Clone the benchmark repo (once) and create a lightweight git worktree for the trial
+2. Install dependencies
+3. Run the agent with the selected prompt
+4. Grade the result (build check, TypeScript check, story render pass rate, ghost story coverage)
+5. Compute a quality score (normalized preview gain)
+6. Commit, push, and open a draft PR on the benchmark repo
+
+## Running a single trial
+
+All commands run from the repo root.
+
+```sh
+# Prompt variant is required. Example: pattern-copy-play (the CLI default)
+node scripts/eval/eval.ts -p mealdrop --prompt pattern-copy-play
+
+# Specific agent
+node scripts/eval/eval.ts -p mealdrop --prompt pattern-copy-play -a codex
+
+# Specific model (agent is inferred)
+node scripts/eval/eval.ts -p mealdrop --prompt pattern-copy-play -m opus-4.6
+
+# Specific effort level
+node scripts/eval/eval.ts -p mealdrop --prompt pattern-copy-play -a claude -e max
+
+# Different prompt
+node scripts/eval/eval.ts -p mealdrop --prompt setup
+
+# Manual mode — prepare workspace, print the command to run yourself
+node scripts/eval/eval.ts -p mealdrop --prompt pattern-copy-play --manual
+
+# Verbose output
+node scripts/eval/eval.ts -p mealdrop --prompt pattern-copy-play -v
+
+# List available projects, models, or prompts
+node scripts/eval/eval.ts --list-projects
+node scripts/eval/eval.ts --list-models
+node scripts/eval/eval.ts --list-prompts
+```
+
+When a trial completes, it prints a summary:
+
+```text
+Result
+ Build: PASS
+ Stories: 8/12 (67%) -> 11/12 (92%)
+ Ghost: 5/8 (63%) -> 7/8 (88%)
+ TS Err: 0
+ Score: 75% (normalized preview gain)
+ Cost: $1.23
+ Time: 4m32s
+ Turns: 18
+ PR: https://github.com/storybook-tmp/mealdrop/pull/42
+```
+
+## Running a batch
+
+By default, a batch runs **10 repetitions per (project × variant)** across all benchmark projects. With the default Claude-only matrix that's `7 projects × 1 variant × 10 reps = 70 trials`. Use `--repetitions` to shrink that — e.g. `--repetitions 2` gives 14 trials.
+
+```sh
+# Prompt is required. Confirms interactively unless you pass --yes (CI / automation).
+node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes
+
+# Smaller batch — 2 reps per project (14 trials with the default Claude-only matrix)
+node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --repetitions 2
+
+# Claude only
+node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --agents claude
+
+# Specific effort levels
+node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --claude-effort max
+node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --claude-efforts max,high
+node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --agents codex --codex-effort xhigh
+
+# Different prompt or concurrency
+node scripts/eval/run-batch.ts --prompt setup --yes
+node scripts/eval/run-batch.ts --prompt pattern-copy-play --yes --concurrency 4
+```
+
+Batch results are written to `storybook-eval/batches//`, with per-run log files and a `summary.json`. If any trials fail, the batch prints a `Failures:` footer with each failed label, the last error line from its log, and the log path.
+
+## Syncing baselines
+
+Before running evals, the benchmark repos need a consistent `.storybook` baseline. `sync-baselines.ts` pushes the canonical baseline config to every benchmark repo.
+
+```sh
+# Sync all projects
+node scripts/eval/sync-baselines.ts
+
+# Sync specific projects
+node scripts/eval/sync-baselines.ts --project mealdrop --project edgy
+
+# Dry run (commit locally but don't push)
+node scripts/eval/sync-baselines.ts --skip-push
+```
+
+The script ensures each repo is on its default branch with no local changes, fetches the latest from origin, replaces the `.storybook` directory with the canonical baseline, and commits/pushes if anything changed.
+
+## Syncing the Storybook version
+
+`sync-storybook-version.ts` bumps every benchmark repo to a specific Storybook version. It mirrors the shape of `sync-baselines.ts`: for each project it ensures the source clone is present and clean, checks out and fast-forwards the default branch, runs `npx storybook@ upgrade --yes --force --skip-check --skip-automigrations -c /.storybook` from the **repo root**, then commits and pushes any resulting changes. Running from the repo root (with `-c` pointing at the project's `.storybook` dir) lets the Storybook CLI discover the correct workspace `package.json` in pnpm/yarn monorepos where the Storybook deps live at the workspace root and the config lives in a sub-package.
+
+```sh
+# Upgrade every benchmark repo to a stable version
+node scripts/eval/sync-storybook-version.ts --version 9.1.0
+
+# Upgrade to a canary published from a Storybook PR
+node scripts/eval/sync-storybook-version.ts --version 0.0.0-pr-34297-sha-abcdef12
+
+# Upgrade a subset of projects
+node scripts/eval/sync-storybook-version.ts --version latest --project mealdrop --project edgy
+
+# Commit locally without pushing yet
+node scripts/eval/sync-storybook-version.ts --version 9.1.0 --skip-push
+```
+
+The upgrade passes the following flags:
+
+- `--yes` — auto-accepts prompts.
+- `--force` — skips the autoblocker gate (useful for canary or major-version bumps).
+- `--skip-check` — skips the postinstall self-check.
+- `--skip-automigrations` — prevents the CLI from rewriting source files (e.g. the `wrap-getAbsolutePath` migration).
+
+The commit message defaults to `Eval: upgrade Storybook to `. If you review a `--skip-push` run first, rerun the same command without `--skip-push` to push the existing local upgrade commits. Run `sync-baselines.ts` afterwards if you also need to refresh the canonical `.storybook` config in every repo.
+
+## Collecting results
+
+After running trials, `collect-pr-data.ts` scrapes the published draft PRs and loads the data into a local SQLite database.
+
+```sh
+# Collect from all projects
+node scripts/eval/collect-pr-data.ts
+
+# Collect from a specific project
+node scripts/eval/collect-pr-data.ts --project mealdrop
+
+# Limit PRs fetched or filter by state
+node scripts/eval/collect-pr-data.ts --limit 50
+node scripts/eval/collect-pr-data.ts --state open
+
+# Custom database path (default: scripts/eval/.cache/eval-pr-data.sqlite)
+node scripts/eval/collect-pr-data.ts --db-path ./my-eval-data.sqlite
+```
+
+## Querying results
+
+Open the database with any SQLite client:
+
+```sh
+sqlite3 scripts/eval/.cache/eval-pr-data.sqlite
+```
+
+The database includes four built-in views:
+
+- `**story_render_summary_by_project_model_effort**` — the go-to view for comparing models. Shows `project`, `model`, `effort`, `trials`, `before`/`after` pass rates, `gain` (normalized preview gain), `avg_cost_usd`, `avg_duration_m_s`, and `avg_turns`.
+- `**story_render_scores_by_trial**` — per-trial breakdown with before/after rates, absolute gain, normalized preview gain, and `score`. Useful for inspecting individual results or computing variance.
+- `**story_render_rate_by_project_model_effort**` — detailed aggregate view, like the summary but with additional columns (empty render failures, raw counts).
+- `**ghost_story_rate_by_project_model_effort**` — ghost story coverage rates with `before_rate`, `after_rate`, `absolute_rate_gain`, and `normalized_rate_gain`.
+
+Common queries:
+
+```sql
+-- Compare model performance across all projects
+SELECT * FROM story_render_summary_by_project_model_effort;
+
+-- Find the best and worst trials for a project
+SELECT project, trial_id, model, score
+FROM story_render_scores_by_trial
+WHERE project = 'mealdrop'
+ORDER BY score DESC;
+
+-- Compare normalized preview gain across models
+SELECT project, model, effort, trials, normalized_preview_gain
+FROM story_render_rate_by_project_model_effort
+ORDER BY normalized_preview_gain DESC;
+
+-- Ghost story coverage
+SELECT project, model, effort, before_rate, after_rate, normalized_rate_gain
+FROM ghost_story_rate_by_project_model_effort;
+```
+
+### Using LLMs to explore the database
+
+The SQLite database is a great target for LLM-assisted analysis. Point Claude or any coding agent at the database file and ask natural language questions like "which model scores best per dollar?" or "what's the score variance for mealdrop?".
+
+## Understanding scores
+
+
+
+Each trial produces several metrics:
+
+- **Build** — whether `storybook build` succeeds (pass/fail)
+- **TypeScript errors** — number of errors from `tsc --noEmit`
+- **Story render (before/after)** — how many stories pass Vitest rendering. The "before" measurement temporarily restores the baseline preview config to isolate the agent's contribution.
+- **Ghost stories (before/after)** — auto-generated tests that check whether components render without crashing.
+
+The headline metric is **normalized preview gain** — how much of the remaining room for improvement did the agent capture. It is stored in `data.json` as a **0–1 index**; the CLI, draft PR, and eval summary UI show the same value as a **percentage** for readability.
+
+If the baseline already passes every story (**before_rate = 100%**), there is no remaining gap — the gain and headline score are **0**.
+
+```text
+gain = (after_rate - before_rate) / (1 - before_rate)
+```
+
+For example, if the baseline pass rate is 60% and the agent achieves 80%:
+
+```text
+gain = (0.80 - 0.60) / (1 - 0.60) = 0.50
+```
+
+The agent captured 50% of the possible improvement. A score of 1.0 means the agent achieved a 100% pass rate. A score of 0 means no improvement.
+
+## Projects
+
+Benchmark apps live in repos under the `storybook-tmp` GitHub org. The authoritative list is in `scripts/eval/lib/projects.ts` — use `node scripts/eval/eval.ts --list-projects` to see names and descriptions.
+
+## Adding a new benchmark project
+
+To benchmark a new app, register it in the harness and sync baselines. Follow these steps in order:
+
+1. Create a repo under `storybook-tmp` on GitHub with the app you want to benchmark.
+2. Install Storybook with a **fresh** init (for example `npx storybook@latest init`). The repo must not include custom stories yet—only the base example stories that the Storybook CLI creates. Remove or avoid any extra story files beyond that scaffold.
+3. Add an entry to `scripts/eval/lib/projects.ts`:
+
+```ts
+{
+ name: 'my-project',
+ repo: 'https://github.com/storybook-tmp/my-project',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/my-project',
+ projectDir: 'packages/app', // if the app lives in a subdirectory
+ description: 'Short description of the tech stack',
+}
+```
+
+4. Add a `vitest:storybook` script to the project's `package.json` that runs the storybook vitest project. The grading harness calls this script and appends `--reporter=json --outputFile=... `. The exact command depends on the repo's vitest setup:
+
+```jsonc
+// Most repos (inline vitest project):
+"vitest:storybook": "vitest run --project=storybook"
+
+// Repos with a dedicated config file (e.g. excalidraw):
+"vitest:storybook": "vitest run --config vitest.storybook.config.mts"
+```
+
+5. Run `node scripts/eval/sync-baselines.ts --project my-project` to push the eval baseline `.storybook` config (this replaces the init scaffold in the benchmark repo).
+6. Run a trial to verify: `node scripts/eval/eval.ts -p my-project --prompt pattern-copy-play`
+
+## Prompts
+
+The eval mirrors the real user flow exactly:
+
+1. A real user copies the "Set up Storybook with AI" prompt from the Storybook UI — a one-line nudge (`AI_SETUP_PROMPT`) that just says _"Run `npx storybook ai setup` and follow its instructions precisely."_
+2. The user pastes that into their AI agent.
+3. The **agent** runs `npx storybook ai setup` itself as a tool call.
+4. The agent reads the resulting project-aware markdown and follows it.
+
+The harness hands steps (1) and (2) to the trial agent as its task. Eval starts at step (3).
+
+### How variant selection works
+
+Prompt variants live in [`code/lib/cli-storybook/src/ai/setup-prompts/`](../../code/lib/cli-storybook/src/ai/setup-prompts/). Each variant is a self-contained `.ts` file that exports an `instructions(projectInfo)` function. The registry in `prompts/index.ts` lists every variant.
+
+The eval selects a variant by injecting the `EVAL_SETUP_PROMPT` env var into the agent's spawn environment. When the agent later runs `npx storybook ai setup`, the CLI reads that env var and returns the matching variant. Real users never set this env var, so they always get the default (`pattern-copy-play`).
+
+```text
+eval.ts --prompt setup
+ → run-trial.ts calls driver.execute({ env: { EVAL_SETUP_PROMPT: 'setup' } })
+ → agent spawns with that env
+ → agent's `npx storybook ai setup` tool call inherits EVAL_SETUP_PROMPT
+ → CLI's getPrompts() picks the 'setup' variant
+```
+
+### Available prompts
+
+- `**pattern-copy-play**` _(default)_ — analyze the codebase, copy real usage patterns, configure preview with providers and MSW mocks, write ~10 story files with play functions, verify each with Vitest. This is the only prompt users ever see when they run `npx storybook ai setup`.
+- `**setup**` — structured step-by-step: analyze, configure preview, write 9 stories (3 simple / 3 medium / 3 complex), verify each with Vitest. Available only to the eval harness for A/B comparison against the default.
+
+### Adding a new prompt variant
+
+1. Create `code/lib/cli-storybook/src/ai/setup-prompts/.ts`. Make it fully self-contained — keep its own `getTypeImportSource`, code-example helpers, and any other private utilities so changing one variant can never accidentally change another. Duplication is deliberate here.
+2. Export an `instructions(projectInfo: ProjectInfo): string` function.
+3. Register it in `code/lib/cli-storybook/src/ai/setup-prompts/index.ts` by adding an entry to `CURRENTLY_USED_PROMPT` and moving the existing one to FORMERLY_USED_PROMPTS.
+4. Use it from the eval: `node scripts/eval/eval.ts -p mealdrop --prompt `.
+
+To promote a variant to be the default users see, change `DEFAULT_PROMPT_NAME` in the same registry file.
diff --git a/scripts/eval/collect-pr-data.test.ts b/scripts/eval/collect-pr-data.test.ts
new file mode 100644
index 000000000000..30f876b4adc5
--- /dev/null
+++ b/scripts/eval/collect-pr-data.test.ts
@@ -0,0 +1,291 @@
+import { DatabaseSync } from 'node:sqlite';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+const execFileSyncMock = vi.fn();
+const openDatabases: DatabaseSync[] = [];
+
+vi.mock('node:child_process', () => ({
+ execFileSync: execFileSyncMock,
+}));
+
+beforeEach(() => {
+ vi.clearAllMocks();
+});
+
+afterEach(() => {
+ while (openDatabases.length > 0) {
+ openDatabases.pop()?.close();
+ }
+});
+
+describe('listEvalPullRequests', () => {
+ it('parses GitHub CLI JSON output on success', async () => {
+ execFileSyncMock.mockReturnValueOnce(
+ JSON.stringify([
+ {
+ number: 123,
+ title: '[eval] mealdrop trial-123',
+ },
+ ])
+ );
+
+ const { listEvalPullRequests } = await import('./collect-pr-data.ts');
+
+ await expect(listEvalPullRequests('storybook-tmp/mealdrop', 10)).resolves.toMatchObject([
+ {
+ number: 123,
+ title: '[eval] mealdrop trial-123',
+ },
+ ]);
+
+ expect(execFileSyncMock).toHaveBeenCalledWith(
+ 'gh',
+ expect.arrayContaining(['pr', 'list', '--state', 'all']),
+ expect.any(Object)
+ );
+ });
+
+ it('passes through an explicit PR state override', async () => {
+ execFileSyncMock.mockReturnValueOnce('[]');
+
+ const { listEvalPullRequests } = await import('./collect-pr-data.ts');
+
+ await expect(listEvalPullRequests('storybook-tmp/mealdrop', 10, 'open')).resolves.toEqual([]);
+
+ expect(execFileSyncMock).toHaveBeenCalledWith(
+ 'gh',
+ expect.arrayContaining(['pr', 'list', '--state', 'open']),
+ expect.any(Object)
+ );
+ });
+
+ it('throws a clear error when GitHub CLI cannot list PRs', async () => {
+ execFileSyncMock.mockImplementationOnce(() => {
+ throw Object.assign(new Error('Command failed: gh'), {
+ status: 1,
+ stderr: Buffer.from('authentication required\n'),
+ });
+ });
+
+ const { listEvalPullRequests } = await import('./collect-pr-data.ts');
+
+ await expect(listEvalPullRequests('storybook-tmp/mealdrop', 10)).rejects.toThrow(
+ /Failed to list eval PRs for storybook-tmp\/mealdrop: .*stderr: authentication required/
+ );
+ });
+});
+
+describe('parseCliArgs', () => {
+ it('defaults PR state to all', async () => {
+ const { parseCliArgs } = await import('./collect-pr-data.ts');
+
+ expect(parseCliArgs([])).toMatchObject({
+ prState: 'all',
+ });
+ });
+
+ it('parses --state open', async () => {
+ const { parseCliArgs } = await import('./collect-pr-data.ts');
+
+ expect(parseCliArgs(['--state', 'open'])).toMatchObject({
+ prState: 'open',
+ });
+ });
+});
+
+describe('normalizeTrialData', () => {
+ it('ingests v3 payloads while ignoring screenshot-era fields', async () => {
+ const { normalizeTrialData } = await import('./collect-pr-data.ts');
+
+ const normalized = normalizeTrialData({
+ trialId: 'trial-123',
+ data: createEvalDataPayload({
+ schemaVersion: 3,
+ screenshots: [
+ {
+ storyFilePath: 'src/Button.stories.tsx',
+ exportName: 'Primary',
+ imagePath: 'src/Button.stories.Primary.chromium.png',
+ },
+ ],
+ artifacts: {
+ buildOutput: {
+ path: '.storybook/eval-results/build-output.txt',
+ },
+ typecheckOutput: {
+ path: '.storybook/eval-results/typecheck-output.txt',
+ },
+ screenshotOutput: {
+ path: '.storybook/eval-results/screenshot-output.txt',
+ },
+ },
+ }),
+ });
+
+ expect(normalized).toMatchObject({
+ dataSchemaVersion: 3,
+ ghostBefore: {
+ candidateCount: 4,
+ total: 2,
+ passed: 1,
+ },
+ ghostAfter: {
+ candidateCount: 4,
+ total: 2,
+ passed: 2,
+ },
+ buildOutputPath: '.storybook/eval-results/build-output.txt',
+ typecheckOutputPath: '.storybook/eval-results/typecheck-output.txt',
+ });
+ expect(normalized).not.toHaveProperty('screenshots');
+ expect(normalized).not.toHaveProperty('screenshotOutputPath');
+ });
+
+ it('rejects v4 payloads that still include screenshots', async () => {
+ const { normalizeTrialData } = await import('./collect-pr-data.ts');
+
+ expect(() =>
+ normalizeTrialData({
+ trialId: 'trial-123',
+ data: createEvalDataPayload({
+ schemaVersion: 4,
+ screenshots: [],
+ }),
+ })
+ ).toThrow(
+ /data\.json\.schemaVersion 4 must not include screenshot-era fields: data\.json\.screenshots/
+ );
+ });
+
+ it('rejects v4 payloads that still include screenshot artifacts', async () => {
+ const { normalizeTrialData } = await import('./collect-pr-data.ts');
+
+ expect(() =>
+ normalizeTrialData({
+ trialId: 'trial-123',
+ data: createEvalDataPayload({
+ schemaVersion: 4,
+ artifacts: {
+ buildOutput: {
+ path: '.storybook/eval-results/build-output.txt',
+ },
+ typecheckOutput: {
+ path: '.storybook/eval-results/typecheck-output.txt',
+ },
+ screenshotOutput: {
+ path: '.storybook/eval-results/screenshot-output.txt',
+ },
+ },
+ }),
+ })
+ ).toThrow(
+ /data\.json\.schemaVersion 4 must not include screenshot-era fields: data\.json\.artifacts\.screenshotOutput/
+ );
+ });
+});
+
+describe('ensureSchema', () => {
+ it('creates the rebuilt schema without screenshot storage', async () => {
+ const { ensureSchema } = await import('./collect-pr-data.ts');
+ const db = createInMemoryDb();
+
+ ensureSchema(db, '/tmp/eval-pr-data.sqlite');
+
+ const trialColumns = db.prepare('PRAGMA table_info(trials)').all() as Array<{ name: string }>;
+ const tableNames = db
+ .prepare(`
+ SELECT name
+ FROM sqlite_master
+ WHERE type = 'table'
+ `)
+ .all() as Array<{ name: string }>;
+
+ expect(trialColumns.map((column) => column.name)).not.toContain('screenshot_output_path');
+ expect(tableNames.map((table) => table.name)).not.toContain('trial_screenshots');
+ });
+
+ it('fails fast on a legacy screenshot-era cache DB', async () => {
+ const { ensureSchema } = await import('./collect-pr-data.ts');
+ const db = createInMemoryDb();
+
+ db.exec(`
+ CREATE TABLE trials (
+ trial_id TEXT PRIMARY KEY,
+ screenshot_output_path TEXT
+ );
+ `);
+
+ expect(() => ensureSchema(db, '/tmp/eval-pr-data.sqlite')).toThrow(
+ /Delete \.cache\/eval-pr-data\.sqlite .* rerun scripts\/eval\/collect-pr-data\.ts/
+ );
+ });
+});
+
+function createInMemoryDb() {
+ const db = new DatabaseSync(':memory:');
+ openDatabases.push(db);
+ return db;
+}
+
+function createEvalDataPayload(overrides: Record) {
+ return {
+ schemaVersion: 4,
+ id: 'trial-123',
+ timestamp: '2026-04-14T01:02:03.000Z',
+ prompt: {
+ name: 'setup',
+ content: 'prompt body',
+ },
+ baselineCommit: 'deadbeef',
+ variant: {
+ agent: 'codex',
+ model: 'gpt-5.4',
+ effort: 'high',
+ },
+ environment: {
+ nodeVersion: 'v22.22.1',
+ evalBranch: 'trial/test-branch',
+ evalCommit: 'abc123',
+ },
+ execution: {
+ cost: 0.12,
+ duration: 30,
+ durationApi: 20,
+ turns: 3,
+ terminalResultSubtype: 'success',
+ },
+ grade: {
+ buildSuccess: true,
+ typeCheckErrors: 0,
+ baselineGhostStories: {
+ candidateCount: 4,
+ total: 2,
+ passed: 1,
+ },
+ ghostStories: {
+ candidateCount: 4,
+ total: 2,
+ passed: 2,
+ },
+ baselinePreviewStories: {
+ total: 4,
+ passed: 1,
+ },
+ storyRender: {
+ total: 4,
+ passed: 3,
+ },
+ fileChanges: [] as Array>,
+ },
+ transcript: [] as unknown[],
+ artifacts: {
+ buildOutput: {
+ path: '.storybook/eval-results/build-output.txt',
+ },
+ typecheckOutput: {
+ path: '.storybook/eval-results/typecheck-output.txt',
+ },
+ },
+ ...overrides,
+ };
+}
diff --git a/scripts/eval/collect-pr-data.ts b/scripts/eval/collect-pr-data.ts
new file mode 100644
index 000000000000..401071b382f6
--- /dev/null
+++ b/scripts/eval/collect-pr-data.ts
@@ -0,0 +1,1596 @@
+/**
+ * Ingests eval draft PRs into SQLite for analysis (queries, exports, dashboards).
+ *
+ * **Maintenance:** This file mixes SQL, GitHub API shapes, and `data.json` schema assumptions.
+ * Prefer updating it with AI assistance and keep changes aligned with `scripts/eval/lib/result-docs.ts`
+ * (`EvalData` / `schemaVersion`). When agent SDKs or transcript formats change, revisit the
+ * parsers and migrations together.
+ */
+import { execFileSync } from 'node:child_process';
+import { mkdirSync } from 'node:fs';
+import { dirname, resolve } from 'node:path';
+import { DatabaseSync } from 'node:sqlite';
+import { fileURLToPath } from 'node:url';
+import { parseArgs } from 'node:util';
+import pLimit from 'p-limit';
+import type { Project } from './lib/projects.ts';
+import { PROJECTS } from './lib/projects.ts';
+import { createLogger, formatHelp, NODE_EVAL_COLLECT_PR_DATA_SCRIPT } from './lib/utils.ts';
+
+const DEFAULT_DB_PATH = resolve(import.meta.dirname, '.cache', 'eval-pr-data.sqlite');
+const GH_PAGE_SIZE = 200;
+
+interface PullRequestListItem {
+ number?: number;
+ title?: string;
+ body?: string;
+ state?: string;
+ isDraft?: boolean;
+ createdAt?: string;
+ headRefName?: string;
+ headRefOid?: string;
+ files?: Array<{
+ path?: string;
+ }>;
+ headRepository?: {
+ name?: string;
+ } | null;
+ headRepositoryOwner?: {
+ login?: string;
+ } | null;
+ statusCheckRollup?: StatusCheck[];
+}
+
+type StatusCheck = CheckRun | StatusContext;
+
+interface CheckRun {
+ __typename: 'CheckRun';
+ name?: string;
+ workflowName?: string;
+ status?: string;
+ conclusion?: string;
+ startedAt?: string;
+ completedAt?: string;
+ detailsUrl?: string;
+}
+
+interface StatusContext {
+ __typename: 'StatusContext';
+ context?: string;
+ state?: string;
+ startedAt?: string;
+ targetUrl?: string;
+}
+
+interface EvalDataPayload {
+ schemaVersion?: number;
+ id?: string;
+ timestamp?: string;
+ prompt?: {
+ name?: string;
+ content?: string;
+ };
+ baselineCommit?: string;
+ variant?: {
+ agent?: string;
+ model?: string;
+ effort?: string;
+ };
+ environment?: {
+ nodeVersion?: string;
+ evalBranch?: string;
+ evalCommit?: string;
+ };
+ execution?: {
+ cost?: number;
+ duration?: number;
+ durationApi?: number;
+ turns?: number;
+ terminalResultSubtype?: string;
+ };
+ grade?: {
+ buildSuccess?: boolean;
+ typeCheckErrors?: number;
+ baselinePreviewStories?: StoryRenderSummaryPayload;
+ storyRender?: StoryRenderSummaryPayload;
+ baselineGhostStories?: GhostSummaryPayload;
+ ghostStories?: GhostSummaryPayload;
+ fileChanges?: FileChangePayload[];
+ };
+ screenshots?: unknown[];
+ transcript?: unknown[];
+ artifacts?: {
+ buildOutput?: {
+ path?: string;
+ };
+ typecheckOutput?: {
+ path?: string;
+ };
+ screenshotOutput?: {
+ path?: string;
+ };
+ };
+}
+
+interface GhostSummaryPayload {
+ candidateCount?: number;
+ total?: number;
+ passed?: number;
+}
+
+interface StoryRenderSummaryPayload {
+ total?: number;
+ passed?: number;
+ emptyRenderFailures?: number;
+}
+
+interface FileChangePayload {
+ path?: string;
+ previousPath?: string;
+ gitStatus?: string;
+}
+
+interface NormalizedGhostSummary {
+ candidateCount: number;
+ total: number;
+ passed: number;
+}
+
+interface NormalizedStoryRenderSummary {
+ total: number;
+ passed: number;
+ emptyRenderFailures: number | null;
+}
+
+interface NormalizedFileChange {
+ path: string;
+ previousPath: string | null;
+ gitStatus: GitStatus;
+}
+
+interface NormalizedTrialData {
+ promptName: string;
+ promptContent: string;
+ trialTimestamp: string;
+ dataSchemaVersion: number;
+ baselineCommit: string;
+ agent: string;
+ model: string;
+ effort: string;
+ buildSuccess: 0 | 1;
+ typecheckErrors: number;
+ costUsd: number | null;
+ durationS: number;
+ durationApiS: number | null;
+ turns: number;
+ terminalResultSubtype: string | null;
+ ghostBefore: NormalizedGhostSummary | null;
+ ghostAfter: NormalizedGhostSummary | null;
+ storyBefore: NormalizedStoryRenderSummary | null;
+ storyAfter: NormalizedStoryRenderSummary | null;
+ nodeVersion: string;
+ evalBranch: string;
+ evalCommit: string;
+ buildOutputPath: string | null;
+ typecheckOutputPath: string | null;
+ fileChanges: NormalizedFileChange[];
+ transcriptJson: string;
+}
+
+interface CollectorSummary {
+ insertedTrials: number;
+ skippedTrials: number;
+ skippedWithoutDataJson: number;
+ failedTrials: number;
+ backfilledTrialCosts: number;
+ failedTrialCostBackfills: number;
+}
+
+interface CollectPullRequestOptions {
+ db: DatabaseSync;
+ project: Project;
+ projectId: number;
+ pullRequest: PullRequestListItem;
+}
+
+type PullRequestState = 'all' | 'open';
+
+type CollectPullRequestResult =
+ | 'inserted'
+ | 'skipped-existing'
+ | 'skipped-without-data-json'
+ | 'failed';
+
+type GitStatus = 'A' | 'M' | 'D' | 'R';
+
+const logger = createLogger('eval-collect');
+
+export async function main() {
+ const args = parseCliArgs();
+ const projects = resolveProjects(args.project);
+ const dbPath = resolve(args.dbPath);
+
+ mkdirSync(dirname(dbPath), { recursive: true });
+ logger.logStep(`Opening SQLite database at ${dbPath}`);
+
+ const db = new DatabaseSync(dbPath);
+
+ try {
+ configureDatabase(db);
+ ensureSchema(db, dbPath);
+
+ const summary: CollectorSummary = {
+ insertedTrials: 0,
+ skippedTrials: 0,
+ skippedWithoutDataJson: 0,
+ failedTrials: 0,
+ backfilledTrialCosts: 0,
+ failedTrialCostBackfills: 0,
+ };
+
+ for (const project of projects) {
+ logger.logStep(`Collecting ${project.name} (${project.githubSlug})`);
+ const projectId = upsertProject(db, project);
+ const pullRequests = await listEvalPullRequests(project.githubSlug, args.limit, args.prState);
+
+ for (const pullRequest of pullRequests) {
+ const result = await collectPullRequest({
+ db,
+ project,
+ projectId,
+ pullRequest,
+ });
+
+ if (result === 'inserted') {
+ summary.insertedTrials += 1;
+ } else if (result === 'skipped-existing') {
+ summary.skippedTrials += 1;
+ } else if (result === 'skipped-without-data-json') {
+ summary.skippedWithoutDataJson += 1;
+ } else {
+ summary.failedTrials += 1;
+ }
+ }
+ }
+
+ const trialCostBackfill = await backfillMissingTrialCosts(db);
+ summary.backfilledTrialCosts = trialCostBackfill.backfilled;
+ summary.failedTrialCostBackfills = trialCostBackfill.failed;
+
+ logger.logSuccess(
+ `Inserted ${summary.insertedTrials} trials, skipped ${summary.skippedTrials} existing trials, skipped ${summary.skippedWithoutDataJson} PRs without usable data.json, failed ${summary.failedTrials}, backfilled ${summary.backfilledTrialCosts} trial costs, failed ${summary.failedTrialCostBackfills} trial cost backfills`
+ );
+ } finally {
+ db.close();
+ }
+}
+
+const collectOptions = {
+ 'db-path': {
+ type: 'string' as const,
+ description: 'SQLite database path (default: .cache/eval-pr-data.sqlite)',
+ },
+ project: { type: 'string' as const, description: 'Collect from a specific project only' },
+ limit: { type: 'string' as const, description: 'Max PRs to fetch (default: 200)' },
+ state: { type: 'string' as const, description: 'PR state filter: all or open (default: all)' },
+ help: { type: 'boolean' as const, short: 'h', description: 'Show this help and exit' },
+};
+
+export function parseCliArgs(argv = process.argv.slice(2)) {
+ const { values } = parseArgs({
+ args: argv,
+ options: collectOptions,
+ strict: true,
+ allowPositionals: false,
+ });
+
+ if (values.help) {
+ console.log(
+ formatHelp(
+ `node ${NODE_EVAL_COLLECT_PR_DATA_SCRIPT} [options]`,
+ 'Scrape eval draft PRs and load results into a local SQLite database.',
+ collectOptions
+ )
+ );
+ process.exit(0);
+ }
+
+ const limit = values.limit == null ? GH_PAGE_SIZE : Number(values.limit);
+ if (!Number.isInteger(limit) || limit <= 0) {
+ throw new Error(`--limit must be a positive integer. Received: ${values.limit}`);
+ }
+
+ const rawPrState = values.state;
+ if (rawPrState != null && rawPrState !== 'all' && rawPrState !== 'open') {
+ throw new Error(`--state must be "all" or "open". Received: ${rawPrState}`);
+ }
+ const prState: PullRequestState = rawPrState === 'open' ? 'open' : 'all';
+
+ return {
+ dbPath: values['db-path'] ?? DEFAULT_DB_PATH,
+ project: values.project ?? undefined,
+ limit,
+ prState,
+ };
+}
+
+function resolveProjects(projectName?: string) {
+ if (!projectName) {
+ return PROJECTS;
+ }
+
+ const project = PROJECTS.find((entry) => entry.name === projectName);
+ if (!project) {
+ throw new Error(
+ `Unknown project "${projectName}". Available: ${PROJECTS.map((entry) => entry.name).join(', ')}`
+ );
+ }
+
+ return [project];
+}
+
+function configureDatabase(db: DatabaseSync) {
+ db.exec(`
+ PRAGMA foreign_keys = ON;
+ PRAGMA journal_mode = WAL;
+ PRAGMA synchronous = NORMAL;
+ `);
+}
+
+export function ensureSchema(db: DatabaseSync, dbPath = DEFAULT_DB_PATH) {
+ failIfLegacyScreenshotSchema(db, dbPath);
+
+ db.exec(`
+ CREATE TABLE IF NOT EXISTS projects (
+ id INTEGER PRIMARY KEY,
+ github_slug TEXT NOT NULL UNIQUE,
+ name TEXT NOT NULL,
+ default_branch TEXT NOT NULL,
+ project_dir TEXT
+ );
+
+ CREATE TABLE IF NOT EXISTS prompts (
+ id INTEGER PRIMARY KEY,
+ name TEXT NOT NULL,
+ content TEXT NOT NULL,
+ UNIQUE(name, content)
+ );
+
+ CREATE TABLE IF NOT EXISTS trials (
+ trial_id TEXT PRIMARY KEY,
+ project_id INTEGER NOT NULL REFERENCES projects(id),
+ prompt_id INTEGER NOT NULL REFERENCES prompts(id),
+ trial_timestamp TEXT NOT NULL,
+ data_schema_version INTEGER NOT NULL,
+ pr_number INTEGER NOT NULL,
+ pr_title TEXT NOT NULL,
+ pr_created_at TEXT NOT NULL,
+ pr_state TEXT NOT NULL,
+ pr_is_draft INTEGER NOT NULL CHECK (pr_is_draft IN (0, 1)),
+ head_ref_name TEXT NOT NULL,
+ head_ref_oid TEXT NOT NULL,
+ baseline_commit TEXT NOT NULL,
+ agent TEXT NOT NULL,
+ model TEXT NOT NULL,
+ effort TEXT NOT NULL,
+ build_success INTEGER NOT NULL CHECK (build_success IN (0, 1)),
+ typecheck_errors INTEGER NOT NULL,
+ cost_usd REAL,
+ duration_s REAL NOT NULL,
+ duration_api_s REAL,
+ turns INTEGER NOT NULL,
+ terminal_result_subtype TEXT,
+ ghost_before_candidate_count INTEGER,
+ ghost_before_total INTEGER,
+ ghost_before_passed INTEGER,
+ ghost_after_candidate_count INTEGER,
+ ghost_after_total INTEGER,
+ ghost_after_passed INTEGER,
+ story_before_total INTEGER,
+ story_before_passed INTEGER,
+ story_before_empty INTEGER,
+ story_after_total INTEGER,
+ story_after_passed INTEGER,
+ story_after_empty INTEGER,
+ node_version TEXT NOT NULL,
+ eval_branch TEXT NOT NULL,
+ eval_commit TEXT NOT NULL,
+ data_json_path TEXT NOT NULL,
+ build_output_path TEXT,
+ typecheck_output_path TEXT,
+ ingested_at TEXT NOT NULL,
+ UNIQUE(project_id, pr_number),
+ UNIQUE(project_id, head_ref_oid)
+ );
+
+ CREATE TABLE IF NOT EXISTS trial_checks (
+ trial_id TEXT NOT NULL REFERENCES trials(trial_id) ON DELETE CASCADE,
+ seq INTEGER NOT NULL,
+ check_type TEXT NOT NULL,
+ name_or_context TEXT NOT NULL,
+ workflow_name TEXT,
+ status TEXT,
+ conclusion_or_state TEXT,
+ started_at TEXT,
+ completed_at TEXT,
+ details_url TEXT,
+ target_url TEXT,
+ PRIMARY KEY (trial_id, seq)
+ );
+
+ CREATE TABLE IF NOT EXISTS trial_file_changes (
+ trial_id TEXT NOT NULL REFERENCES trials(trial_id) ON DELETE CASCADE,
+ seq INTEGER NOT NULL,
+ path TEXT NOT NULL,
+ previous_path TEXT,
+ git_status TEXT NOT NULL CHECK (git_status IN ('A', 'M', 'D', 'R')),
+ PRIMARY KEY (trial_id, seq)
+ );
+
+ CREATE TABLE IF NOT EXISTS trial_transcripts (
+ trial_id TEXT PRIMARY KEY REFERENCES trials(trial_id) ON DELETE CASCADE,
+ transcript_json TEXT NOT NULL
+ );
+ `);
+
+ ensureTableColumn(db, 'trials', 'story_before_total', 'INTEGER');
+ ensureTableColumn(db, 'trials', 'story_before_passed', 'INTEGER');
+ ensureTableColumn(db, 'trials', 'story_before_empty', 'INTEGER');
+ ensureTableColumn(db, 'trials', 'story_after_total', 'INTEGER');
+ ensureTableColumn(db, 'trials', 'story_after_passed', 'INTEGER');
+ ensureTableColumn(db, 'trials', 'story_after_empty', 'INTEGER');
+ ensureTableColumn(db, 'trials', 'cost_usd', 'REAL');
+ ensureViews(db);
+}
+
+function ensureViews(db: DatabaseSync) {
+ db.exec(`
+ DROP VIEW IF EXISTS ghost_story_rate_by_project_model_effort;
+ DROP VIEW IF EXISTS story_render_rate_by_project_model_effort;
+ DROP VIEW IF EXISTS story_render_summary_by_project_model_effort;
+ DROP VIEW IF EXISTS story_render_scores_by_trial;
+
+ CREATE VIEW ghost_story_rate_by_project_model_effort AS
+ SELECT
+ p.name AS project,
+ t.model AS model,
+ t.effort AS effort,
+ COUNT(*) AS trials,
+ AVG(t.ghost_before_passed) AS avg_before_passed,
+ AVG(t.ghost_before_total) AS avg_before_total,
+ AVG(t.ghost_after_passed) AS avg_after_passed,
+ AVG(t.ghost_after_total) AS avg_after_total,
+ AVG(
+ CASE
+ WHEN t.ghost_before_total > 0
+ THEN 1.0 * t.ghost_before_passed / t.ghost_before_total
+ ELSE 0
+ END
+ ) AS before_rate,
+ AVG(
+ CASE
+ WHEN t.ghost_after_total > 0
+ THEN 1.0 * t.ghost_after_passed / t.ghost_after_total
+ ELSE 0
+ END
+ ) AS after_rate,
+ AVG(
+ CASE
+ WHEN t.ghost_before_total > 0 AND t.ghost_after_total > 0
+ THEN (1.0 * t.ghost_after_passed / t.ghost_after_total) -
+ (1.0 * t.ghost_before_passed / t.ghost_before_total)
+ ELSE 0
+ END
+ ) AS absolute_rate_gain,
+ AVG(
+ CASE
+ WHEN t.ghost_before_total > 0
+ AND t.ghost_after_total > 0
+ AND t.ghost_before_passed < t.ghost_before_total
+ THEN (
+ (1.0 * t.ghost_after_passed / t.ghost_after_total) -
+ (1.0 * t.ghost_before_passed / t.ghost_before_total)
+ ) / (1.0 - (1.0 * t.ghost_before_passed / t.ghost_before_total))
+ ELSE 0
+ END
+ ) AS normalized_rate_gain
+ FROM trials t
+ JOIN projects p ON p.id = t.project_id
+ WHERE t.effort IN ('high', 'max', 'xhigh')
+ GROUP BY p.name, t.model, t.effort
+ `);
+
+ db.exec(`
+ CREATE VIEW story_render_rate_by_project_model_effort AS
+ SELECT
+ p.name AS project,
+ t.model AS model,
+ t.effort AS effort,
+ COUNT(*) AS trials,
+ AVG(t.story_before_passed) AS avg_before_passed,
+ AVG(t.story_before_total) AS avg_before_total,
+ AVG(t.story_before_empty) AS avg_before_empty,
+ AVG(t.story_after_passed) AS avg_after_passed,
+ AVG(t.story_after_total) AS avg_after_total,
+ AVG(t.story_after_empty) AS avg_after_empty,
+ AVG(t.cost_usd) AS avg_cost_usd,
+ AVG(t.duration_s) AS avg_duration_s,
+ AVG(t.turns) AS avg_turns,
+ AVG(
+ CASE
+ WHEN t.story_before_total > 0
+ THEN 1.0 * t.story_before_passed / t.story_before_total
+ ELSE 0
+ END
+ ) AS before_rate,
+ AVG(
+ CASE
+ WHEN t.story_after_total > 0
+ THEN 1.0 * t.story_after_passed / t.story_after_total
+ ELSE 0
+ END
+ ) AS after_rate,
+ AVG(
+ CASE
+ WHEN t.story_before_total > 0
+ AND t.story_after_total > 0
+ AND t.story_before_passed < t.story_before_total
+ THEN (
+ (1.0 * t.story_after_passed / t.story_after_total) -
+ (1.0 * t.story_before_passed / t.story_before_total)
+ ) / (1.0 - (1.0 * t.story_before_passed / t.story_before_total))
+ WHEN t.story_before_total > 0
+ AND t.story_after_total > 0
+ AND t.story_before_passed = t.story_before_total
+ AND t.story_after_passed = t.story_after_total
+ THEN 1.0
+ ELSE 0
+ END
+ ) AS normalized_preview_gain
+ FROM trials t
+ JOIN projects p ON p.id = t.project_id
+ WHERE t.effort IN ('high', 'max', 'xhigh')
+ AND t.story_before_total IS NOT NULL
+ AND t.story_after_total IS NOT NULL
+ GROUP BY p.name, t.model, t.effort
+ `);
+
+ db.exec(`
+ CREATE VIEW story_render_scores_by_trial AS
+ SELECT
+ p.name AS project,
+ t.pr_number AS pr_number,
+ t.trial_id AS trial_id,
+ t.trial_timestamp AS trial_timestamp,
+ t.model AS model,
+ t.effort AS effort,
+ t.story_before_passed AS before_passed,
+ t.story_before_total AS before_total,
+ printf('%d/%d', t.story_before_passed, t.story_before_total) AS before_quotient,
+ CASE
+ WHEN t.story_before_total > 0
+ THEN 1.0 * t.story_before_passed / t.story_before_total
+ ELSE NULL
+ END AS before_rate,
+ CASE
+ WHEN t.story_before_total > 0
+ THEN 100.0 * t.story_before_passed / t.story_before_total
+ ELSE NULL
+ END AS before_percent,
+ t.story_after_passed AS after_passed,
+ t.story_after_total AS after_total,
+ printf('%d/%d', t.story_after_passed, t.story_after_total) AS after_quotient,
+ CASE
+ WHEN t.story_after_total > 0
+ THEN 1.0 * t.story_after_passed / t.story_after_total
+ ELSE NULL
+ END AS after_rate,
+ CASE
+ WHEN t.story_after_total > 0
+ THEN 100.0 * t.story_after_passed / t.story_after_total
+ ELSE NULL
+ END AS after_percent,
+ CASE
+ WHEN t.story_before_total > 0 AND t.story_after_total > 0
+ THEN
+ (1.0 * t.story_after_passed / t.story_after_total) -
+ (1.0 * t.story_before_passed / t.story_before_total)
+ ELSE NULL
+ END AS absolute_rate_gain,
+ CASE
+ WHEN t.story_before_total > 0 AND t.story_after_total > 0
+ THEN
+ 100.0 * (
+ (1.0 * t.story_after_passed / t.story_after_total) -
+ (1.0 * t.story_before_passed / t.story_before_total)
+ )
+ ELSE NULL
+ END AS absolute_gain_percent,
+ CASE
+ WHEN t.story_before_total > 0
+ AND t.story_after_total > 0
+ AND t.story_before_passed < t.story_before_total
+ THEN (
+ (1.0 * t.story_after_passed / t.story_after_total) -
+ (1.0 * t.story_before_passed / t.story_before_total)
+ ) / (1.0 - (1.0 * t.story_before_passed / t.story_before_total))
+ WHEN t.story_before_total > 0
+ AND t.story_after_total > 0
+ AND t.story_before_passed = t.story_before_total
+ AND t.story_after_passed = t.story_after_total
+ THEN 1.0
+ ELSE 0
+ END AS normalized_preview_gain,
+ CASE
+ WHEN t.story_before_total > 0
+ AND t.story_after_total > 0
+ AND t.story_before_passed < t.story_before_total
+ THEN 100.0 * (
+ (
+ (1.0 * t.story_after_passed / t.story_after_total) -
+ (1.0 * t.story_before_passed / t.story_before_total)
+ ) / (1.0 - (1.0 * t.story_before_passed / t.story_before_total))
+ )
+ WHEN t.story_before_total > 0
+ AND t.story_after_total > 0
+ AND t.story_before_passed = t.story_before_total
+ AND t.story_after_passed = t.story_after_total
+ THEN 100.0
+ ELSE 0
+ END AS normalized_preview_gain_percent,
+ CASE
+ WHEN t.story_before_total > 0
+ AND t.story_after_total > 0
+ AND t.story_before_passed < t.story_before_total
+ THEN (
+ (1.0 * t.story_after_passed / t.story_after_total) -
+ (1.0 * t.story_before_passed / t.story_before_total)
+ ) / (1.0 - (1.0 * t.story_before_passed / t.story_before_total))
+ WHEN t.story_before_total > 0
+ AND t.story_after_total > 0
+ AND t.story_before_passed = t.story_before_total
+ AND t.story_after_passed = t.story_after_total
+ THEN 1.0
+ ELSE 0
+ END AS score
+ FROM trials t
+ JOIN projects p ON p.id = t.project_id
+ WHERE t.story_before_total IS NOT NULL
+ AND t.story_after_total IS NOT NULL
+ `);
+
+ db.exec(`
+ CREATE VIEW story_render_summary_by_project_model_effort AS
+ SELECT
+ project,
+ model,
+ effort,
+ trials,
+ before_rate AS before,
+ after_rate AS after,
+ normalized_preview_gain AS gain,
+ avg_cost_usd,
+ avg_duration_s,
+ printf(
+ '%dm %02ds',
+ CAST(ROUND(avg_duration_s) / 60 AS INTEGER),
+ CAST(ROUND(avg_duration_s) AS INTEGER) % 60
+ ) AS avg_duration_m_s,
+ avg_turns
+ FROM story_render_rate_by_project_model_effort
+ WHERE project <> 'baklava'
+ `);
+}
+
+async function collectPullRequest(
+ opts: CollectPullRequestOptions
+): Promise {
+ try {
+ const prNumber = getRequiredInteger(opts.pullRequest.number, 'pullRequest.number');
+ const dataJsonPath = findEvalDataJsonPath(opts.pullRequest.files);
+ const headRepositorySlug = resolveHeadRepositorySlug(opts.pullRequest, opts.project.githubSlug);
+
+ if (!dataJsonPath) {
+ return logSkippedWithoutUsableDataJson(
+ opts.project.githubSlug,
+ prNumber,
+ 'missing data.json change'
+ );
+ }
+
+ const trialId = extractTrialId(opts.pullRequest);
+ if (!trialId) {
+ return logSkippedWithoutUsableDataJson(
+ opts.project.githubSlug,
+ prNumber,
+ 'could not infer trial id'
+ );
+ }
+
+ const headRefName = getRequiredString(opts.pullRequest.headRefName, 'pullRequest.headRefName');
+ const headRefOid = getRequiredString(opts.pullRequest.headRefOid, 'pullRequest.headRefOid');
+
+ const existingTrialId = findExistingTrialId(
+ opts.db,
+ opts.projectId,
+ trialId,
+ prNumber,
+ headRefOid
+ );
+
+ if (existingTrialId) {
+ logger.logStep(
+ `Skipped existing ${opts.project.githubSlug}#${prNumber} (${existingTrialId})`
+ );
+ return 'skipped-existing';
+ }
+
+ const rawData = fetchDataJson(headRepositorySlug, dataJsonPath, headRefOid);
+ if (!rawData) {
+ return logSkippedWithoutUsableDataJson(
+ opts.project.githubSlug,
+ prNumber,
+ 'could not read data.json from PR head'
+ );
+ }
+
+ let normalized: NormalizedTrialData;
+ try {
+ normalized = normalizeTrialData({
+ data: rawData,
+ trialId,
+ });
+ } catch (error) {
+ return logSkippedWithoutUsableDataJson(
+ opts.project.githubSlug,
+ prNumber,
+ `invalid data.json: ${formatError(error)}`
+ );
+ }
+
+ insertTrial(opts.db, {
+ projectId: opts.projectId,
+ trialId,
+ prNumber,
+ prTitle: getRequiredString(opts.pullRequest.title, 'pullRequest.title'),
+ prCreatedAt: getRequiredString(opts.pullRequest.createdAt, 'pullRequest.createdAt'),
+ prState: getRequiredString(opts.pullRequest.state, 'pullRequest.state'),
+ prIsDraft: getRequiredBoolean(opts.pullRequest.isDraft, 'pullRequest.isDraft') ? 1 : 0,
+ headRefName,
+ headRefOid,
+ dataJsonPath,
+ ingestedAt: new Date().toISOString(),
+ pullRequest: opts.pullRequest,
+ normalized,
+ });
+
+ logger.logSuccess(`Inserted ${opts.project.githubSlug}#${prNumber} (${trialId})`);
+ return 'inserted';
+ } catch (error) {
+ logger.logError(
+ `Failed ${opts.project.githubSlug}#${formatPullRequestNumber(opts.pullRequest.number)}: ${formatError(error)}`
+ );
+ return 'failed';
+ }
+}
+
+function upsertProject(db: DatabaseSync, project: Project) {
+ db.prepare(`
+ INSERT INTO projects (
+ github_slug,
+ name,
+ default_branch,
+ project_dir
+ ) VALUES (?, ?, ?, ?)
+ ON CONFLICT(github_slug) DO UPDATE SET
+ name = excluded.name,
+ default_branch = excluded.default_branch,
+ project_dir = excluded.project_dir
+ `).run(project.githubSlug, project.name, project.branch, project.projectDir ?? null);
+
+ const row = db.prepare('SELECT id FROM projects WHERE github_slug = ?').get(project.githubSlug) as
+ | { id?: unknown }
+ | undefined;
+
+ return getRequiredInteger(row?.id, `projects.id for ${project.githubSlug}`);
+}
+
+function upsertPrompt(db: DatabaseSync, promptName: string, promptContent: string) {
+ db.prepare(`
+ INSERT INTO prompts (name, content)
+ VALUES (?, ?)
+ ON CONFLICT(name, content) DO NOTHING
+ `).run(promptName, promptContent);
+
+ const row = db
+ .prepare('SELECT id FROM prompts WHERE name = ? AND content = ?')
+ .get(promptName, promptContent) as { id?: unknown } | undefined;
+
+ return getRequiredInteger(row?.id, `prompts.id for ${promptName}`);
+}
+
+export async function listEvalPullRequests(
+ repoSlug: string,
+ limit: number,
+ state: PullRequestState = 'all'
+) {
+ try {
+ return runGhJsonOrThrow([
+ 'pr',
+ 'list',
+ '--repo',
+ repoSlug,
+ '--state',
+ state,
+ '--search',
+ 'label:eval',
+ '--limit',
+ String(limit),
+ '--json',
+ [
+ 'number',
+ 'title',
+ 'body',
+ 'state',
+ 'isDraft',
+ 'createdAt',
+ 'headRefName',
+ 'headRefOid',
+ 'files',
+ 'headRepository',
+ 'headRepositoryOwner',
+ 'statusCheckRollup',
+ ].join(','),
+ ]);
+ } catch (error) {
+ throw new Error(`Failed to list eval PRs for ${repoSlug}: ${formatError(error)}`);
+ }
+}
+
+function fetchDataJson(repoSlug: string, dataJsonPath: string, headRefOid: string) {
+ const blobBuffer = fetchRepositoryBlob(repoSlug, dataJsonPath, headRefOid);
+ if (!blobBuffer) {
+ return null;
+ }
+
+ try {
+ return JSON.parse(blobBuffer.toString('utf8')) as EvalDataPayload;
+ } catch (error) {
+ logger.logError(
+ `Failed to decode ${repoSlug}:${dataJsonPath}@${headRefOid}: ${formatError(error)}`
+ );
+ return null;
+ }
+}
+
+function fetchRepositoryBlob(repoSlug: string, filePath: string, ref: string) {
+ return runGhBytes(
+ [
+ 'api',
+ '-H',
+ 'Accept: application/vnd.github.raw',
+ `repos/${repoSlug}/contents/${filePath}?ref=${ref}`,
+ ],
+ null
+ );
+}
+
+function runGhJsonOrThrow(args: string[]): T {
+ try {
+ const stdout = execFileSync('gh', args, {
+ encoding: 'utf8',
+ maxBuffer: 50 * 1024 * 1024,
+ }).trim();
+ return JSON.parse(stdout) as T;
+ } catch (error) {
+ throw new Error(`gh ${args.join(' ')} failed: ${formatError(error)}`);
+ }
+}
+
+function runGhBytes(args: string[], fallback: Buffer | null) {
+ try {
+ return execFileSync('gh', args, {
+ encoding: 'buffer',
+ maxBuffer: 50 * 1024 * 1024,
+ });
+ } catch (error) {
+ logger.logError(`gh ${args.join(' ')} failed: ${formatError(error)}`);
+ return fallback;
+ }
+}
+
+function ensureTableColumn(
+ db: DatabaseSync,
+ tableName: string,
+ columnName: string,
+ columnDefinition: string
+) {
+ const columns = db.prepare(`PRAGMA table_info(${tableName})`).all() as Array<{ name?: unknown }>;
+ const hasColumn = columns.some((column) => column.name === columnName);
+
+ if (!hasColumn) {
+ db.exec(`ALTER TABLE ${tableName} ADD COLUMN ${columnName} ${columnDefinition}`);
+ }
+}
+
+function failIfLegacyScreenshotSchema(db: DatabaseSync, dbPath: string) {
+ const hasLegacyScreenshotTable = tableExists(db, 'trial_screenshots');
+ const hasLegacyScreenshotColumn = tableHasColumn(db, 'trials', 'screenshot_output_path');
+
+ if (!hasLegacyScreenshotTable && !hasLegacyScreenshotColumn) {
+ return;
+ }
+
+ throw new Error(
+ `Legacy screenshot-era eval collector DB schema detected at ${dbPath}. Delete .cache/eval-pr-data.sqlite (or the custom DB file you passed in) and rerun scripts/eval/collect-pr-data.ts to regenerate it.`
+ );
+}
+
+function tableExists(db: DatabaseSync, tableName: string) {
+ const row = db
+ .prepare(
+ `
+ SELECT name
+ FROM sqlite_master
+ WHERE type = 'table' AND name = ?
+ `
+ )
+ .get(tableName) as { name?: unknown } | undefined;
+
+ return typeof row?.name === 'string';
+}
+
+function tableHasColumn(db: DatabaseSync, tableName: string, columnName: string) {
+ if (!tableExists(db, tableName)) {
+ return false;
+ }
+
+ const columns = db.prepare(`PRAGMA table_info(${tableName})`).all() as Array<{ name?: unknown }>;
+ return columns.some((column) => column.name === columnName);
+}
+
+function findExistingTrialId(
+ db: DatabaseSync,
+ projectId: number,
+ trialId: string,
+ prNumber: number,
+ headRefOid: string
+) {
+ const row = db
+ .prepare(`
+ SELECT trial_id
+ FROM trials
+ WHERE trial_id = ?
+ OR (project_id = ? AND pr_number = ?)
+ OR (project_id = ? AND head_ref_oid = ?)
+ LIMIT 1
+ `)
+ .get(trialId, projectId, prNumber, projectId, headRefOid) as { trial_id?: unknown } | undefined;
+
+ return getOptionalString(row?.trial_id);
+}
+
+function insertTrial(
+ db: DatabaseSync,
+ input: {
+ projectId: number;
+ trialId: string;
+ prNumber: number;
+ prTitle: string;
+ prCreatedAt: string;
+ prState: string;
+ prIsDraft: 0 | 1;
+ headRefName: string;
+ headRefOid: string;
+ dataJsonPath: string;
+ ingestedAt: string;
+ pullRequest: PullRequestListItem;
+ normalized: NormalizedTrialData;
+ }
+) {
+ db.exec('BEGIN');
+
+ try {
+ const promptId = upsertPrompt(db, input.normalized.promptName, input.normalized.promptContent);
+
+ db.prepare(`
+ INSERT INTO trials (
+ trial_id,
+ project_id,
+ prompt_id,
+ trial_timestamp,
+ data_schema_version,
+ pr_number,
+ pr_title,
+ pr_created_at,
+ pr_state,
+ pr_is_draft,
+ head_ref_name,
+ head_ref_oid,
+ baseline_commit,
+ agent,
+ model,
+ effort,
+ build_success,
+ typecheck_errors,
+ cost_usd,
+ duration_s,
+ duration_api_s,
+ turns,
+ terminal_result_subtype,
+ ghost_before_candidate_count,
+ ghost_before_total,
+ ghost_before_passed,
+ ghost_after_candidate_count,
+ ghost_after_total,
+ ghost_after_passed,
+ story_before_total,
+ story_before_passed,
+ story_before_empty,
+ story_after_total,
+ story_after_passed,
+ story_after_empty,
+ node_version,
+ eval_branch,
+ eval_commit,
+ data_json_path,
+ build_output_path,
+ typecheck_output_path,
+ ingested_at
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ `).run(
+ input.trialId,
+ input.projectId,
+ promptId,
+ input.normalized.trialTimestamp,
+ input.normalized.dataSchemaVersion,
+ input.prNumber,
+ input.prTitle,
+ input.prCreatedAt,
+ input.prState,
+ input.prIsDraft,
+ input.headRefName,
+ input.headRefOid,
+ input.normalized.baselineCommit,
+ input.normalized.agent,
+ input.normalized.model,
+ input.normalized.effort,
+ input.normalized.buildSuccess,
+ input.normalized.typecheckErrors,
+ input.normalized.costUsd,
+ input.normalized.durationS,
+ input.normalized.durationApiS,
+ input.normalized.turns,
+ input.normalized.terminalResultSubtype,
+ input.normalized.ghostBefore?.candidateCount ?? null,
+ input.normalized.ghostBefore?.total ?? null,
+ input.normalized.ghostBefore?.passed ?? null,
+ input.normalized.ghostAfter?.candidateCount ?? null,
+ input.normalized.ghostAfter?.total ?? null,
+ input.normalized.ghostAfter?.passed ?? null,
+ input.normalized.storyBefore?.total ?? null,
+ input.normalized.storyBefore?.passed ?? null,
+ input.normalized.storyBefore?.emptyRenderFailures ?? null,
+ input.normalized.storyAfter?.total ?? null,
+ input.normalized.storyAfter?.passed ?? null,
+ input.normalized.storyAfter?.emptyRenderFailures ?? null,
+ input.normalized.nodeVersion,
+ input.normalized.evalBranch,
+ input.normalized.evalCommit,
+ input.dataJsonPath,
+ input.normalized.buildOutputPath,
+ input.normalized.typecheckOutputPath,
+ input.ingestedAt
+ );
+
+ insertTrialChecks(db, input.trialId, input.pullRequest.statusCheckRollup);
+ insertTrialFileChanges(db, input.trialId, input.normalized.fileChanges);
+ insertTrialTranscript(db, input.trialId, input.normalized.transcriptJson);
+
+ db.exec('COMMIT');
+ } catch (error) {
+ db.exec('ROLLBACK');
+ throw error;
+ }
+}
+
+function insertTrialChecks(
+ db: DatabaseSync,
+ trialId: string,
+ statusCheckRollup: PullRequestListItem['statusCheckRollup']
+) {
+ const statement = db.prepare(`
+ INSERT INTO trial_checks (
+ trial_id,
+ seq,
+ check_type,
+ name_or_context,
+ workflow_name,
+ status,
+ conclusion_or_state,
+ started_at,
+ completed_at,
+ details_url,
+ target_url
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ `);
+
+ for (const [index, check] of (statusCheckRollup ?? []).entries()) {
+ if (check.__typename === 'CheckRun') {
+ statement.run(
+ trialId,
+ index + 1,
+ check.__typename,
+ check.name ?? 'unknown',
+ getOptionalString(check.workflowName),
+ getOptionalString(check.status),
+ getOptionalString(check.conclusion),
+ getOptionalString(check.startedAt),
+ getOptionalString(check.completedAt),
+ getOptionalString(check.detailsUrl),
+ null
+ );
+ continue;
+ }
+
+ statement.run(
+ trialId,
+ index + 1,
+ check.__typename,
+ check.context ?? 'unknown',
+ null,
+ null,
+ getOptionalString(check.state),
+ getOptionalString(check.startedAt),
+ null,
+ null,
+ getOptionalString(check.targetUrl)
+ );
+ }
+}
+
+function insertTrialFileChanges(
+ db: DatabaseSync,
+ trialId: string,
+ fileChanges: NormalizedFileChange[]
+) {
+ const statement = db.prepare(`
+ INSERT INTO trial_file_changes (
+ trial_id,
+ seq,
+ path,
+ previous_path,
+ git_status
+ ) VALUES (?, ?, ?, ?, ?)
+ `);
+
+ for (const [index, change] of fileChanges.entries()) {
+ statement.run(trialId, index + 1, change.path, change.previousPath, change.gitStatus);
+ }
+}
+
+function insertTrialTranscript(db: DatabaseSync, trialId: string, transcriptJson: string) {
+ db.prepare(`
+ INSERT INTO trial_transcripts (
+ trial_id,
+ transcript_json
+ ) VALUES (?, ?)
+ `).run(trialId, transcriptJson);
+}
+
+async function backfillMissingTrialCosts(db: DatabaseSync) {
+ const rows = db
+ .prepare(`
+ SELECT
+ t.trial_id,
+ t.data_json_path,
+ t.head_ref_oid,
+ p.github_slug
+ FROM trials t
+ INNER JOIN projects p ON p.id = t.project_id
+ WHERE t.cost_usd IS NULL
+ ORDER BY t.trial_timestamp DESC
+ `)
+ .all() as Array<{
+ trial_id?: unknown;
+ data_json_path?: unknown;
+ head_ref_oid?: unknown;
+ github_slug?: unknown;
+ }>;
+
+ if (rows.length === 0) {
+ logger.logStep('No trial costs need backfill.');
+ return { backfilled: 0, failed: 0 };
+ }
+
+ logger.logStep(`Backfilling ${rows.length} trial cost(s)...`);
+ const limit = pLimit(8);
+ const statement = db.prepare(`
+ UPDATE trials
+ SET cost_usd = ?
+ WHERE trial_id = ?
+ `);
+
+ let backfilled = 0;
+ let failed = 0;
+
+ await Promise.all(
+ rows.map((row) =>
+ limit(async () => {
+ try {
+ const data = fetchDataJson(
+ getRequiredString(row.github_slug, 'trials.github_slug'),
+ getRequiredString(row.data_json_path, 'trials.data_json_path'),
+ getRequiredString(row.head_ref_oid, 'trials.head_ref_oid')
+ );
+
+ if (!data) {
+ failed += 1;
+ return;
+ }
+
+ const execution = getOptionalObject(data.execution);
+ const costUsd = getOptionalNumber(execution?.cost, 'data.json.execution.cost');
+
+ if (costUsd == null) {
+ return;
+ }
+
+ statement.run(costUsd, getRequiredString(row.trial_id, 'trials.trial_id'));
+ backfilled += 1;
+ } catch (error) {
+ logger.logError(`Failed to backfill trial cost: ${formatError(error)}`);
+ failed += 1;
+ }
+ })
+ )
+ );
+
+ logger.logSuccess(
+ `Backfilled ${backfilled} trial cost(s), failed ${failed} trial cost fetch(es)`
+ );
+ return { backfilled, failed };
+}
+
+export function normalizeTrialData(opts: {
+ data: EvalDataPayload;
+ trialId: string;
+}): NormalizedTrialData {
+ const dataId = getRequiredString(opts.data.id, 'data.json.id');
+ if (dataId !== opts.trialId) {
+ throw new Error(`data.json.id ${dataId} does not match inferred trial_id ${opts.trialId}`);
+ }
+
+ const dataSchemaVersion = getEvalDataSchemaVersion(opts.data.schemaVersion);
+ assertSupportedScreenshotFields(opts.data, dataSchemaVersion);
+ const prompt = getRequiredObject(opts.data.prompt, 'data.json.prompt');
+ const variant = getRequiredObject(opts.data.variant, 'data.json.variant');
+ const environment = getRequiredObject(opts.data.environment, 'data.json.environment');
+ const execution = getRequiredObject(opts.data.execution, 'data.json.execution');
+ const grade = getRequiredObject(opts.data.grade, 'data.json.grade');
+ const artifacts = getOptionalObject(opts.data.artifacts);
+
+ return {
+ promptName: getRequiredString(prompt.name, 'data.json.prompt.name'),
+ promptContent: getRequiredString(prompt.content, 'data.json.prompt.content'),
+ trialTimestamp: getRequiredString(opts.data.timestamp, 'data.json.timestamp'),
+ dataSchemaVersion,
+ baselineCommit: getRequiredString(opts.data.baselineCommit, 'data.json.baselineCommit'),
+ agent: getRequiredString(variant.agent, 'data.json.variant.agent'),
+ model: getRequiredString(variant.model, 'data.json.variant.model'),
+ effort: getRequiredString(variant.effort, 'data.json.variant.effort'),
+ buildSuccess: getRequiredBoolean(grade.buildSuccess, 'data.json.grade.buildSuccess') ? 1 : 0,
+ typecheckErrors: getRequiredInteger(grade.typeCheckErrors, 'data.json.grade.typeCheckErrors'),
+ costUsd: getOptionalNumber(execution.cost, 'data.json.execution.cost'),
+ durationS: getRequiredNumber(execution.duration, 'data.json.execution.duration'),
+ durationApiS: getOptionalNumber(execution.durationApi, 'data.json.execution.durationApi'),
+ turns: getRequiredInteger(execution.turns, 'data.json.execution.turns'),
+ terminalResultSubtype: getOptionalString(execution.terminalResultSubtype),
+ ghostBefore: normalizeGhostSummary(
+ grade.baselineGhostStories,
+ 'data.json.grade.baselineGhostStories'
+ ),
+ ghostAfter: normalizeGhostSummary(grade.ghostStories, 'data.json.grade.ghostStories'),
+ storyBefore: normalizeStoryRenderSummary(
+ grade.baselinePreviewStories,
+ 'data.json.grade.baselinePreviewStories'
+ ),
+ storyAfter: normalizeStoryRenderSummary(grade.storyRender, 'data.json.grade.storyRender'),
+ nodeVersion: getRequiredString(environment.nodeVersion, 'data.json.environment.nodeVersion'),
+ evalBranch: getRequiredString(environment.evalBranch, 'data.json.environment.evalBranch'),
+ evalCommit: getRequiredString(environment.evalCommit, 'data.json.environment.evalCommit'),
+ buildOutputPath: getOptionalArtifactPath(
+ artifacts?.buildOutput,
+ 'data.json.artifacts.buildOutput'
+ ),
+ typecheckOutputPath: getOptionalArtifactPath(
+ artifacts?.typecheckOutput,
+ 'data.json.artifacts.typecheckOutput'
+ ),
+ fileChanges: normalizeFileChanges(grade.fileChanges),
+ transcriptJson: stringifyTranscript(opts.data.transcript),
+ };
+}
+
+function getEvalDataSchemaVersion(value: unknown): 3 | 4 {
+ const schemaVersion = getRequiredInteger(value, 'data.json.schemaVersion');
+
+ if (schemaVersion !== 3 && schemaVersion !== 4) {
+ throw new Error(`data.json.schemaVersion must be 3 or 4`);
+ }
+
+ return schemaVersion;
+}
+
+function assertSupportedScreenshotFields(data: EvalDataPayload, schemaVersion: 3 | 4) {
+ if (schemaVersion !== 4) {
+ return;
+ }
+
+ const legacyFields: string[] = [];
+ if (hasOwn(data, 'screenshots')) {
+ legacyFields.push('data.json.screenshots');
+ }
+
+ const artifacts = getOptionalObject(data.artifacts);
+ if (artifacts && hasOwn(artifacts, 'screenshotOutput')) {
+ legacyFields.push('data.json.artifacts.screenshotOutput');
+ }
+
+ if (legacyFields.length > 0) {
+ throw new Error(
+ `data.json.schemaVersion 4 must not include screenshot-era fields: ${legacyFields.join(', ')}`
+ );
+ }
+}
+
+function normalizeGhostSummary(value: unknown, label: string): NormalizedGhostSummary | null {
+ if (value == null) {
+ return null;
+ }
+
+ const summary = getRequiredObject(value, label);
+ return {
+ candidateCount: getRequiredInteger(summary.candidateCount, `${label}.candidateCount`),
+ total: getRequiredInteger(summary.total, `${label}.total`),
+ passed: getRequiredInteger(summary.passed, `${label}.passed`),
+ };
+}
+
+function normalizeStoryRenderSummary(
+ value: unknown,
+ label: string
+): NormalizedStoryRenderSummary | null {
+ if (value == null) {
+ return null;
+ }
+
+ const summary = getRequiredObject(value, label);
+ return {
+ total: getRequiredInteger(summary.total, `${label}.total`),
+ passed: getRequiredInteger(summary.passed, `${label}.passed`),
+ emptyRenderFailures: getOptionalInteger(
+ summary.emptyRenderFailures,
+ `${label}.emptyRenderFailures`
+ ),
+ };
+}
+
+function normalizeFileChanges(value: unknown): NormalizedFileChange[] {
+ const fileChanges = getRequiredArray(value, 'data.json.grade.fileChanges');
+
+ return fileChanges.map((entry, index) => {
+ const fileChange = getRequiredObject(entry, `data.json.grade.fileChanges[${index}]`);
+ const gitStatus = getRequiredString(
+ fileChange.gitStatus,
+ `data.json.grade.fileChanges[${index}].gitStatus`
+ );
+
+ if (!isGitStatus(gitStatus)) {
+ throw new Error(`data.json.grade.fileChanges[${index}].gitStatus must be one of A, M, D, R`);
+ }
+
+ return {
+ path: getRequiredString(fileChange.path, `data.json.grade.fileChanges[${index}].path`),
+ previousPath: getOptionalString(fileChange.previousPath),
+ gitStatus,
+ };
+ });
+}
+
+function stringifyTranscript(value: unknown) {
+ return JSON.stringify(getRequiredArray(value, 'data.json.transcript'));
+}
+
+function getOptionalArtifactPath(value: unknown, label: string) {
+ const artifact = getOptionalObject(value);
+ return artifact ? getRequiredString(artifact.path, `${label}.path`) : null;
+}
+
+function extractTrialId(pullRequest: PullRequestListItem) {
+ const body = getOptionalString(pullRequest.body) ?? '';
+
+ const explicitId = extractBacktickValue(body, 'ID');
+ if (explicitId) {
+ return explicitId;
+ }
+
+ const legacyId = extractBacktickValue(body, 'Trial ID');
+ if (legacyId) {
+ return legacyId;
+ }
+
+ const title = getOptionalString(pullRequest.title) ?? '';
+ const match = title.match(/^\[eval\]\s+\S+\s+(.+)$/);
+ return match?.[1] ?? '';
+}
+
+function extractBacktickValue(body: string, label: string) {
+ const match = body.match(new RegExp(`^- ${escapeRegExp(label)}: ` + '`([^`]*)`', 'm'));
+ return match?.[1] ?? '';
+}
+
+function findEvalDataJsonPath(files: PullRequestListItem['files']) {
+ return files?.find(
+ (file) => file.path?.endsWith('data.json') && file.path.includes('eval-results')
+ )?.path;
+}
+
+function resolveHeadRepositorySlug(pullRequest: PullRequestListItem, fallbackSlug: string) {
+ const owner = getOptionalString(pullRequest.headRepositoryOwner?.login);
+ const repo = getOptionalString(pullRequest.headRepository?.name);
+ return owner && repo ? `${owner}/${repo}` : fallbackSlug;
+}
+
+function getRequiredObject(value: unknown, label: string) {
+ if (!value || typeof value !== 'object' || Array.isArray(value)) {
+ throw new Error(`${label} must be an object`);
+ }
+
+ return value as Record;
+}
+
+function getOptionalObject(value: unknown) {
+ if (!value || typeof value !== 'object' || Array.isArray(value)) {
+ return null;
+ }
+
+ return value as Record;
+}
+
+function hasOwn(value: object, key: string) {
+ return Object.prototype.hasOwnProperty.call(value, key);
+}
+
+function getRequiredArray(value: unknown, label: string) {
+ if (!Array.isArray(value)) {
+ throw new Error(`${label} must be an array`);
+ }
+
+ return value;
+}
+
+function getRequiredString(value: unknown, label: string) {
+ if (typeof value !== 'string' || value.length === 0) {
+ throw new Error(`${label} must be a non-empty string`);
+ }
+
+ return value;
+}
+
+function getOptionalString(value: unknown) {
+ if (typeof value === 'number' || typeof value === 'boolean') {
+ return String(value);
+ }
+ return typeof value === 'string' && value.length > 0 ? value : null;
+}
+
+function getRequiredNumber(value: unknown, label: string) {
+ if (typeof value !== 'number' || !Number.isFinite(value)) {
+ throw new Error(`${label} must be a finite number`);
+ }
+
+ return value;
+}
+
+function getOptionalNumber(value: unknown, label: string) {
+ if (value == null) {
+ return null;
+ }
+
+ if (typeof value !== 'number' || !Number.isFinite(value)) {
+ throw new Error(`${label} must be a finite number when present`);
+ }
+
+ return value;
+}
+
+function getRequiredInteger(value: unknown, label: string) {
+ if (typeof value !== 'number' || !Number.isInteger(value)) {
+ throw new Error(`${label} must be an integer`);
+ }
+
+ return value;
+}
+
+function getOptionalInteger(value: unknown, label: string) {
+ if (value == null) {
+ return null;
+ }
+
+ if (typeof value !== 'number' || !Number.isInteger(value)) {
+ throw new Error(`${label} must be an integer when present`);
+ }
+
+ return value;
+}
+
+function getRequiredBoolean(value: unknown, label: string) {
+ if (typeof value !== 'boolean') {
+ throw new Error(`${label} must be a boolean`);
+ }
+
+ return value;
+}
+
+function isGitStatus(value: string): value is GitStatus {
+ return value === 'A' || value === 'M' || value === 'D' || value === 'R';
+}
+
+function logSkippedWithoutUsableDataJson(repoSlug: string, prNumber: number, reason: string) {
+ logger.logStep(`Skipped without usable data.json ${repoSlug}#${prNumber}: ${reason}`);
+ return 'skipped-without-data-json' as const;
+}
+
+function formatPullRequestNumber(value: unknown) {
+ return typeof value === 'number' && Number.isInteger(value) ? String(value) : 'unknown';
+}
+
+function escapeRegExp(value: string) {
+ return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+}
+
+function formatError(error: unknown) {
+ if (!(error instanceof Error)) {
+ return String(error);
+ }
+
+ const code =
+ 'code' in error && typeof error.code === 'string'
+ ? error.code
+ : 'status' in error && typeof error.status === 'number'
+ ? String(error.status)
+ : null;
+ const stderr = 'stderr' in error ? formatCommandOutput(error.stderr) : null;
+ const stdout = 'stdout' in error ? formatCommandOutput(error.stdout) : null;
+ const details = [
+ code ? `code ${code}` : null,
+ stderr ? `stderr: ${stderr}` : null,
+ !stderr && stdout ? `stdout: ${stdout}` : null,
+ ]
+ .filter((value): value is string => Boolean(value))
+ .join(' | ');
+
+ return details ? `${error.message} | ${details}` : error.message;
+}
+
+function formatCommandOutput(value: unknown) {
+ const text =
+ typeof value === 'string' ? value : value instanceof Buffer ? value.toString('utf8') : null;
+
+ if (!text) {
+ return null;
+ }
+
+ const normalized = text.replace(/\s+/g, ' ').trim();
+ return normalized.length > 300 ? `${normalized.slice(0, 297)}...` : normalized;
+}
+
+if (process.argv[1] && resolve(process.argv[1]) === fileURLToPath(import.meta.url)) {
+ try {
+ await main();
+ } catch (error) {
+ logger.logError(`Collector failed: ${formatError(error)}`);
+ process.exitCode = 1;
+ }
+}
diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts
new file mode 100644
index 000000000000..c5e5230fcf0f
--- /dev/null
+++ b/scripts/eval/eval.ts
@@ -0,0 +1,293 @@
+/**
+ * Eval harness entry point.
+ *
+ * Runs with `node scripts/eval/eval.ts` (no jiti). Node 22+ supports .ts natively
+ * via type stripping. Import specifiers use explicit .ts extensions.
+ *
+ * Usage:
+ * node scripts/eval/eval.ts -p mealdrop --prompt pattern-copy-play # claude defaults
+ * node scripts/eval/eval.ts -p mealdrop --prompt setup -a codex # codex defaults
+ * node scripts/eval/eval.ts -p mealdrop --prompt setup -m gpt-5.4 # codex (inferred)
+ * node scripts/eval/eval.ts -p mealdrop --prompt setup -a claude -e max
+ * node scripts/eval/eval.ts -p mealdrop --prompt setup --manual # prepare only, print instructions
+ * node scripts/eval/eval.ts --list-projects
+ * node scripts/eval/eval.ts --list-models
+ * node scripts/eval/eval.ts --list-prompts
+ */
+import { writeFile } from 'node:fs/promises';
+import { join } from 'node:path';
+import { parseArgs } from 'node:util';
+import { z } from 'zod';
+import pc from 'picocolors';
+import {
+ AGENT_IDS,
+ AGENTS,
+ CLAUDE_EFFORTS,
+ CLAUDE_MODELS,
+ CODEX_EFFORTS,
+ CODEX_MODELS,
+ type AgentId,
+ type AgentVariant,
+} from './lib/agents/config.ts';
+import { prepareTrial } from './lib/prepare-trial.ts';
+import { PROJECTS } from './lib/projects.ts';
+import { captureAiSetupMarkdown, runTrial, type TrialConfig } from './lib/run-trial.ts';
+import {
+ captureEnvironment,
+ createLogger,
+ formatCost,
+ formatDuration,
+ formatHelp,
+ formatScorePercent,
+ generateTrialId,
+ listPrompts,
+ loadPrompt,
+ EXAMPLE_PROMPT_BASENAME,
+ NODE_EVAL_TRIAL_SCRIPT,
+} from './lib/utils.ts';
+
+const PROJECT_NAMES = PROJECTS.map((p) => p.name) as [string, ...string[]];
+
+const base = {
+ project: z.enum(PROJECT_NAMES).optional(),
+ prompt: z.string().optional(),
+ verbose: z.boolean().default(false),
+ manual: z.boolean().default(false),
+ listProjects: z.boolean().default(false),
+ listModels: z.boolean().default(false),
+ listPrompts: z.boolean().default(false),
+};
+
+// `parseArgs` cannot require `--prompt` only when `-p` is used; Zod `superRefine` applies that rule after parse.
+const argsSchema = z
+ .discriminatedUnion('agent', [
+ z.object({
+ ...base,
+ agent: z.literal('claude'),
+ model: z.enum(CLAUDE_MODELS).default(AGENTS.claude.defaultModel),
+ effort: z.enum(CLAUDE_EFFORTS).default(AGENTS.claude.defaultEffort),
+ }),
+ z.object({
+ ...base,
+ agent: z.literal('codex'),
+ model: z.enum(CODEX_MODELS).default(AGENTS.codex.defaultModel),
+ effort: z.enum(CODEX_EFFORTS).default(AGENTS.codex.defaultEffort),
+ }),
+ ])
+ .superRefine((data, ctx) => {
+ const needsPromptForTrial =
+ data.project != null && !data.listProjects && !data.listModels && !data.listPrompts;
+ if (!needsPromptForTrial) {
+ return;
+ }
+ const prompt = data.prompt?.trim() ?? '';
+ if (prompt === '') {
+ ctx.addIssue({
+ code: z.ZodIssueCode.custom,
+ message: `Specify --prompt . Example: --prompt ${EXAMPLE_PROMPT_BASENAME}. Run with --list-prompts to see available names.`,
+ path: ['prompt'],
+ });
+ }
+ });
+
+const evalOptions = {
+ project: { type: 'string' as const, short: 'p', description: 'Project to evaluate' },
+ agent: { type: 'string' as const, short: 'a', description: 'Agent to use (claude or codex)' },
+ model: {
+ type: 'string' as const,
+ short: 'm',
+ description: 'Model to use (agent inferred if omitted)',
+ },
+ effort: { type: 'string' as const, short: 'e', description: 'Effort level' },
+ prompt: {
+ type: 'string' as const,
+ description: `Prompt variant name — required with -p (e.g. ${EXAMPLE_PROMPT_BASENAME}). Use --list-prompts to see available names.`,
+ },
+ verbose: { type: 'boolean' as const, short: 'v', description: 'Enable verbose output' },
+ manual: {
+ type: 'boolean' as const,
+ description: 'Prepare workspace only, print instructions',
+ },
+ 'list-projects': { type: 'boolean' as const, description: 'List available projects' },
+ 'list-models': { type: 'boolean' as const, description: 'List available models' },
+ 'list-prompts': { type: 'boolean' as const, description: 'List available prompts' },
+ help: { type: 'boolean' as const, short: 'h', description: 'Show this help and exit' },
+};
+
+const { values } = parseArgs({
+ options: evalOptions,
+ args: process.argv.slice(2),
+ strict: true,
+});
+
+if (values.help) {
+ console.log(
+ formatHelp(
+ `node ${NODE_EVAL_TRIAL_SCRIPT} [options]`,
+ 'Run a single eval trial against a benchmark project.',
+ evalOptions
+ )
+ );
+ process.exit(0);
+}
+
+// Resolve the discriminator: explicit --agent, inferred from --model, or default to claude.
+const agent = values.agent ?? (values.model ? inferAgent(values.model) : 'claude');
+
+const parsed = argsSchema.safeParse({
+ ...values,
+ agent,
+ listProjects: values['list-projects'],
+ listModels: values['list-models'],
+ listPrompts: values['list-prompts'],
+});
+
+if (!parsed.success) {
+ for (const issue of parsed.error.issues) {
+ console.error(pc.red(` ${issue.path.join('.')}: ${issue.message}`));
+ }
+ process.exit(1);
+}
+
+const args = parsed.data;
+const logger = createLogger();
+
+if (args.listProjects) {
+ for (const project of PROJECTS) {
+ logger.log(` ${pc.bold(project.name)} — ${project.description}`);
+ }
+ process.exit(0);
+}
+if (args.listModels) {
+ for (const [name, { models }] of Object.entries(AGENTS)) {
+ logger.log(`\n ${pc.bold(name)}`);
+ for (const model of models) logger.log(` ${model}`);
+ }
+ process.exit(0);
+}
+if (args.listPrompts) {
+ for (const name of listPrompts()) logger.log(` ${pc.bold(name)}`);
+ process.exit(0);
+}
+
+if (!args.project) {
+ logger.log(pc.red(`Specify a project with -p. Available: ${PROJECT_NAMES.join(', ')}`));
+ process.exit(1);
+}
+const project = PROJECTS.find((p) => p.name === args.project)!;
+const variant = toVariant(args);
+const promptName = args.prompt!.trim();
+
+logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`));
+logger.log(
+ `Agent: ${variant.agent} | Model: ${variant.model} | Effort: ${variant.effort} | Prompt: ${promptName}\n`
+);
+
+if (args.manual) {
+ const trialId = generateTrialId();
+ const workspace = await prepareTrial(project, trialId, logger);
+ await captureEnvironment();
+
+ const prompt = loadPrompt(promptName);
+ const promptPath = join(workspace.resultsDir, 'prompt.md');
+ await writeFile(promptPath, prompt);
+
+ const setupPromptPath = join(workspace.resultsDir, 'setup-prompt.md');
+ const setupPromptContent = await captureAiSetupMarkdown(
+ workspace.projectPath,
+ promptName,
+ logger
+ );
+ await writeFile(setupPromptPath, setupPromptContent);
+
+ const cliCommand = buildManualCommand(variant, promptPath, promptName);
+
+ logger.log(pc.bold('\n── Manual mode ──'));
+ logger.log(`\n Trial dir: ${pc.cyan(workspace.trialDir)}`);
+ logger.log(` Project dir: ${pc.cyan(workspace.projectPath)}`);
+ logger.log(` Prompt file: ${pc.cyan(promptPath)}`);
+ logger.log(` Setup prompt: ${pc.cyan(setupPromptPath)}`);
+ logger.log(pc.bold('\nRun the agent yourself:\n'));
+ logger.log(` ${pc.green('cd')} ${workspace.projectPath}`);
+ logger.log(` ${pc.green(cliCommand)}\n`);
+} else {
+ const result = await runTrial(
+ {
+ project,
+ variant,
+ prompt: promptName,
+ verbose: args.verbose,
+ } satisfies TrialConfig,
+ logger
+ );
+
+ const storyRenderStr = formatPassedTotalSummary(
+ result.grade.baselinePreviewStories,
+ result.grade.storyRender
+ );
+ const ghostStoriesStr = formatPassedTotalSummary(
+ result.grade.baselineGhostStories,
+ result.grade.ghostStories
+ );
+ logger.log(pc.bold('\nResult'));
+ logger.log(` Build: ${result.grade.buildSuccess ? pc.green('PASS') : pc.red('FAIL')}`);
+ logger.log(` Stories: ${storyRenderStr}`);
+ logger.log(` Ghost: ${ghostStoriesStr}`);
+ logger.log(` TS Err: ${result.grade.typeCheckErrors}`);
+ logger.log(` Score: ${formatScorePercent(result.score.score)} (normalized preview gain)`);
+ logger.log(` Cost: ${formatCost(result.execution.cost)}`);
+ logger.log(` Time: ${formatDuration(result.execution.duration)}`);
+ logger.log(` Turns: ${result.execution.turns}`);
+ logger.log(` PR: ${result.publish.url}`);
+
+ logger.log('\nDone.');
+}
+
+function inferAgent(model: string): AgentId {
+ for (const id of AGENT_IDS) {
+ if (AGENTS[id].models.some((candidate) => candidate === model)) return id;
+ }
+ throw new Error(`No agent found for model: ${model}`);
+}
+
+function buildManualCommand(variant: AgentVariant, promptPath: string, promptName: string): string {
+ // EVAL_SETUP_PROMPT must be in the env the agent inherits, so that the
+ // agent's own `npx storybook ai setup` tool call picks the right variant.
+ const envPrefix = `EVAL_SETUP_PROMPT=${promptName} `;
+ const escapedPath = promptPath.replace(/'/g, `'\\''`);
+ const promptArg = `"$(cat '${escapedPath}')"`;
+ if (variant.agent === 'claude') {
+ const sdkModel = AGENTS.claude.sdkModelIds[variant.model] ?? variant.model;
+ return `${envPrefix}claude --model ${sdkModel} ${promptArg}`;
+ }
+ return `${envPrefix}codex --model ${variant.model} --reasoning-effort ${variant.effort} ${promptArg}`;
+}
+
+function toVariant(args: z.infer): AgentVariant {
+ return args.agent === 'claude'
+ ? { agent: 'claude', model: args.model, effort: args.effort }
+ : { agent: 'codex', model: args.model, effort: args.effort };
+}
+
+function formatPassedTotalSummary(
+ before?: { passed: number; total: number },
+ after?: { passed: number; total: number }
+) {
+ const beforeSummary = formatPassedTotal(before);
+ const afterSummary = formatPassedTotal(after);
+
+ if (beforeSummary === '-' && afterSummary === '-') {
+ return '-';
+ }
+
+ return `${beforeSummary} -> ${afterSummary}`;
+}
+
+function formatPassedTotal(summary?: { passed: number; total: number }) {
+ if (!summary) {
+ return '-';
+ }
+
+ const rate = summary.total > 0 ? summary.passed / summary.total : 0;
+ return `${summary.passed}/${summary.total} (${Math.round(rate * 100)}%)`;
+}
diff --git a/scripts/eval/lib/agents/claude-code.test.ts b/scripts/eval/lib/agents/claude-code.test.ts
new file mode 100644
index 000000000000..0efe830d1496
--- /dev/null
+++ b/scripts/eval/lib/agents/claude-code.test.ts
@@ -0,0 +1,164 @@
+import { describe, expect, it, vi, beforeEach } from 'vitest';
+
+const { queryMock } = vi.hoisted(() => ({
+ queryMock: vi.fn(),
+}));
+
+vi.mock('@anthropic-ai/claude-agent-sdk', () => ({
+ query: queryMock,
+}));
+
+import { claudeAgent } from './claude-code.ts';
+
+const logger = {
+ log: vi.fn(),
+ logStep: vi.fn(),
+ logSuccess: vi.fn(),
+ logError: vi.fn(),
+};
+
+describe('claudeAgent.execute', () => {
+ beforeEach(() => {
+ vi.clearAllMocks();
+ });
+
+ it('does not pass maxTurns to the Claude SDK query', async () => {
+ queryMock.mockImplementation(async function* () {
+ yield {
+ type: 'result',
+ subtype: 'success',
+ num_turns: 2,
+ total_cost_usd: 0.42,
+ duration_api_ms: 4000,
+ };
+ });
+
+ await claudeAgent.execute({
+ prompt: 'prompt',
+ projectPath: '/repo',
+ variant: { agent: 'claude', model: 'sonnet-4.6', effort: 'medium' },
+ resultsDir: '/results',
+ logger,
+ });
+
+ expect(queryMock).toHaveBeenCalledWith(
+ expect.objectContaining({
+ options: expect.not.objectContaining({
+ maxTurns: expect.anything(),
+ }),
+ })
+ );
+ });
+
+ it('passes STORYBOOK_DISABLE_TELEMETRY through the Claude SDK environment', async () => {
+ queryMock.mockImplementation(async function* () {
+ yield {
+ type: 'result',
+ subtype: 'success',
+ num_turns: 1,
+ total_cost_usd: 0.1,
+ duration_api_ms: 1000,
+ };
+ });
+
+ await claudeAgent.execute({
+ prompt: 'prompt',
+ projectPath: '/repo',
+ variant: { agent: 'claude', model: 'sonnet-4.6', effort: 'medium' },
+ resultsDir: '/results',
+ logger,
+ });
+
+ expect(queryMock).toHaveBeenCalledWith(
+ expect.objectContaining({
+ options: expect.objectContaining({
+ env: expect.objectContaining({
+ STORYBOOK_DISABLE_TELEMETRY: '1',
+ }),
+ }),
+ })
+ );
+ });
+
+ it('preserves terminal result metadata for non-success Claude results', async () => {
+ queryMock.mockImplementation(async function* () {
+ yield {
+ type: 'result',
+ subtype: 'error_max_turns',
+ num_turns: 51,
+ total_cost_usd: 1.3491844,
+ duration_api_ms: 490424,
+ };
+ });
+
+ const result = await claudeAgent.execute({
+ prompt: 'prompt',
+ projectPath: '/repo',
+ variant: { agent: 'claude', model: 'sonnet-4.6', effort: 'medium' },
+ resultsDir: '/results',
+ logger,
+ });
+
+ expect(result.execution).toMatchObject({
+ turns: 51,
+ cost: 1.3491844,
+ durationApi: 490.424,
+ terminalResultSubtype: 'error_max_turns',
+ });
+ expect(result.transcript).toEqual([
+ expect.objectContaining({
+ type: 'result',
+ subtype: 'error_max_turns',
+ }),
+ ]);
+ });
+
+ it('shows line count for tool results and trims tool input', async () => {
+ const longText = Array.from({ length: 40 }, (_, index) => `line ${index + 1}`).join('\n');
+
+ queryMock.mockImplementation(async function* () {
+ yield {
+ type: 'assistant',
+ message: {
+ content: [
+ {
+ type: 'tool_use',
+ name: 'Bash',
+ input: { command: longText },
+ },
+ ],
+ },
+ };
+ yield {
+ type: 'user',
+ message: {
+ content: [
+ {
+ type: 'tool_result',
+ tool_use_id: 'tool_12345678',
+ content: longText,
+ },
+ ],
+ },
+ };
+ yield {
+ type: 'result',
+ subtype: 'success',
+ num_turns: 1,
+ total_cost_usd: 0.1,
+ duration_api_ms: 1000,
+ };
+ });
+
+ await claudeAgent.execute({
+ prompt: 'prompt',
+ projectPath: '/repo',
+ variant: { agent: 'claude', model: 'sonnet-4.6', effort: 'medium' },
+ resultsDir: '/results',
+ logger,
+ });
+
+ expect(logger.log).toHaveBeenCalledWith(expect.stringContaining('🔧 Bash('));
+ expect(logger.log).toHaveBeenCalledWith('📎 tool_result(12345678): 40 lines');
+ });
+});
diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts
new file mode 100644
index 000000000000..7113a43d4416
--- /dev/null
+++ b/scripts/eval/lib/agents/claude-code.ts
@@ -0,0 +1,184 @@
+import type { SDKMessage } from '@anthropic-ai/claude-agent-sdk';
+import { query } from '@anthropic-ai/claude-agent-sdk';
+import {
+ AGENTS,
+ resolveClaudeSdkModel,
+ type AgentDriver,
+ type AgentExecutionResult,
+ type Execution,
+} from './config.ts';
+import { countLines, trimNonChatOutput } from '../output-preview.ts';
+import type { Logger } from '../utils.ts';
+
+export const claudeAgent: AgentDriver = {
+ name: 'claude',
+
+ async execute({
+ prompt,
+ projectPath,
+ variant,
+ logger,
+ verbose,
+ env,
+ }): Promise {
+ if (variant.agent !== 'claude') {
+ throw new Error(`Claude driver received unsupported variant: ${variant.agent}`);
+ }
+
+ const startTime = Date.now();
+ const settings = AGENTS.claude.execution;
+ const { model } = variant;
+ const effort = variant.effort as 'low' | 'medium' | 'high' | 'max';
+ const sdkModel = resolveClaudeSdkModel(model);
+
+ let cost: number | undefined;
+ let turns = 0;
+ let durationApi: number | undefined;
+ const messages: unknown[] = [];
+
+ try {
+ for await (const message of query({
+ prompt,
+ options: {
+ model: sdkModel,
+ cwd: projectPath,
+ env: {
+ ...process.env,
+ ...env,
+ STORYBOOK_DISABLE_TELEMETRY: '1',
+ },
+ allowedTools: [...settings.allowedTools],
+ effort,
+ debug: settings.debug,
+ systemPrompt: settings.systemPrompt,
+ },
+ })) {
+ logMessage(message, logger, verbose);
+ messages.push(message);
+
+ if (message.type === 'result') {
+ cost = message.total_cost_usd as number | undefined;
+ turns = (message.num_turns as number) ?? 0;
+ durationApi =
+ typeof message.duration_api_ms === 'number'
+ ? message.duration_api_ms / 1000
+ : undefined;
+ }
+ }
+ } catch (error) {
+ logger.logError(
+ `Claude execution failed: ${error instanceof Error ? error.message : String(error)}`
+ );
+ throw error;
+ }
+
+ const execution: Execution = {
+ cost,
+ duration: (Date.now() - startTime) / 1000,
+ durationApi,
+ turns,
+ terminalResultSubtype: getLastResultSubtype(messages),
+ };
+
+ return { execution, transcript: messages };
+ },
+};
+
+function logMessage(message: SDKMessage, logger: Logger, verbose?: boolean) {
+ switch (message.type) {
+ case 'assistant': {
+ for (const block of message.message.content) {
+ if (block.type === 'text') {
+ logger.log(`💬 ${block.text}`);
+ } else if (block.type === 'tool_use') {
+ logger.log(`🔧 ${block.name}(${formatToolInput(block.input)})`);
+ }
+ }
+ if (message.error) {
+ logger.logError(`Assistant error: ${message.error}`);
+ }
+ break;
+ }
+ case 'user': {
+ const content = message.message.content;
+ if (!Array.isArray(content)) break;
+ for (const block of content) {
+ if (block.type === 'tool_result') {
+ const text =
+ typeof block.content === 'string'
+ ? block.content
+ : Array.isArray(block.content)
+ ? block.content
+ .map((b: { type: string; text?: string }) =>
+ 'text' in b ? (b.text ?? '') : `[${b.type}]`
+ )
+ .join('')
+ : '[no content]';
+ if (verbose) {
+ logger.log(`📎 tool_result(${block.tool_use_id?.slice(-8)}): ${text}`);
+ } else {
+ const lines = countLines(text);
+ logger.log(
+ `📎 tool_result(${block.tool_use_id?.slice(-8)}): ${lines > 0 ? `${lines} lines` : '(empty)'}`
+ );
+ }
+ }
+ }
+ break;
+ }
+ case 'result':
+ if (message.subtype === 'success') {
+ logger.logSuccess(
+ `Done — ${message.num_turns} turns, $${message.total_cost_usd?.toFixed(4)}`
+ );
+ } else {
+ logger.logError(`Error (${message.subtype}): ${message.errors?.join(', ')}`);
+ }
+ break;
+ case 'system':
+ if (message.subtype === 'init') {
+ logger.log(`🚀 Session started — model: ${message.model}`);
+ } else if (message.subtype === 'api_retry') {
+ logger.log(`🔄 API retry: attempt ${message.attempt}/${message.max_retries}`);
+ } else if (message.subtype === 'status') {
+ logger.log(`📊 status: ${message.status ?? 'unknown'}`);
+ }
+ break;
+ case 'tool_use_summary':
+ logger.log(`📋 ${message.summary}`);
+ break;
+ case 'rate_limit_event':
+ logger.log(
+ `⏳ Rate limited — status: ${message.rate_limit_info?.status}, resets at: ${message.rate_limit_info?.resetsAt}`
+ );
+ break;
+ default:
+ break;
+ }
+}
+
+function formatToolInput(value: unknown) {
+ try {
+ return trimNonChatOutput(JSON.stringify(value, null, 2));
+ } catch {
+ return trimNonChatOutput(String(value));
+ }
+}
+
+function getLastResultSubtype(messages: unknown[]): string | undefined {
+ for (let index = messages.length - 1; index >= 0; index -= 1) {
+ const message = messages[index];
+ if (
+ message &&
+ typeof message === 'object' &&
+ 'type' in message &&
+ message.type === 'result' &&
+ 'subtype' in message &&
+ typeof message.subtype === 'string'
+ ) {
+ return message.subtype;
+ }
+ }
+
+ return undefined;
+}
diff --git a/scripts/eval/lib/agents/codex.test.ts b/scripts/eval/lib/agents/codex.test.ts
new file mode 100644
index 000000000000..2117a5ace3c0
--- /dev/null
+++ b/scripts/eval/lib/agents/codex.test.ts
@@ -0,0 +1,172 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+const { codexCtorMock, startThreadMock, runStreamedMock } = vi.hoisted(() => ({
+ codexCtorMock: vi.fn(),
+ startThreadMock: vi.fn(),
+ runStreamedMock: vi.fn(),
+}));
+
+vi.mock('@openai/codex-sdk', () => ({
+ Codex: class MockCodex {
+ constructor(opts?: unknown) {
+ codexCtorMock(opts);
+ }
+
+ startThread(opts: unknown) {
+ return startThreadMock(opts);
+ }
+ },
+}));
+
+import { codexAgent } from './codex.ts';
+
+const logger = {
+ log: vi.fn(),
+ logStep: vi.fn(),
+ logSuccess: vi.fn(),
+ logError: vi.fn(),
+};
+
+describe('codexAgent.execute', () => {
+ beforeEach(() => {
+ vi.clearAllMocks();
+
+ runStreamedMock.mockResolvedValue({
+ events: (async function* () {
+ yield {
+ type: 'turn.completed',
+ usage: {
+ input_tokens: 10,
+ cached_input_tokens: 2,
+ output_tokens: 4,
+ },
+ };
+ })(),
+ });
+
+ startThreadMock.mockReturnValue({
+ runStreamed: runStreamedMock,
+ });
+ });
+
+ it('passes STORYBOOK_DISABLE_TELEMETRY through the Codex CLI environment', async () => {
+ await codexAgent.execute({
+ prompt: 'prompt',
+ projectPath: '/repo',
+ variant: { agent: 'codex', model: 'gpt-5.4', effort: 'medium' },
+ resultsDir: '/results',
+ logger,
+ });
+
+ expect(codexCtorMock).toHaveBeenCalledWith(
+ expect.objectContaining({
+ env: expect.objectContaining({
+ STORYBOOK_DISABLE_TELEMETRY: '1',
+ }),
+ })
+ );
+ });
+
+ it('starts the thread with the expected working directory and approval policy', async () => {
+ await codexAgent.execute({
+ prompt: 'prompt',
+ projectPath: '/repo',
+ variant: { agent: 'codex', model: 'gpt-5.4', effort: 'medium' },
+ resultsDir: '/results',
+ logger,
+ });
+
+ expect(startThreadMock).toHaveBeenCalledWith(
+ expect.objectContaining({
+ model: 'gpt-5.4',
+ modelReasoningEffort: 'medium',
+ workingDirectory: '/repo',
+ approvalPolicy: 'never',
+ })
+ );
+ });
+
+ it('uses completed agent messages as the effective turn count when available', async () => {
+ runStreamedMock.mockResolvedValue({
+ events: (async function* () {
+ yield {
+ type: 'item.completed',
+ item: {
+ type: 'agent_message',
+ text: 'First response',
+ },
+ };
+ yield {
+ type: 'item.completed',
+ item: {
+ type: 'command_execution',
+ command: 'echo test',
+ exit_code: 0,
+ },
+ };
+ yield {
+ type: 'item.completed',
+ item: {
+ type: 'agent_message',
+ text: 'Second response',
+ },
+ };
+ yield {
+ type: 'turn.completed',
+ usage: {
+ input_tokens: 10,
+ cached_input_tokens: 2,
+ output_tokens: 4,
+ },
+ };
+ })(),
+ });
+
+ const result = await codexAgent.execute({
+ prompt: 'prompt',
+ projectPath: '/repo',
+ variant: { agent: 'codex', model: 'gpt-5.4', effort: 'medium' },
+ resultsDir: '/results',
+ logger,
+ });
+
+ expect(result.execution.turns).toBe(2);
+ });
+
+ it('shows line count instead of command output content', async () => {
+ const longOutput = Array.from({ length: 40 }, (_, index) => `line ${index + 1}`).join('\n');
+
+ runStreamedMock.mockResolvedValue({
+ events: (async function* () {
+ yield {
+ type: 'item.completed',
+ item: {
+ type: 'command_execution',
+ command: 'cat huge.log',
+ exit_code: 0,
+ aggregated_output: longOutput,
+ },
+ };
+ yield {
+ type: 'turn.completed',
+ usage: {
+ input_tokens: 10,
+ cached_input_tokens: 2,
+ output_tokens: 4,
+ },
+ };
+ })(),
+ });
+
+ await codexAgent.execute({
+ prompt: 'prompt',
+ projectPath: '/repo',
+ variant: { agent: 'codex', model: 'gpt-5.4', effort: 'medium' },
+ resultsDir: '/results',
+ logger,
+ });
+
+ expect(logger.log).toHaveBeenCalledWith('🔧 $ cat huge.log → exit 0 (40 lines)');
+ expect(logger.log).not.toHaveBeenCalledWith(expect.stringContaining('line 1'));
+ });
+});
diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts
new file mode 100644
index 000000000000..c3a75c00e488
--- /dev/null
+++ b/scripts/eval/lib/agents/codex.ts
@@ -0,0 +1,121 @@
+import { Codex, type ModelReasoningEffort } from '@openai/codex-sdk';
+import {
+ AGENTS,
+ estimateCost,
+ type AgentDriver,
+ type AgentExecutionResult,
+ type Execution,
+} from './config.ts';
+import { countLines } from '../output-preview.ts';
+
+export const codexAgent: AgentDriver = {
+ name: 'codex',
+
+ async execute({
+ prompt,
+ projectPath,
+ variant,
+ logger,
+ verbose,
+ env,
+ }): Promise {
+ if (variant.agent !== 'codex') {
+ throw new Error(`Codex driver received unsupported variant: ${variant.agent}`);
+ }
+
+ const startTime = Date.now();
+ const settings = AGENTS.codex.execution;
+ const { model, effort } = variant;
+
+ const codex = new Codex({
+ env: {
+ ...process.env,
+ ...env,
+ STORYBOOK_DISABLE_TELEMETRY: '1',
+ },
+ });
+ const thread = codex.startThread({
+ model,
+ modelReasoningEffort: effort as ModelReasoningEffort,
+ workingDirectory: projectPath,
+ approvalPolicy: settings.approvalPolicy,
+ });
+
+ const items: unknown[] = [];
+ let totalInput = 0;
+ let totalCached = 0;
+ let totalOutput = 0;
+ let sdkTurns = 0;
+ let agentMessageTurns = 0;
+
+ const { events } = await thread.runStreamed(prompt);
+ for await (const event of events) {
+ switch (event.type) {
+ case 'item.completed': {
+ const item = event.item;
+ items.push(item);
+ switch (item.type) {
+ case 'agent_message':
+ agentMessageTurns += 1;
+ logger.log(`💬 ${item.text}`);
+ break;
+ case 'command_execution': {
+ const lines = countLines(item.aggregated_output);
+ logger.log(
+ `🔧 $ ${item.command} → exit ${item.exit_code ?? '?'}${lines > 0 ? ` (${lines} lines)` : ''}`
+ );
+ if (verbose && item.aggregated_output) {
+ logger.log(` ${item.aggregated_output}`);
+ }
+ break;
+ }
+ case 'file_change':
+ for (const c of item.changes) logger.log(`📝 ${c.kind} ${c.path}`);
+ break;
+ case 'reasoning':
+ logger.log(`🧠 ${item.text}`);
+ break;
+ case 'error':
+ logger.logError(item.message);
+ break;
+ }
+ break;
+ }
+ case 'turn.completed':
+ totalInput += event.usage.input_tokens;
+ totalCached += event.usage.cached_input_tokens;
+ totalOutput += event.usage.output_tokens;
+ sdkTurns += 1;
+ logger.log(
+ `📊 tokens: ${event.usage.input_tokens}in / ${event.usage.output_tokens}out (${event.usage.cached_input_tokens} cached)`
+ );
+ break;
+ case 'turn.failed':
+ logger.logError(`Turn failed: ${event.error.message}`);
+ break;
+ case 'error':
+ logger.logError(`Error: ${event.message}`);
+ break;
+ }
+ }
+
+ // Codex often reports a single SDK turn for the whole autonomous run.
+ // Counting completed assistant messages gives a more useful "effective turns" metric.
+ const turns = agentMessageTurns || sdkTurns;
+
+ const execution: Execution = {
+ cost: estimateCost('codex', model, {
+ inputTokens: totalInput,
+ cachedInputTokens: totalCached,
+ outputTokens: totalOutput,
+ }),
+ duration: (Date.now() - startTime) / 1000,
+ turns,
+ };
+ logger.logSuccess(
+ `Done — ${turns} turns, ${Math.round(execution.duration)}s, ${totalInput}in/${totalOutput}out tokens${execution.cost != null ? `, $${execution.cost.toFixed(4)}` : ''}`
+ );
+
+ return { execution, transcript: items };
+ },
+};
diff --git a/scripts/eval/lib/agents/config.test.ts b/scripts/eval/lib/agents/config.test.ts
new file mode 100644
index 000000000000..3aec24872780
--- /dev/null
+++ b/scripts/eval/lib/agents/config.test.ts
@@ -0,0 +1,61 @@
+import { describe, expect, it } from 'vitest';
+
+import { AGENTS, getDefaultVariant } from './config.ts';
+
+describe('AGENTS', () => {
+ it('keeps each agent default inside its supported model and effort lists', () => {
+ for (const config of Object.values(AGENTS)) {
+ expect(config).toMatchObject({
+ defaultModel: expect.any(String),
+ defaultEffort: expect.any(String),
+ });
+ expect(config.models).toContain(config.defaultModel);
+ expect(config.efforts).toContain(config.defaultEffort);
+ }
+ });
+
+ it('keeps Claude models fully remappable to SDK model ids', () => {
+ expect(AGENTS.claude).toMatchObject({
+ defaultModel: 'sonnet-4.6',
+ defaultEffort: 'medium',
+ execution: {
+ allowedTools: ['Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep'],
+ permissionModel: 'tool-allowlist',
+ },
+ sdkModelIds: Object.fromEntries(
+ AGENTS.claude.models.map((model) => [model, expect.any(String)])
+ ),
+ });
+ });
+
+ it('keeps Codex models fully priceable from token usage', () => {
+ expect(AGENTS.codex).toMatchObject({
+ defaultModel: 'gpt-5.4',
+ defaultEffort: 'medium',
+ execution: {
+ approvalPolicy: 'never',
+ permissionModel: 'approval-policy-never',
+ },
+ pricing: {
+ 'gpt-5.4': {
+ input: 2.5,
+ cachedInput: 0.25,
+ output: 15,
+ },
+ },
+ });
+ });
+
+ it('derives default variants from the central agent definitions', () => {
+ expect(getDefaultVariant('claude')).toEqual({
+ agent: 'claude',
+ model: 'sonnet-4.6',
+ effort: 'medium',
+ });
+ expect(getDefaultVariant('codex')).toEqual({
+ agent: 'codex',
+ model: 'gpt-5.4',
+ effort: 'medium',
+ });
+ });
+});
diff --git a/scripts/eval/lib/agents/config.ts b/scripts/eval/lib/agents/config.ts
new file mode 100644
index 000000000000..71ca3986b125
--- /dev/null
+++ b/scripts/eval/lib/agents/config.ts
@@ -0,0 +1,179 @@
+/**
+ * Agent definitions, model mappings, pricing, and cost estimation.
+ */
+
+import type { Logger } from '../utils.ts';
+
+export const CLAUDE_MODELS = ['sonnet-4.6', 'opus-4.6', 'haiku-4.5'] as const;
+export const CODEX_MODELS = ['gpt-5.4'] as const;
+export const ALL_MODELS = [...CLAUDE_MODELS, ...CODEX_MODELS] as const;
+
+export const CLAUDE_EFFORTS = ['low', 'medium', 'high', 'max'] as const;
+export const CODEX_EFFORTS = ['low', 'medium', 'high', 'xhigh'] as const;
+export const ALL_EFFORTS = ['low', 'medium', 'high', 'max', 'xhigh'] as const;
+
+export const AGENT_IDS = ['claude', 'codex'] as const;
+
+export type ClaudeModel = (typeof CLAUDE_MODELS)[number];
+export type CodexModel = (typeof CODEX_MODELS)[number];
+export type ClaudeEffort = (typeof CLAUDE_EFFORTS)[number];
+export type CodexEffort = (typeof CODEX_EFFORTS)[number];
+
+/** Agent + model + effort — validated as a discriminated union at the CLI boundary. */
+export type AgentVariant =
+ | { agent: 'claude'; model: ClaudeModel; effort: ClaudeEffort }
+ | { agent: 'codex'; model: CodexModel; effort: CodexEffort };
+
+export type AgentId = AgentVariant['agent'];
+
+export interface Execution {
+ cost?: number;
+ duration: number;
+ durationApi?: number;
+ turns: number;
+ terminalResultSubtype?: string;
+}
+
+export interface AgentExecutionResult {
+ execution: Execution;
+ transcript: unknown[];
+}
+
+export interface AgentExecuteParams {
+ prompt: string;
+ projectPath: string;
+ variant: AgentVariant;
+ resultsDir: string;
+ logger: Logger;
+ verbose?: boolean;
+ /**
+ * Extra env vars to forward to the agent's spawn. Merged on top of
+ * `process.env` and under the driver's fixed entries (e.g.
+ * `STORYBOOK_DISABLE_TELEMETRY`). Used by the harness to inject
+ * `EVAL_SETUP_PROMPT` so that the agent's own `npx storybook ai setup`
+ * tool call resolves to the selected prompt variant.
+ */
+ env?: Record;
+}
+
+export interface AgentDriver {
+ name: AgentId;
+ execute(params: AgentExecuteParams): Promise;
+}
+
+export interface TokenPricing {
+ input: number;
+ cachedInput: number;
+ output: number;
+}
+
+export interface TokenUsage {
+ inputTokens: number;
+ cachedInputTokens: number;
+ outputTokens: number;
+}
+
+export type ClaudeTool = 'Read' | 'Write' | 'Edit' | 'Bash' | 'Glob' | 'Grep';
+
+export interface ClaudeExecutionConfig {
+ /**
+ * Bash is toggled here at the harness level, but individual shell commands still execute through
+ * Claude's Bash tool rather than through a separate command allowlist.
+ */
+ allowedTools: readonly ClaudeTool[];
+ debug: boolean;
+ systemPrompt: { type: 'preset'; preset: 'claude_code' };
+ /** Claude access is controlled through the explicit tool allowlist above. */
+ permissionModel: 'tool-allowlist';
+}
+
+export interface CodexExecutionConfig {
+ /** Codex runs non-interactively so benchmark runs never block on approval prompts. */
+ approvalPolicy: 'never';
+ permissionModel: 'approval-policy-never';
+}
+
+export interface AgentDefinition {
+ models: readonly TModel[];
+ defaultModel: TModel;
+ /** Map friendly model names to SDK-specific model IDs (e.g. "sonnet-4.6" → "claude-sonnet-4-6"). */
+ sdkModelIds: Partial>;
+ /** Per-million-token pricing for manual cost estimation (agents that don't report cost natively). */
+ pricing: Partial>;
+ efforts: readonly TEffort[];
+ defaultEffort: TEffort;
+ execution: TExecution;
+}
+
+export type ClaudeDefinition = AgentDefinition;
+export type CodexDefinition = AgentDefinition;
+
+export interface AgentDefinitions {
+ claude: ClaudeDefinition;
+ codex: CodexDefinition;
+}
+
+export const AGENTS: AgentDefinitions = {
+ claude: {
+ models: CLAUDE_MODELS,
+ defaultModel: 'sonnet-4.6',
+ sdkModelIds: {
+ 'sonnet-4.6': 'claude-sonnet-4-6',
+ 'opus-4.6': 'claude-opus-4-6',
+ 'haiku-4.5': 'claude-haiku-4-5',
+ },
+ pricing: {},
+ efforts: CLAUDE_EFFORTS,
+ defaultEffort: 'medium',
+ execution: {
+ allowedTools: ['Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep'],
+ debug: true,
+ systemPrompt: { type: 'preset', preset: 'claude_code' },
+ permissionModel: 'tool-allowlist',
+ },
+ },
+ codex: {
+ models: CODEX_MODELS,
+ defaultModel: 'gpt-5.4',
+ sdkModelIds: {},
+ pricing: {
+ 'gpt-5.4': { input: 2.5, cachedInput: 0.25, output: 15.0 },
+ },
+ efforts: CODEX_EFFORTS,
+ defaultEffort: 'medium',
+ execution: {
+ approvalPolicy: 'never',
+ permissionModel: 'approval-policy-never',
+ },
+ },
+};
+
+export function getDefaultVariant(
+ agent: T
+): Extract {
+ const definition = AGENTS[agent];
+ return {
+ agent,
+ model: definition.defaultModel,
+ effort: definition.defaultEffort,
+ } as Extract;
+}
+
+export function resolveClaudeSdkModel(model: ClaudeModel): string {
+ return AGENTS.claude.sdkModelIds[model] ?? model;
+}
+
+/** Estimate cost from token usage using the pricing table. */
+export function estimateCost(agent: AgentId, model: string, usage: TokenUsage): number | undefined {
+ const pricing =
+ agent === 'claude'
+ ? AGENTS.claude.pricing[model as ClaudeModel]
+ : AGENTS.codex.pricing[model as CodexModel];
+ if (!pricing) return undefined;
+ const freshInput = usage.inputTokens - usage.cachedInputTokens;
+ return (
+ (freshInput / 1_000_000) * pricing.input +
+ (usage.cachedInputTokens / 1_000_000) * pricing.cachedInput +
+ (usage.outputTokens / 1_000_000) * pricing.output
+ );
+}
diff --git a/scripts/eval/lib/baseline-template-files.ts b/scripts/eval/lib/baseline-template-files.ts
new file mode 100644
index 000000000000..9853f877363a
--- /dev/null
+++ b/scripts/eval/lib/baseline-template-files.ts
@@ -0,0 +1,1147 @@
+const ts = String.raw;
+
+const MAIN_TS = ts`import type { StorybookConfig } from '@storybook/react-vite';
+
+const config: StorybookConfig = {
+ stories: [
+ '../src/**/*.mdx',
+ '../src/**/*.stories.@(js|jsx|mjs|ts|tsx)',
+ './eval-support/*.mdx',
+ ],
+ addons: [
+ '@chromatic-com/storybook',
+ '@storybook/addon-vitest',
+ '@storybook/addon-a11y',
+ '@storybook/addon-docs',
+ '@storybook/addon-onboarding',
+ ],
+ framework: '@storybook/react-vite',
+};
+
+export default config;
+`;
+
+const PREVIEW_TSX = ts`import type { Preview } from '@storybook/react-vite';
+
+const preview: Preview = {
+ parameters: {
+ controls: {
+ matchers: {
+ color: /(background|color)$/i,
+ date: /Date$/i,
+ },
+ },
+ a11y: {
+ test: 'todo',
+ },
+ },
+};
+
+export default preview;
+`;
+
+const EVAL_SUPPORT_SUMMARY_MDX = `import data from '../eval-results/data.json';
+
+# Eval Summary
+
+
+
+ | Project | {data.project?.name ?? '-'} |
+ | ID | {data.id ?? '-'} |
+ | Prompt | {data.prompt?.name ?? '-'} |
+ | Agent | {data.variant?.agent ?? '-'} |
+ | Model | {data.variant?.model ?? '-'} |
+ | Effort | {data.variant?.effort ?? '-'} |
+ | Score | {typeof data.score?.score === 'number' ? \`\${Math.round(data.score.score * 100)}%\` : '-'} |
+ | Build | {data.grade?.buildSuccess === true ? 'PASS' : data.grade?.buildSuccess === false ? 'FAIL' : '-'} |
+ | TypeScript errors | {data.grade?.typeCheckErrors ?? '-'} |
+ | Ghost stories | {data.grade?.ghostStories ? \`\${data.grade.ghostStories.passed}/\${data.grade.ghostStories.total} (\${Math.round(data.grade.ghostStories.successRate * 100)}%)\` : '-'} |
+ | Duration | {typeof data.execution?.duration === 'number' ? \`\${Math.round(data.execution.duration)}s\` : '-'} |
+ | Cost | {typeof data.execution?.cost === 'number' ? \`$\${data.execution.cost.toFixed(2)}\` : '-'} |
+
+
+
+## Changed Files
+
+
+ {(data.grade?.fileChanges ?? []).map((change) => (
+ -
+
{change.gitStatus} {change.path}
+
+ ))}
+
+`;
+
+const EVAL_SUPPORT_TRANSCRIPT_MDX = `{/* Transcript renderer copied directly from https://github.com/storybookjs/mcp/tree/main/eval/templates/result-docs/transcript.tsx and transcript.types.ts */}
+import data from '../eval-results/data.json';
+import { Transcript } from './transcript';
+
+# Transcript
+
+
+`;
+
+const EVAL_SUPPORT_TRANSCRIPT_TSX = `/*
+ * Keep the baseline copies exact to https://github.com/storybookjs/mcp/blob/main/eval/templates/result-docs/transcript.tsx.
+ * This repo-local template keeps only the minimum lint shims required by Storybook's monorepo.
+ */
+import { useEffect, useRef, useState } from 'react';
+import type {
+ AssistantMessage,
+ TranscriptMessage,
+ TranscriptProps,
+ ResultMessage,
+ SystemMessage,
+ TextContent,
+ ToolResultContent,
+ ToolUseContent,
+ UserMessage,
+} from './transcript.types.ts';
+
+const formatJsonWithPreservedWhitespace = (obj: any): string => {
+ return JSON.stringify(obj, null, 2)
+ .replace(/\\\\\\\\n/g, '\\\\n')
+ .replace(/\\\\n/g, '\\n')
+ .replace(/\\\\\\\\t/g, '\\\\t')
+ .replace(/\\\\t/g, '\\t');
+};
+
+const truncateText = (text: string, maxLength: number): string => {
+ return text.length <= maxLength ? text : text.substring(0, maxLength) + '...';
+};
+
+const getPercentageStyle = (percentage: number): React.CSSProperties => {
+ const adjustedPercentage = Math.max(0, percentage - 5);
+ const intensity = Math.min(adjustedPercentage / 25, 1);
+ const red = Math.round(245 + (248 - 245) * intensity);
+ const green = Math.round(245 * (1 - intensity));
+ const blue = Math.round(245 * (1 - intensity));
+ const bgColor = \`rgb(\${red}, \${green}, \${blue})\`;
+ const textColor = intensity > 0.4 ? '#ffffff' : '#666';
+ return { background: bgColor, color: textColor };
+};
+
+const MetadataCard = ({
+ title,
+ value,
+ subvalue,
+ html,
+}: {
+ title: string;
+ value?: string | number;
+ subvalue?: string;
+ html?: string;
+}) => (
+
+
+ {title}
+
+ {html ? (
+
+ ) : (
+ <>
+
{value}
+ {subvalue && (
+
+ {subvalue}
+
+ )}
+ >
+ )}
+
+);
+
+const CodeBlock = ({
+ content,
+ language = '',
+ isError = false,
+}: {
+ content: string;
+ language?: string;
+ isError?: boolean;
+}) => {
+ const [isTruncated, setIsTruncated] = useState(content.length > 500);
+ const codeRef = useRef(null);
+
+ useEffect(() => {
+ if (codeRef.current && (globalThis as any).hljs) {
+ (globalThis as any).hljs.highlightElement(codeRef.current);
+ }
+ }, [content, isTruncated]);
+
+ return (
+
+
+
+ {content}
+
+ {isTruncated && content.length > 500 && (
+
+ )}
+
+ {content.length > 500 && (
+
setIsTruncated(!isTruncated)}
+ style={{
+ marginTop: '0.5rem',
+ padding: '0.5rem 1rem',
+ backgroundColor: '#3b82f6',
+ color: 'white',
+ border: 'none',
+ borderRadius: '4px',
+ cursor: 'pointer',
+ fontSize: '0.875rem',
+ }}
+ >
+ {isTruncated ? 'Show more' : 'Show less'}
+
+ )}
+
+ );
+};
+
+const ContentSection = ({ label, children }: { label: string; children: React.ReactNode }) => (
+
+
+ {label}
+
+ {children}
+
+);
+
+const FileContent = ({
+ filePath,
+ content,
+ language = 'typescript',
+}: {
+ filePath: string;
+ content: string;
+ language?: string;
+}) => (
+ <>
+
+
+ {filePath}
+
+
+
+
+
+ >
+);
+
+const FileDiff = ({
+ filePath,
+ oldString,
+ newString,
+}: {
+ filePath: string;
+ oldString: string;
+ newString: string;
+}) => {
+ const diff = \`--- a/\${filePath}
++++ b/\${filePath}
+@@ -1,\${oldString.split('\\n').length} +1,\${newString.split('\\n').length} @@
+\${oldString
+ .split('\\n')
+ .map((line) => '-' + line)
+ .join('\\n')}
+\${newString
+ .split('\\n')
+ .map((line) => '+' + line)
+ .join('\\n')}\`;
+
+ return (
+ <>
+
+
+ {filePath}
+
+
+
+
+
+ >
+ );
+};
+
+const TodoList = ({ todos }: { todos: Array<{ content: string; status?: string }> }) => (
+
+ {todos.map((todo, i) => {
+ const status = todo.status || 'pending';
+ const checked = status === 'completed';
+ const indeterminate = status === 'in_progress';
+
+ return (
+
+ {
+ if (el) el.indeterminate = indeterminate;
+ }}
+ readOnly
+ style={{ marginRight: '0.5rem' }}
+ />
+ {todo.content}
+
+ );
+ })}
+
+);
+
+const ElapsedTime = ({ elapsedMs, percentage }: { elapsedMs: number; percentage: number }) => (
+
+
+ {(elapsedMs / 1000).toFixed(1)}s
+
+ {percentage.toFixed(1)}%
+
+
+
+);
+
+const TYPE_COLORS = {
+ assistant: { bg: '#dbeafe', text: '#1e40af' },
+ user: { bg: '#f3e8ff', text: '#6b21a8' },
+ system: { bg: '#e0e7ff', text: '#3730a3' },
+ result: { bg: '#dcfce7', text: '#166534' },
+ tool: { bg: '#fef3c7', text: '#92400e' },
+ prompt: { bg: '#fce7f3', text: '#9f1239' },
+} as const;
+
+const Turn = ({
+ children,
+ type,
+ title,
+ subtitle,
+ tokenCount,
+ percentage,
+ isMCP = false,
+}: {
+ children: React.ReactNode;
+ type: keyof typeof TYPE_COLORS;
+ title: string;
+ subtitle?: string;
+ tokenCount?: string;
+ percentage?: number;
+ isMCP?: boolean;
+}) => {
+ const [isExpanded, setIsExpanded] = useState(false);
+
+ const colors = TYPE_COLORS[type] ?? TYPE_COLORS.assistant;
+
+ return (
+
+
setIsExpanded(!isExpanded)}
+ style={{
+ display: 'flex',
+ alignItems: 'center',
+ gap: '0.75rem',
+ padding: '1rem',
+ backgroundColor: '#f9fafb',
+ cursor: 'pointer',
+ userSelect: 'none',
+ }}
+ >
+
+ ▶
+
+
+ {type}
+
+ {isMCP && (
+
+ MCP
+
+ )}
+
{title}
+ {subtitle && (
+
+ {subtitle}
+
+ )}
+ {!subtitle &&
}
+ {tokenCount && (
+ <>
+
+
{tokenCount}
+ {percentage !== undefined && (
+
+ {percentage.toFixed(1)}%
+
+ )}
+ >
+ )}
+
+ {isExpanded &&
{children}
}
+
+ );
+};
+
+export const Transcript = (props: TranscriptProps) => {
+ const { prompt, promptTokenCount, messages } = props;
+
+ useEffect(() => {
+ const script = document.createElement('script');
+ script.type = 'module';
+ script.textContent = \`import hljs from 'https://esm.sh/highlight.js@11.9.0'; window.hljs = hljs;\`;
+ document.head.appendChild(script);
+
+ const style = document.createElement('link');
+ style.rel = 'stylesheet';
+ style.href = 'https://esm.sh/highlight.js@11.9.0/styles/github-dark-dimmed.css';
+ document.head.appendChild(style);
+
+ const codeStyle = document.createElement('style');
+ codeStyle.textContent = 'code * { font-family: monospace !important; }';
+ document.head.appendChild(codeStyle);
+
+ return () => {
+ document.head.removeChild(script);
+ document.head.removeChild(style);
+ document.head.removeChild(codeStyle);
+ };
+ }, []);
+
+ const systemTurn = messages.find((t) => t.type === 'system') as SystemMessage | undefined;
+ const resultTurn = messages.find((t) => t.type === 'result') as ResultMessage | undefined;
+
+ const messageTokens = messages.reduce((sum, turn) => sum + (turn.tokenCount || 0), 0);
+
+ const totalTime = messages.reduce((sum, turn) => sum + (turn.ms || 0), 0);
+ const totalMessageTokens = messageTokens;
+
+ const metadataCards = [];
+
+ if (systemTurn) {
+ metadataCards.push({
+ title: 'Agent',
+ value: systemTurn.agent || 'N/A',
+ });
+
+ metadataCards.push({
+ title: 'Model',
+ value: systemTurn.model || 'N/A',
+ });
+
+ if (systemTurn.tools) {
+ const mcpTools = systemTurn.tools.filter((t) => t.includes('mcp'));
+ metadataCards.push({
+ title: 'Available Tools',
+ value: systemTurn.tools.length,
+ subvalue: mcpTools.length > 0 ? \`\${mcpTools.length} MCP tools\` : 'unknown',
+ });
+ }
+
+ if (systemTurn.mcp_servers && systemTurn.mcp_servers.length > 0) {
+ const mcpServersHtml =
+ '' +
+ systemTurn.mcp_servers
+ .map(
+ (s) =>
+ \`
+
+ \${s.name}
+
\`,
+ )
+ .join('') +
+ '
';
+ metadataCards.push({
+ title: 'MCP Servers',
+ html: mcpServersHtml,
+ });
+ } else {
+ metadataCards.push({
+ title: 'MCP Servers',
+ value: 'None',
+ });
+ }
+ }
+
+ if (resultTurn) {
+ if (resultTurn.num_turns !== undefined) {
+ metadataCards.push({
+ title: 'Turns',
+ value: resultTurn.num_turns,
+ });
+ }
+
+ if (messageTokens > 0) {
+ metadataCards.push({
+ title: 'Total Message Tokens',
+ value: messageTokens.toLocaleString(),
+ });
+ }
+ }
+
+ const turns = messages.filter((t) => ['assistant', 'user', 'system', 'result'].includes(t.type));
+
+ const groupedTurns = groupToolCallsWithResults(turns);
+
+ return (
+
+
Agent Transcript
+
+ {metadataCards.length > 0 && (
+
+
+ Metadata
+
+
+ {metadataCards.map((card, index) => (
+
+ ))}
+
+
+ )}
+
+
+ Turns
+
+
+ {prompt && (
+
+
+
+
+
+ )}
+
+ {groupedTurns.map((group, index) => {
+ const isTodoWrite =
+ group.toolCall &&
+ 'message' in group.toolCall &&
+ group.toolCall.message?.content?.find(
+ (c) => c.type === 'tool_use' && c.name === 'TodoWrite',
+ );
+
+ if (isTodoWrite && 'input' in isTodoWrite && isTodoWrite.input?.todos) {
+ return
;
+ }
+
+ const elements = [];
+
+ if (index > 0) {
+ const currentTurn = group.toolCall || group.turn;
+ if (currentTurn) {
+ const elapsedMs = currentTurn.ms;
+ if (elapsedMs >= 50) {
+ const percentage = (elapsedMs / totalTime) * 100;
+ elements.push(
+
,
+ );
+ }
+ }
+ }
+
+ if (group.toolCall && group.toolResult) {
+ elements.push(
+
,
+ );
+ } else if (group.turn) {
+ elements.push(
+
,
+ );
+ }
+
+ return elements;
+ })}
+
+ );
+};
+
+function groupToolCallsWithResults(turns: TranscriptMessage[]): Array<{
+ toolCall?: TranscriptMessage;
+ toolResult?: TranscriptMessage;
+ turn?: TranscriptMessage;
+}> {
+ const grouped: Array<{
+ toolCall?: TranscriptMessage;
+ toolResult?: TranscriptMessage;
+ turn?: TranscriptMessage;
+ }> = [];
+ const usedResultIndices = new Set();
+
+ for (const [i, turn] of turns.entries()) {
+ if (usedResultIndices.has(i)) continue;
+
+ const toolUseContent =
+ turn.type === 'assistant' &&
+ 'message' in turn &&
+ turn.message?.content?.find((c) => c.type === 'tool_use');
+
+ if (toolUseContent && 'id' in toolUseContent) {
+ const toolUseId = toolUseContent.id;
+ const resultIndex = turns.findIndex(
+ (t, j) =>
+ j > i &&
+ t.type === 'user' &&
+ 'message' in t &&
+ t.message?.content?.some(
+ (c) => c.type === 'tool_result' && 'tool_use_id' in c && c.tool_use_id === toolUseId,
+ ),
+ );
+
+ if (resultIndex !== -1) {
+ grouped.push({ toolCall: turn, toolResult: turns[resultIndex] });
+ usedResultIndices.add(resultIndex);
+ continue;
+ }
+ }
+
+ grouped.push({ turn });
+ }
+
+ return grouped;
+}
+
+const ToolCallGroup = ({
+ toolCall,
+ toolResult,
+ totalMessageTokens,
+ cwd,
+}: {
+ toolCall: AssistantMessage;
+ toolResult: UserMessage;
+ totalMessageTokens: number;
+ cwd: string;
+}) => {
+ const toolUse = toolCall.message.content.find((c) => c.type === 'tool_use') as ToolUseContent;
+ const toolName = toolUse?.name || 'Unknown Tool';
+ const isMCP = toolUse?.isMCP;
+
+ const additionalInfo = extractToolAdditionalInfo(toolUse, toolName, cwd);
+
+ const totalTokens = (toolCall.tokenCount || 0) + (toolResult.tokenCount || 0);
+ const percentage = totalMessageTokens > 0 ? (totalTokens / totalMessageTokens) * 100 : 0;
+
+ const tokenCountStr =
+ toolCall.tokenCount && toolResult.tokenCount
+ ? \`\${toolCall.tokenCount.toLocaleString()} + \${toolResult.tokenCount.toLocaleString()} tokens\`
+ : \`\${totalTokens.toLocaleString()} tokens\`;
+
+ return (
+
+ {renderToolInput(toolUse, toolName)}
+ {renderToolOutput(toolResult, isMCP)}
+
+ );
+};
+
+const TurnRenderer = ({
+ turn,
+ totalMessageTokens,
+}: {
+ turn: TranscriptMessage;
+ totalMessageTokens: number;
+}) => {
+ const isMCP =
+ turn.type === 'assistant' &&
+ 'message' in turn &&
+ turn.message?.content?.some((c) => c.type === 'tool_use' && c.isMCP);
+
+ const title = getTurnTitle(turn);
+ const percentage =
+ turn.tokenCount && totalMessageTokens > 0 ? (turn.tokenCount / totalMessageTokens) * 100 : 0;
+
+ return (
+
+ {'message' in turn && turn.message?.content
+ ? renderTurnContent(turn.message.content)
+ : renderSystemOrResult(turn)}
+
+ );
+};
+
+function getTurnTitle(turn: TranscriptMessage): string {
+ if (turn.type === 'assistant' && 'message' in turn && turn.message?.content) {
+ const toolUse = turn.message.content.find((c) => c.type === 'tool_use');
+ if (toolUse && 'name' in toolUse) return toolUse.name;
+
+ const text = turn.message.content.find((c) => c.type === 'text');
+ if (text && 'text' in text) return truncateText(text.text, 80);
+ }
+
+ if (turn.type === 'user' && 'message' in turn && turn.message?.content) {
+ const toolResult = turn.message.content.find((c) => c.type === 'tool_result');
+ if (toolResult && 'tool_use_id' in toolResult) return \`Result: \${toolResult.tool_use_id}\`;
+ }
+
+ return 'subtype' in turn && turn.subtype ? turn.subtype : turn.type;
+}
+
+function extractToolAdditionalInfo(
+ toolUse: ToolUseContent | undefined,
+ toolName: string,
+ cwd: string,
+): string {
+ if (!toolUse?.input) return '';
+
+ if (['Read', 'Write', 'Edit'].includes(toolName)) {
+ const fullPath = toolUse.input.file_path || toolUse.input.path;
+ if (fullPath) {
+ return cwd && fullPath.startsWith(cwd)
+ ? fullPath.substring(cwd.length).replace(/^\\//, '')
+ : fullPath;
+ }
+ }
+
+ if (toolName === 'Glob' && toolUse.input.pattern) {
+ return toolUse.input.pattern;
+ }
+
+ if (toolName === 'Bash' && toolUse.input.command) {
+ const cmd = toolUse.input.command;
+ return cmd.length > 80 ? cmd.slice(0, 80) + '...' : cmd;
+ }
+
+ return '';
+}
+
+function renderToolInput(toolUse: ToolUseContent | undefined, toolName: string): React.ReactNode {
+ if (!toolUse?.input) return null;
+
+ if (toolName === 'Write' && toolUse.input.file_path && toolUse.input.content) {
+ return (
+
+ );
+ }
+
+ if (
+ toolName === 'Edit' &&
+ toolUse.input.file_path &&
+ toolUse.input.old_string &&
+ toolUse.input.new_string
+ ) {
+ return (
+
+ );
+ }
+
+ return (
+
+
+
+ );
+}
+
+function renderToolOutput(toolResult: UserMessage, isMCP: boolean): React.ReactNode {
+ const toolResultContent = toolResult.message.content.find((c) => c.type === 'tool_result');
+ if (!toolResultContent) return null;
+
+ if (isMCP) {
+ try {
+ const content =
+ typeof toolResultContent.content === 'string'
+ ? JSON.parse(toolResultContent.content)
+ : toolResultContent.content;
+
+ if (Array.isArray(content)) {
+ return (
+
+ <>
+ {content.map((item, index) => {
+ if (item.type !== 'text' || !item.text) return null;
+ return ;
+ })}
+ >
+
+ );
+ }
+ } catch {
+ // Fall through to default rendering
+ }
+ }
+
+ return (
+
+
+
+ );
+}
+
+function renderTurnContent(
+ content: (TextContent | ToolUseContent | ToolResultContent)[],
+): React.ReactNode {
+ return (
+ <>
+ {content.map((item, index) => {
+ if (item.type === 'text' && 'text' in item) {
+ return (
+
+
+
+ );
+ }
+ if (item.type === 'tool_use' && 'name' in item) {
+ return (
+
+
+
+ {item.name}
+
+
+ {'input' in item && item.input && (
+
+
+
+ )}
+
+ );
+ }
+ if (item.type === 'tool_result' && 'content' in item) {
+ return (
+
+
+
+ );
+ }
+ return null;
+ })}
+ >
+ );
+}
+
+function renderSystemOrResult(turn: TranscriptMessage): React.ReactNode {
+ const data = { ...turn };
+ delete (data as any).type;
+ delete (data as any).ms;
+ delete (data as any).uuid;
+ delete (data as any).session_id;
+ delete (data as any).parent_tool_use_id;
+ delete (data as any).tokenCount;
+ delete (data as any).costUSD;
+
+ return (
+
+
+
+ );
+}
+`;
+
+const EVAL_SUPPORT_TRANSCRIPT_TYPES_TS = `export interface TextContent {
+ type: 'text';
+ text: string;
+}
+
+export interface ToolUseContent {
+ type: 'tool_use';
+ id: string;
+ name: string;
+ input: Record;
+ isMCP: boolean;
+}
+
+export interface ToolResultContent {
+ tool_use_id: string;
+ type: 'tool_result';
+ content: string | Array<{ type: string; text?: string; isError?: boolean }>;
+}
+
+export interface MessageUsage {
+ input_tokens: number;
+ output_tokens: number;
+}
+
+export interface AssistantMessage {
+ type: 'assistant';
+ message: {
+ content: (TextContent | ToolUseContent)[];
+ usage: MessageUsage;
+ };
+ ms: number;
+ tokenCount?: number;
+ costUSD?: number;
+}
+
+export interface UserMessage {
+ type: 'user';
+ message: {
+ content: ToolResultContent[];
+ };
+ ms: number;
+ tokenCount?: number;
+ costUSD?: number;
+}
+
+export interface SystemMessage {
+ type: 'system';
+ subtype: 'init';
+ agent: string;
+ model: string;
+ tools: string[];
+ mcp_servers: Array<{
+ name: string;
+ status: 'connected' | 'disconnected' | 'unknown';
+ }>;
+ cwd: string;
+ ms: number;
+ tokenCount?: number;
+ costUSD?: number;
+}
+
+export interface ResultMessage {
+ type: 'result';
+ subtype: 'success' | 'error';
+ duration_ms: number;
+ duration_api_ms: number;
+ num_turns: number;
+ total_cost_usd: number;
+ ms: number;
+ tokenCount?: number;
+ costUSD?: number;
+}
+
+export type TranscriptMessage = AssistantMessage | UserMessage | SystemMessage | ResultMessage;
+
+export interface TranscriptProps {
+ prompt: string;
+ promptTokenCount: number;
+ promptCost: number;
+ messages: TranscriptMessage[];
+}
+`;
+
+export const BASELINE_STORYBOOK_FILES = {
+ 'main.ts': MAIN_TS,
+ 'preview.tsx': PREVIEW_TSX,
+ 'eval-support/summary.mdx': EVAL_SUPPORT_SUMMARY_MDX,
+ 'eval-support/transcript.mdx': EVAL_SUPPORT_TRANSCRIPT_MDX,
+ 'eval-support/transcript.tsx': EVAL_SUPPORT_TRANSCRIPT_TSX,
+ 'eval-support/transcript.types.ts': EVAL_SUPPORT_TRANSCRIPT_TYPES_TS,
+} satisfies Record;
diff --git a/scripts/eval/lib/grade.test.ts b/scripts/eval/lib/grade.test.ts
new file mode 100644
index 000000000000..1d6cf13b1779
--- /dev/null
+++ b/scripts/eval/lib/grade.test.ts
@@ -0,0 +1,154 @@
+import { describe, expect, it } from 'vitest';
+
+import {
+ filterStorybookFiles,
+ computeQualityScore,
+ countTypeCheckErrors,
+ parseChangedFiles,
+} from './grade.ts';
+import type { FileChange } from './grade.ts';
+
+describe('filterStorybookFiles', () => {
+ it('matches files in .storybook/ directory', () => {
+ const files: FileChange[] = [
+ { path: '.storybook/main.ts', gitStatus: 'M' },
+ { path: '.storybook/preview.tsx', gitStatus: 'A' },
+ { path: 'src/App.tsx', gitStatus: 'M' },
+ ];
+ expect(filterStorybookFiles(files)).toMatchObject([
+ { path: '.storybook/main.ts', gitStatus: 'M' },
+ { path: '.storybook/preview.tsx', gitStatus: 'A' },
+ ]);
+ });
+
+ it('matches story files with various extensions', () => {
+ const files: FileChange[] = [
+ { path: 'src/Button.stories.tsx', gitStatus: 'A' },
+ { path: 'src/Header.stories.ts', gitStatus: 'A' },
+ { path: 'src/Page.story.jsx', gitStatus: 'A' },
+ { path: 'src/utils.stories.js', gitStatus: 'A' },
+ { path: 'src/Button.tsx', gitStatus: 'M' },
+ { path: 'src/Button.test.tsx', gitStatus: 'M' },
+ ];
+ expect(filterStorybookFiles(files)).toMatchObject(files.slice(0, 4));
+ });
+
+ it('returns empty for no storybook files', () => {
+ const files: FileChange[] = [
+ { path: 'src/App.tsx', gitStatus: 'M' },
+ { path: 'package.json', gitStatus: 'M' },
+ ];
+ expect(filterStorybookFiles(files)).toHaveLength(0);
+ });
+
+ it('handles empty input', () => {
+ expect(filterStorybookFiles([])).toHaveLength(0);
+ });
+
+ it('matches renamed files using either side of the rename', () => {
+ const files: FileChange[] = [
+ { path: 'src/Button.tsx', previousPath: 'src/Button.stories.tsx', gitStatus: 'R' },
+ { path: '.storybook/preview.tsx', previousPath: 'config/preview.tsx', gitStatus: 'R' },
+ { path: 'src/App.tsx', previousPath: 'src/Main.tsx', gitStatus: 'R' },
+ ];
+
+ expect(filterStorybookFiles(files)).toMatchObject(files.slice(0, 2));
+ });
+});
+
+describe('computeQualityScore', () => {
+ it('uses normalized preview gain as the score', () => {
+ const result = computeQualityScore({
+ baselinePreviewStories: { passed: 1, total: 4 },
+ storyRender: { passed: 4, total: 4 },
+ });
+ expect(result.score).toBe(1);
+ expect(result.breakdown.beforeRate).toBeCloseTo(0.25);
+ expect(result.breakdown.afterRate).toBeCloseTo(1);
+ expect(result.breakdown.gain).toBe(1);
+ });
+
+ it('returns 0 when either baseline or post-run story data is missing', () => {
+ expect(computeQualityScore({ baselinePreviewStories: { passed: 1, total: 5 } }).score).toBe(0);
+ expect(computeQualityScore({ storyRender: { passed: 4, total: 5 } }).score).toBe(0);
+ expect(computeQualityScore({}).score).toBe(0);
+ });
+
+ it('returns 0 when story render coverage regresses', () => {
+ const result = computeQualityScore({
+ baselinePreviewStories: { passed: 3, total: 4 },
+ storyRender: { passed: 2, total: 4 },
+ });
+ expect(result.score).toBe(0);
+ expect(result.breakdown.gain).toBe(0);
+ });
+
+ it('uses normalized gain for partial improvements', () => {
+ const result = computeQualityScore({
+ baselinePreviewStories: { passed: 2, total: 6 },
+ storyRender: { passed: 4, total: 6 },
+ });
+ expect(result.score).toBeCloseTo(0.5);
+ expect(result.breakdown.beforeRate).toBe(2 / 6);
+ expect(result.breakdown.afterRate).toBe(4 / 6);
+ expect(result.breakdown.gain).toBeCloseTo(0.5);
+ });
+
+ it('returns 0 when baseline and final are both perfect (no remaining gap to improve)', () => {
+ const result = computeQualityScore({
+ baselinePreviewStories: { passed: 4, total: 4 },
+ storyRender: { passed: 4, total: 4 },
+ });
+ expect(result.score).toBe(0);
+ expect(result.breakdown.gain).toBe(0);
+ });
+});
+
+describe('countTypeCheckErrors', () => {
+ it('counts zero for clean output', () => {
+ expect(countTypeCheckErrors('')).toBe(0);
+ expect(countTypeCheckErrors('All good\nNo issues')).toBe(0);
+ });
+
+ it('counts TypeScript error codes', () => {
+ const output = [
+ "src/App.tsx(3,1): error TS2304: Cannot find name 'foo'.",
+ "src/App.tsx(5,1): error TS2322: Type 'string' is not assignable.",
+ 'Found 2 errors.',
+ ].join('\n');
+ expect(countTypeCheckErrors(output)).toBe(2);
+ });
+
+ it('counts multiple errors on the same line', () => {
+ expect(countTypeCheckErrors('error TS1234 and error TS5678 on same line')).toBe(2);
+ });
+
+ it('does not count non-error TS references', () => {
+ expect(countTypeCheckErrors('TS2304 without error prefix')).toBe(0);
+ expect(countTypeCheckErrors('warning TS1234')).toBe(0);
+ });
+});
+
+describe('parseChangedFiles', () => {
+ it('parses added, modified, deleted, and renamed files', () => {
+ const output =
+ 'A\tsrc/new-file.ts\nM\tsrc/existing.ts\nD\tsrc/removed.ts\nR100\told.ts\tnew.ts';
+ expect(parseChangedFiles(output)).toMatchObject([
+ { path: 'src/new-file.ts', gitStatus: 'A' },
+ { path: 'src/existing.ts', gitStatus: 'M' },
+ { path: 'src/removed.ts', gitStatus: 'D' },
+ { path: 'new.ts', previousPath: 'old.ts', gitStatus: 'R' },
+ ]);
+ });
+
+ it('handles empty output', () => {
+ expect(parseChangedFiles('')).toEqual([]);
+ expect(parseChangedFiles('\n')).toEqual([]);
+ });
+
+ it('handles single file', () => {
+ expect(parseChangedFiles('M\tpackage.json')).toEqual([
+ { path: 'package.json', gitStatus: 'M' },
+ ]);
+ });
+});
diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts
new file mode 100644
index 000000000000..09b82dcecfea
--- /dev/null
+++ b/scripts/eval/lib/grade.ts
@@ -0,0 +1,374 @@
+import { existsSync } from 'node:fs';
+import { readFile, writeFile } from 'node:fs/promises';
+import { join } from 'node:path';
+import { x } from 'tinyexec';
+import { getComponentCandidates } from '../../../code/core/src/core-server/utils/ghost-stories/get-candidates.ts';
+import { parseVitestResults } from '../../../code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts';
+import { detectPackageManager, resolveInstallRoot } from './package-manager.ts';
+import { capitalizeFirst, type Logger } from './utils.ts';
+import type { TrialWorkspace } from './prepare-trial.ts';
+import {
+ getGeneratedStoryFiles,
+ getScriptRunCommand,
+ runStoryRenderPass,
+ type StoryRenderGrade,
+ withBaselinePreviewEnvironment,
+} from './story-render.ts';
+
+/** Git `--name-status` codes: A=added, M=modified, D=deleted, R=renamed. */
+export type GitDiffStatus = 'A' | 'M' | 'D' | 'R';
+
+export interface FileChange {
+ path: string;
+ gitStatus: GitDiffStatus;
+ /** For renames, the original path before the move. */
+ previousPath?: string;
+}
+
+export interface GhostStoryGrade {
+ candidateCount: number;
+ total: number;
+ passed: number;
+ successRate: number;
+}
+
+export interface QualityScore {
+ score: number;
+ breakdown: {
+ beforeRate: number;
+ afterRate: number;
+ gain: number;
+ };
+}
+
+export interface Grade {
+ buildSuccess: boolean;
+ buildError?: string;
+ typeCheckErrors: number;
+ typeCheckOutput?: string;
+ fileChanges: FileChange[];
+ storybookChanges: FileChange[];
+ baselineGhostStories?: GhostStoryGrade;
+ ghostStories?: GhostStoryGrade;
+ baselinePreviewStories?: StoryRenderGrade;
+ storyRender?: StoryRenderGrade;
+}
+
+/** Filter file changes to only storybook-related ones. */
+export function filterStorybookFiles(fileChanges: FileChange[]): FileChange[] {
+ const isStorybookPath = (path?: string) =>
+ path != null && (path.includes('.storybook/') || /\.(stories|story)\.[tj]sx?$/.test(path));
+
+ return fileChanges.filter((f) => isStorybookPath(f.path) || isStorybookPath(f.previousPath));
+}
+
+/**
+ * Compute the eval score from normalized preview gain on generated stories.
+ *
+ * Build, typecheck, runtime, and ghost stories are still recorded in the eval output,
+ * but they no longer contribute to the score itself.
+ *
+ * When the baseline is already at 100% story coverage, the score is **0** (no remaining gap).
+ */
+export function computeQualityScore(opts: {
+ baselinePreviewStories?: Pick;
+ storyRender?: Pick;
+}): QualityScore {
+ const beforeRate = getStoryRenderRate(opts.baselinePreviewStories);
+ const afterRate = getStoryRenderRate(opts.storyRender);
+ const gain = computeNormalizedGain(beforeRate, afterRate);
+
+ return {
+ score: gain,
+ breakdown: {
+ beforeRate: beforeRate ?? 0,
+ afterRate: afterRate ?? 0,
+ gain,
+ },
+ };
+}
+
+/** Count TypeScript errors from tsc output. */
+export function countTypeCheckErrors(tscOutput: string): number {
+ return (tscOutput.match(/error TS\d+/g) || []).length;
+}
+
+/** Parse git diff --name-status output into FileChange objects. */
+export function parseChangedFiles(gitOutput: string): FileChange[] {
+ return gitOutput
+ .trim()
+ .split('\n')
+ .filter(Boolean)
+ .map((line) => {
+ const [status, ...parts] = line.split('\t');
+ const gitStatus = parseGitDiffStatus(status);
+
+ if (gitStatus === 'R' && parts.length >= 2) {
+ const [previousPath, path] = parts;
+ return { path, previousPath, gitStatus };
+ }
+
+ return { path: parts.join('\t'), gitStatus };
+ });
+}
+
+export async function grade(
+ workspace: TrialWorkspace,
+ logger: Logger,
+ baselineGhostStories?: GhostStoryGrade
+): Promise<{ grade: Grade; score: QualityScore }> {
+ const { repoRoot, projectPath, resultsDir, baselineCommit } = workspace;
+
+ // Changed files
+ logger.logStep('Collecting agent changes...');
+ const fileChanges = await getChangedFiles(repoRoot, baselineCommit);
+ const storybookChanges = filterStorybookFiles(fileChanges);
+ logger.logSuccess(
+ `${fileChanges.length} files changed (${storybookChanges.length} storybook-related)`
+ );
+
+ // Storybook build + TypeScript check in parallel
+ logger.logStep('Running storybook build + typecheck...');
+ const [build, tsc] = await Promise.all([
+ x('npx', ['storybook', 'build', '--quiet'], {
+ throwOnError: false,
+ timeout: 300_000,
+ nodeOptions: {
+ cwd: projectPath,
+ env: {
+ ...process.env,
+ STORYBOOK_DISABLE_TELEMETRY: '1',
+ NODE_OPTIONS: '--max_old_space_size=4096',
+ },
+ },
+ }),
+ x('npx', ['tsc', '--noEmit'], {
+ throwOnError: false,
+ timeout: 120_000,
+ nodeOptions: { cwd: projectPath },
+ }),
+ ]);
+
+ const buildSuccess = build.exitCode === 0;
+ const buildOutput = build.stdout + '\n' + build.stderr;
+ await writeFile(join(resultsDir, 'build-output.txt'), buildOutput);
+ if (buildSuccess) {
+ logger.logSuccess('Storybook build succeeded');
+ } else {
+ logger.logError(`Storybook build failed (exit ${build.exitCode})`);
+ }
+
+ const tscOutput = tsc.stdout + '\n' + tsc.stderr;
+ await writeFile(join(resultsDir, 'typecheck-output.txt'), tscOutput);
+ const typeCheckErrors = countTypeCheckErrors(tscOutput);
+ if (typeCheckErrors === 0) {
+ logger.logSuccess('No TypeScript errors');
+ } else {
+ logger.logError(`${typeCheckErrors} TypeScript error(s)`);
+ }
+
+ const generatedStoryFiles = getGeneratedStoryFiles(repoRoot, projectPath, fileChanges);
+
+ const ghostStories = await collectGhostStoriesGrade(projectPath, logger);
+
+ const storyRenderRun = await runStoryRenderPass({
+ projectPath,
+ resultsDir,
+ storyFiles: generatedStoryFiles,
+ outputBaseName: 'story-render',
+ label: 'story tests',
+ logger,
+ });
+
+ const cssCheck = storyRenderRun.summary?.cssCheck ?? 'not-run';
+ if (cssCheck === 'pass') {
+ logger.logSuccess('CssCheck story passed');
+ } else if (cssCheck === 'fail') {
+ logger.logError('CssCheck story failed');
+ } else {
+ logger.log('CssCheck story not run');
+ }
+
+ const baselinePreviewRun = await withBaselinePreviewEnvironment({
+ repoRoot,
+ baselineCommit,
+ fileChanges,
+ fn: () =>
+ runStoryRenderPass({
+ projectPath,
+ resultsDir,
+ storyFiles: generatedStoryFiles,
+ outputBaseName: 'baseline-story-render',
+ label: 'baseline story tests (original preview)',
+ logger,
+ }),
+ });
+
+ const trialGrade: Grade = {
+ buildSuccess,
+ buildError: buildSuccess ? undefined : truncateEnd(buildOutput, 2000),
+ typeCheckErrors,
+ typeCheckOutput: typeCheckErrors > 0 ? truncateEnd(tscOutput, 2000) : undefined,
+ fileChanges,
+ storybookChanges,
+ baselineGhostStories,
+ ghostStories,
+ baselinePreviewStories: baselinePreviewRun.summary,
+ storyRender: storyRenderRun.summary,
+ };
+
+ const score = computeQualityScore({
+ baselinePreviewStories: baselinePreviewRun.summary,
+ storyRender: storyRenderRun.summary,
+ });
+
+ return { grade: trialGrade, score };
+}
+
+function getStoryRenderRate(storyRender?: Pick) {
+ if (!storyRender || storyRender.total <= 0) {
+ return undefined;
+ }
+
+ const rate = storyRender.passed / storyRender.total;
+ return Number.isNaN(rate) ? undefined : rate;
+}
+
+/** Truncate text to approximately maxChars, snapping to a line boundary. */
+function truncateEnd(text: string, maxChars: number): string {
+ if (text.length <= maxChars) return text;
+ const truncated = text.slice(-maxChars);
+ const firstNewline = truncated.indexOf('\n');
+ return firstNewline >= 0 ? truncated.slice(firstNewline + 1) : truncated;
+}
+
+function parseGitDiffStatus(rawStatus?: string): GitDiffStatus {
+ const firstChar = rawStatus?.charAt(0);
+ return firstChar === 'A' || firstChar === 'M' || firstChar === 'D' || firstChar === 'R'
+ ? firstChar
+ : 'M';
+}
+
+async function getChangedFiles(repoRoot: string, baseline: string): Promise {
+ // Stage all files so `git diff --cached` picks up new files the agent created.
+ // Safe: this runs on an ephemeral trial copy, not the real repo.
+ await x('git', ['add', '-A'], { nodeOptions: { cwd: repoRoot } });
+ const { stdout } = await x('git', ['diff', '--cached', '--name-status', baseline], {
+ throwOnError: false,
+ nodeOptions: { cwd: repoRoot },
+ });
+ return parseChangedFiles(stdout);
+}
+
+export async function collectGhostStoriesGrade(
+ projectPath: string,
+ logger: Logger,
+ label = 'ghost stories'
+): Promise {
+ logger.logStep(`Running ${label}...`);
+
+ try {
+ const { candidates } = await getComponentCandidates({ sampleSize: 20, cwd: projectPath });
+ if (candidates.length === 0) {
+ logger.logError(`No candidate components found for ${label}`);
+ return undefined;
+ }
+ logger.logStep(`Found ${candidates.length} candidate component(s) for ${label}`);
+
+ const pm = detectPackageManager(resolveInstallRoot(projectPath));
+ const [runCmd, ...runArgs] = getScriptRunCommand(pm);
+ const outputFile = join(projectPath, `ghost-stories-report-${Date.now()}.json`);
+
+ const result = await x(
+ runCmd,
+ [
+ ...runArgs,
+ '--reporter=json',
+ '--testTimeout=1000',
+ `--outputFile=${outputFile}`,
+ ...candidates,
+ ],
+ {
+ throwOnError: false,
+ timeout: 300_000,
+ nodeOptions: {
+ cwd: projectPath,
+ env: {
+ ...process.env,
+ STORYBOOK_DISABLE_TELEMETRY: '1',
+ STORYBOOK_COMPONENT_PATHS: candidates.join(';'),
+ },
+ },
+ }
+ );
+
+ const stderr = result.stderr.toLowerCase();
+ if (result.exitCode !== 0 && !existsSync(outputFile)) {
+ const runError = stderr.includes('no tests found')
+ ? 'No tests found'
+ : stderr.includes('browsertype.launch')
+ ? 'Playwright is not installed'
+ : stderr.includes('startup error')
+ ? 'Startup Error'
+ : `Exit ${result.exitCode}`;
+ logger.logError(`${capitalizeFirst(label)}: ${runError}`);
+ return undefined;
+ }
+
+ if (!existsSync(outputFile)) {
+ logger.logError(`${capitalizeFirst(label)}: JSON report not found`);
+ return undefined;
+ }
+
+ const rawReport = JSON.parse(await readFile(outputFile, 'utf8'));
+ const parsed = parseVitestResults(rawReport);
+ const emptyRenders = parsed.summary?.passedButEmptyRender ?? 0;
+
+ // Suite-level: each file either loaded and rendered or it didn't.
+ const total: number = rawReport.numTotalTestSuites ?? 0;
+ const passed = (rawReport.numPassedTestSuites ?? 0) - emptyRenders;
+ const successRate = total > 0 ? passed / total : 0;
+
+ if (total === 0) {
+ logger.logError(`${capitalizeFirst(label)}: No tests found`);
+ return undefined;
+ }
+
+ logger.logSuccess(
+ `${capitalizeFirst(label)}: ${passed}/${total} passed (${Math.round(successRate * 100)}%)${emptyRenders > 0 ? ` (${emptyRenders} empty renders excluded)` : ''}`
+ );
+
+ return {
+ candidateCount: candidates.length,
+ total,
+ passed,
+ successRate,
+ };
+ } catch (error) {
+ logger.logError(
+ `${capitalizeFirst(label)}: ${error instanceof Error ? error.message : String(error)}`
+ );
+ return undefined;
+ }
+}
+
+/**
+ * Normalized preview gain: fraction of the remaining gap to 100% pass rate that this run closed.
+ *
+ * - Missing rates → 0
+ * - Baseline already at 100% → 0 (no remaining improvement; avoids scoring a no-op as full gain)
+ * - Otherwise → (after − before) / (1 − before), clamped to [0, 1]. When `before` is 0, this is
+ * just `after` (all improvement from zero).
+ */
+function computeNormalizedGain(beforeRate?: number, afterRate?: number) {
+ if (beforeRate == null || afterRate == null) {
+ return 0;
+ }
+
+ if (beforeRate >= 1) {
+ return 0;
+ }
+
+ const gain = (afterRate - beforeRate) / (1 - beforeRate);
+ return Math.max(0, Math.min(1, Number.isNaN(gain) ? 0 : gain));
+}
diff --git a/scripts/eval/lib/grading-helpers.test.ts b/scripts/eval/lib/grading-helpers.test.ts
new file mode 100644
index 000000000000..3f15c6089182
--- /dev/null
+++ b/scripts/eval/lib/grading-helpers.test.ts
@@ -0,0 +1,169 @@
+import { mkdirSync, writeFileSync, rmSync } from 'node:fs';
+import { join } from 'node:path';
+import { tmpdir } from 'node:os';
+
+import { afterEach, beforeEach, describe, expect, it } from 'vitest';
+
+import { getComponentCandidates } from 'storybook/internal/core-server';
+import {
+ computeQualityScore,
+ countTypeCheckErrors,
+ filterStorybookFiles,
+ parseChangedFiles,
+} from './grade.ts';
+/**
+ * Helper-level test: compose grading helpers on a fake project directory.
+ * This exercises candidate discovery, git-output parsing,
+ * and quality-score calculation without pretending to cover the full grade() flow.
+ */
+
+let TMP: string;
+
+beforeEach(() => {
+ TMP = join(tmpdir(), `eval-grading-helpers-${Date.now()}`);
+ mkdirSync(join(TMP, 'src', 'components'), { recursive: true });
+ mkdirSync(join(TMP, '.storybook'), { recursive: true });
+});
+
+afterEach(() => {
+ rmSync(TMP, { recursive: true, force: true });
+});
+
+describe('grading helpers', () => {
+ it('composes helper signals for a well-configured project', async () => {
+ // Set up a realistic project with components and storybook config
+ writeFile(
+ 'src/components/Button.tsx',
+ [
+ `import React from 'react';`,
+ `export function Button({ label }: { label: string }) {`,
+ ` return (`,
+ ` {label}`,
+ ` );`,
+ `}`,
+ ].join('\n')
+ );
+ writeFile(
+ 'src/components/Card.tsx',
+ [
+ `import React from 'react';`,
+ `export function Card({ title }: { title: string }) {`,
+ ` return (`,
+ ` {title}
`,
+ ` );`,
+ `}`,
+ ].join('\n')
+ );
+ writeFile(
+ '.storybook/preview.tsx',
+ [
+ `import '../src/styles/globals.css';`,
+ `import { ThemeProvider } from '@emotion/react';`,
+ ].join('\n')
+ );
+ writeFile(
+ '.storybook/main.ts',
+ `export default { staticDirs: ['../public'], stories: ['../src/**/*.stories.tsx'] };`
+ );
+
+ // Step 1: Find candidates — both components should be discovered
+ const candidates = await findCandidates(TMP);
+ expect(candidates).toHaveLength(2);
+
+ // Step 2: Simulate git output where the agent added storybook config + one
+ // story per discovered candidate, plus modified package.json
+ const gitLines = [
+ 'A\t.storybook/preview.tsx',
+ 'A\t.storybook/main.ts',
+ ...candidates.map((c) => `A\t${c.replace(/\.tsx$/, '.stories.tsx')}`),
+ 'M\tpackage.json',
+ ];
+ const changedFiles = parseChangedFiles(gitLines.join('\n'));
+ const storybookFiles = filterStorybookFiles(changedFiles);
+
+ // 2 config files + 1 story per candidate = storybook-related
+ expect(storybookFiles).toHaveLength(2 + candidates.length);
+ // Total includes package.json
+ expect(changedFiles).toHaveLength(storybookFiles.length + 1);
+
+ // Step 3: Score is now just story-render preview gain.
+ const quality = computeQualityScore({
+ baselinePreviewStories: { passed: 1, total: 4 },
+ storyRender: { passed: 4, total: 4 },
+ });
+ expect(quality.score).toBe(1);
+ });
+
+ it('composes helper signals for a broken project', async () => {
+ writeFile(
+ 'src/components/Widget.tsx',
+ [
+ `import React from 'react';`,
+ `export function Widget() {`,
+ ` return hello
;`,
+ `}`,
+ ].join('\n')
+ );
+
+ // Candidates still discoverable even when storybook setup is broken
+ const candidates = await findCandidates(TMP);
+ expect(candidates).toHaveLength(1);
+
+ // Simulate tsc output with errors proportional to candidate count
+ const tscLines = candidates.map(
+ (c, i) => `${c}(${i + 1},1): error TS2304: Cannot find name 'React'.`
+ );
+ tscLines.push('src/App.tsx(10,5): error TS2345: Argument not assignable.');
+ const errorCount = countTypeCheckErrors(tscLines.join('\n'));
+ expect(errorCount).toBe(candidates.length + 1);
+
+ // Missing post-run story-render data means no measurable gain.
+ const quality = computeQualityScore({
+ baselinePreviewStories: { passed: 2, total: 5 },
+ });
+ expect(quality.score).toBe(0);
+ });
+
+ it('keeps helper output stable as candidate count grows', async () => {
+ // Rich project: many simple components
+ for (let i = 0; i < 5; i++) {
+ writeFile(
+ `src/components/Comp${i}.tsx`,
+ [
+ `import React from 'react';`,
+ `export function Comp${i}() {`,
+ ` return Component ${i}
;`,
+ `}`,
+ ].join('\n')
+ );
+ }
+ writeFile('.storybook/preview.tsx', `import { MemoryRouter } from 'react-router-dom';`);
+
+ const candidates = await findCandidates(TMP);
+ expect(candidates).toHaveLength(5);
+
+ // Agent wrote one story per candidate — all storybook-related
+ const gitOutput = candidates.map((c) => `A\t${c.replace(/\.tsx$/, '.stories.tsx')}`).join('\n');
+ const storybookFiles = filterStorybookFiles(parseChangedFiles(gitOutput));
+ expect(storybookFiles).toHaveLength(candidates.length);
+
+ // Score tracks only the increase in story-render success rate.
+ expect(
+ computeQualityScore({
+ baselinePreviewStories: { passed: 3, total: 5 },
+ storyRender: { passed: 5, total: 5 },
+ }).score
+ ).toBe(1);
+ });
+});
+
+function writeFile(relativePath: string, content: string) {
+ const fullPath = join(TMP, relativePath);
+ mkdirSync(join(fullPath, '..'), { recursive: true });
+ writeFileSync(fullPath, content);
+}
+
+async function findCandidates(cwd: string) {
+ const { candidates } = await getComponentCandidates({ cwd, sampleSize: 20 });
+ return candidates.map((c) => c.replace(cwd + '/', ''));
+}
diff --git a/scripts/eval/lib/output-preview.test.ts b/scripts/eval/lib/output-preview.test.ts
new file mode 100644
index 000000000000..a02c2b4f181b
--- /dev/null
+++ b/scripts/eval/lib/output-preview.test.ts
@@ -0,0 +1,20 @@
+import { describe, expect, it } from 'vitest';
+
+import { trimNonChatOutput } from './output-preview.ts';
+
+describe('trimNonChatOutput', () => {
+ it('trims large non-chat output to a shorter head and tail preview', () => {
+ const longOutput = Array.from({ length: 20 }, (_, index) => `line ${index + 1}`).join('\n');
+
+ const trimmed = trimNonChatOutput(longOutput);
+
+ expect(trimmed).toContain('line 1');
+ expect(trimmed).toContain('line 20');
+ expect(trimmed).toContain('… 8 more lines …');
+ expect(trimmed).not.toContain('line 7');
+ });
+
+ it('leaves short output untouched', () => {
+ expect(trimNonChatOutput('short output')).toBe('short output');
+ });
+});
diff --git a/scripts/eval/lib/output-preview.ts b/scripts/eval/lib/output-preview.ts
new file mode 100644
index 000000000000..dd91347adb51
--- /dev/null
+++ b/scripts/eval/lib/output-preview.ts
@@ -0,0 +1,58 @@
+const OUTPUT_TRUNCATED_MARKER = '… output truncated …';
+const DEFAULT_MAX_LINES = 12;
+const DEFAULT_MAX_CHARS = 700;
+
+export function countLines(text: string | undefined | null): number {
+ const trimmed = text?.trim();
+ if (!trimmed) return 0;
+ return trimmed.split('\n').length;
+}
+
+export function trimNonChatOutput(
+ text: string,
+ {
+ maxLines = DEFAULT_MAX_LINES,
+ maxChars = DEFAULT_MAX_CHARS,
+ }: {
+ maxLines?: number;
+ maxChars?: number;
+ } = {}
+) {
+ const normalized = text.trim();
+ if (!normalized) {
+ return normalized;
+ }
+
+ const lineTrimmed = trimByLines(normalized, maxLines);
+ if (lineTrimmed.length <= maxChars) {
+ return lineTrimmed;
+ }
+
+ return trimByChars(lineTrimmed, maxChars);
+}
+
+function trimByLines(text: string, maxLines: number) {
+ const lines = text.split('\n');
+ if (lines.length <= maxLines) {
+ return text;
+ }
+
+ const headCount = Math.max(1, Math.floor(maxLines / 2));
+ const tailCount = Math.max(1, maxLines - headCount);
+ const hiddenCount = Math.max(0, lines.length - headCount - tailCount);
+
+ return [
+ ...lines.slice(0, headCount),
+ `… ${hiddenCount} more lines …`,
+ ...lines.slice(-tailCount),
+ ].join('\n');
+}
+
+function trimByChars(text: string, maxChars: number) {
+ const separatorLength = OUTPUT_TRUNCATED_MARKER.length + 2;
+ const availableChars = Math.max(0, maxChars - separatorLength);
+ const headLength = Math.max(1, Math.floor(availableChars / 2));
+ const tailLength = Math.max(1, availableChars - headLength);
+
+ return `${text.slice(0, headLength)}\n${OUTPUT_TRUNCATED_MARKER}\n${text.slice(-tailLength)}`;
+}
diff --git a/scripts/eval/lib/package-manager.test.ts b/scripts/eval/lib/package-manager.test.ts
new file mode 100644
index 000000000000..f1b59adf53d3
--- /dev/null
+++ b/scripts/eval/lib/package-manager.test.ts
@@ -0,0 +1,71 @@
+import { mkdirSync, rmSync, writeFileSync } from 'node:fs';
+import { dirname, join } from 'node:path';
+import { tmpdir } from 'node:os';
+
+import { afterEach, describe, expect, it } from 'vitest';
+
+import { detectPackageManager, resolveInstallRoot } from './package-manager.ts';
+
+const TEMP_DIRS: string[] = [];
+
+afterEach(() => {
+ for (const dir of TEMP_DIRS.splice(0)) {
+ rmSync(dir, { recursive: true, force: true });
+ }
+});
+
+describe('detectPackageManager', () => {
+ it('recognizes npm from package-lock files', () => {
+ const root = createTempDir('npm-lock');
+ writeFile('package-lock.json', root);
+
+ expect(detectPackageManager(root)).toBe('npm');
+ });
+});
+
+describe('resolveInstallRoot', () => {
+ it('keeps nested standalone apps on their own install root', () => {
+ const repoRoot = createTempDir('nested-bun');
+ const projectDir = join(repoRoot, 'frontend');
+ mkdirSync(projectDir, { recursive: true });
+ writeFile('frontend/bun.lock', repoRoot);
+
+ expect(resolveInstallRoot(projectDir, repoRoot)).toBe(projectDir);
+ });
+
+ it('walks up to the repo workspace root when lockfiles live above projectDir', () => {
+ const repoRoot = createTempDir('pnpm-workspace');
+ const projectDir = join(repoRoot, 'packages', 'lib');
+ mkdirSync(projectDir, { recursive: true });
+ writeFile('pnpm-lock.yaml', repoRoot);
+ writeFile('pnpm-workspace.yaml', repoRoot);
+
+ expect(resolveInstallRoot(projectDir, repoRoot)).toBe(repoRoot);
+ });
+
+ it('does not walk above the cloned repo root', () => {
+ const parent = createTempDir('parent-lock');
+ const repoRoot = join(parent, 'repo');
+ const projectDir = join(repoRoot, 'packages', 'lib');
+ mkdirSync(projectDir, { recursive: true });
+ writeFile('yarn.lock', parent);
+
+ expect(resolveInstallRoot(projectDir, repoRoot)).toBe(projectDir);
+ });
+});
+
+function createTempDir(name: string) {
+ const dir = join(
+ tmpdir(),
+ `storybook-eval-${name}-${Date.now()}-${Math.random().toString(16).slice(2)}`
+ );
+ mkdirSync(dir, { recursive: true });
+ TEMP_DIRS.push(dir);
+ return dir;
+}
+
+function writeFile(relativePath: string, root: string) {
+ const fullPath = join(root, relativePath);
+ mkdirSync(dirname(fullPath), { recursive: true });
+ writeFileSync(fullPath, '');
+}
diff --git a/scripts/eval/lib/package-manager.ts b/scripts/eval/lib/package-manager.ts
new file mode 100644
index 000000000000..ea61a5444e4f
--- /dev/null
+++ b/scripts/eval/lib/package-manager.ts
@@ -0,0 +1,99 @@
+/**
+ * Shared package manager detection and dependency installation.
+ *
+ * Used by trial preparation and any other eval flows that need a
+ * package-manager-aware install step.
+ */
+import { existsSync } from 'node:fs';
+import { dirname, join, resolve } from 'node:path';
+import { x } from 'tinyexec';
+import type { Logger } from './utils.ts';
+
+const PACKAGE_MANAGER_MARKERS = {
+ pnpm: ['pnpm-lock.yaml', 'pnpm-workspace.yaml'],
+ yarn: ['yarn.lock'],
+ bun: ['bun.lockb', 'bun.lock'],
+ npm: ['package-lock.json', 'npm-shrinkwrap.json'],
+} as const;
+
+/** Detect the package manager from lock files in a directory. */
+export function detectPackageManager(dir: string): string {
+ if (PACKAGE_MANAGER_MARKERS.pnpm.some((file) => existsSync(join(dir, file)))) return 'pnpm';
+ if (PACKAGE_MANAGER_MARKERS.yarn.some((file) => existsSync(join(dir, file)))) return 'yarn';
+ if (PACKAGE_MANAGER_MARKERS.bun.some((file) => existsSync(join(dir, file)))) return 'bun';
+ if (PACKAGE_MANAGER_MARKERS.npm.some((file) => existsSync(join(dir, file)))) return 'npm';
+ return 'npm';
+}
+
+/**
+ * Resolve the directory where dependency installation should run.
+ *
+ * For nested projects inside a workspace, the lockfile often lives above `dir`.
+ * We walk upward until we find the closest package-manager marker, stopping at
+ * the cloned repo root so we do not accidentally use markers from outside the trial.
+ */
+export function resolveInstallRoot(dir: string, stopAt?: string): string {
+ const start = resolve(dir);
+ const boundary = stopAt ? resolve(stopAt) : undefined;
+
+ let current = start;
+ while (true) {
+ if (hasAnyMarker(current)) {
+ return current;
+ }
+
+ if (boundary && current === boundary) {
+ return start;
+ }
+
+ const parent = dirname(current);
+ if (parent === current) {
+ return start;
+ }
+
+ current = parent;
+ }
+}
+
+/** Install dependencies using the detected package manager. */
+export async function installDeps(
+ dir: string,
+ logger: Logger,
+ env?: Record,
+ options?: { stopAt?: string }
+): Promise {
+ const installRoot = resolveInstallRoot(dir, options?.stopAt);
+ const pm = detectPackageManager(installRoot);
+ const [cmd, args] = getInstallArgs(pm, installRoot);
+ logger.logStep(
+ installRoot === resolve(dir)
+ ? `Installing with ${pm}...`
+ : `Installing with ${pm} from ${installRoot}...`
+ );
+ await x(cmd, args, {
+ timeout: 300_000,
+ nodeOptions: { cwd: installRoot, ...(env && { env: env as NodeJS.ProcessEnv }) },
+ });
+}
+
+function hasAnyMarker(dir: string): boolean {
+ return Object.values(PACKAGE_MANAGER_MARKERS).some((files) =>
+ files.some((file) => existsSync(join(dir, file)))
+ );
+}
+
+function getInstallArgs(pm: string, dir: string): [string, string[]] {
+ switch (pm) {
+ case 'pnpm':
+ return ['pnpm', ['install', '--no-frozen-lockfile']];
+ case 'yarn':
+ return [
+ 'yarn',
+ existsSync(join(dir, '.yarnrc.yml')) ? ['install', '--no-immutable'] : ['install'],
+ ];
+ case 'bun':
+ return ['bun', ['install']];
+ default:
+ return ['npm', ['install', '--ignore-scripts']];
+ }
+}
diff --git a/scripts/eval/lib/prepare-trial.test.ts b/scripts/eval/lib/prepare-trial.test.ts
new file mode 100644
index 000000000000..d4760512ea81
--- /dev/null
+++ b/scripts/eval/lib/prepare-trial.test.ts
@@ -0,0 +1,239 @@
+import { existsSync, mkdirSync, mkdtempSync, rmSync } from 'node:fs';
+import { join } from 'node:path';
+import { tmpdir } from 'node:os';
+
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+let TMP = '';
+
+beforeEach(() => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-prepare-trial-'));
+ vi.resetModules();
+});
+
+afterEach(() => {
+ vi.doUnmock('tinyexec');
+ vi.doUnmock('./package-manager.ts');
+ vi.doUnmock('./utils.ts');
+ vi.restoreAllMocks();
+ vi.resetModules();
+
+ if (TMP) {
+ rmSync(TMP, { recursive: true, force: true });
+ TMP = '';
+ }
+});
+
+function createLogger() {
+ return {
+ log: vi.fn(),
+ logStep: vi.fn(),
+ logSuccess: vi.fn(),
+ logError: vi.fn(),
+ };
+}
+
+function createExecResult(stdout = '', exitCode = 0) {
+ return { stdout, stderr: '', exitCode };
+}
+
+describe('prepareTrial', () => {
+ it('clones missing source repos, creates a worktree, and installs from the trial repo root', async () => {
+ const reposDir = join(TMP, 'repos');
+ const trialsDir = join(TMP, 'trials');
+ const installDeps = vi.fn().mockResolvedValue(undefined);
+ const calls: Array<{ cmd: string; args: string[]; cwd?: string }> = [];
+
+ vi.doMock('tinyexec', () => ({
+ x: vi.fn(
+ async (cmd: string, args: string[], options?: { nodeOptions?: { cwd?: string } }) => {
+ calls.push({ cmd, args, cwd: options?.nodeOptions?.cwd });
+
+ if (cmd === 'git' && args[0] === 'rev-parse') {
+ return createExecResult('deadbeef\n');
+ }
+
+ return createExecResult();
+ }
+ ),
+ }));
+
+ vi.doMock('./package-manager.ts', () => ({ installDeps }));
+ vi.doMock('./utils.ts', async () => {
+ const actual = await vi.importActual('./utils.ts');
+ return {
+ ...actual,
+ REPOS_DIR: reposDir,
+ TRIALS_DIR: trialsDir,
+ };
+ });
+
+ const { prepareTrial } = await import('./prepare-trial.ts');
+ const logger = createLogger();
+ const project = {
+ name: 'evergreen-ci',
+ repo: 'https://github.com/storybook-tmp/ui',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/ui',
+ projectDir: 'packages/lib',
+ };
+
+ const workspace = await prepareTrial(project, 'trial-123', logger);
+
+ expect(workspace).toEqual({
+ trialDir: join(trialsDir, 'trial-123'),
+ sourceDir: join(reposDir, 'evergreen-ci'),
+ repoRoot: join(trialsDir, 'trial-123', 'project'),
+ projectPath: join(trialsDir, 'trial-123', 'project', 'packages/lib'),
+ resultsDir: join(
+ trialsDir,
+ 'trial-123',
+ 'project',
+ 'packages/lib',
+ '.storybook',
+ 'eval-results'
+ ),
+ baselineCommit: 'deadbeef',
+ trialBranch: 'trial/trial-123',
+ });
+ expect(existsSync(workspace.resultsDir)).toBe(true);
+ expect(installDeps).toHaveBeenCalledWith(
+ join(trialsDir, 'trial-123', 'project', 'packages/lib'),
+ logger,
+ undefined,
+ { stopAt: join(trialsDir, 'trial-123', 'project') }
+ );
+
+ expect(calls).toEqual(
+ expect.arrayContaining([
+ {
+ cmd: 'git',
+ args: ['clone', '--branch', 'main', project.repo, join(reposDir, 'evergreen-ci')],
+ cwd: undefined,
+ },
+ {
+ cmd: 'git',
+ args: ['remote', 'set-url', 'origin', project.repo],
+ cwd: join(reposDir, 'evergreen-ci'),
+ },
+ {
+ cmd: 'git',
+ args: ['fetch', 'origin', '--prune'],
+ cwd: join(reposDir, 'evergreen-ci'),
+ },
+ {
+ cmd: 'git',
+ args: ['checkout', 'main'],
+ cwd: join(reposDir, 'evergreen-ci'),
+ },
+ {
+ cmd: 'git',
+ args: ['reset', '--hard', 'origin/main'],
+ cwd: join(reposDir, 'evergreen-ci'),
+ },
+ {
+ cmd: 'git',
+ args: ['rev-parse', 'HEAD'],
+ cwd: join(reposDir, 'evergreen-ci'),
+ },
+ {
+ cmd: 'git',
+ args: [
+ 'worktree',
+ 'add',
+ '-b',
+ 'trial/trial-123',
+ join(trialsDir, 'trial-123', 'project'),
+ 'main',
+ ],
+ cwd: join(reposDir, 'evergreen-ci'),
+ },
+ ])
+ );
+ });
+
+ it('reuses an existing source clone without recloning it', async () => {
+ const reposDir = join(TMP, 'repos');
+ const trialsDir = join(TMP, 'trials');
+ const sourceDir = join(reposDir, 'mealdrop');
+ const installDeps = vi.fn().mockResolvedValue(undefined);
+ const calls: Array<{ cmd: string; args: string[]; cwd?: string }> = [];
+
+ mkdirSync(join(sourceDir, '.git'), { recursive: true });
+
+ vi.doMock('tinyexec', () => ({
+ x: vi.fn(
+ async (cmd: string, args: string[], options?: { nodeOptions?: { cwd?: string } }) => {
+ calls.push({ cmd, args, cwd: options?.nodeOptions?.cwd });
+
+ if (cmd === 'git' && args[0] === 'rev-parse') {
+ return createExecResult('cafebabe\n');
+ }
+
+ return createExecResult();
+ }
+ ),
+ }));
+
+ vi.doMock('./package-manager.ts', () => ({ installDeps }));
+ vi.doMock('./utils.ts', async () => {
+ const actual = await vi.importActual('./utils.ts');
+ return {
+ ...actual,
+ REPOS_DIR: reposDir,
+ TRIALS_DIR: trialsDir,
+ };
+ });
+
+ const { prepareTrial } = await import('./prepare-trial.ts');
+ const logger = createLogger();
+ const project = {
+ name: 'mealdrop',
+ repo: 'https://github.com/storybook-tmp/mealdrop',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ };
+
+ const workspace = await prepareTrial(project, 'trial-456', logger);
+
+ expect(workspace.baselineCommit).toBe('cafebabe');
+ expect(calls.some((call) => call.args[0] === 'clone')).toBe(false);
+ expect(calls).toEqual(
+ expect.arrayContaining([
+ {
+ cmd: 'git',
+ args: ['remote', 'set-url', 'origin', project.repo],
+ cwd: sourceDir,
+ },
+ {
+ cmd: 'git',
+ args: ['fetch', 'origin', '--prune'],
+ cwd: sourceDir,
+ },
+ {
+ cmd: 'git',
+ args: ['checkout', 'main'],
+ cwd: sourceDir,
+ },
+ {
+ cmd: 'git',
+ args: ['reset', '--hard', 'origin/main'],
+ cwd: sourceDir,
+ },
+ {
+ cmd: 'git',
+ args: [
+ 'worktree',
+ 'add',
+ '-b',
+ 'trial/trial-456',
+ join(trialsDir, 'trial-456', 'project'),
+ 'main',
+ ],
+ cwd: sourceDir,
+ },
+ ])
+ );
+ expect(installDeps).toHaveBeenCalledTimes(1);
+ });
+});
diff --git a/scripts/eval/lib/prepare-trial.ts b/scripts/eval/lib/prepare-trial.ts
new file mode 100644
index 000000000000..b4faec00d7e3
--- /dev/null
+++ b/scripts/eval/lib/prepare-trial.ts
@@ -0,0 +1,109 @@
+import { existsSync } from 'node:fs';
+import { mkdir, rm } from 'node:fs/promises';
+import { dirname, join } from 'node:path';
+import type { Logger } from './utils.ts';
+import type { Project } from './projects.ts';
+import { x } from 'tinyexec';
+import { installDeps } from './package-manager.ts';
+import { getEvalResultsDir, getProjectPath, REPOS_DIR, TRIALS_DIR } from './utils.ts';
+
+export interface TrialWorkspace {
+ trialDir: string;
+ sourceDir: string;
+ repoRoot: string;
+ projectPath: string;
+ resultsDir: string;
+ baselineCommit: string;
+ trialBranch: string;
+}
+
+/**
+ * Maintain one persistent source clone per project and create a fresh worktree for every trial.
+ */
+export async function prepareTrial(
+ project: Project,
+ trialId: string,
+ logger: Logger
+): Promise {
+ const sourceDir = join(REPOS_DIR, project.name);
+ const trialDir = join(TRIALS_DIR, trialId);
+ const repoRoot = join(trialDir, 'project');
+ const trialBranch = `trial/${trialId}`;
+ await mkdir(trialDir, { recursive: true });
+
+ await ensureSourceClone(project, sourceDir, logger);
+ const baselineCommit = await syncSourceClone(project, sourceDir, logger);
+ await createTrialWorktree({
+ sourceDir,
+ trialBranch,
+ repoRoot,
+ baseBranch: project.branch,
+ logger,
+ });
+
+ const projectPath = getProjectPath(repoRoot, project.projectDir);
+ const resultsDir = getEvalResultsDir(projectPath);
+ await mkdir(resultsDir, { recursive: true });
+ await installDeps(projectPath, logger, undefined, { stopAt: repoRoot });
+
+ logger.logSuccess('Trial ready');
+ return { trialDir, sourceDir, repoRoot, projectPath, resultsDir, baselineCommit, trialBranch };
+}
+
+export async function ensureSourceClone(project: Project, sourceDir: string, logger: Logger) {
+ await mkdir(dirname(sourceDir), { recursive: true });
+
+ if (existsSync(join(sourceDir, '.git'))) {
+ return;
+ }
+
+ if (existsSync(sourceDir)) {
+ await rm(sourceDir, { recursive: true, force: true });
+ }
+
+ logger.logStep(`Cloning source repo ${project.repo}#${project.branch}...`);
+ await x('git', ['clone', '--branch', project.branch, project.repo, sourceDir], {
+ timeout: 120_000,
+ });
+}
+
+async function syncSourceClone(project: Project, sourceDir: string, logger: Logger) {
+ logger.logStep(`Syncing ${project.name} source clone...`);
+ await x('git', ['remote', 'set-url', 'origin', project.repo], {
+ nodeOptions: { cwd: sourceDir },
+ });
+ await x('git', ['fetch', 'origin', '--prune'], {
+ timeout: 120_000,
+ nodeOptions: { cwd: sourceDir },
+ });
+ await x('git', ['checkout', project.branch], { nodeOptions: { cwd: sourceDir } });
+ await x('git', ['reset', '--hard', `origin/${project.branch}`], {
+ nodeOptions: { cwd: sourceDir },
+ });
+
+ return getGitHead(sourceDir);
+}
+
+async function getGitHead(cwd: string): Promise {
+ return (await x('git', ['rev-parse', 'HEAD'], { nodeOptions: { cwd } })).stdout.trim();
+}
+
+async function createTrialWorktree({
+ sourceDir,
+ trialBranch,
+ repoRoot,
+ baseBranch,
+ logger,
+}: {
+ sourceDir: string;
+ trialBranch: string;
+ repoRoot: string;
+ baseBranch: string;
+ logger: Logger;
+}) {
+ logger.logStep(`Creating worktree for ${trialBranch}...`);
+ await x('git', ['worktree', 'add', '-b', trialBranch, repoRoot, baseBranch], {
+ timeout: 120_000,
+ nodeOptions: { cwd: sourceDir },
+ });
+}
diff --git a/scripts/eval/lib/projects.test.ts b/scripts/eval/lib/projects.test.ts
new file mode 100644
index 000000000000..09f9299e0a71
--- /dev/null
+++ b/scripts/eval/lib/projects.test.ts
@@ -0,0 +1,33 @@
+import { describe, expect, it } from 'vitest';
+
+import { PROJECTS } from './projects.ts';
+
+const githubRepoUrl = /^https:\/\/github\.com\/[^/]+\/[^/]+$/;
+
+describe('PROJECTS', () => {
+ it('pins every benchmark project to a storybook-tmp main-branch repo', () => {
+ expect(PROJECTS.length).toBeGreaterThan(0);
+
+ for (const project of PROJECTS) {
+ expect(project).toMatchObject({
+ branch: 'main',
+ repo: expect.stringMatching(githubRepoUrl),
+ githubSlug: expect.stringMatching(/^storybook-tmp\/[^/]+$/),
+ description: expect.any(String),
+ });
+ }
+ });
+
+ it('keeps benchmark project metadata unambiguous', () => {
+ const names = PROJECTS.map((p) => p.name);
+ const repos = PROJECTS.map((p) => p.repo);
+
+ expect(new Set(names).size).toBe(names.length);
+ expect(new Set(repos).size).toBe(repos.length);
+
+ for (const project of PROJECTS) {
+ if (!project.projectDir) continue;
+ expect(project.projectDir).toMatch(/^(?!\/)(?!\.\.?(?:\/|$)).+/);
+ }
+ });
+});
diff --git a/scripts/eval/lib/projects.ts b/scripts/eval/lib/projects.ts
new file mode 100644
index 000000000000..c344304dae37
--- /dev/null
+++ b/scripts/eval/lib/projects.ts
@@ -0,0 +1,70 @@
+export interface Project {
+ name: string;
+ repo: string;
+ branch: string;
+ githubSlug: string;
+ projectDir?: string;
+ description?: string;
+}
+
+export const PROJECTS: Project[] = [
+ {
+ name: 'mealdrop',
+ repo: 'https://github.com/storybook-tmp/mealdrop',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ description: 'Styled components, Redux, React Router',
+ },
+ {
+ name: 'edgy',
+ repo: 'https://github.com/storybook-tmp/edgy',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/edgy',
+ description: 'Tailwind, HeadlessUI, React Router',
+ },
+ {
+ name: 'wikitok',
+ repo: 'https://github.com/storybook-tmp/wikitok',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/wikitok',
+ projectDir: 'frontend',
+ description: 'Simple project with Tailwind',
+ },
+ {
+ name: 'baklava',
+ repo: 'https://github.com/storybook-tmp/baklava',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/baklava',
+ description: 'Component library with Zustand',
+ },
+ {
+ name: 'echarts',
+ repo: 'https://github.com/storybook-tmp/echarts-react',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/echarts-react',
+ description: 'ECharts React wrapper',
+ },
+ {
+ name: 'evergreen-ci',
+ repo: 'https://github.com/storybook-tmp/ui',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/ui',
+ projectDir: 'packages/lib',
+ description: 'GraphQL',
+ },
+ {
+ name: 'excalidraw',
+ repo: 'https://github.com/storybook-tmp/excalidraw',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/excalidraw',
+ projectDir: 'excalidraw-app',
+ description: 'Monorepo with canvas based drawing app',
+ },
+ {
+ name: 'bluesky',
+ repo: 'https://github.com/storybook-tmp/bluesky',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/bluesky',
+ description: 'React Native + React app with highly complex providers setup',
+ },
+];
diff --git a/scripts/eval/lib/publish-trial.test.ts b/scripts/eval/lib/publish-trial.test.ts
new file mode 100644
index 000000000000..b8eb583859aa
--- /dev/null
+++ b/scripts/eval/lib/publish-trial.test.ts
@@ -0,0 +1,509 @@
+import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { join } from 'node:path';
+import { tmpdir } from 'node:os';
+
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+let TMP = '';
+
+beforeEach(() => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-publish-trial-'));
+ vi.resetModules();
+});
+
+afterEach(() => {
+ vi.doUnmock('tinyexec');
+ vi.restoreAllMocks();
+ vi.resetModules();
+
+ if (TMP) {
+ rmSync(TMP, { recursive: true, force: true });
+ TMP = '';
+ }
+});
+
+function createLogger() {
+ return {
+ log: vi.fn(),
+ logStep: vi.fn(),
+ logSuccess: vi.fn(),
+ logError: vi.fn(),
+ };
+}
+
+function createExecResult(stdout = '', exitCode = 0) {
+ return { stdout, stderr: '', exitCode };
+}
+
+function writeEvalSupportFixture(projectPath: string) {
+ const supportDir = join(projectPath, '.storybook', 'eval-support');
+ const configPath = join(projectPath, '.storybook', 'main.ts');
+ const resultsDir = join(projectPath, '.storybook', 'eval-results');
+
+ mkdirSync(supportDir, { recursive: true });
+ mkdirSync(resultsDir, { recursive: true });
+
+ writeFileSync(
+ configPath,
+ [
+ "import type { StorybookConfig } from '@storybook/react-vite';",
+ '',
+ 'const config: StorybookConfig = {',
+ " stories: ['../src/**/*.stories.tsx', './eval-support/*.mdx'],",
+ '};',
+ '',
+ 'export default config;',
+ ].join('\n')
+ );
+
+ for (const file of ['summary.mdx', 'transcript.mdx', 'transcript.tsx', 'transcript.types.ts']) {
+ writeFileSync(join(supportDir, file), `fixture ${file}\n`);
+ }
+
+ writeFileSync(join(resultsDir, 'data.json'), '{}');
+ writeFileSync(join(resultsDir, 'build-output.txt'), 'build output\n');
+ writeFileSync(join(resultsDir, 'typecheck-output.txt'), 'typecheck output\n');
+}
+
+describe('buildTrialLabels', () => {
+ it('includes eval, project, agent, model, effort, and prompt labels', async () => {
+ const { buildTrialLabels } = await import('./publish-trial.ts');
+
+ expect(
+ buildTrialLabels(
+ {
+ name: 'mealdrop',
+ repo: 'https://github.com/storybook-tmp/mealdrop',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ {
+ agent: 'claude',
+ model: 'sonnet-4.6',
+ effort: 'high',
+ },
+ 'setup'
+ )
+ ).toEqual([
+ 'eval',
+ 'project:mealdrop',
+ 'agent:claude',
+ 'model:sonnet-4.6',
+ 'effort:high',
+ 'prompt:setup',
+ ]);
+ });
+});
+
+describe('publishTrialBranch', () => {
+ it('validates shared eval support, passes the PR body in-memory, and leaves Storybook config untouched', async () => {
+ const calls: Array<{ cmd: string; args: string[]; cwd?: string }> = [];
+
+ vi.doMock('tinyexec', () => ({
+ x: vi.fn(
+ async (cmd: string, args: string[], options?: { nodeOptions?: { cwd?: string } }) => {
+ calls.push({ cmd, args, cwd: options?.nodeOptions?.cwd });
+
+ if (cmd === 'gh' && args[0] === 'label' && args[1] === 'list') {
+ return createExecResult('');
+ }
+
+ if (cmd === 'git' && args[0] === 'config' && args.length === 2) {
+ return createExecResult('', 1);
+ }
+
+ if (cmd === 'gh' && args[0] === 'pr' && args[1] === 'create') {
+ return createExecResult('https://github.com/storybook-tmp/mealdrop/pull/123\n');
+ }
+
+ return createExecResult();
+ }
+ ),
+ }));
+
+ const { publishTrialBranch } = await import('./publish-trial.ts');
+ const repoRoot = join(TMP, 'repo');
+ const projectPath = join(repoRoot, 'packages', 'app');
+ const resultsDir = join(projectPath, '.storybook', 'eval-results');
+ const configPath = join(projectPath, '.storybook', 'main.ts');
+
+ writeEvalSupportFixture(projectPath);
+ const originalConfig = readFileSync(configPath, 'utf-8');
+
+ const publish = await publishTrialBranch({
+ data: {
+ schemaVersion: 4,
+ id: 'trial-123',
+ timestamp: '2026-04-02T00:00:00.000Z',
+ project: {
+ name: 'mealdrop',
+ repo: 'https://github.com/storybook-tmp/mealdrop',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ variant: {
+ agent: 'claude',
+ model: 'sonnet-4.6',
+ effort: 'high',
+ },
+ prompt: {
+ name: 'setup',
+ content: 'prompt body',
+ },
+ baselineCommit: 'deadbeef',
+ environment: {
+ nodeVersion: 'v22.21.1',
+ evalBranch: 'test-branch',
+ evalCommit: 'abc123',
+ },
+ execution: {
+ cost: 0.91,
+ duration: 45,
+ turns: 4,
+ terminalResultSubtype: 'success',
+ },
+ grade: {
+ baselineGhostStories: {
+ candidateCount: 6,
+ total: 4,
+ passed: 1,
+ successRate: 0.25,
+ },
+ baselinePreviewStories: {
+ total: 8,
+ passed: 4,
+ storyFiles: 3,
+ cssCheck: 'not-run' as const,
+ },
+ buildSuccess: true,
+ typeCheckErrors: 0,
+ fileChanges: [],
+ storybookChanges: [],
+ ghostStories: {
+ candidateCount: 6,
+ total: 4,
+ passed: 3,
+ successRate: 0.75,
+ },
+ storyRender: {
+ total: 8,
+ passed: 6,
+ storyFiles: 3,
+ cssCheck: 'not-run' as const,
+ },
+ },
+ score: {
+ score: 0.5,
+ breakdown: {
+ beforeRate: 0.5,
+ afterRate: 0.75,
+ gain: 0.5,
+ },
+ },
+ transcript: [],
+ artifacts: {
+ buildOutput: { path: '.storybook/eval-results/build-output.txt', success: true },
+ typecheckOutput: {
+ path: '.storybook/eval-results/typecheck-output.txt',
+ errorCount: 0,
+ },
+ },
+ docs: {
+ transcript: {
+ prompt: 'prompt body',
+ promptTokenCount: 3,
+ promptCost: 0,
+ messages: [],
+ },
+ },
+ },
+ workspace: {
+ trialDir: join(TMP, 'trial'),
+ sourceDir: join(TMP, 'source'),
+ repoRoot,
+ projectPath,
+ resultsDir,
+ baselineCommit: 'deadbeef',
+ trialBranch: 'trial/foo',
+ },
+ logger: createLogger(),
+ });
+
+ expect(publish).toMatchObject({
+ branch: 'trial/foo',
+ labels: expect.arrayContaining(['prompt:setup']),
+ url: 'https://github.com/storybook-tmp/mealdrop/pull/123',
+ });
+
+ expect(readFileSync(configPath, 'utf-8')).toBe(originalConfig);
+
+ const prCreateCall = calls.find(
+ (call) => call.cmd === 'gh' && call.args[0] === 'pr' && call.args[1] === 'create'
+ );
+ expect(prCreateCall).toBeDefined();
+ const prBody = prCreateCall!.args[prCreateCall!.args.indexOf('--body') + 1];
+ expect(prBody).toContain('ID: `trial-123`');
+ expect(prBody).toContain('Created at: `Apr 2 2026 00:00:00 UTC`');
+ expect(prBody).toContain('Score (preview gain): `50%`');
+ expect(prBody).toContain('Ghost stories before: `1/4 (25%)`');
+ expect(prBody).toContain('Ghost stories after: `3/4 (75%)`');
+ expect(prBody).toContain('Vitest pass rate before preview changes: `4/8 (50%)`');
+ expect(prBody).toContain('Vitest pass rate after preview changes: `6/8 (75%)`');
+ expect(prBody).toContain('CssCheck: `not-run`');
+ expect(prBody).toContain('[.storybook/eval-results/data.json](');
+ expect(prBody).toContain('Full prompt');
+ expect(prBody.match(//g)).toHaveLength(1);
+ expect(prBody).not.toContain('src/Button.stories.Primary.chromium.png');
+ expect(prBody).not.toContain('Screenshot');
+ expect(existsSync(join(resultsDir, 'pr-body.md'))).toBe(false);
+
+ expect(calls).toEqual(
+ expect.arrayContaining([
+ {
+ cmd: 'git',
+ args: ['add', '-A'],
+ cwd: repoRoot,
+ },
+ {
+ cmd: 'git',
+ args: ['commit', '--no-verify', '-m', 'eval: trial-123'],
+ cwd: repoRoot,
+ },
+ {
+ cmd: 'git',
+ args: ['push', '--set-upstream', 'origin', 'trial/foo'],
+ cwd: repoRoot,
+ },
+ ])
+ );
+
+ const labelCreateCalls = calls.filter(
+ (call) => call.cmd === 'gh' && call.args[0] === 'label' && call.args[1] === 'create'
+ );
+ for (const call of labelCreateCalls) {
+ expect(call.args).not.toContain('--color');
+ }
+ });
+
+ it('fails with a clear error when eval support files are missing', async () => {
+ vi.doMock('tinyexec', () => ({
+ x: vi.fn(async (cmd: string, args: string[]) => {
+ if (cmd === 'gh' && args[0] === 'label' && args[1] === 'list') {
+ return createExecResult('');
+ }
+
+ return createExecResult();
+ }),
+ }));
+
+ const { publishTrialBranch } = await import('./publish-trial.ts');
+ const repoRoot = join(TMP, 'repo');
+ const projectPath = join(repoRoot, 'packages', 'app');
+ const resultsDir = join(projectPath, '.storybook', 'eval-results');
+
+ mkdirSync(join(projectPath, '.storybook'), { recursive: true });
+ mkdirSync(resultsDir, { recursive: true });
+ writeFileSync(
+ join(projectPath, '.storybook', 'main.ts'),
+ "export default { stories: ['../src/**/*.stories.tsx'] };"
+ );
+
+ await expect(
+ publishTrialBranch({
+ data: {
+ schemaVersion: 4,
+ id: 'trial-456',
+ timestamp: '2026-04-02T00:00:00.000Z',
+ project: {
+ name: 'mealdrop',
+ repo: 'https://github.com/storybook-tmp/mealdrop',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ variant: {
+ agent: 'codex',
+ model: 'gpt-5.4',
+ effort: 'high',
+ },
+ prompt: {
+ name: 'setup',
+ content: 'prompt body',
+ },
+ baselineCommit: 'deadbeef',
+ environment: {
+ nodeVersion: 'v22.21.1',
+ evalBranch: 'test-branch',
+ evalCommit: 'abc123',
+ },
+ execution: {
+ duration: 30,
+ turns: 1,
+ },
+ grade: {
+ buildSuccess: true,
+ typeCheckErrors: 0,
+ fileChanges: [],
+ storybookChanges: [],
+ },
+ score: {
+ score: 1,
+ breakdown: {
+ beforeRate: 0,
+ afterRate: 1,
+ gain: 1,
+ },
+ },
+ transcript: [],
+ artifacts: {
+ buildOutput: {
+ path: '.storybook/eval-results/build-output.txt',
+ success: true,
+ },
+ typecheckOutput: {
+ path: '.storybook/eval-results/typecheck-output.txt',
+ errorCount: 0,
+ },
+ },
+ docs: {
+ transcript: {
+ prompt: 'prompt body',
+ promptTokenCount: 3,
+ promptCost: 0,
+ messages: [],
+ },
+ },
+ },
+ workspace: {
+ trialDir: join(TMP, 'trial'),
+ sourceDir: join(TMP, 'source'),
+ repoRoot,
+ projectPath,
+ resultsDir,
+ baselineCommit: 'deadbeef',
+ trialBranch: 'trial/bar',
+ },
+ logger: createLogger(),
+ })
+ ).rejects.toThrow('Eval support is not configured for mealdrop');
+ });
+
+ it('does not recreate labels that already exist in the repo', async () => {
+ const calls: Array<{ cmd: string; args: string[]; cwd?: string }> = [];
+
+ vi.doMock('tinyexec', () => ({
+ x: vi.fn(
+ async (cmd: string, args: string[], options?: { nodeOptions?: { cwd?: string } }) => {
+ calls.push({ cmd, args, cwd: options?.nodeOptions?.cwd });
+
+ if (cmd === 'gh' && args[0] === 'label' && args[1] === 'list') {
+ return createExecResult(
+ [
+ 'eval\tAutomated eval label for eval\t#D93F0B',
+ 'project:mealdrop\tAutomated eval label for project:mealdrop\t#1D76DB',
+ 'agent:claude\tAutomated eval label for agent:claude\t#C5DEF5',
+ 'model:sonnet-4.6\tAutomated eval label for model:sonnet-4.6\t#FBCA04',
+ 'effort:high\tAutomated eval label for effort:high\t#0E8A16',
+ 'prompt:setup\tAutomated eval label for prompt:setup\t#BFDADC',
+ ].join('\n')
+ );
+ }
+
+ if (cmd === 'git' && args[0] === 'config' && args.length === 2) {
+ return createExecResult('', 1);
+ }
+
+ if (cmd === 'gh' && args[0] === 'pr' && args[1] === 'create') {
+ return createExecResult('https://github.com/storybook-tmp/mealdrop/pull/789\n');
+ }
+
+ return createExecResult();
+ }
+ ),
+ }));
+
+ const { publishTrialBranch } = await import('./publish-trial.ts');
+ const repoRoot = join(TMP, 'repo');
+ const projectPath = join(repoRoot, 'packages', 'app');
+ const resultsDir = join(projectPath, '.storybook', 'eval-results');
+
+ writeEvalSupportFixture(projectPath);
+
+ await publishTrialBranch({
+ data: {
+ schemaVersion: 4,
+ id: 'trial-789',
+ timestamp: '2026-04-02T00:00:00.000Z',
+ project: {
+ name: 'mealdrop',
+ repo: 'https://github.com/storybook-tmp/mealdrop',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ variant: {
+ agent: 'claude',
+ model: 'sonnet-4.6',
+ effort: 'high',
+ },
+ prompt: {
+ name: 'setup',
+ content: 'prompt body',
+ },
+ baselineCommit: 'deadbeef',
+ environment: {
+ nodeVersion: 'v22.21.1',
+ evalBranch: 'test-branch',
+ evalCommit: 'abc123',
+ },
+ execution: {
+ duration: 30,
+ turns: 1,
+ },
+ grade: {
+ buildSuccess: true,
+ typeCheckErrors: 0,
+ fileChanges: [],
+ storybookChanges: [],
+ },
+ score: {
+ score: 1,
+ breakdown: {
+ beforeRate: 0,
+ afterRate: 1,
+ gain: 1,
+ },
+ },
+ transcript: [],
+ artifacts: {
+ buildOutput: { path: '.storybook/eval-results/build-output.txt', success: true },
+ typecheckOutput: {
+ path: '.storybook/eval-results/typecheck-output.txt',
+ errorCount: 0,
+ },
+ },
+ docs: {
+ transcript: {
+ prompt: 'prompt body',
+ promptTokenCount: 3,
+ promptCost: 0,
+ messages: [],
+ },
+ },
+ },
+ workspace: {
+ trialDir: join(TMP, 'trial'),
+ sourceDir: join(TMP, 'source'),
+ repoRoot,
+ projectPath,
+ resultsDir,
+ baselineCommit: 'deadbeef',
+ trialBranch: 'trial/baz',
+ },
+ logger: createLogger(),
+ });
+
+ const labelCreateCalls = calls.filter(
+ (call) => call.cmd === 'gh' && call.args[0] === 'label' && call.args[1] === 'create'
+ );
+ expect(labelCreateCalls).toHaveLength(0);
+ });
+});
diff --git a/scripts/eval/lib/publish-trial.ts b/scripts/eval/lib/publish-trial.ts
new file mode 100644
index 000000000000..8e604ddd304b
--- /dev/null
+++ b/scripts/eval/lib/publish-trial.ts
@@ -0,0 +1,309 @@
+import { existsSync } from 'node:fs';
+import { readFile } from 'node:fs/promises';
+import { join, relative } from 'node:path';
+import { x } from 'tinyexec';
+import type { TrialWorkspace } from './prepare-trial.ts';
+import type { EvalData } from './result-docs.ts';
+import {
+ formatCost,
+ formatDuration,
+ formatReadableUtcTimestamp,
+ formatScorePercent,
+ getEvalResultsDir,
+ getEvalResultsRelativePath,
+ type Logger,
+} from './utils.ts';
+
+export interface PublishMetadata {
+ branch: string;
+ labels: string[];
+ url: string;
+}
+
+export function buildTrialLabels(
+ project: EvalData['project'],
+ variant: EvalData['variant'],
+ prompt: string
+) {
+ return [
+ 'eval',
+ `project:${project.name}`,
+ `agent:${variant.agent}`,
+ `model:${variant.model}`,
+ `effort:${variant.effort}`,
+ `prompt:${prompt}`,
+ ];
+}
+
+export async function publishTrialBranch(opts: {
+ data: EvalData;
+ workspace: TrialWorkspace;
+ logger: Logger;
+}) {
+ const labels = buildTrialLabels(opts.data.project, opts.data.variant, opts.data.prompt.name);
+
+ await validateEvalSupportSetup({
+ projectName: opts.data.project.name,
+ projectPath: opts.workspace.projectPath,
+ });
+
+ const prBody = renderPrBody({
+ branch: opts.workspace.trialBranch,
+ data: opts.data,
+ });
+
+ opts.logger.logStep('Creating trial commit...');
+ await ensureGitIdentity(opts.workspace.repoRoot);
+ await x('git', ['add', '-A'], {
+ nodeOptions: { cwd: opts.workspace.repoRoot },
+ });
+ await x('git', ['commit', '--no-verify', '-m', `eval: ${opts.data.id}`], {
+ nodeOptions: { cwd: opts.workspace.repoRoot },
+ });
+
+ opts.logger.logStep(`Pushing ${opts.workspace.trialBranch}...`);
+ await x('git', ['push', '--set-upstream', 'origin', opts.workspace.trialBranch], {
+ timeout: 120_000,
+ nodeOptions: { cwd: opts.workspace.repoRoot },
+ });
+
+ await ensureLabels(opts.data.project.githubSlug, labels);
+
+ opts.logger.logStep('Opening draft PR...');
+ const title = `[eval] ${opts.data.project.name} ${opts.data.id}`;
+ const prUrl = (
+ await x(
+ 'gh',
+ [
+ 'pr',
+ 'create',
+ '--repo',
+ opts.data.project.githubSlug,
+ '--base',
+ opts.data.project.branch,
+ '--head',
+ opts.workspace.trialBranch,
+ '--draft',
+ '--title',
+ title,
+ '--body',
+ prBody,
+ ],
+ { nodeOptions: { cwd: opts.workspace.repoRoot } }
+ )
+ ).stdout.trim();
+
+ await x(
+ 'gh',
+ [
+ 'pr',
+ 'edit',
+ prUrl,
+ '--repo',
+ opts.data.project.githubSlug,
+ ...labels.flatMap((label) => ['--add-label', label]),
+ ],
+ {
+ nodeOptions: { cwd: opts.workspace.repoRoot },
+ }
+ );
+
+ opts.logger.logSuccess(`Draft PR opened: ${prUrl}`);
+
+ return {
+ branch: opts.workspace.trialBranch,
+ labels,
+ url: prUrl,
+ } satisfies PublishMetadata;
+}
+
+async function ensureLabels(repo: string, labels: string[]) {
+ const existing = new Set(
+ (
+ await x('gh', ['label', 'list', '--repo', repo, '--limit', '200'], {
+ nodeOptions: { cwd: process.cwd() },
+ })
+ ).stdout
+ .split('\n')
+ .map((line) => line.split('\t')[0]?.trim())
+ .filter((label): label is string => Boolean(label))
+ );
+
+ for (const label of labels) {
+ if (existing.has(label)) {
+ continue;
+ }
+
+ await x(
+ 'gh',
+ [
+ 'label',
+ 'create',
+ label,
+ '--repo',
+ repo,
+ '--description',
+ `Automated eval label for ${label}`,
+ ],
+ { throwOnError: false }
+ );
+ }
+}
+
+async function validateEvalSupportSetup(opts: { projectName: string; projectPath: string }) {
+ const missing: string[] = [];
+ const configPath = await findStorybookMainFile(opts.projectPath);
+ if (!configPath) {
+ missing.push('.storybook/main.(ts|js|mts|mjs|cjs)');
+ } else {
+ const content = await readFile(configPath, 'utf-8');
+ if (!content.includes('./eval-support/*.mdx')) {
+ missing.push(
+ `${relative(opts.projectPath, configPath) || '.storybook/main'} missing ./eval-support/*.mdx`
+ );
+ }
+ }
+
+ const supportDir = join(opts.projectPath, '.storybook', 'eval-support');
+ for (const file of ['summary.mdx', 'transcript.mdx', 'transcript.tsx', 'transcript.types.ts']) {
+ if (!existsSync(join(supportDir, file))) {
+ missing.push(relative(opts.projectPath, join(supportDir, file)));
+ }
+ }
+
+ const resultsDir = getEvalResultsDir(opts.projectPath);
+
+ if (!existsSync(join(resultsDir, 'data.json'))) {
+ missing.push(relative(opts.projectPath, join(resultsDir, 'data.json')));
+ }
+
+ for (const file of ['build-output.txt', 'typecheck-output.txt']) {
+ if (!existsSync(join(resultsDir, file))) {
+ missing.push(relative(opts.projectPath, join(resultsDir, file)));
+ }
+ }
+
+ if (missing.length > 0) {
+ throw new Error(
+ `Eval support is not configured for ${opts.projectName}. Missing: ${missing.join(', ')}`
+ );
+ }
+}
+
+async function findStorybookMainFile(projectPath: string) {
+ const candidates = ['main.ts', 'main.js', 'main.mts', 'main.mjs', 'main.cjs'].map((file) =>
+ join(projectPath, '.storybook', file)
+ );
+ return candidates.find((candidate) => existsSync(candidate));
+}
+
+async function ensureGitIdentity(repoRoot: string) {
+ const name = await x('git', ['config', 'user.name'], {
+ throwOnError: false,
+ nodeOptions: { cwd: repoRoot },
+ });
+ if (name.exitCode !== 0 || !name.stdout.trim()) {
+ await x('git', ['config', 'user.name', 'Storybook Eval'], {
+ nodeOptions: { cwd: repoRoot },
+ });
+ }
+
+ const email = await x('git', ['config', 'user.email'], {
+ throwOnError: false,
+ nodeOptions: { cwd: repoRoot },
+ });
+ if (email.exitCode !== 0 || !email.stdout.trim()) {
+ await x('git', ['config', 'user.email', 'storybook-eval@local'], {
+ nodeOptions: { cwd: repoRoot },
+ });
+ }
+}
+
+function createBlobUrl(repo: string, branch: string, filePath: string) {
+ return `https://github.com/${repo}/blob/${branch}/${filePath}`;
+}
+
+function renderPrBody(opts: { branch: string; data: EvalData }) {
+ const dataUrl = createBlobUrl(
+ opts.data.project.githubSlug,
+ opts.branch,
+ getEvalResultsRelativePath('data.json', opts.data.project.projectDir)
+ );
+ const buildOutputUrl = createBlobUrl(
+ opts.data.project.githubSlug,
+ opts.branch,
+ opts.data.artifacts.buildOutput.path
+ );
+ const typecheckOutputUrl = createBlobUrl(
+ opts.data.project.githubSlug,
+ opts.branch,
+ opts.data.artifacts.typecheckOutput.path
+ );
+ const baselineGhostStories = formatGhostStories(opts.data.grade.baselineGhostStories);
+ const postAgentGhostStories = formatGhostStories(opts.data.grade.ghostStories);
+ const baselinePreviewStories = formatStoryRender(opts.data.grade.baselinePreviewStories);
+ const postAgentStoryRender = formatStoryRender(opts.data.grade.storyRender);
+ const lines = [
+ '# Eval Trial',
+ '',
+ `- ID: \`${opts.data.id}\``,
+ `- Created at: \`${formatReadableUtcTimestamp(opts.data.timestamp)}\``,
+ `- Project: \`${opts.data.project.name}\``,
+ `- Agent: \`${opts.data.variant.agent}\``,
+ `- Model: \`${opts.data.variant.model}\``,
+ `- Effort: \`${opts.data.variant.effort}\``,
+ `- Prompt: \`${opts.data.prompt.name}\``,
+ `- Score (preview gain): \`${formatScorePercent(opts.data.score.score)}\``,
+ `- Build: \`${opts.data.grade.buildSuccess ? 'PASS' : 'FAIL'}\``,
+ `- TypeScript errors: \`${opts.data.grade.typeCheckErrors}\``,
+ `- Ghost stories before: \`${baselineGhostStories}\``,
+ `- Ghost stories after: \`${postAgentGhostStories}\``,
+ `- Vitest pass rate before preview changes: \`${baselinePreviewStories}\``,
+ `- Vitest pass rate after preview changes: \`${postAgentStoryRender}\``,
+ `- CssCheck: \`${opts.data.grade.storyRender?.cssCheck ?? 'not-run'}\``,
+ `- Duration: \`${formatDuration(opts.data.execution.duration)}\``,
+ `- Cost: \`${formatCost(opts.data.execution.cost)}\``,
+ `- Raw data: [${getEvalResultsRelativePath('data.json', opts.data.project.projectDir)}](${dataUrl})`,
+ ];
+
+ if (!opts.data.grade.buildSuccess) {
+ lines.push(`- Build log: [${opts.data.artifacts.buildOutput.path}](${buildOutputUrl})`);
+ }
+
+ if (opts.data.grade.typeCheckErrors > 0) {
+ lines.push(
+ `- Typecheck log: [${opts.data.artifacts.typecheckOutput.path}](${typecheckOutputUrl})`
+ );
+ }
+
+ lines.push(
+ '',
+ '',
+ 'Full prompt
',
+ '',
+ '````md',
+ opts.data.prompt.content,
+ '````',
+ ' '
+ );
+
+ return lines.join('\n');
+}
+
+function formatStoryRender(storyRender?: EvalData['grade']['storyRender']) {
+ if (!storyRender) {
+ return '-';
+ }
+
+ const rate = storyRender.total > 0 ? storyRender.passed / storyRender.total : 0;
+ return `${storyRender.passed}/${storyRender.total} (${Math.round(rate * 100)}%)`;
+}
+
+function formatGhostStories(ghost?: EvalData['grade']['ghostStories']) {
+ if (!ghost) {
+ return '-';
+ }
+
+ const rate = ghost.total > 0 ? ghost.passed / ghost.total : 0;
+ return `${ghost.passed}/${ghost.total} (${Math.round(rate * 100)}%)`;
+}
diff --git a/scripts/eval/lib/result-docs.test.ts b/scripts/eval/lib/result-docs.test.ts
new file mode 100644
index 000000000000..f53b31a1536e
--- /dev/null
+++ b/scripts/eval/lib/result-docs.test.ts
@@ -0,0 +1,313 @@
+import { describe, expect, it } from 'vitest';
+
+import { buildEvalData, normalizeTranscriptForDocs } from './result-docs.ts';
+
+describe('normalizeTranscriptForDocs', () => {
+ it('normalizes claude transcript entries into MCP transcript props', () => {
+ const normalized = normalizeTranscriptForDocs({
+ prompt: 'Write stories',
+ summary: {
+ execution: { turns: 2, duration: 12, durationApi: 8, cost: 0.42 },
+ },
+ transcript: [
+ {
+ type: 'system',
+ subtype: 'init',
+ agent: 'Claude Code',
+ model: 'claude-opus',
+ tools: ['Read', 'Write'],
+ cwd: '/repo',
+ },
+ {
+ type: 'assistant',
+ message: {
+ content: [
+ { type: 'text', text: 'I will inspect the codebase.' },
+ {
+ type: 'tool_use',
+ id: 'tool_1',
+ name: 'Read',
+ input: { path: 'src/Button.tsx' },
+ },
+ ],
+ usage: { output_tokens: 42 },
+ },
+ },
+ {
+ type: 'user',
+ message: {
+ content: [
+ {
+ type: 'tool_result',
+ tool_use_id: 'tool_1',
+ content: 'file contents',
+ },
+ ],
+ },
+ },
+ {
+ type: 'result',
+ subtype: 'success',
+ num_turns: 2,
+ total_cost_usd: 0.42,
+ duration_ms: 12000,
+ duration_api_ms: 8000,
+ },
+ ],
+ });
+
+ expect(normalized.prompt).toBe('Write stories');
+ expect(normalized.messages).toMatchObject([
+ {
+ type: 'system',
+ subtype: 'init',
+ agent: 'Claude Code',
+ model: 'claude-opus',
+ tools: ['Read', 'Write'],
+ cwd: '/repo',
+ },
+ {
+ type: 'assistant',
+ message: {
+ content: [
+ { type: 'text', text: 'I will inspect the codebase.' },
+ {
+ type: 'tool_use',
+ id: 'tool_1',
+ name: 'Read',
+ input: { path: 'src/Button.tsx' },
+ isMCP: false,
+ },
+ ],
+ },
+ tokenCount: 42,
+ },
+ {
+ type: 'user',
+ message: {
+ content: [
+ {
+ type: 'tool_result',
+ tool_use_id: 'tool_1',
+ content: 'file contents',
+ },
+ ],
+ },
+ },
+ {
+ type: 'result',
+ subtype: 'success',
+ num_turns: 2,
+ total_cost_usd: 0.42,
+ duration_ms: 12000,
+ duration_api_ms: 8000,
+ },
+ ]);
+ });
+
+ it('normalizes codex command entries into a copied transcript-compatible tool call/result pair', () => {
+ const normalized = normalizeTranscriptForDocs({
+ prompt: 'Build a story',
+ summary: {
+ variant: { agent: 'codex', model: 'gpt-5.4' },
+ execution: { turns: 3, duration: 9, cost: 0.1 },
+ },
+ transcript: [
+ {
+ type: 'command_execution',
+ command: 'npm test',
+ exit_code: 1,
+ aggregated_output: 'failing output',
+ },
+ ],
+ });
+
+ expect(normalized.messages).toMatchObject([
+ {
+ type: 'system',
+ subtype: 'init',
+ agent: 'Codex',
+ model: 'gpt-5.4',
+ },
+ {
+ type: 'assistant',
+ message: {
+ content: [
+ {
+ type: 'tool_use',
+ name: 'Bash',
+ input: { command: 'npm test' },
+ isMCP: false,
+ },
+ ],
+ },
+ },
+ {
+ type: 'user',
+ message: {
+ content: [
+ {
+ type: 'tool_result',
+ content: 'Exit code: 1\n\nfailing output',
+ },
+ ],
+ },
+ },
+ {
+ type: 'result',
+ subtype: 'success',
+ num_turns: 3,
+ },
+ ]);
+ });
+
+ it('maps Claude terminal subtypes other than success to error in transcript props', () => {
+ const normalized = normalizeTranscriptForDocs({
+ prompt: 'Write stories',
+ summary: {
+ execution: {
+ turns: 51,
+ duration: 12,
+ durationApi: 8,
+ cost: 0.42,
+ terminalResultSubtype: 'error_max_turns',
+ },
+ },
+ transcript: [
+ {
+ type: 'result',
+ subtype: 'error_max_turns',
+ num_turns: 51,
+ total_cost_usd: 0.42,
+ duration_ms: 12000,
+ duration_api_ms: 8000,
+ },
+ ],
+ });
+
+ expect(normalized.messages).toMatchObject([
+ {
+ type: 'result',
+ subtype: 'error',
+ num_turns: 51,
+ total_cost_usd: 0.42,
+ },
+ ]);
+ });
+
+ it('treats missing Claude tool result content as empty text', () => {
+ const normalized = normalizeTranscriptForDocs({
+ prompt: 'Write stories',
+ transcript: [
+ {
+ type: 'user',
+ message: {
+ content: [
+ {
+ type: 'tool_result',
+ tool_use_id: 'tool_1',
+ },
+ ],
+ },
+ },
+ ],
+ });
+
+ expect(normalized.messages).toHaveLength(1);
+ expect(normalized.messages[0]).toMatchObject({
+ type: 'user',
+ message: {
+ content: [
+ {
+ type: 'tool_result',
+ tool_use_id: 'tool_1',
+ content: '',
+ },
+ ],
+ },
+ });
+ });
+
+ it('builds a single persisted eval data object', () => {
+ const data = buildEvalData({
+ id: '20260402T041205123Z-deadbeef',
+ timestamp: '2026-04-02T04:12:05.123Z',
+ project: {
+ name: 'mealdrop',
+ repo: 'https://github.com/storybook-tmp/mealdrop',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ variant: {
+ agent: 'codex',
+ model: 'gpt-5.4',
+ effort: 'medium',
+ },
+ prompt: {
+ name: 'setup',
+ content: 'Write stories',
+ },
+ baselineCommit: 'deadbeef',
+ environment: {
+ nodeVersion: 'v22.21.1',
+ evalBranch: 'branch',
+ evalCommit: 'commit',
+ },
+ execution: {
+ duration: 12,
+ turns: 2,
+ },
+ grade: {
+ buildSuccess: true,
+ typeCheckErrors: 0,
+ fileChanges: [],
+ storybookChanges: [],
+ },
+ score: {
+ score: 1,
+ breakdown: { beforeRate: 0, afterRate: 1, gain: 1 },
+ },
+ transcript: [],
+ artifacts: {
+ buildOutput: { path: '.storybook/eval-results/build-output.txt', success: true },
+ typecheckOutput: {
+ path: '.storybook/eval-results/typecheck-output.txt',
+ errorCount: 0,
+ },
+ },
+ });
+
+ expect(data).toMatchObject({
+ schemaVersion: 4,
+ id: '20260402T041205123Z-deadbeef',
+ prompt: {
+ name: 'setup',
+ content: 'Write stories',
+ },
+ artifacts: {
+ buildOutput: { path: '.storybook/eval-results/build-output.txt', success: true },
+ typecheckOutput: {
+ path: '.storybook/eval-results/typecheck-output.txt',
+ errorCount: 0,
+ },
+ },
+ docs: {
+ transcript: {
+ prompt: 'Write stories',
+ messages: [
+ expect.objectContaining({
+ type: 'system',
+ subtype: 'init',
+ }),
+ expect.objectContaining({
+ type: 'result',
+ subtype: 'success',
+ }),
+ ],
+ },
+ },
+ });
+ expect(data).not.toHaveProperty('screenshots');
+ expect(data).not.toHaveProperty('artifacts.screenshotOutput');
+ });
+});
diff --git a/scripts/eval/lib/result-docs.ts b/scripts/eval/lib/result-docs.ts
new file mode 100644
index 000000000000..b3895fb15d48
--- /dev/null
+++ b/scripts/eval/lib/result-docs.ts
@@ -0,0 +1,699 @@
+import type { AgentVariant, Execution } from './agents/config.ts';
+import type { Grade, QualityScore } from './grade.ts';
+import type { Project } from './projects.ts';
+import type {
+ AssistantMessage,
+ ResultMessage,
+ SystemMessage,
+ TranscriptMessage,
+ TranscriptProps,
+ ToolResultContent,
+ ToolUseContent,
+ UserMessage,
+} from './transcript-types.ts';
+import type { EvalEnvironment } from './utils.ts';
+
+export interface EvalArtifacts {
+ buildOutput: {
+ path: string;
+ success: boolean;
+ };
+ typecheckOutput: {
+ path: string;
+ errorCount: number;
+ };
+}
+
+export interface EvalData {
+ schemaVersion: 4;
+ id: string;
+ timestamp: string;
+ project: Project;
+ variant: AgentVariant;
+ prompt: {
+ name: string;
+ content: string;
+ };
+ baselineCommit: string;
+ environment: EvalEnvironment;
+ execution: Execution;
+ grade: Grade;
+ score: QualityScore;
+ transcript: unknown[];
+ artifacts: EvalArtifacts;
+ docs: {
+ transcript: TranscriptProps;
+ };
+}
+
+interface EvalSummaryLike {
+ variant?: {
+ agent?: string;
+ model?: string;
+ };
+ execution?: {
+ cost?: number;
+ duration?: number;
+ durationApi?: number;
+ turns?: number;
+ terminalResultSubtype?: string;
+ };
+}
+
+export function buildEvalData(opts: {
+ id: string;
+ timestamp: string;
+ project: Project;
+ variant: AgentVariant;
+ prompt: {
+ name: string;
+ content: string;
+ };
+ baselineCommit: string;
+ environment: EvalEnvironment;
+ execution: Execution;
+ grade: Grade;
+ score: QualityScore;
+ transcript: unknown[];
+ artifacts: EvalArtifacts;
+}): EvalData {
+ return {
+ schemaVersion: 4,
+ id: opts.id,
+ timestamp: opts.timestamp,
+ project: opts.project,
+ variant: opts.variant,
+ prompt: opts.prompt,
+ baselineCommit: opts.baselineCommit,
+ environment: opts.environment,
+ execution: opts.execution,
+ grade: opts.grade,
+ score: opts.score,
+ transcript: opts.transcript,
+ artifacts: opts.artifacts,
+ docs: {
+ transcript: normalizeTranscriptForDocs({
+ prompt: opts.prompt.content,
+ transcript: opts.transcript,
+ summary: {
+ variant: opts.variant,
+ execution: opts.execution,
+ },
+ }),
+ },
+ };
+}
+
+export function normalizeTranscriptForDocs(opts: {
+ prompt: string;
+ transcript: unknown[];
+ summary?: unknown;
+}): TranscriptProps {
+ const summary = isEvalSummaryLike(opts.summary) ? opts.summary : undefined;
+ const messages = opts.transcript.flatMap((entry, index) =>
+ normalizeTranscriptEntry(entry, index, summary)
+ );
+
+ ensureSystemMessage(messages, summary);
+ ensureResultMessage(messages, summary);
+
+ return {
+ prompt: opts.prompt,
+ promptTokenCount: estimateTokenCount(opts.prompt),
+ promptCost: 0,
+ messages,
+ };
+}
+
+function normalizeTranscriptEntry(
+ entry: unknown,
+ index: number,
+ summary?: EvalSummaryLike
+): TranscriptMessage[] {
+ if (!entry || typeof entry !== 'object') {
+ return [];
+ }
+
+ if (looksLikeClaudeSystem(entry)) {
+ return [normalizeClaudeSystem(entry)];
+ }
+
+ if (looksLikeClaudeAssistant(entry)) {
+ return [normalizeClaudeAssistant(entry)];
+ }
+
+ if (looksLikeClaudeUser(entry)) {
+ return [normalizeClaudeUser(entry, index)];
+ }
+
+ if (looksLikeClaudeResult(entry)) {
+ return [normalizeClaudeResult(entry, summary)];
+ }
+
+ if (looksLikeClaudeStatus(entry)) {
+ return [createAssistantTextMessage(`Status: ${entry.status ?? 'unknown'}`, entry.ms)];
+ }
+
+ if (looksLikeClaudeApiRetry(entry)) {
+ return [
+ createAssistantTextMessage(
+ `API retry: attempt ${entry.attempt ?? '?'} / ${entry.max_retries ?? '?'}`,
+ entry.ms
+ ),
+ ];
+ }
+
+ if (looksLikeClaudeToolUseSummary(entry)) {
+ return [createAssistantTextMessage(entry.summary, entry.ms)];
+ }
+
+ if (looksLikeClaudeRateLimitEvent(entry)) {
+ const info = entry.rate_limit_info ?? {};
+ return [
+ createAssistantTextMessage(
+ `Rate limited — status: ${info.status ?? 'unknown'}, resets at: ${info.resetsAt ?? 'unknown'}`,
+ entry.ms
+ ),
+ ];
+ }
+
+ if (looksLikeCodexAgentMessage(entry)) {
+ return [createAssistantTextMessage(entry.text)];
+ }
+
+ if (looksLikeCodexReasoning(entry)) {
+ return [createAssistantTextMessage(`Reasoning\n\n${entry.text}`)];
+ }
+
+ if (looksLikeCodexCommand(entry)) {
+ return normalizeCodexCommand(entry, index);
+ }
+
+ if (looksLikeCodexFileChange(entry)) {
+ const summaryText = [
+ 'File changes:',
+ ...entry.changes.map((change) => `- ${change.kind} ${change.path}`),
+ ].join('\n');
+ return [createAssistantTextMessage(summaryText)];
+ }
+
+ if (looksLikeCodexError(entry)) {
+ return [createAssistantTextMessage(`Error\n\n${entry.message}`)];
+ }
+
+ return [
+ createAssistantTextMessage(
+ `Raw event\n\n\`\`\`json\n${JSON.stringify(entry, null, 2)}\n\`\`\``
+ ),
+ ];
+}
+
+function normalizeClaudeSystem(entry: ClaudeSystemEntry): SystemMessage {
+ return {
+ type: 'system',
+ subtype: 'init',
+ agent: entry.agent ?? 'Claude',
+ model: entry.model ?? 'unknown',
+ tools: entry.tools?.filter(isString) ?? [],
+ mcp_servers: normalizeMcpServers(entry.mcp_servers),
+ cwd: entry.cwd ?? '',
+ ms: getNumber(entry.ms),
+ tokenCount: getOptionalNumber(entry.tokenCount),
+ costUSD: getOptionalNumber(entry.costUSD),
+ };
+}
+
+function normalizeClaudeAssistant(entry: ClaudeAssistantEntry): AssistantMessage {
+ const content = entry.message.content.flatMap(
+ (block): Array<{ type: 'text'; text: string } | ToolUseContent> => {
+ if (block.type === 'text' && typeof block.text === 'string') {
+ return [{ type: 'text', text: block.text }];
+ }
+
+ if (block.type === 'tool_use' && typeof block.name === 'string') {
+ return [
+ {
+ type: 'tool_use',
+ id: typeof block.id === 'string' ? block.id : `tool-${block.name}`,
+ name: block.name,
+ input: isRecord(block.input) ? block.input : {},
+ isMCP: isMcpToolName(block.name),
+ },
+ ];
+ }
+
+ return [];
+ }
+ );
+
+ const outputTokens = getNumber(entry.message.usage?.output_tokens);
+ const inputTokens = getNumber(entry.message.usage?.input_tokens);
+ const tokenCount =
+ getOptionalNumber(entry.tokenCount) ??
+ (outputTokens || estimateAssistantContentTokens(content));
+
+ return {
+ type: 'assistant',
+ message: {
+ content,
+ usage: {
+ input_tokens: inputTokens,
+ output_tokens: outputTokens,
+ },
+ },
+ ms: getNumber(entry.ms),
+ tokenCount: tokenCount || undefined,
+ costUSD: getOptionalNumber(entry.costUSD),
+ };
+}
+
+function normalizeClaudeUser(entry: ClaudeUserEntry, index: number): UserMessage {
+ const content = entry.message.content.map((block, blockIndex) => ({
+ type: 'tool_result' as const,
+ tool_use_id:
+ typeof block.tool_use_id === 'string'
+ ? block.tool_use_id
+ : `tool-result-${index}-${blockIndex}`,
+ content: normalizeToolResultContent(block.content),
+ }));
+
+ const tokenCount =
+ getOptionalNumber(entry.tokenCount) ??
+ content.reduce((sum, block) => sum + estimateToolResultTokens(block.content), 0);
+
+ return {
+ type: 'user',
+ message: { content },
+ ms: getNumber(entry.ms),
+ tokenCount: tokenCount || undefined,
+ costUSD: getOptionalNumber(entry.costUSD),
+ };
+}
+
+function normalizeClaudeResult(entry: ClaudeResultEntry, summary?: EvalSummaryLike): ResultMessage {
+ return {
+ type: 'result',
+ subtype: entry.subtype === 'success' ? 'success' : 'error',
+ duration_ms:
+ getNumber(entry.duration_ms) || Math.round(getNumber(summary?.execution?.duration) * 1000),
+ duration_api_ms:
+ getNumber(entry.duration_api_ms) ||
+ Math.round(getNumber(summary?.execution?.durationApi) * 1000),
+ num_turns: getNumber(entry.num_turns) || getNumber(summary?.execution?.turns),
+ total_cost_usd: getNumber(entry.total_cost_usd) || getNumber(summary?.execution?.cost),
+ ms: getNumber(entry.ms),
+ tokenCount: getOptionalNumber(entry.tokenCount),
+ costUSD: getOptionalNumber(entry.costUSD),
+ };
+}
+
+function normalizeCodexCommand(entry: CodexCommandEntry, index: number): TranscriptMessage[] {
+ const id = `codex-command-${index}`;
+ const output = buildCodexCommandOutput(entry);
+
+ return [
+ {
+ type: 'assistant',
+ message: {
+ content: [
+ {
+ type: 'tool_use',
+ id,
+ name: 'Bash',
+ input: { command: entry.command },
+ isMCP: false,
+ },
+ ],
+ usage: {
+ input_tokens: 0,
+ output_tokens: estimateTokenCount(entry.command),
+ },
+ },
+ ms: 0,
+ tokenCount: estimateTokenCount(entry.command),
+ },
+ {
+ type: 'user',
+ message: {
+ content: [
+ {
+ type: 'tool_result',
+ tool_use_id: id,
+ content: output,
+ },
+ ],
+ },
+ ms: 0,
+ tokenCount: estimateToolResultTokens(output),
+ },
+ ];
+}
+
+function ensureSystemMessage(messages: TranscriptMessage[], summary?: EvalSummaryLike) {
+ if (messages.some((message) => message.type === 'system')) {
+ return;
+ }
+
+ if (!summary?.variant) {
+ return;
+ }
+
+ messages.unshift({
+ type: 'system',
+ subtype: 'init',
+ agent: formatAgentName(summary.variant.agent),
+ model: summary.variant.model ?? 'unknown',
+ tools: [],
+ mcp_servers: [],
+ cwd: '',
+ ms: 0,
+ });
+}
+
+function ensureResultMessage(messages: TranscriptMessage[], summary?: EvalSummaryLike) {
+ if (messages.some((message) => message.type === 'result')) {
+ return;
+ }
+
+ if (!summary?.execution) {
+ return;
+ }
+
+ messages.push({
+ type: 'result',
+ subtype:
+ summary.execution.terminalResultSubtype &&
+ summary.execution.terminalResultSubtype !== 'success'
+ ? 'error'
+ : 'success',
+ duration_ms: Math.round(getNumber(summary.execution.duration) * 1000),
+ duration_api_ms: Math.round(getNumber(summary.execution.durationApi) * 1000),
+ num_turns: getNumber(summary.execution.turns),
+ total_cost_usd: getNumber(summary.execution.cost),
+ ms: 0,
+ });
+}
+
+function normalizeToolResultContent(content: unknown): ToolResultContent['content'] {
+ if (typeof content === 'string') {
+ return content;
+ }
+
+ if (Array.isArray(content)) {
+ return content.map((item) => ({
+ type: isRecord(item) && typeof item.type === 'string' ? item.type : 'text',
+ text: isRecord(item) && typeof item.text === 'string' ? item.text : undefined,
+ isError: isRecord(item) && item.isError === true,
+ }));
+ }
+
+ return JSON.stringify(content, null, 2) ?? '';
+}
+
+function normalizeMcpServers(value: unknown): SystemMessage['mcp_servers'] {
+ if (!Array.isArray(value)) {
+ return [];
+ }
+
+ return value.flatMap((server) => {
+ if (!isRecord(server) || typeof server.name !== 'string') {
+ return [];
+ }
+
+ const status = server.status;
+ return [
+ {
+ name: server.name,
+ status:
+ status === 'connected' || status === 'disconnected' || status === 'unknown'
+ ? status
+ : 'unknown',
+ },
+ ];
+ });
+}
+
+function buildCodexCommandOutput(entry: CodexCommandEntry) {
+ const lines = [];
+
+ if (typeof entry.exit_code === 'number') {
+ lines.push(`Exit code: ${entry.exit_code}`);
+ }
+ if (typeof entry.aggregated_output === 'string' && entry.aggregated_output.trim()) {
+ lines.push(entry.aggregated_output.trim());
+ }
+ if (lines.length === 0) {
+ lines.push(entry.command);
+ }
+
+ return lines.join('\n\n');
+}
+
+function createAssistantTextMessage(text: string, ms = 0): AssistantMessage {
+ const tokenCount = estimateTokenCount(text);
+
+ return {
+ type: 'assistant',
+ message: {
+ content: [{ type: 'text', text }],
+ usage: {
+ input_tokens: 0,
+ output_tokens: tokenCount,
+ },
+ },
+ ms,
+ tokenCount,
+ };
+}
+
+function estimateAssistantContentTokens(content: AssistantMessage['message']['content']) {
+ return content.reduce((sum, item) => {
+ if (item.type === 'text') {
+ return sum + estimateTokenCount(item.text);
+ }
+
+ return sum + estimateTokenCount(`${item.name}\n${JSON.stringify(item.input)}`);
+ }, 0);
+}
+
+function estimateToolResultTokens(content: ToolResultContent['content'] | undefined) {
+ if (content == null) {
+ return 0;
+ }
+
+ if (typeof content === 'string') {
+ return estimateTokenCount(content);
+ }
+
+ return content.reduce(
+ (sum, item) => sum + estimateTokenCount([item.type, item.text].filter(Boolean).join('\n')),
+ 0
+ );
+}
+
+function estimateTokenCount(text: string) {
+ if (!text.trim()) {
+ return 0;
+ }
+
+ return Math.max(1, Math.ceil(text.length / 4));
+}
+
+function formatAgentName(agent?: string) {
+ if (agent === 'claude') {
+ return 'Claude';
+ }
+ if (agent === 'codex') {
+ return 'Codex';
+ }
+
+ return agent ?? 'Agent';
+}
+
+function isMcpToolName(name: string) {
+ return /^mcp/i.test(name) || name.includes('mcp__') || name.includes('mcp_');
+}
+
+function isEvalSummaryLike(value: unknown): value is EvalSummaryLike {
+ return isRecord(value);
+}
+
+function isRecord(value: unknown): value is Record {
+ return typeof value === 'object' && value !== null;
+}
+
+function isString(value: unknown): value is string {
+ return typeof value === 'string';
+}
+
+function getNumber(value: unknown) {
+ return typeof value === 'number' && Number.isFinite(value) ? value : 0;
+}
+
+function getOptionalNumber(value: unknown) {
+ return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
+}
+
+interface ClaudeSystemEntry {
+ type: 'system';
+ subtype: 'init';
+ agent?: string;
+ model?: string;
+ tools?: unknown[];
+ mcp_servers?: unknown;
+ cwd?: string;
+ ms?: number;
+ tokenCount?: number;
+ costUSD?: number;
+}
+
+interface ClaudeAssistantEntry {
+ type: 'assistant';
+ message: {
+ content: Array<
+ | {
+ type: 'text';
+ text: string;
+ }
+ | {
+ type: 'tool_use';
+ id?: string;
+ name: string;
+ input: Record;
+ }
+ >;
+ usage?: {
+ input_tokens?: number;
+ output_tokens?: number;
+ };
+ };
+ ms?: number;
+ tokenCount?: number;
+ costUSD?: number;
+}
+
+interface ClaudeUserEntry {
+ type: 'user';
+ message: {
+ content: Array<{
+ type: 'tool_result';
+ tool_use_id?: string;
+ content:
+ | string
+ | Array<{
+ type: string;
+ text?: string;
+ isError?: boolean;
+ }>;
+ }>;
+ };
+ ms?: number;
+ tokenCount?: number;
+ costUSD?: number;
+}
+
+interface ClaudeResultEntry {
+ type: 'result';
+ subtype: string;
+ duration_ms?: number;
+ duration_api_ms?: number;
+ num_turns?: number;
+ total_cost_usd?: number;
+ ms?: number;
+ tokenCount?: number;
+ costUSD?: number;
+}
+
+// --- Transcript narrowing helpers (Claude / Codex SDK message shapes) ---
+// If upstream SDKs add new message kinds, extend these predicates and any consumers in lockstep.
+
+interface CodexCommandEntry {
+ type: 'command_execution';
+ command: string;
+ exit_code?: number;
+ aggregated_output?: string;
+}
+
+function looksLikeClaudeSystem(entry: unknown): entry is ClaudeSystemEntry {
+ return isRecord(entry) && entry.type === 'system' && entry.subtype === 'init';
+}
+
+function looksLikeClaudeAssistant(entry: unknown): entry is ClaudeAssistantEntry {
+ return (
+ isRecord(entry) &&
+ entry.type === 'assistant' &&
+ isRecord(entry.message) &&
+ Array.isArray(entry.message.content)
+ );
+}
+
+function looksLikeClaudeUser(entry: unknown): entry is ClaudeUserEntry {
+ return (
+ isRecord(entry) &&
+ entry.type === 'user' &&
+ isRecord(entry.message) &&
+ Array.isArray(entry.message.content)
+ );
+}
+
+function looksLikeClaudeResult(entry: unknown): entry is ClaudeResultEntry {
+ return isRecord(entry) && entry.type === 'result' && typeof entry.subtype === 'string';
+}
+
+function looksLikeClaudeStatus(entry: unknown): entry is {
+ type: 'system';
+ subtype: 'status';
+ status?: string;
+ ms?: number;
+} {
+ return isRecord(entry) && entry.type === 'system' && entry.subtype === 'status';
+}
+
+function looksLikeClaudeApiRetry(entry: unknown): entry is {
+ type: 'system';
+ subtype: 'api_retry';
+ attempt?: number;
+ max_retries?: number;
+ ms?: number;
+} {
+ return isRecord(entry) && entry.type === 'system' && entry.subtype === 'api_retry';
+}
+
+function looksLikeClaudeToolUseSummary(
+ entry: unknown
+): entry is { type: 'tool_use_summary'; summary: string; ms?: number } {
+ return isRecord(entry) && entry.type === 'tool_use_summary' && typeof entry.summary === 'string';
+}
+
+function looksLikeClaudeRateLimitEvent(entry: unknown): entry is {
+ type: 'rate_limit_event';
+ rate_limit_info?: { status?: string; resetsAt?: string };
+ ms?: number;
+} {
+ return isRecord(entry) && entry.type === 'rate_limit_event';
+}
+
+function looksLikeCodexAgentMessage(
+ entry: unknown
+): entry is { type: 'agent_message'; text: string } {
+ return isRecord(entry) && entry.type === 'agent_message' && typeof entry.text === 'string';
+}
+
+function looksLikeCodexReasoning(entry: unknown): entry is { type: 'reasoning'; text: string } {
+ return isRecord(entry) && entry.type === 'reasoning' && typeof entry.text === 'string';
+}
+
+function looksLikeCodexCommand(entry: unknown): entry is CodexCommandEntry {
+ return isRecord(entry) && entry.type === 'command_execution' && typeof entry.command === 'string';
+}
+
+function looksLikeCodexFileChange(entry: unknown): entry is {
+ type: 'file_change';
+ changes: Array<{ kind: string; path: string }>;
+} {
+ return isRecord(entry) && entry.type === 'file_change' && Array.isArray(entry.changes);
+}
+
+function looksLikeCodexError(entry: unknown): entry is { type: 'error'; message: string } {
+ return isRecord(entry) && entry.type === 'error' && typeof entry.message === 'string';
+}
diff --git a/scripts/eval/lib/run-trial.test.ts b/scripts/eval/lib/run-trial.test.ts
new file mode 100644
index 000000000000..957492069b2d
--- /dev/null
+++ b/scripts/eval/lib/run-trial.test.ts
@@ -0,0 +1,471 @@
+import { mkdirSync, readFileSync, rmSync } from 'node:fs';
+import { join } from 'node:path';
+import { tmpdir } from 'node:os';
+
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+import type { TrialConfig, TrialReport } from './run-trial.ts';
+
+// Mock external dependencies to avoid real git/storybook/vitest calls
+vi.mock('./prepare-trial', () => ({
+ prepareTrial: vi.fn(),
+}));
+vi.mock('./grade', () => ({
+ grade: vi.fn(),
+ collectGhostStoriesGrade: vi.fn(),
+}));
+vi.mock('./publish-trial', () => ({
+ publishTrialBranch: vi.fn().mockResolvedValue({
+ branch: 'trial/test-branch',
+ labels: [
+ 'eval',
+ 'project:test-project',
+ 'agent:claude',
+ 'model:sonnet-4.6',
+ 'effort:high',
+ 'prompt:setup',
+ ],
+ url: 'https://github.com/storybook-tmp/test-project/pull/123',
+ }),
+}));
+vi.mock('./utils', async (importOriginal) => {
+ const actual = await importOriginal();
+ return {
+ ...actual,
+ captureEnvironment: vi.fn().mockResolvedValue({
+ nodeVersion: 'v22.21.1',
+ evalBranch: 'test-branch',
+ evalCommit: 'abc123',
+ }),
+ };
+});
+vi.mock('./agents/claude-code', () => ({
+ claudeAgent: { name: 'claude', execute: vi.fn() },
+}));
+vi.mock('./agents/codex', () => ({
+ codexAgent: { name: 'codex', execute: vi.fn() },
+}));
+vi.mock('tinyexec', () => ({
+ x: vi.fn().mockResolvedValue({
+ exitCode: 0,
+ stdout: '# Storybook Setup\n\nFull project-aware instructions...',
+ stderr: '',
+ }),
+}));
+
+import { x } from 'tinyexec';
+import { claudeAgent } from './agents/claude-code.ts';
+import { collectGhostStoriesGrade, grade } from './grade.ts';
+import { prepareTrial } from './prepare-trial.ts';
+import { publishTrialBranch } from './publish-trial.ts';
+import { runTrial } from './run-trial.ts';
+import { captureEnvironment } from './utils.ts';
+
+let TMP: string;
+
+beforeEach(() => {
+ vi.clearAllMocks();
+ TMP = join(tmpdir(), `eval-run-trial-${Date.now()}`);
+ mkdirSync(join(TMP, '.storybook', 'eval-results'), { recursive: true });
+});
+
+afterEach(() => {
+ rmSync(TMP, { recursive: true, force: true });
+});
+
+const baseConfig: TrialConfig = {
+ project: {
+ name: 'test-project',
+ repo: 'https://github.com/storybook-tmp/test-project',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/test-project',
+ },
+ variant: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' },
+ prompt: 'setup',
+};
+
+describe('runTrial pipeline', () => {
+ it('assembles a complete TrialReport from pipeline steps', async () => {
+ setupMocks();
+
+ const result = await runTrial(baseConfig);
+
+ expect(result).toMatchObject({
+ schemaVersion: 4,
+ id: expect.any(String),
+ project: {
+ name: 'test-project',
+ repo: 'https://github.com/storybook-tmp/test-project',
+ branch: 'main',
+ },
+ variant: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' },
+ prompt: {
+ name: 'setup',
+ },
+ baselineCommit: 'deadbeef',
+ execution: {
+ cost: 0.42,
+ duration: 45.2,
+ turns: 12,
+ },
+ grade: {
+ baselineGhostStories: {
+ candidateCount: 5,
+ total: 4,
+ passed: 2,
+ successRate: 0.5,
+ },
+ ghostStories: {
+ candidateCount: 5,
+ total: 4,
+ passed: 3,
+ successRate: 0.75,
+ },
+ baselinePreviewStories: {
+ total: 6,
+ passed: 2,
+ storyFiles: 3,
+ },
+ buildSuccess: true,
+ },
+ score: {
+ score: 0.5,
+ },
+ publish: {
+ branch: 'trial/test-branch',
+ url: 'https://github.com/storybook-tmp/test-project/pull/123',
+ },
+ });
+ expect(result).not.toHaveProperty('screenshots');
+ expect(result.timestamp).toMatch(/^\d{4}-\d{2}-\d{2}T/);
+ });
+
+ it('calls pipeline steps with correct arguments', async () => {
+ setupMocks();
+
+ const config: TrialConfig = {
+ ...baseConfig,
+ project: {
+ name: 'mealdrop',
+ repo: 'https://github.com/storybook-tmp/mealdrop',
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ };
+
+ await runTrial(config);
+
+ expect(vi.mocked(prepareTrial).mock.calls[0][0]).toMatchObject({
+ name: 'mealdrop',
+ repo: 'https://github.com/storybook-tmp/mealdrop',
+ branch: 'main',
+ });
+ expect(vi.mocked(prepareTrial).mock.calls[0][2]).toBeDefined();
+
+ expect(vi.mocked(captureEnvironment)).toHaveBeenCalledWith();
+
+ const params = vi.mocked(claudeAgent.execute).mock.calls[0][0];
+ expect(params).toMatchObject({
+ prompt: expect.stringContaining('npx storybook ai setup'),
+ projectPath: TMP,
+ variant: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' },
+ resultsDir: join(TMP, '.storybook', 'eval-results'),
+ env: { EVAL_SETUP_PROMPT: 'setup' },
+ });
+ expect(params.logger).toBeDefined();
+
+ expect(vi.mocked(x)).toHaveBeenCalledWith(
+ 'npx',
+ ['storybook', 'ai', 'setup'],
+ expect.objectContaining({
+ nodeOptions: expect.objectContaining({
+ cwd: TMP,
+ env: expect.objectContaining({
+ EVAL_SETUP_PROMPT: 'setup',
+ STORYBOOK_DISABLE_TELEMETRY: '1',
+ }),
+ }),
+ })
+ );
+
+ const gradeWorkspace = vi.mocked(grade).mock.calls[0][0];
+ expect(gradeWorkspace).toMatchObject({
+ baselineCommit: 'deadbeef',
+ projectPath: TMP,
+ resultsDir: join(TMP, '.storybook', 'eval-results'),
+ });
+ expect(vi.mocked(grade).mock.calls[0][1]).toBeDefined();
+ expect(vi.mocked(collectGhostStoriesGrade)).toHaveBeenCalledWith(
+ TMP,
+ expect.anything(),
+ 'baseline ghost stories'
+ );
+ expect(vi.mocked(grade).mock.calls[0][2]).toMatchObject({
+ candidateCount: 5,
+ total: 4,
+ passed: 2,
+ successRate: 0.5,
+ });
+ expect(vi.mocked(publishTrialBranch).mock.calls[0][0]).toMatchObject({
+ data: expect.objectContaining({
+ id: expect.any(String),
+ prompt: expect.objectContaining({ name: 'setup' }),
+ }),
+ workspace: expect.objectContaining({
+ trialBranch: 'trial/test-branch',
+ }),
+ });
+ });
+
+ it('writes data.json, prompt.md, and setup-prompt.md to results dir', async () => {
+ setupMocks();
+
+ await runTrial(baseConfig);
+
+ const resultsDir = join(TMP, '.storybook', 'eval-results');
+
+ const data: TrialReport = JSON.parse(readFileSync(join(resultsDir, 'data.json'), 'utf-8'));
+ expect(data).toMatchObject({
+ schemaVersion: 4,
+ id: expect.any(String),
+ execution: { cost: 0.42 },
+ grade: {
+ buildSuccess: true,
+ baselineGhostStories: {
+ candidateCount: 5,
+ total: 4,
+ passed: 2,
+ successRate: 0.5,
+ },
+ },
+ prompt: {
+ name: 'setup',
+ content: expect.stringContaining('Full project-aware instructions'),
+ },
+ artifacts: {
+ buildOutput: { path: '.storybook/eval-results/build-output.txt', success: true },
+ typecheckOutput: {
+ path: '.storybook/eval-results/typecheck-output.txt',
+ errorCount: 0,
+ },
+ },
+ docs: {
+ transcript: {
+ prompt: expect.stringContaining('Full project-aware instructions'),
+ },
+ },
+ });
+ expect(data).not.toHaveProperty('screenshots');
+ expect(data).not.toHaveProperty('artifacts.screenshotOutput');
+
+ const promptContent = readFileSync(join(resultsDir, 'prompt.md'), 'utf-8');
+ expect(promptContent).toContain('npx storybook ai setup');
+
+ const setupPromptContent = readFileSync(join(resultsDir, 'setup-prompt.md'), 'utf-8');
+ expect(setupPromptContent).toContain('Full project-aware instructions');
+
+ expect(() => readFileSync(join(resultsDir, 'summary.json'), 'utf-8')).toThrow();
+ expect(() => readFileSync(join(resultsDir, 'transcript.json'), 'utf-8')).toThrow();
+ });
+
+ it('propagates failed build into result', async () => {
+ setupMocks({ buildSuccess: false, typeCheckErrors: 5 });
+
+ await expect(runTrial(baseConfig)).resolves.toMatchObject({
+ grade: { buildSuccess: false, typeCheckErrors: 5 },
+ score: { score: 0 },
+ });
+ });
+
+ it('does not include screenshot-era fields when the build fails', async () => {
+ setupMocks({ buildSuccess: false, typeCheckErrors: 5 });
+
+ await runTrial(baseConfig);
+
+ expect(vi.mocked(publishTrialBranch)).toHaveBeenCalledWith(
+ expect.objectContaining({
+ data: expect.objectContaining({
+ artifacts: expect.not.objectContaining({
+ screenshotOutput: expect.anything(),
+ }),
+ }),
+ })
+ );
+ });
+
+ it('keeps play-prompt output on the no-screenshot schema', async () => {
+ setupMocks();
+
+ await runTrial({
+ ...baseConfig,
+ prompt: 'pattern-copy-play',
+ });
+
+ expect(vi.mocked(publishTrialBranch)).toHaveBeenCalledWith(
+ expect.objectContaining({
+ data: expect.objectContaining({
+ prompt: expect.objectContaining({ name: 'pattern-copy-play' }),
+ artifacts: expect.not.objectContaining({
+ screenshotOutput: expect.anything(),
+ }),
+ }),
+ })
+ );
+ });
+
+ it('does not call grade before agent finishes', async () => {
+ // Use execution order tracking to verify sequencing
+ const callOrder: string[] = [];
+
+ vi.mocked(prepareTrial).mockImplementation(async () => {
+ callOrder.push('prepare');
+ return {
+ trialDir: TMP,
+ sourceDir: join(TMP, 'source'),
+ repoRoot: TMP,
+ projectPath: TMP,
+ resultsDir: join(TMP, '.storybook', 'eval-results'),
+ baselineCommit: 'deadbeef',
+ trialBranch: 'trial/test-branch',
+ };
+ });
+
+ vi.mocked(claudeAgent.execute).mockImplementation(async () => {
+ callOrder.push('agent');
+ return {
+ execution: { cost: 0.1, duration: 10, turns: 3 },
+ transcript: [],
+ };
+ });
+
+ vi.mocked(collectGhostStoriesGrade).mockImplementation(async () => {
+ callOrder.push('baseline-ghost');
+ return {
+ candidateCount: 3,
+ total: 2,
+ passed: 1,
+ successRate: 0.5,
+ };
+ });
+
+ vi.mocked(grade).mockImplementation(async () => {
+ callOrder.push('grade');
+ return {
+ grade: {
+ buildSuccess: true,
+ typeCheckErrors: 0,
+ fileChanges: [],
+ storybookChanges: [],
+ },
+ score: {
+ score: 0,
+ breakdown: {
+ beforeRate: 0,
+ afterRate: 0,
+ gain: 0,
+ },
+ },
+ };
+ });
+
+ await runTrial(baseConfig);
+
+ expect(callOrder).toEqual(['prepare', 'baseline-ghost', 'agent', 'grade']);
+ });
+});
+
+function setupMocks(overrides?: {
+ buildSuccess?: boolean;
+ typeCheckErrors?: number;
+ cost?: number;
+}) {
+ const { buildSuccess = true, typeCheckErrors = 0, cost = 0.42 } = overrides ?? {};
+
+ vi.mocked(prepareTrial).mockResolvedValue({
+ trialDir: TMP,
+ sourceDir: join(TMP, 'source'),
+ repoRoot: TMP,
+ projectPath: TMP,
+ resultsDir: join(TMP, '.storybook', 'eval-results'),
+ baselineCommit: 'deadbeef',
+ trialBranch: 'trial/test-branch',
+ });
+
+ vi.mocked(claudeAgent.execute).mockResolvedValue({
+ execution: {
+ cost,
+ duration: 45.2,
+ turns: 12,
+ terminalResultSubtype: 'success',
+ },
+ transcript: [
+ {
+ type: 'assistant',
+ message: {
+ content: [{ type: 'text', text: 'done' }],
+ usage: { input_tokens: 1, output_tokens: 1 },
+ },
+ },
+ ],
+ });
+
+ vi.mocked(collectGhostStoriesGrade).mockResolvedValue({
+ candidateCount: 5,
+ total: 4,
+ passed: 2,
+ successRate: 0.5,
+ });
+
+ vi.mocked(grade).mockResolvedValue({
+ grade: {
+ baselineGhostStories: {
+ candidateCount: 5,
+ total: 4,
+ passed: 2,
+ successRate: 0.5,
+ },
+ ghostStories: buildSuccess
+ ? {
+ candidateCount: 5,
+ total: 4,
+ passed: 3,
+ successRate: 0.75,
+ }
+ : undefined,
+ baselinePreviewStories: {
+ total: 6,
+ passed: 2,
+ storyFiles: 3,
+ cssCheck: 'not-run' as const,
+ },
+ buildSuccess,
+ typeCheckErrors,
+ fileChanges: [
+ { path: '.storybook/preview.tsx', gitStatus: 'A' },
+ { path: 'src/Button.stories.tsx', gitStatus: 'A' },
+ ],
+ storybookChanges: [
+ { path: '.storybook/preview.tsx', gitStatus: 'A' },
+ { path: 'src/Button.stories.tsx', gitStatus: 'A' },
+ ],
+ ...(buildSuccess
+ ? {
+ storyRender: {
+ total: 6,
+ passed: 4,
+ storyFiles: 3,
+ cssCheck: 'pass' as const,
+ },
+ }
+ : {}),
+ },
+ score: {
+ score: buildSuccess ? 0.5 : 0,
+ breakdown: {
+ beforeRate: 2 / 6,
+ afterRate: buildSuccess ? 4 / 6 : 0,
+ gain: buildSuccess ? 0.5 : 0,
+ },
+ },
+ });
+}
diff --git a/scripts/eval/lib/run-trial.ts b/scripts/eval/lib/run-trial.ts
new file mode 100644
index 000000000000..03152c03a1c2
--- /dev/null
+++ b/scripts/eval/lib/run-trial.ts
@@ -0,0 +1,235 @@
+import { writeFile } from 'node:fs/promises';
+import { join } from 'pathe';
+import { x } from 'tinyexec';
+import type { Logger } from './utils.ts';
+import type { AgentId, AgentDriver, AgentVariant } from './agents/config.ts';
+import type { Project } from './projects.ts';
+import { collectGhostStoriesGrade, grade } from './grade.ts';
+import { claudeAgent } from './agents/claude-code.ts';
+import { codexAgent } from './agents/codex.ts';
+import { publishTrialBranch, type PublishMetadata } from './publish-trial.ts';
+import { prepareTrial } from './prepare-trial.ts';
+import { buildEvalData, type EvalData } from './result-docs.ts';
+import {
+ captureEnvironment,
+ createLogger,
+ generateTrialId,
+ getEvalResultsRelativePath,
+ loadPrompt,
+} from './utils.ts';
+
+export interface TrialConfig {
+ /** Which project to evaluate from its normalized benchmark baseline branch. */
+ project: Project;
+ /** Agent, model, and effort level. */
+ variant: AgentVariant;
+ /** Prompt variant name — registered in `code/lib/cli-storybook/src/ai/prompts/` (e.g. "pattern-copy-play"). */
+ prompt: string;
+ /** Log agent messages to stdout. */
+ verbose?: boolean;
+}
+
+export type TrialReport = EvalData;
+
+export interface RunTrialResult extends EvalData {
+ publish: PublishMetadata;
+}
+
+const drivers: Record = {
+ claude: claudeAgent,
+ codex: codexAgent,
+};
+
+/**
+ * Run a full eval trial: prepare -> execute agent -> grade -> save.
+ */
+export async function runTrial(config: TrialConfig, logger?: Logger): Promise {
+ const { project, variant, prompt: promptName } = config;
+ const { agent: agentName, model } = variant;
+ const log = logger ?? createLogger();
+ const trialId = generateTrialId();
+ const timestamp = new Date().toISOString();
+
+ log.log(`Preparing ${project.name}...`);
+
+ // 1. Prepare the trial
+ const workspace = await prepareTrial(project, trialId, log);
+
+ // 2. Capture environment
+ const environment = await captureEnvironment();
+
+ // 3. Capture a baseline ghost-stories score before the agent changes the repo.
+ const baselineGhostStories = await collectGhostStoriesGrade(
+ workspace.projectPath,
+ log,
+ 'baseline ghost stories'
+ );
+
+ // 4. Load the nudge prompt the agent will receive. The agent itself runs
+ // `npx storybook ai setup` as a tool call — mirroring what real users do
+ // when they copy the "Set up Storybook with AI" prompt from the UI.
+ const prompt = loadPrompt(promptName);
+ await writeFile(join(workspace.resultsDir, 'prompt.md'), prompt);
+
+ // 5. Capture the full markdown the agent will receive from `ai setup` so
+ // the trial record contains a reproducible, project-aware snapshot of
+ // the instructions (not just the one-line nudge). Runs the same CLI the
+ // agent will run, in the same workspace, with the same env. Persisted as
+ // a separate file so the resulting PR diff shows the exact instructions
+ // the agent was given for this trial.
+ const promptContent = await captureAiSetupMarkdown(workspace.projectPath, promptName, log);
+ await writeFile(join(workspace.resultsDir, 'setup-prompt.md'), promptContent);
+
+ // 6. Execute the agent. EVAL_SETUP_PROMPT is forwarded into the agent's
+ // environment so its `ai setup` tool call resolves to the selected
+ // prompt variant (unset for real users → always the default).
+ log.log(` Running ${agentName} (${model}, effort=${variant.effort})...`);
+ const driver = drivers[agentName];
+ const { execution, transcript } = await driver.execute({
+ prompt,
+ projectPath: workspace.projectPath,
+ variant,
+ resultsDir: workspace.resultsDir,
+ logger: log,
+ verbose: config.verbose,
+ env: { EVAL_SETUP_PROMPT: promptName },
+ });
+ log.logSuccess(
+ `Agent completed (${Math.round(execution.duration)}s, ${execution.cost ? `$${execution.cost.toFixed(2)}` : 'cost N/A'}, ${execution.turns} turns)`
+ );
+
+ const provisionalArtifacts = {
+ buildOutput: {
+ path: getEvalResultsRelativePath('build-output.txt', project.projectDir),
+ success: false,
+ },
+ typecheckOutput: {
+ path: getEvalResultsRelativePath('typecheck-output.txt', project.projectDir),
+ errorCount: 0,
+ },
+ };
+
+ // 7. Write provisional data so the baseline-owned MDX files can resolve it during grading.
+ const provisionalData = buildEvalData({
+ id: trialId,
+ timestamp,
+ project,
+ variant,
+ prompt: {
+ name: promptName,
+ content: promptContent,
+ },
+ baselineCommit: workspace.baselineCommit,
+ environment,
+ execution,
+ grade: {
+ baselineGhostStories,
+ buildSuccess: false,
+ typeCheckErrors: 0,
+ fileChanges: [],
+ storybookChanges: [],
+ },
+ score: {
+ score: 0,
+ breakdown: {
+ beforeRate: 0,
+ afterRate: 0,
+ gain: 0,
+ },
+ },
+ transcript,
+ artifacts: provisionalArtifacts,
+ });
+
+ await writeFile(
+ join(workspace.resultsDir, 'data.json'),
+ JSON.stringify(provisionalData, null, 2)
+ );
+
+ // 8. Grade the results using story-render preview gain as the score.
+ const { grade: trialGrade, score } = await grade(workspace, log, baselineGhostStories);
+
+ // 9. Rewrite the provisional data with the final grade.
+ const reportForCommit = buildEvalData({
+ ...provisionalData,
+ grade: trialGrade,
+ score,
+ artifacts: {
+ ...provisionalArtifacts,
+ buildOutput: {
+ ...provisionalArtifacts.buildOutput,
+ success: trialGrade.buildSuccess,
+ },
+ typecheckOutput: {
+ ...provisionalArtifacts.typecheckOutput,
+ errorCount: trialGrade.typeCheckErrors,
+ },
+ },
+ });
+
+ await writeFile(
+ join(workspace.resultsDir, 'data.json'),
+ JSON.stringify(reportForCommit, null, 2)
+ );
+
+ // 10. Commit, push, and open the benchmark PR
+ const publish = await publishTrialBranch({
+ data: reportForCommit,
+ workspace,
+ logger: log,
+ });
+
+ log.logSuccess(`Results saved to ${workspace.resultsDir}`);
+
+ return {
+ ...reportForCommit,
+ publish,
+ };
+}
+
+/**
+ * Run `npx storybook ai setup` inside the prepared trial workspace and return
+ * its stdout — the exact project-aware markdown the agent will receive from
+ * the same CLI invocation. `EVAL_SETUP_PROMPT` selects the variant;
+ * `STORYBOOK_DISABLE_TELEMETRY` keeps the harness's capture invocation out of
+ * telemetry.
+ *
+ * Failures (spawn errors, timeouts, non-zero exit) are logged and swallowed:
+ * capturing the prompt content is bookkeeping, not the thing being measured,
+ * so it must never abort the trial.
+ */
+export async function captureAiSetupMarkdown(
+ projectPath: string,
+ promptName: string,
+ log: Logger
+): Promise {
+ try {
+ const result = await x('npx', ['storybook', 'ai', 'setup'], {
+ throwOnError: false,
+ timeout: 60_000,
+ nodeOptions: {
+ cwd: projectPath,
+ env: {
+ ...process.env,
+ EVAL_SETUP_PROMPT: promptName,
+ STORYBOOK_DISABLE_TELEMETRY: '1',
+ },
+ },
+ });
+
+ if (result.exitCode !== 0) {
+ log.logError(
+ `Failed to capture ai setup markdown (exit ${result.exitCode}). Falling back to nudge-only record.`
+ );
+ log.logError(result.stderr.trim() || result.stdout.trim());
+ return '';
+ }
+
+ return result.stdout.trim();
+ } catch (error) {
+ log.logError(
+ `Failed to capture ai setup markdown (${error instanceof Error ? error.message : String(error)}). Falling back to nudge-only record.`
+ );
+ return '';
+ }
+}
diff --git a/scripts/eval/lib/story-render.test.ts b/scripts/eval/lib/story-render.test.ts
new file mode 100644
index 000000000000..5c9363f61e6a
--- /dev/null
+++ b/scripts/eval/lib/story-render.test.ts
@@ -0,0 +1,73 @@
+import { join } from 'pathe';
+import { describe, expect, it } from 'vitest';
+
+import {
+ getChangedStoryFiles,
+ getGeneratedStoryFiles,
+ getPreviewEnvironmentFiles,
+} from './story-render.ts';
+import type { FileChange } from './grade.ts';
+
+describe('getChangedStoryFiles', () => {
+ it('returns created and modified story files only', () => {
+ const fileChanges: FileChange[] = [
+ { path: 'src/Button.stories.tsx', gitStatus: 'A' },
+ { path: 'src/Header.story.ts', gitStatus: 'M' },
+ { path: '.storybook/main.ts', gitStatus: 'M' },
+ { path: 'src/Deleted.stories.tsx', gitStatus: 'D' },
+ ];
+
+ expect(getChangedStoryFiles('/repo', fileChanges)).toEqual([
+ join('/repo', 'src', 'Button.stories.tsx'),
+ join('/repo', 'src', 'Header.story.ts'),
+ ]);
+ });
+});
+
+describe('getGeneratedStoryFiles', () => {
+ it('returns generated story files that live under the evaluated project path', () => {
+ const fileChanges: FileChange[] = [
+ { path: 'frontend/src/App.stories.tsx', gitStatus: 'A' },
+ { path: 'frontend/src/components/Button.stories.tsx', gitStatus: 'M' },
+ { path: 'frontend/.storybook/preview.tsx', gitStatus: 'M' },
+ { path: 'docs/Button.stories.tsx', gitStatus: 'A' },
+ ];
+
+ expect(getGeneratedStoryFiles('/repo', '/repo/frontend', fileChanges).sort()).toEqual(
+ [
+ join('/repo', 'frontend', 'src', 'App.stories.tsx'),
+ join('/repo', 'frontend', 'src', 'components', 'Button.stories.tsx'),
+ ].sort()
+ );
+ });
+});
+
+describe('getPreviewEnvironmentFiles', () => {
+ it('returns only preview files for baseline rollback', () => {
+ const fileChanges: FileChange[] = [
+ { path: 'frontend/.storybook/main.ts', gitStatus: 'M' },
+ { path: 'frontend/.storybook/preview.tsx', gitStatus: 'M' },
+ { path: 'frontend/.storybook/preview-body.html', gitStatus: 'A' },
+ { path: 'frontend/.storybook/wiki-mocks.ts', gitStatus: 'A' },
+ { path: 'frontend/.storybook/eval-support/summary.mdx', gitStatus: 'M' },
+ { path: 'frontend/.storybook/eval-results/data.json', gitStatus: 'M' },
+ ];
+
+ expect(getPreviewEnvironmentFiles(fileChanges)).toEqual(['frontend/.storybook/preview.tsx']);
+ });
+
+ it('considers renamed preview files via previousPath', () => {
+ const fileChanges: FileChange[] = [
+ {
+ path: 'frontend/.storybook/preview.ts',
+ previousPath: 'frontend/.storybook/preview.tsx',
+ gitStatus: 'R',
+ },
+ ];
+
+ expect(getPreviewEnvironmentFiles(fileChanges)).toEqual([
+ 'frontend/.storybook/preview.ts',
+ 'frontend/.storybook/preview.tsx',
+ ]);
+ });
+});
diff --git a/scripts/eval/lib/story-render.ts b/scripts/eval/lib/story-render.ts
new file mode 100644
index 000000000000..7c88909b6c08
--- /dev/null
+++ b/scripts/eval/lib/story-render.ts
@@ -0,0 +1,273 @@
+import { existsSync } from 'node:fs';
+import { mkdir, readFile, rm, writeFile } from 'node:fs/promises';
+import { dirname, join, relative, resolve } from 'pathe';
+import { x } from 'tinyexec';
+import { parseVitestResults } from '../../../code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts';
+import type { FileChange } from './grade.ts';
+import { detectPackageManager, resolveInstallRoot } from './package-manager.ts';
+import type { Logger } from './utils.ts';
+
+const STORY_FILE_PATTERN = /\.(stories|story)\.[tj]sx?$/;
+
+export interface StoryRenderGrade {
+ total: number;
+ passed: number;
+ storyFiles: number;
+ cssCheck: 'pass' | 'fail' | 'not-run';
+}
+
+export interface StoryRenderRunResult {
+ attempted: boolean;
+ success: boolean;
+ outputPath?: string;
+ reportPath?: string;
+ runError?: string;
+ summary?: StoryRenderGrade;
+}
+
+interface FileSnapshot {
+ path: string;
+ content: Buffer | null;
+}
+
+export function getGeneratedStoryFiles(
+ repoRoot: string,
+ projectPath: string,
+ fileChanges: FileChange[]
+): string[] {
+ return getChangedStoryFiles(repoRoot, fileChanges).filter((storyFile) => {
+ const relativePath = relative(projectPath, storyFile);
+ return relativePath !== '' && !relativePath.startsWith('..');
+ });
+}
+
+export function getChangedStoryFiles(repoRoot: string, fileChanges: FileChange[]): string[] {
+ return fileChanges
+ .filter((change) => change.gitStatus !== 'D' && STORY_FILE_PATTERN.test(change.path))
+ .map((change) => resolve(repoRoot, change.path));
+}
+
+export function getPreviewEnvironmentFiles(fileChanges: FileChange[]): string[] {
+ return fileChanges
+ .flatMap((change) => [change.path, change.previousPath].filter(Boolean) as string[])
+ .filter(isPreviewEnvironmentPath)
+ .filter((path, index, values) => values.indexOf(path) === index)
+ .sort();
+}
+
+/**
+ * Each benchmark repo must define a `vitest:storybook` script in its package.json
+ * that knows how to run the storybook vitest project for that repo.
+ * The grading harness appends `--reporter=json --outputFile=... `.
+ */
+export async function runStoryRenderPass(opts: {
+ projectPath: string;
+ resultsDir: string;
+ storyFiles: string[];
+ outputBaseName: string;
+ label?: string;
+ logger: Logger;
+}): Promise {
+ const tag = opts.label ?? 'story-render';
+ const runnableStoryFiles = opts.storyFiles.filter((storyFile) => {
+ const rel = relative(opts.projectPath, storyFile);
+ return rel !== '' && !rel.startsWith('..');
+ });
+
+ if (runnableStoryFiles.length === 0) {
+ opts.logger.logStep(`No generated story files found for ${tag}.`);
+ return {
+ attempted: false,
+ success: true,
+ summary: {
+ total: 0,
+ passed: 0,
+ storyFiles: 0,
+ cssCheck: 'not-run' as const,
+ },
+ };
+ }
+
+ const reportPath = join(opts.resultsDir, `${opts.outputBaseName}-report.json`);
+ const outputPath = join(opts.resultsDir, `${opts.outputBaseName}-output.txt`);
+
+ opts.logger.logStep(`Running ${tag} for ${runnableStoryFiles.length} story file(s)...`);
+
+ const pm = detectPackageManager(resolveInstallRoot(opts.projectPath));
+ const [runCmd, ...runArgs] = getScriptRunCommand(pm);
+
+ const timeoutMs = STORY_RENDER_TIMEOUT_MS;
+ let result: { exitCode: number | null; stdout: string; stderr: string };
+ let timedOut = false;
+ try {
+ result = await x(
+ runCmd,
+ [...runArgs, '--reporter=json', `--outputFile=${reportPath}`, ...runnableStoryFiles],
+ {
+ throwOnError: false,
+ timeout: timeoutMs,
+ nodeOptions: {
+ cwd: opts.projectPath,
+ env: {
+ ...process.env,
+ STORYBOOK_DISABLE_TELEMETRY: '1',
+ },
+ },
+ }
+ );
+ } catch (error) {
+ if (isAbortTimeoutError(error)) {
+ timedOut = true;
+ result = { exitCode: null, stdout: '', stderr: `Timed out after ${timeoutMs / 1000}s` };
+ } else {
+ throw error;
+ }
+ }
+
+ const output = `${result.stdout}\n${result.stderr}`.trim();
+ await writeFile(outputPath, output);
+
+ const summary = existsSync(reportPath)
+ ? await readStoryRenderSummary(reportPath, runnableStoryFiles.length)
+ : undefined;
+
+ const success = !timedOut && result.exitCode === 0;
+ if (success) {
+ opts.logger.logSuccess(`${tag}: passed`);
+ } else if (timedOut) {
+ opts.logger.logError(`${tag}: timed out after ${timeoutMs / 1000}s`);
+ } else {
+ const rate = summary ? `${summary.passed}/${summary.total} passed` : `exit ${result.exitCode}`;
+ opts.logger.logError(`${tag}: ${rate}`);
+ }
+
+ return {
+ attempted: true,
+ success,
+ outputPath,
+ reportPath,
+ runError: timedOut
+ ? `Story-render run timed out after ${timeoutMs / 1000}s`
+ : summary
+ ? undefined
+ : output || 'Story-render report not found',
+ summary,
+ };
+}
+
+const STORY_RENDER_TIMEOUT_MS = 600_000;
+
+function isAbortTimeoutError(error: unknown): boolean {
+ if (!(error instanceof Error)) return false;
+ if ('code' in error && (error as { code?: string }).code === 'ABORT_ERR') return true;
+ const cause = (error as { cause?: unknown }).cause;
+ return cause instanceof Error && cause.name === 'TimeoutError';
+}
+
+/** Build the full command tokens for ` run vitest:storybook [--] `. */
+export function getScriptRunCommand(pm: string): string[] {
+ switch (pm) {
+ case 'pnpm':
+ return ['pnpm', 'run', 'vitest:storybook'];
+ case 'yarn':
+ return ['yarn', 'vitest:storybook'];
+ case 'bun':
+ return ['bun', 'run', 'vitest:storybook'];
+ default:
+ return ['npm', 'run', 'vitest:storybook', '--'];
+ }
+}
+
+export async function withBaselinePreviewEnvironment(opts: {
+ repoRoot: string;
+ baselineCommit: string;
+ fileChanges: FileChange[];
+ fn: () => Promise;
+}): Promise {
+ const previewFiles = getPreviewEnvironmentFiles(opts.fileChanges);
+ if (previewFiles.length === 0) {
+ return opts.fn();
+ }
+
+ const snapshots = await snapshotFiles(opts.repoRoot, previewFiles);
+
+ try {
+ await restoreFilesFromCommit(opts.repoRoot, opts.baselineCommit, previewFiles);
+ return await opts.fn();
+ } finally {
+ await restoreSnapshots(opts.repoRoot, snapshots);
+ }
+}
+
+async function readStoryRenderSummary(reportPath: string, storyFiles: number) {
+ try {
+ const rawReport = JSON.parse(await readFile(reportPath, 'utf8'));
+ const parsed = parseVitestResults(rawReport).summary;
+
+ if (!parsed) {
+ return undefined;
+ }
+
+ return {
+ total: parsed.total,
+ passed: parsed.passed,
+ storyFiles,
+ cssCheck: parsed.cssCheck,
+ } satisfies StoryRenderGrade;
+ } catch {
+ return undefined;
+ }
+}
+
+async function snapshotFiles(repoRoot: string, paths: string[]): Promise {
+ return Promise.all(
+ paths.map(async (path) => {
+ const absolutePath = join(repoRoot, path);
+ return {
+ path,
+ content: existsSync(absolutePath) ? await readFile(absolutePath) : null,
+ };
+ })
+ );
+}
+
+async function restoreFilesFromCommit(repoRoot: string, baselineCommit: string, paths: string[]) {
+ await Promise.all(
+ paths.map(async (path) => {
+ const absolutePath = join(repoRoot, path);
+ const gitObject = `${baselineCommit}:${path}`;
+ const result = await x('git', ['show', gitObject], {
+ throwOnError: false,
+ nodeOptions: { cwd: repoRoot },
+ });
+
+ if (result.exitCode === 0) {
+ await mkdir(dirname(absolutePath), { recursive: true });
+ await writeFile(absolutePath, result.stdout);
+ return;
+ }
+
+ await rm(absolutePath, { force: true });
+ })
+ );
+}
+
+async function restoreSnapshots(repoRoot: string, snapshots: FileSnapshot[]) {
+ await Promise.all(
+ snapshots.map(async ({ path, content }) => {
+ const absolutePath = join(repoRoot, path);
+ if (content == null) {
+ await rm(absolutePath, { force: true });
+ return;
+ }
+
+ await mkdir(dirname(absolutePath), { recursive: true });
+ await writeFile(absolutePath, content);
+ })
+ );
+}
+
+function isPreviewEnvironmentPath(path: string) {
+ const normalized = path.replace(/\\/g, '/');
+ return /(^|\/)\.storybook\/preview\.[^/]+$/.test(normalized);
+}
diff --git a/scripts/eval/lib/transcript-types.ts b/scripts/eval/lib/transcript-types.ts
new file mode 100644
index 000000000000..4eff59f2144e
--- /dev/null
+++ b/scripts/eval/lib/transcript-types.ts
@@ -0,0 +1,81 @@
+export interface TextContent {
+ type: 'text';
+ text: string;
+}
+
+export interface ToolUseContent {
+ type: 'tool_use';
+ id: string;
+ name: string;
+ input: Record;
+ isMCP: boolean;
+}
+
+export interface ToolResultContent {
+ tool_use_id: string;
+ type: 'tool_result';
+ content: string | Array<{ type: string; text?: string; isError?: boolean }>;
+}
+
+export interface MessageUsage {
+ input_tokens: number;
+ output_tokens: number;
+}
+
+export interface AssistantMessage {
+ type: 'assistant';
+ message: {
+ content: (TextContent | ToolUseContent)[];
+ usage: MessageUsage;
+ };
+ ms: number;
+ tokenCount?: number;
+ costUSD?: number;
+}
+
+export interface UserMessage {
+ type: 'user';
+ message: {
+ content: ToolResultContent[];
+ };
+ ms: number;
+ tokenCount?: number;
+ costUSD?: number;
+}
+
+export interface SystemMessage {
+ type: 'system';
+ subtype: 'init';
+ agent: string;
+ model: string;
+ tools: string[];
+ mcp_servers: Array<{
+ name: string;
+ status: 'connected' | 'disconnected' | 'unknown';
+ }>;
+ cwd: string;
+ ms: number;
+ tokenCount?: number;
+ costUSD?: number;
+}
+
+export interface ResultMessage {
+ type: 'result';
+ subtype: 'success' | 'error';
+ duration_ms: number;
+ duration_api_ms: number;
+ num_turns: number;
+ total_cost_usd: number;
+ ms: number;
+ tokenCount?: number;
+ costUSD?: number;
+}
+
+export type TranscriptMessage = AssistantMessage | UserMessage | SystemMessage | ResultMessage;
+
+export interface TranscriptProps {
+ prompt: string;
+ promptTokenCount: number;
+ promptCost: number;
+ messages: TranscriptMessage[];
+}
diff --git a/scripts/eval/lib/utils.test.ts b/scripts/eval/lib/utils.test.ts
new file mode 100644
index 000000000000..bb197efcfd98
--- /dev/null
+++ b/scripts/eval/lib/utils.test.ts
@@ -0,0 +1,212 @@
+import { describe, expect, it } from 'vitest';
+
+import {
+ EXAMPLE_PROMPT_BASENAME,
+ formatDuration,
+ formatCost,
+ formatScore,
+ formatScorePercent,
+ formatReadableUtcTimestamp,
+ generateTrialId,
+ loadPrompt,
+ listPrompts,
+ formatTable,
+ formatHelp,
+} from './utils.ts';
+
+describe('formatDuration', () => {
+ it('formats seconds under a minute', () => {
+ expect(formatDuration(0)).toBe('0s');
+ expect(formatDuration(1)).toBe('1s');
+ expect(formatDuration(45)).toBe('45s');
+ });
+
+ it('rounds fractional seconds', () => {
+ expect(formatDuration(2.7)).toBe('3s');
+ expect(formatDuration(59.4)).toBe('59s');
+ });
+
+ it('formats minutes and seconds', () => {
+ expect(formatDuration(60)).toBe('1m0s');
+ expect(formatDuration(61)).toBe('1m1s');
+ expect(formatDuration(90)).toBe('1m30s');
+ expect(formatDuration(125)).toBe('2m5s');
+ expect(formatDuration(3661)).toBe('61m1s');
+ });
+});
+
+describe('formatCost', () => {
+ it('returns dash for undefined', () => {
+ expect(formatCost(undefined)).toBe('-');
+ expect(formatCost()).toBe('-');
+ });
+
+ it('formats dollar amounts', () => {
+ expect(formatCost(0)).toBe('$0.00');
+ expect(formatCost(1.5)).toBe('$1.50');
+ });
+});
+
+describe('formatScore', () => {
+ it('keeps integer scores compact', () => {
+ expect(formatScore(1)).toBe('1');
+ expect(formatScore(0)).toBe('0');
+ });
+
+ it('formats gain scores without unnecessary trailing zeroes', () => {
+ expect(formatScore(0.25)).toBe('0.25');
+ expect(formatScore(1 / 3)).toBe('0.333');
+ expect(formatScore(-0.125)).toBe('-0.125');
+ });
+});
+
+describe('formatScorePercent', () => {
+ it('formats 0-1 scores as whole-number percentages when exact', () => {
+ expect(formatScorePercent(0)).toBe('0%');
+ expect(formatScorePercent(1)).toBe('100%');
+ expect(formatScorePercent(0.5)).toBe('50%');
+ expect(formatScorePercent(0.75)).toBe('75%');
+ });
+
+ it('uses one decimal when the percentage is not an integer', () => {
+ expect(formatScorePercent(1 / 3)).toBe('33.3%');
+ expect(formatScorePercent(-0.125)).toBe('-12.5%');
+ });
+});
+
+describe('generateTrialId', () => {
+ it('starts with a readable branch-safe UTC timestamp', () => {
+ const id = generateTrialId();
+ expect(id).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}Z-[a-f0-9]{8}$/);
+ });
+
+ it('generates unique IDs', () => {
+ const a = generateTrialId();
+ const b = generateTrialId();
+ expect(a).not.toBe(b);
+ });
+});
+
+describe('formatReadableUtcTimestamp', () => {
+ it('formats ISO timestamps in UTC for PR output', () => {
+ expect(formatReadableUtcTimestamp('2026-04-02T10:23:40.000Z')).toBe('Apr 2 2026 10:23:40 UTC');
+ });
+
+ it('falls back to the original value for invalid dates', () => {
+ expect(formatReadableUtcTimestamp('not-a-date')).toBe('not-a-date');
+ });
+});
+
+describe('listPrompts', () => {
+ it('mirrors the CLI prompt registry', () => {
+ const prompts = listPrompts();
+ expect(prompts).toContain('pattern-copy-play');
+ expect(prompts).toContain('setup');
+ expect(prompts).not.toContain('pattern-copy');
+ });
+
+ it('includes the default/example prompt', () => {
+ expect(listPrompts()).toContain(EXAMPLE_PROMPT_BASENAME);
+ });
+});
+
+describe('loadPrompt', () => {
+ it('returns the nudge string the agent receives (not the resolved instructions)', () => {
+ const prompt = loadPrompt(EXAMPLE_PROMPT_BASENAME);
+ expect(prompt).toContain('npx storybook ai setup');
+ expect(prompt).not.toContain('### Step 1');
+ });
+
+ it('rejects unknown prompt names', () => {
+ expect(() => loadPrompt('nonexistent-prompt-xyz')).toThrow('Prompt not found');
+ });
+
+ it('accepts every registered prompt name', () => {
+ for (const name of listPrompts()) {
+ expect(() => loadPrompt(name)).not.toThrow();
+ }
+ });
+});
+
+describe('formatHelp', () => {
+ it('formats usage, description, and options into a help message', () => {
+ const result = formatHelp('node eval.ts [options]', 'Run an eval trial.', {
+ project: { type: 'string', short: 'p', description: 'Project name' },
+ verbose: { type: 'boolean', short: 'v', description: 'Verbose output' },
+ help: { type: 'boolean', short: 'h', description: 'Show this help and exit' },
+ });
+
+ expect(result).toContain('Usage: node eval.ts [options]');
+ expect(result).toContain('Run an eval trial.');
+ expect(result).toContain('-p, --project ');
+ expect(result).toContain('-v, --verbose');
+ expect(result).toContain('-h, --help');
+ expect(result).toContain('Project name');
+ });
+
+ it('pads options without a short flag', () => {
+ const result = formatHelp('node tool.ts', 'A tool.', {
+ verbose: { type: 'boolean', short: 'v', description: 'Verbose' },
+ 'dry-run': { type: 'boolean', description: 'Dry run' },
+ });
+
+ expect(result).toContain('-v, --verbose');
+ expect(result).toContain(' --dry-run');
+ });
+
+ it('aligns descriptions across options of different name lengths', () => {
+ const result = formatHelp('node tool.ts', 'A tool.', {
+ x: { type: 'boolean', description: 'Short name' },
+ 'very-long-option': { type: 'boolean', description: 'Long name' },
+ });
+
+ const lines = result.split('\n').filter((l) => l.includes('--'));
+ const descStartX = lines[0].indexOf('Short name');
+ const descStartLong = lines[1].indexOf('Long name');
+ expect(descStartX).toBe(descStartLong);
+ });
+});
+
+describe('formatTable', () => {
+ it('formats a simple table with aligned columns', () => {
+ const result = formatTable(
+ ['Name', 'Score'],
+ [
+ ['Alice', '100'],
+ ['Bob', '95'],
+ ]
+ );
+ const lines = result.split('\n');
+ expect(lines).toHaveLength(4); // header + divider + 2 rows
+ expect(lines[0]).toContain('Name');
+ expect(lines[0]).toContain('Score');
+ expect(lines[1]).toMatch(/^-+\+-+$/);
+ expect(lines[2]).toContain('Alice');
+ expect(lines[3]).toContain('Bob');
+ });
+
+ it('auto-sizes columns to fit content', () => {
+ const result = formatTable(['X', 'Y'], [['short', 'a-much-longer-value']]);
+ const lines = result.split('\n');
+ // Header column for Y should be padded to match the data width
+ const headerCols = lines[0].split(' | ');
+ const dataCols = lines[2].split(' | ');
+ expect(headerCols[1].trim().length).toBeLessThanOrEqual(dataCols[1].trim().length);
+ });
+
+ it('handles ANSI escape codes in cells', () => {
+ const green = '\x1b[32mPASS\x1b[39m';
+ const result = formatTable(['Status'], [[green], ['FAIL']]);
+ const lines = result.split('\n');
+ // Both rows should be the same visible width
+ // The ANSI row has extra invisible chars but should still align
+ expect(lines[2]).toContain('PASS');
+ expect(lines[3]).toContain('FAIL');
+ });
+
+ it('handles empty rows', () => {
+ const result = formatTable(['A', 'B'], []);
+ const lines = result.split('\n');
+ expect(lines).toHaveLength(2); // header + divider only
+ });
+});
diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts
new file mode 100644
index 000000000000..94d6472fdc2d
--- /dev/null
+++ b/scripts/eval/lib/utils.ts
@@ -0,0 +1,239 @@
+import { join, resolve, sep } from 'node:path';
+import pc from 'picocolors';
+import { x } from 'tinyexec';
+
+import { AI_SETUP_PROMPT } from '../../../code/core/src/shared/constants/ai-prompts.ts';
+import {
+ DEFAULT_PROMPT_NAME,
+ PROMPT_NAMES,
+} from '../../../code/lib/cli-storybook/src/ai/setup-prompts/index.ts';
+
+export interface Logger {
+ log: (msg: string) => void;
+ logStep: (msg: string) => void;
+ logSuccess: (msg: string) => void;
+ logError: (msg: string) => void;
+}
+
+export const REPO_ROOT = resolve(import.meta.dirname, '..', '..', '..');
+export const EVAL_ROOT = resolve(REPO_ROOT, '..', 'storybook-eval');
+export const REPOS_DIR = resolve(EVAL_ROOT, 'repos');
+export const TRIALS_DIR = resolve(EVAL_ROOT, 'trials');
+/** Name used in docs and tests when a concrete prompt must be named. Tracks the CLI default. */
+export const EXAMPLE_PROMPT_BASENAME = DEFAULT_PROMPT_NAME;
+export const NODE_EVAL_TRIAL_SCRIPT = 'scripts/eval/eval.ts' as const;
+export const NODE_EVAL_RUN_BATCH_SCRIPT = 'scripts/eval/run-batch.ts' as const;
+export const NODE_EVAL_SYNC_BASELINES_SCRIPT = 'scripts/eval/sync-baselines.ts' as const;
+export const NODE_EVAL_SYNC_STORYBOOK_VERSION_SCRIPT =
+ 'scripts/eval/sync-storybook-version.ts' as const;
+export const NODE_EVAL_COLLECT_PR_DATA_SCRIPT = 'scripts/eval/collect-pr-data.ts' as const;
+export const STORYBOOK_DIRNAME = '.storybook';
+export const EVAL_RESULTS_DIRNAME = 'eval-results';
+
+export function createLogger(prefix?: string): Logger {
+ const p = prefix ? pc.dim(`[${prefix}]`) + ' ' : '';
+ return {
+ log: (msg: string) => console.log(`${p}${msg}`),
+ logStep: (msg: string) => console.log(`${p} ${pc.cyan('>')} ${msg}`),
+ logSuccess: (msg: string) => console.log(`${p} ${pc.green('✓')} ${msg}`),
+ logError: (msg: string) => console.log(`${p} ${pc.red('✗')} ${msg}`),
+ };
+}
+
+export const formatDuration = (s: number) => {
+ const total = Math.round(s);
+ const minutes = Math.floor(total / 60);
+ const seconds = total % 60;
+ return minutes > 0 ? `${minutes}m${seconds}s` : `${seconds}s`;
+};
+
+export const formatCost = (cost?: number) => (cost == null ? '-' : `$${cost.toFixed(2)}`);
+
+/** Raw 0-1 index (used in data.json and when you need the exact ratio). */
+export const formatScore = (score: number) =>
+ score.toFixed(3).replace(/(?:\.0+|(\.\d*?)0+)$/, '$1');
+
+/** Human-readable percentage for the same 0-1 score (data.json keeps the ratio). */
+export function formatScorePercent(score: number): string {
+ if (!Number.isFinite(score)) return String(score);
+ const pct = score * 100;
+ const rounded = Math.round(pct);
+ if (Math.abs(pct - rounded) < 1e-6) return `${rounded}%`;
+ return `${pct.toFixed(1)}%`;
+}
+
+export function getProjectPath(repoRoot: string, projectDir?: string) {
+ return projectDir ? join(repoRoot, projectDir) : repoRoot;
+}
+
+export function getStorybookDir(projectPath: string) {
+ return join(projectPath, STORYBOOK_DIRNAME);
+}
+
+export function getEvalSupportDir(projectPath: string) {
+ return join(getStorybookDir(projectPath), 'eval-support');
+}
+
+export function getEvalResultsDir(projectPath: string) {
+ return join(getStorybookDir(projectPath), EVAL_RESULTS_DIRNAME);
+}
+
+export function getEvalResultsRelativeDir(projectDir?: string) {
+ return toPosixPath(
+ projectDir
+ ? join(projectDir, STORYBOOK_DIRNAME, EVAL_RESULTS_DIRNAME)
+ : join(STORYBOOK_DIRNAME, EVAL_RESULTS_DIRNAME)
+ );
+}
+
+export function getEvalResultsRelativePath(fileName: string, projectDir?: string) {
+ return `${getEvalResultsRelativeDir(projectDir)}/${fileName}`;
+}
+
+export function generateTrialId() {
+ const timestamp = new Date()
+ .toISOString()
+ .replace(/\.\d{3}Z$/, 'Z')
+ .replace(/:/g, '-');
+ return `${timestamp}-${crypto.randomUUID().replace(/-/g, '').slice(0, 8)}`;
+}
+
+/** Uppercase first character only (shared eval string helper). */
+export function capitalizeFirst(value: string) {
+ return value.charAt(0).toUpperCase() + value.slice(1);
+}
+
+export function formatReadableUtcTimestamp(timestamp: string) {
+ const date = new Date(timestamp);
+ if (Number.isNaN(date.getTime())) {
+ return timestamp;
+ }
+
+ const month = [
+ 'Jan',
+ 'Feb',
+ 'Mar',
+ 'Apr',
+ 'May',
+ 'Jun',
+ 'Jul',
+ 'Aug',
+ 'Sep',
+ 'Oct',
+ 'Nov',
+ 'Dec',
+ ][date.getUTCMonth()];
+ const day = date.getUTCDate();
+ const year = date.getUTCFullYear();
+ const hour = `${date.getUTCHours()}`.padStart(2, '0');
+ const minute = `${date.getUTCMinutes()}`.padStart(2, '0');
+ const second = `${date.getUTCSeconds()}`.padStart(2, '0');
+ return `${month} ${day} ${year} ${hour}:${minute}:${second} UTC`;
+}
+
+/** Format data as an aligned table with automatic column widths. */
+export function formatTable(headers: string[], rows: string[][]): string {
+ const widths = headers.map((h, i) =>
+ Math.max(h.length, ...rows.map((r) => stripAnsi(r[i] ?? '').length))
+ );
+
+ const pad = (str: string, width: number) => {
+ const visible = stripAnsi(str).length;
+ return str + ' '.repeat(Math.max(0, width - visible));
+ };
+
+ const sep = ' | ';
+ return [
+ headers.map((h, i) => pad(h, widths[i])).join(sep),
+ widths.map((w) => '-'.repeat(w)).join('-+-'),
+ ...rows.map((row) => row.map((cell, i) => pad(cell, widths[i])).join(sep)),
+ ].join('\n');
+}
+
+/**
+ * Returns the exact nudge string a real user copies from the Storybook UI —
+ * "Run `npx storybook ai setup` and follow its instructions precisely." The
+ * AGENT then runs `ai setup` itself as a tool call, mirroring the real user
+ * flow. The harness selects a prompt variant via the `EVAL_SETUP_PROMPT` env
+ * var on the agent's spawn (not here); this function only validates the name.
+ */
+export function loadPrompt(name: string): string {
+ const available = listPrompts();
+ if (!available.includes(name)) {
+ throw new Error(`Prompt not found: ${name}\nAvailable: ${available.join(', ')}`);
+ }
+ return AI_SETUP_PROMPT;
+}
+
+/** List available prompt names. Mirrors the builder registry in the CLI. */
+export function listPrompts(): string[] {
+ return [...PROMPT_NAMES];
+}
+
+export interface EvalEnvironment {
+ nodeVersion: string;
+ /** Git branch of the eval harness (storybook monorepo), not the evaluated project. */
+ evalBranch: string;
+ /** Git commit of the eval harness (storybook monorepo), not the evaluated project. */
+ evalCommit: string;
+}
+
+export async function captureEnvironment(): Promise {
+ let evalBranch = 'unknown';
+ let evalCommit = 'unknown';
+ try {
+ evalBranch = (
+ await x('git', ['rev-parse', '--abbrev-ref', 'HEAD'], { nodeOptions: { cwd: REPO_ROOT } })
+ ).stdout.trim();
+ evalCommit = (
+ await x('git', ['rev-parse', 'HEAD'], { nodeOptions: { cwd: REPO_ROOT } })
+ ).stdout.trim();
+ } catch {
+ /* not in a git repo */
+ }
+ return { nodeVersion: process.version, evalBranch, evalCommit };
+}
+
+export interface HelpOption {
+ type: 'string' | 'boolean';
+ short?: string;
+ description?: string;
+}
+
+/**
+ * Format a --help message from the same options object passed to parseArgs.
+ * Each option may carry a `description` field (ignored by parseArgs at runtime).
+ */
+export function formatHelp(
+ usage: string,
+ description: string,
+ options: Record
+): string {
+ const entries = Object.entries(options);
+
+ const formatted = entries.map(([name, opt]) => {
+ const short = opt.short ? `-${opt.short}, ` : ' ';
+ const long = opt.type === 'string' ? `--${name} ` : `--${name}`;
+ return { short, long, desc: opt.description ?? '' };
+ });
+
+ const maxLong = Math.max(...formatted.map((f) => f.long.length));
+
+ return [
+ `Usage: ${usage}`,
+ '',
+ description,
+ '',
+ 'Options:',
+ ...formatted.map((f) => ` ${f.short}${f.long.padEnd(maxLong)} ${f.desc}`),
+ ].join('\n');
+}
+
+/** Strip ANSI escape codes for accurate width calculation. */
+function stripAnsi(str: string) {
+ return str.replace(/\x1b\[[0-9;]*m/g, '');
+}
+
+export function toPosixPath(value: string) {
+ return value.split(sep).join('/');
+}
diff --git a/scripts/eval/run-batch.test.ts b/scripts/eval/run-batch.test.ts
new file mode 100644
index 000000000000..49ad2a712868
--- /dev/null
+++ b/scripts/eval/run-batch.test.ts
@@ -0,0 +1,506 @@
+import { EventEmitter } from 'node:events';
+import { existsSync, mkdtempSync, readFileSync, rmSync } from 'node:fs';
+import { join } from 'node:path';
+import { tmpdir } from 'node:os';
+import { PassThrough } from 'node:stream';
+
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+import { NODE_EVAL_TRIAL_SCRIPT } from './lib/utils.ts';
+import {
+ BATCH_DEFAULT_CLAUDE_EFFORTS,
+ BATCH_DEFAULT_EFFORTS,
+ BATCH_DEFAULT_AGENT_IDS,
+ BATCH_EXCLUDED_PROJECT_NAMES,
+ BATCH_MATRIX_MODELS,
+ BATCH_PROJECT_NAMES,
+ BATCH_REPETITIONS,
+ BATCH_VARIANTS,
+ buildBatchRunDescriptors,
+ buildBatchVariants,
+ main,
+ parseRunBatchArgs,
+ runBatch,
+ type SpawnedBatchChild,
+} from './run-batch.ts';
+
+const TEST_PROMPT = 'pattern-copy-play';
+
+let TMP = '';
+
+afterEach(() => {
+ if (TMP) {
+ rmSync(TMP, { recursive: true, force: true });
+ TMP = '';
+ }
+});
+
+describe('buildBatchRunDescriptors', () => {
+ it('creates the default batch matrix with full repetition coverage', () => {
+ const descriptors = buildBatchRunDescriptors({ prompt: TEST_PROMPT });
+ const combinations = new Map();
+
+ expect(descriptors).toHaveLength(
+ BATCH_PROJECT_NAMES.length * BATCH_VARIANTS.length * BATCH_REPETITIONS
+ );
+ expect(new Set(descriptors.map((descriptor) => descriptor.label)).size).toBe(
+ descriptors.length
+ );
+ expect(new Set(descriptors.map((descriptor) => descriptor.project))).toEqual(
+ new Set(BATCH_PROJECT_NAMES)
+ );
+ expect(new Set(descriptors.map((descriptor) => descriptor.prompt))).toEqual(
+ new Set(['pattern-copy-play'])
+ );
+ expect(new Set(BATCH_PROJECT_NAMES)).not.toContain('baklava');
+
+ for (const descriptor of descriptors) {
+ const key = `${descriptor.project}:${descriptor.agent}:${descriptor.model}:${descriptor.effort}`;
+ combinations.set(key, [...(combinations.get(key) ?? []), descriptor.repetition]);
+ expect(descriptor.args).toEqual([
+ NODE_EVAL_TRIAL_SCRIPT,
+ '-p',
+ descriptor.project,
+ '-a',
+ descriptor.agent,
+ '-m',
+ descriptor.model,
+ '-e',
+ descriptor.effort,
+ '--prompt',
+ descriptor.prompt,
+ ]);
+ }
+
+ expect(combinations.size).toBe(BATCH_PROJECT_NAMES.length * BATCH_VARIANTS.length);
+
+ for (const repetitions of combinations.values()) {
+ expect([...repetitions].sort((a, b) => a - b)).toEqual(
+ Array.from({ length: BATCH_REPETITIONS }, (_, index) => index + 1)
+ );
+ }
+ });
+
+ it('can restrict the batch to Claude only when explicitly requested', () => {
+ const descriptors = buildBatchRunDescriptors({ prompt: TEST_PROMPT, agents: ['claude'] });
+
+ expect(descriptors).toHaveLength(BATCH_PROJECT_NAMES.length * BATCH_REPETITIONS);
+ expect(new Set(descriptors.map((descriptor) => descriptor.agent))).toEqual(new Set(['claude']));
+ });
+
+ it('uses the configured Claude effort override when building descriptors', () => {
+ const descriptors = buildBatchRunDescriptors({ prompt: TEST_PROMPT, claudeEffort: 'high' });
+
+ expect(
+ new Set(
+ descriptors
+ .filter((descriptor) => descriptor.agent === 'claude')
+ .map((descriptor) => descriptor.effort)
+ )
+ ).toEqual(new Set(['high']));
+ });
+
+ it('supports multiple Claude efforts in a single batch', () => {
+ const descriptors = buildBatchRunDescriptors({
+ prompt: TEST_PROMPT,
+ agents: ['claude'],
+ claudeEfforts: ['max', 'high'],
+ });
+
+ expect(descriptors).toHaveLength(BATCH_PROJECT_NAMES.length * 2 * BATCH_REPETITIONS);
+ expect(
+ new Set(
+ descriptors
+ .filter((descriptor) => descriptor.agent === 'claude')
+ .map((descriptor) => descriptor.effort)
+ )
+ ).toEqual(new Set(['max', 'high']));
+ });
+
+ it('uses the configured codex effort override when codex is enabled', () => {
+ const descriptors = buildBatchRunDescriptors({
+ prompt: TEST_PROMPT,
+ agents: ['claude', 'codex'],
+ codexEffort: 'medium',
+ });
+
+ expect(
+ new Set(
+ descriptors
+ .filter((descriptor) => descriptor.agent === 'codex')
+ .map((descriptor) => descriptor.effort)
+ )
+ ).toEqual(new Set(['medium']));
+ });
+
+ it('uses a prompt override for every batch run descriptor', () => {
+ const descriptors = buildBatchRunDescriptors({ prompt: TEST_PROMPT });
+
+ expect(new Set(descriptors.map((descriptor) => descriptor.prompt))).toEqual(
+ new Set(['pattern-copy-play'])
+ );
+ expect(descriptors[0]?.args).toContain('--prompt');
+ expect(descriptors[0]?.args).toContain('pattern-copy-play');
+ expect(descriptors[0]?.label).toContain('-pattern-copy-play-');
+ });
+
+ it('interleaves projects first so batch startup spreads across repos', () => {
+ const descriptors = buildBatchRunDescriptors({ prompt: TEST_PROMPT });
+
+ expect(
+ descriptors
+ .slice(0, BATCH_PROJECT_NAMES.length * BATCH_VARIANTS.length)
+ .map((descriptor) => ({
+ project: descriptor.project,
+ agent: descriptor.agent,
+ repetition: descriptor.repetition,
+ }))
+ ).toEqual([
+ { project: 'mealdrop', agent: 'claude', repetition: 1 },
+ { project: 'edgy', agent: 'claude', repetition: 1 },
+ { project: 'wikitok', agent: 'claude', repetition: 1 },
+ { project: 'echarts', agent: 'claude', repetition: 1 },
+ { project: 'evergreen-ci', agent: 'claude', repetition: 1 },
+ { project: 'excalidraw', agent: 'claude', repetition: 1 },
+ { project: 'bluesky', agent: 'claude', repetition: 1 },
+ { project: 'mealdrop', agent: 'codex', repetition: 1 },
+ { project: 'edgy', agent: 'codex', repetition: 1 },
+ { project: 'wikitok', agent: 'codex', repetition: 1 },
+ { project: 'echarts', agent: 'codex', repetition: 1 },
+ { project: 'evergreen-ci', agent: 'codex', repetition: 1 },
+ { project: 'excalidraw', agent: 'codex', repetition: 1 },
+ { project: 'bluesky', agent: 'codex', repetition: 1 },
+ ]);
+ });
+});
+
+describe('buildBatchVariants', () => {
+ it('returns the default benchmark variants when no overrides are provided', () => {
+ expect(buildBatchVariants()).toEqual(BATCH_VARIANTS);
+ expect(BATCH_VARIANTS).toEqual([
+ {
+ agent: 'claude',
+ model: BATCH_MATRIX_MODELS.claude,
+ effort: BATCH_DEFAULT_CLAUDE_EFFORTS[0],
+ },
+ { agent: 'codex', model: BATCH_MATRIX_MODELS.codex, effort: BATCH_DEFAULT_EFFORTS.codex },
+ ]);
+ });
+
+ it('enables both Claude and Codex by default', () => {
+ expect(BATCH_DEFAULT_AGENT_IDS).toEqual(['claude', 'codex']);
+ });
+
+ it('excludes baklava from the default batch projects', () => {
+ expect(BATCH_EXCLUDED_PROJECT_NAMES).toEqual(['baklava']);
+ expect(BATCH_PROJECT_NAMES).toEqual([
+ 'mealdrop',
+ 'edgy',
+ 'wikitok',
+ 'echarts',
+ 'evergreen-ci',
+ 'excalidraw',
+ 'bluesky',
+ ]);
+ });
+
+ it('supports Claude-only variants when requested', () => {
+ expect(buildBatchVariants({ agents: ['claude'] })).toEqual([
+ {
+ agent: 'claude',
+ model: BATCH_MATRIX_MODELS.claude,
+ effort: BATCH_DEFAULT_CLAUDE_EFFORTS[0],
+ },
+ ]);
+ });
+
+ it('supports multiple Claude variants when multiple efforts are requested', () => {
+ expect(buildBatchVariants({ agents: ['claude'], claudeEfforts: ['max', 'high'] })).toEqual([
+ { agent: 'claude', model: BATCH_MATRIX_MODELS.claude, effort: 'max' },
+ { agent: 'claude', model: BATCH_MATRIX_MODELS.claude, effort: 'high' },
+ ]);
+ });
+});
+
+describe('parseRunBatchArgs', () => {
+ it('parses optional effort overrides, prompt, and concurrency from the CLI', () => {
+ expect(
+ parseRunBatchArgs([
+ '--prompt',
+ 'pattern-copy-play',
+ '--agents',
+ 'claude,codex',
+ '--claude-effort',
+ 'high',
+ '--codex-effort',
+ 'medium',
+ '--concurrency',
+ '3',
+ ])
+ ).toEqual({
+ prompt: 'pattern-copy-play',
+ agents: ['claude', 'codex'],
+ claudeEffort: 'high',
+ codexEffort: 'medium',
+ concurrency: 3,
+ });
+ });
+
+ it('parses multiple Claude efforts from the CLI', () => {
+ expect(parseRunBatchArgs(['--prompt', TEST_PROMPT, '--claude-efforts', 'max,high'])).toEqual({
+ prompt: TEST_PROMPT,
+ claudeEfforts: ['max', 'high'],
+ });
+ });
+});
+
+describe('runBatch', () => {
+ it('caps concurrency and keeps queued work moving as slots free up', async () => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-run-batch-concurrency-'));
+ const descriptors = buildBatchRunDescriptors({ prompt: TEST_PROMPT }).slice(0, 5);
+ const controller = createControlledSpawn();
+
+ const batchPromise = runBatch(
+ {
+ descriptors,
+ concurrency: 2,
+ evalRoot: TMP,
+ batchTimestamp: '2026-04-03T04-05-06-789Z',
+ log: () => {},
+ },
+ { spawn: controller.spawn }
+ );
+
+ await waitForCondition(
+ () => controller.controllers.length === 2,
+ 'expected first two runs to start'
+ );
+ expect(controller.maxActive).toBe(2);
+
+ controller.controllers[0].finish();
+ await waitForCondition(
+ () => controller.controllers.length === 3,
+ 'expected third run to start'
+ );
+ expect(controller.maxActive).toBe(2);
+
+ controller.controllers[1].finish();
+ await waitForCondition(
+ () => controller.controllers.length === 4,
+ 'expected fourth run to start'
+ );
+
+ controller.controllers[2].finish();
+ await waitForCondition(
+ () => controller.controllers.length === 5,
+ 'expected fifth run to start'
+ );
+
+ controller.controllers[3].finish();
+ controller.controllers[4].finish();
+
+ const summary = await batchPromise;
+
+ expect(summary.totalRuns).toBe(5);
+ expect(summary.failed).toBe(0);
+ expect(controller.maxActive).toBe(2);
+ });
+
+ it('continues after failures and returns a nonzero main result when any run fails', async () => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-run-batch-failure-'));
+ const descriptors = buildBatchRunDescriptors({ prompt: TEST_PROMPT }).slice(0, 3);
+ const spawn = createAutoSpawn([0, 2, 0]);
+
+ const exitCode = await main(
+ {
+ descriptors,
+ concurrency: 3,
+ evalRoot: TMP,
+ batchTimestamp: '2026-04-03T06-07-08-999Z',
+ log: () => {},
+ },
+ { spawn }
+ );
+
+ const summaryPath = join(TMP, 'batches', '2026-04-03T06-07-08-999Z', 'summary.json');
+ const summary = JSON.parse(readFileSync(summaryPath, 'utf-8'));
+
+ expect(exitCode).toBe(1);
+ expect(spawn).toHaveBeenCalledTimes(3);
+ expect(summary.failed).toBe(1);
+ expect(summary.succeeded).toBe(2);
+ expect(summary.runs.map((run: { status: string }) => run.status)).toEqual([
+ 'success',
+ 'failed',
+ 'success',
+ ]);
+ });
+
+ it('writes summary metadata and per-run logs under the batch directory', async () => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-run-batch-summary-'));
+ const descriptor = buildBatchRunDescriptors({ prompt: TEST_PROMPT })[0];
+ const spawn = createAutoSpawn([0]);
+
+ const summary = await runBatch(
+ {
+ descriptors: [descriptor],
+ concurrency: 1,
+ evalRoot: TMP,
+ batchTimestamp: '2026-04-03T08-09-10-111Z',
+ log: () => {},
+ },
+ { spawn }
+ );
+
+ const batchDir = join(TMP, 'batches', '2026-04-03T08-09-10-111Z');
+ const logPath = join(batchDir, 'logs', `${descriptor.label}.log`);
+ const persisted = JSON.parse(readFileSync(summary.summaryPath, 'utf-8'));
+
+ expect(summary.batchDir).toBe(batchDir);
+ expect(summary.logsDir).toBe(join(batchDir, 'logs'));
+ expect(summary.summaryPath).toBe(join(batchDir, 'summary.json'));
+ expect(summary.runs[0]).toMatchObject({
+ ...descriptor,
+ logPath,
+ exitCode: 0,
+ signal: null,
+ status: 'success',
+ });
+ expect(persisted.runs[0].logPath).toBe(logPath);
+ expect(existsSync(logPath)).toBe(true);
+
+ const logContents = readFileSync(logPath, 'utf-8');
+ expect(logContents).toContain(`$ node ${NODE_EVAL_TRIAL_SCRIPT}`);
+ expect(logContents).toContain('--prompt pattern-copy-play');
+ expect(logContents).toContain(`stdout:${descriptor.label}`);
+ expect(logContents).toContain(`stderr:${descriptor.label}`);
+ });
+});
+
+class MockChildProcess extends EventEmitter implements SpawnedBatchChild {
+ pid?: number;
+ stdout = new PassThrough();
+ stderr = new PassThrough();
+
+ constructor(pid: number) {
+ super();
+ this.pid = pid;
+ }
+}
+
+function createControlledSpawn() {
+ let active = 0;
+ let maxActive = 0;
+ let nextPid = 1000;
+ const controllers: Array<{
+ child: MockChildProcess;
+ finish: (exitCode?: number | null, signal?: NodeJS.Signals | null) => void;
+ }> = [];
+
+ const spawn = vi.fn(() => {
+ active += 1;
+ maxActive = Math.max(maxActive, active);
+
+ const child = new MockChildProcess(nextPid++);
+ let settled = false;
+
+ controllers.push({
+ child,
+ finish: (exitCode = 0, signal = null) => {
+ if (settled) {
+ return;
+ }
+ settled = true;
+ child.stdout.end(`stdout:${child.pid}\n`);
+ child.stderr.end(`stderr:${child.pid}\n`);
+ active -= 1;
+ child.emit('close', exitCode, signal);
+ },
+ });
+
+ return child;
+ });
+
+ return {
+ spawn,
+ controllers,
+ get maxActive() {
+ return maxActive;
+ },
+ };
+}
+
+function createAutoSpawn(outcomes: Array) {
+ let nextPid = 2000;
+
+ return vi.fn((_command: string, args: string[]) => {
+ const descriptor = getDescriptorFromArgs(args);
+ const outcome = outcomes.shift() ?? 0;
+ const child = new MockChildProcess(nextPid++);
+
+ queueMicrotask(() => {
+ child.stdout.end(`stdout:${descriptor.label}\n`);
+ child.stderr.end(`stderr:${descriptor.label}\n`);
+
+ if (outcome instanceof Error) {
+ child.emit('error', outcome);
+ return;
+ }
+
+ child.emit('close', outcome, null);
+ });
+
+ return child;
+ });
+}
+
+function getDescriptorFromArgs(args: string[]) {
+ const promptIndex = args.indexOf('--prompt');
+ const prompt = promptIndex === -1 ? undefined : args[promptIndex + 1];
+ const agentIndex = args.indexOf('-a');
+ const agent = agentIndex === -1 ? undefined : args[agentIndex + 1];
+ const effortIndex = args.indexOf('-e');
+ const effort = effortIndex === -1 ? undefined : args[effortIndex + 1];
+ const options: Parameters[0] = {
+ prompt: prompt ?? TEST_PROMPT,
+ };
+
+ if (agent === 'claude') {
+ options.agents = ['claude'];
+ if (effort === 'low' || effort === 'medium' || effort === 'high' || effort === 'max') {
+ options.claudeEfforts = [effort];
+ }
+ }
+
+ if (agent === 'codex') {
+ options.agents = ['codex'];
+ if (effort === 'low' || effort === 'medium' || effort === 'high' || effort === 'xhigh') {
+ options.codexEffort = effort;
+ }
+ }
+
+ const descriptors = buildBatchRunDescriptors(options);
+ const descriptor = descriptors.find((candidate) => {
+ return candidate.args.join('\0') === args.join('\0');
+ });
+
+ if (!descriptor) {
+ throw new Error(`Unknown descriptor for args: ${args.join(' ')}`);
+ }
+
+ return descriptor;
+}
+
+async function waitForCondition(check: () => boolean, message: string) {
+ const timeoutAt = Date.now() + 2_000;
+
+ while (Date.now() < timeoutAt) {
+ if (check()) {
+ return;
+ }
+
+ await new Promise((resolve) => setTimeout(resolve, 5));
+ }
+
+ throw new Error(message);
+}
diff --git a/scripts/eval/run-batch.ts b/scripts/eval/run-batch.ts
new file mode 100644
index 000000000000..323550ab0341
--- /dev/null
+++ b/scripts/eval/run-batch.ts
@@ -0,0 +1,663 @@
+import { spawn as spawnChild, type SpawnOptions } from 'node:child_process';
+import { once } from 'node:events';
+import { createWriteStream } from 'node:fs';
+import { mkdir, readFile, writeFile } from 'node:fs/promises';
+import { join, resolve } from 'node:path';
+import type { Readable } from 'node:stream';
+import { parseArgs } from 'node:util';
+
+import pLimit from 'p-limit';
+import { z } from 'zod';
+
+import { createInterface } from 'node:readline/promises';
+
+import { esMain } from '../utils/esmain.ts';
+import { AGENTS, CLAUDE_EFFORTS, CODEX_EFFORTS, type AgentVariant } from './lib/agents/config.ts';
+import { PROJECTS } from './lib/projects.ts';
+import {
+ EVAL_ROOT,
+ formatHelp,
+ listPrompts,
+ NODE_EVAL_RUN_BATCH_SCRIPT,
+ NODE_EVAL_TRIAL_SCRIPT,
+ REPO_ROOT,
+} from './lib/utils.ts';
+
+export const BATCH_EXCLUDED_PROJECT_NAMES = ['baklava'] as const;
+export const BATCH_PROJECT_NAMES = PROJECTS.filter((project) => project.name !== 'baklava').map(
+ (project) => project.name
+);
+export const BATCH_AGENT_IDS = ['claude', 'codex'] as const;
+export const BATCH_DEFAULT_AGENT_IDS = ['claude', 'codex'] as const;
+export const BATCH_DEFAULT_CLAUDE_EFFORTS = ['high'] as const;
+export const BATCH_DEFAULT_EFFORTS = {
+ codex: 'high',
+} as const;
+/** Default models for the batch matrix — single place to change (codex follows AGENTS). */
+export const BATCH_MATRIX_MODELS = {
+ claude: 'opus-4.6',
+ codex: AGENTS.codex.defaultModel,
+} as const satisfies Record<'claude' | 'codex', string>;
+
+export const BATCH_VARIANTS = buildBatchVariants();
+export const BATCH_REPETITIONS = 10;
+export const BATCH_CONCURRENCY = 8;
+
+export interface BatchRunDescriptor {
+ project: (typeof BATCH_PROJECT_NAMES)[number];
+ agent: AgentVariant['agent'];
+ model: AgentVariant['model'];
+ effort: AgentVariant['effort'];
+ prompt: string;
+ repetition: number;
+ label: string;
+ args: string[];
+}
+
+export interface BatchRunSummaryEntry extends BatchRunDescriptor {
+ pid?: number;
+ startTimestamp: string;
+ endTimestamp: string;
+ durationMs: number;
+ exitCode: number | null;
+ signal: NodeJS.Signals | null;
+ status: 'success' | 'failed';
+ logPath: string;
+}
+
+export interface BatchSummary {
+ batchTimestamp: string;
+ batchDir: string;
+ logsDir: string;
+ summaryPath: string;
+ startedAt: string;
+ endedAt: string;
+ durationMs: number;
+ totalRuns: number;
+ concurrency: number;
+ succeeded: number;
+ failed: number;
+ runs: BatchRunSummaryEntry[];
+}
+
+export interface RunBatchOptions {
+ descriptors?: BatchRunDescriptor[];
+ concurrency?: number;
+ repoRoot?: string;
+ evalRoot?: string;
+ batchTimestamp?: string;
+ /** Required when `descriptors` are not provided — prompt variant name from the CLI registry. */
+ prompt?: string;
+ /** Skip interactive confirmation (large API / token usage). */
+ yes?: boolean;
+ agents?: (typeof BATCH_AGENT_IDS)[number][];
+ claudeEfforts?: (typeof CLAUDE_EFFORTS)[number][];
+ claudeEffort?: (typeof CLAUDE_EFFORTS)[number];
+ codexEffort?: (typeof CODEX_EFFORTS)[number];
+ /** Repetitions per (project × variant). Defaults to BATCH_REPETITIONS. */
+ repetitions?: number;
+ log?: (message: string) => void;
+}
+
+export interface SpawnedBatchChild {
+ pid?: number;
+ stdout?: Readable | null;
+ stderr?: Readable | null;
+ once(
+ event: 'close',
+ listener: (code: number | null, signal: NodeJS.Signals | null) => void
+ ): this;
+ once(event: 'error', listener: (error: Error) => void): this;
+}
+
+export interface BatchRunnerDeps {
+ now?: () => Date;
+ spawn?: (command: string, args: string[], options: SpawnOptions) => SpawnedBatchChild;
+}
+
+export async function main(options: RunBatchOptions = {}, deps: BatchRunnerDeps = {}) {
+ const summary = await runBatch(options, deps);
+ return summary.failed > 0 ? 1 : 0;
+}
+
+export async function confirmBatchStart(runCount: number, options: { yes?: boolean } = {}) {
+ if (options.yes) {
+ return;
+ }
+ if (!process.stdin.isTTY) {
+ throw new Error(
+ 'This batch runs many trials and can consume substantial API quota. In non-interactive mode, pass --yes to confirm.'
+ );
+ }
+ const rl = createInterface({ input: process.stdin, output: process.stdout });
+ try {
+ const answer = (
+ await rl.question(
+ `This will launch ${runCount} eval trial(s) and may use significant API quota. Type "yes" to continue: `
+ )
+ ).trim();
+ if (answer.toLowerCase() !== 'yes') {
+ throw new Error('Batch aborted.');
+ }
+ } finally {
+ rl.close();
+ }
+}
+
+export async function runBatch(
+ options: RunBatchOptions = {},
+ deps: BatchRunnerDeps = {}
+): Promise {
+ const descriptors =
+ options.descriptors ??
+ buildBatchRunDescriptors({
+ prompt: requireBatchPrompt(options),
+ agents: options.agents,
+ claudeEfforts: options.claudeEfforts,
+ claudeEffort: options.claudeEffort,
+ codexEffort: options.codexEffort,
+ repetitions: options.repetitions,
+ });
+
+ if (!options.yes && options.descriptors === undefined) {
+ await confirmBatchStart(descriptors.length, { yes: false });
+ }
+
+ const concurrency = options.concurrency ?? BATCH_CONCURRENCY;
+ const repoRoot = resolve(options.repoRoot ?? REPO_ROOT);
+ const evalRoot = resolve(options.evalRoot ?? EVAL_ROOT);
+ const batchTimestamp = options.batchTimestamp ?? formatBatchTimestamp(deps.now?.() ?? new Date());
+ const batchDir = join(evalRoot, 'batches', batchTimestamp);
+ const logsDir = join(batchDir, 'logs');
+ const summaryPath = join(batchDir, 'summary.json');
+ const log = options.log ?? console.log;
+ const now = deps.now ?? (() => new Date());
+ const spawn = deps.spawn ?? defaultSpawn;
+
+ await mkdir(logsDir, { recursive: true });
+
+ const batchStart = now();
+ const results: BatchRunSummaryEntry[] = new Array(descriptors.length);
+ const limit = pLimit(concurrency);
+ let started = 0;
+ let finished = 0;
+
+ log(
+ `Starting eval batch ${batchTimestamp}: ${descriptors.length} runs, concurrency ${concurrency}, logs ${logsDir}`
+ );
+
+ await Promise.all(
+ descriptors.map((descriptor, index) =>
+ limit(async () => {
+ started += 1;
+ log(`[start ${started}/${descriptors.length}] ${descriptor.label}`);
+
+ const result = await runBatchDescriptor(descriptor, {
+ repoRoot,
+ logsDir,
+ now,
+ spawn,
+ });
+ results[index] = result;
+
+ finished += 1;
+ const reason = result.status === 'failed' ? await readFailureReason(result.logPath) : '';
+ log(
+ `[finish ${finished}/${descriptors.length}] ${descriptor.label} ${result.status} ${formatExitResult(result)} ${result.durationMs}ms${reason ? ` — ${reason}` : ''}`
+ );
+ })
+ )
+ );
+
+ const batchEnd = now();
+ const summary: BatchSummary = {
+ batchTimestamp,
+ batchDir,
+ logsDir,
+ summaryPath,
+ startedAt: batchStart.toISOString(),
+ endedAt: batchEnd.toISOString(),
+ durationMs: batchEnd.getTime() - batchStart.getTime(),
+ totalRuns: results.length,
+ concurrency,
+ succeeded: results.filter((result) => result.status === 'success').length,
+ failed: results.filter((result) => result.status === 'failed').length,
+ runs: results,
+ };
+
+ await writeFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`);
+
+ log(
+ `Finished eval batch ${batchTimestamp}: ${summary.totalRuns} total, ${summary.succeeded} succeeded, ${summary.failed} failed`
+ );
+
+ if (summary.failed > 0) {
+ log('Failures:');
+ for (const run of summary.runs) {
+ if (run.status === 'success') continue;
+ const reason = await readFailureReason(run.logPath);
+ log(` - ${run.label}${reason ? `\n ${reason}` : ''}\n ${run.logPath}`);
+ }
+ }
+
+ return summary;
+}
+
+const FAILURE_REASON_MAX_LEN = 200;
+
+/** Read the most informative trailing line from a failed trial log to surface inline. */
+async function readFailureReason(logPath: string): Promise {
+ try {
+ const text = await readFile(logPath, 'utf8');
+ const lines = text
+ .split(/\r?\n/)
+ .map((line) => line.trim())
+ .filter(Boolean);
+ if (lines.length === 0) return '';
+ const errorLine = [...lines]
+ .reverse()
+ .find((line) => /error|abort|timeout|failed/i.test(line) && !line.startsWith('at '));
+ const candidate = errorLine ?? lines[lines.length - 1];
+ return candidate.length > FAILURE_REASON_MAX_LEN
+ ? `${candidate.slice(0, FAILURE_REASON_MAX_LEN - 1)}…`
+ : candidate;
+ } catch {
+ return '';
+ }
+}
+
+export function buildBatchVariants(
+ options: {
+ agents?: RunBatchOptions['agents'];
+ claudeEfforts?: RunBatchOptions['claudeEfforts'];
+ claudeEffort?: RunBatchOptions['claudeEffort'];
+ codexEffort?: RunBatchOptions['codexEffort'];
+ } = {}
+): AgentVariant[] {
+ const agents = resolveBatchAgents(options.agents);
+ const claudeEfforts = resolveClaudeEfforts(options);
+ const variants: AgentVariant[] = [];
+
+ for (const agent of agents) {
+ if (agent === 'claude') {
+ for (const effort of claudeEfforts) {
+ variants.push({
+ agent: 'claude',
+ model: BATCH_MATRIX_MODELS.claude,
+ effort,
+ });
+ }
+ continue;
+ }
+
+ variants.push({
+ agent: 'codex',
+ model: BATCH_MATRIX_MODELS.codex,
+ effort: options.codexEffort ?? BATCH_DEFAULT_EFFORTS.codex,
+ });
+ }
+
+ return variants;
+}
+
+export function buildBatchRunDescriptors(options: {
+ prompt: string;
+ agents?: RunBatchOptions['agents'];
+ claudeEfforts?: RunBatchOptions['claudeEfforts'];
+ claudeEffort?: RunBatchOptions['claudeEffort'];
+ codexEffort?: RunBatchOptions['codexEffort'];
+ repetitions?: number;
+}): BatchRunDescriptor[] {
+ const knownProjects = new Set(PROJECTS.map((project) => project.name));
+
+ for (const project of BATCH_PROJECT_NAMES) {
+ if (!knownProjects.has(project)) {
+ throw new Error(`Configured batch project is missing from PROJECTS: ${project}`);
+ }
+ }
+
+ const descriptors: BatchRunDescriptor[] = [];
+ const variants =
+ options.agents == null &&
+ options.claudeEfforts == null &&
+ options.claudeEffort == null &&
+ options.codexEffort == null
+ ? BATCH_VARIANTS
+ : buildBatchVariants(options);
+
+ const totalRepetitions = options.repetitions ?? BATCH_REPETITIONS;
+ for (let repetition = 1; repetition <= totalRepetitions; repetition += 1) {
+ for (const variant of variants) {
+ for (const project of BATCH_PROJECT_NAMES) {
+ descriptors.push(createBatchRunDescriptor(project, variant, repetition, options.prompt));
+ }
+ }
+ }
+
+ return descriptors;
+}
+
+function requireBatchPrompt(options: RunBatchOptions): string {
+ if (options.prompt == null || options.prompt.trim() === '') {
+ throw new Error(
+ 'runBatch: pass `prompt` (prompt template basename) or provide `descriptors` explicitly.'
+ );
+ }
+
+ const prompt = options.prompt.trim();
+ const available = listPrompts();
+
+ const canonical = available.find((name) => name.toLowerCase() === prompt.toLowerCase());
+
+ if (!canonical) {
+ throw new Error(`Unknown prompt "${prompt}". Available prompts: ${available.join(', ')}`);
+ }
+
+ return canonical;
+}
+
+async function runBatchDescriptor(
+ descriptor: BatchRunDescriptor,
+ context: {
+ repoRoot: string;
+ logsDir: string;
+ now: () => Date;
+ spawn: NonNullable;
+ }
+): Promise {
+ const logPath = join(context.logsDir, `${descriptor.label}.log`);
+ const logStream = createWriteStream(logPath);
+ const start = context.now();
+ let pid: number | undefined;
+ let exitCode: number | null = null;
+ let signal: NodeJS.Signals | null = null;
+ let spawnError: Error | undefined;
+
+ logStream.write(`$ node ${descriptor.args.join(' ')}\n\n`);
+
+ try {
+ const stdio: ['ignore', 'pipe', 'pipe'] = ['ignore', 'pipe', 'pipe'];
+ const child = context.spawn('node', descriptor.args, {
+ cwd: context.repoRoot,
+ stdio,
+ });
+
+ pid = child.pid ?? undefined;
+ child.stdout?.pipe(logStream, { end: false });
+ child.stderr?.pipe(logStream, { end: false });
+
+ const result = await waitForChild(child);
+ exitCode = result.exitCode;
+ signal = result.signal;
+ spawnError = result.error;
+ } catch (error) {
+ spawnError = toError(error);
+ }
+
+ if (spawnError) {
+ logStream.write(`\n[batch runner] ${formatError(spawnError)}\n`);
+ }
+
+ await closeLogStream(logStream);
+
+ const end = context.now();
+
+ return {
+ ...descriptor,
+ ...(pid == null ? {} : { pid }),
+ startTimestamp: start.toISOString(),
+ endTimestamp: end.toISOString(),
+ durationMs: end.getTime() - start.getTime(),
+ exitCode,
+ signal,
+ status: spawnError || exitCode !== 0 || signal !== null ? 'failed' : 'success',
+ logPath,
+ };
+}
+
+function sanitizeLabelSegment(value: string) {
+ return value.replace(/[^a-zA-Z0-9._-]/g, '_');
+}
+
+function createBatchRunDescriptor(
+ project: BatchRunDescriptor['project'],
+ variant: AgentVariant,
+ repetition: number,
+ prompt: string
+): BatchRunDescriptor {
+ const label = [
+ project,
+ variant.agent,
+ variant.model,
+ variant.effort,
+ prompt,
+ `r${String(repetition).padStart(2, '0')}`,
+ ]
+ .map(sanitizeLabelSegment)
+ .join('-');
+ return {
+ project,
+ agent: variant.agent,
+ model: variant.model,
+ effort: variant.effort,
+ prompt,
+ repetition,
+ label,
+ args: [
+ NODE_EVAL_TRIAL_SCRIPT,
+ '-p',
+ project,
+ '-a',
+ variant.agent,
+ '-m',
+ variant.model,
+ '-e',
+ variant.effort,
+ '--prompt',
+ prompt,
+ ],
+ };
+}
+
+function defaultSpawn(command: string, args: string[], options: SpawnOptions) {
+ return spawnChild(command, args, options);
+}
+
+const runBatchArgsSchema = z.object({
+ concurrency: z.coerce.number().int().positive().optional(),
+ prompt: z.string().min(1),
+ yes: z.boolean().optional(),
+ agents: z.array(z.enum(BATCH_AGENT_IDS)).nonempty().optional(),
+ claudeEfforts: z.array(z.enum(CLAUDE_EFFORTS)).nonempty().optional(),
+ claudeEffort: z.enum(CLAUDE_EFFORTS).optional(),
+ codexEffort: z.enum(CODEX_EFFORTS).optional(),
+ repetitions: z.coerce.number().int().positive().optional(),
+});
+
+const runBatchOptions = {
+ concurrency: { type: 'string' as const, description: 'Max concurrent runs (default: 8)' },
+ prompt: {
+ type: 'string' as const,
+ description:
+ 'Prompt variant name (required; registered in code/lib/cli-storybook/src/ai/setup-prompts/)',
+ },
+ agents: {
+ type: 'string' as const,
+ description: 'Comma-separated agent list (claude,codex)',
+ },
+ 'claude-efforts': {
+ type: 'string' as const,
+ description: 'Comma-separated Claude effort levels',
+ },
+ 'claude-effort': { type: 'string' as const, description: 'Single Claude effort level' },
+ 'codex-effort': { type: 'string' as const, description: 'Single Codex effort level' },
+ repetitions: {
+ type: 'string' as const,
+ description: `Repetitions per (project × variant) (default: ${BATCH_REPETITIONS})`,
+ },
+ yes: {
+ type: 'boolean' as const,
+ short: 'y',
+ description: 'Skip the confirmation prompt (non-interactive / CI)',
+ },
+ help: { type: 'boolean' as const, short: 'h', description: 'Show this help and exit' },
+};
+
+export function parseRunBatchArgs(
+ argv: string[]
+):
+ | Pick<
+ RunBatchOptions,
+ | 'agents'
+ | 'claudeEfforts'
+ | 'claudeEffort'
+ | 'codexEffort'
+ | 'concurrency'
+ | 'prompt'
+ | 'yes'
+ | 'repetitions'
+ >
+ | { help: true } {
+ const { values } = parseArgs({
+ args: argv,
+ strict: true,
+ options: runBatchOptions,
+ });
+
+ if (values.help) {
+ return { help: true };
+ }
+
+ const parsed = runBatchArgsSchema.safeParse({
+ concurrency: values.concurrency,
+ prompt: values.prompt,
+ yes: values.yes,
+ agents: parseAgentArgs(values.agents),
+ claudeEfforts: parseClaudeEfforts(values['claude-efforts']),
+ claudeEffort: values['claude-effort'],
+ codexEffort: values['codex-effort'],
+ repetitions: values.repetitions,
+ });
+
+ if (!parsed.success) {
+ const issues = parsed.error.issues
+ .map((issue) => `${issue.path.join('.')}: ${issue.message}`)
+ .join('\n');
+ throw new Error(issues);
+ }
+
+ return parsed.data;
+}
+
+function parseAgentArgs(value?: string) {
+ if (value == null) {
+ return undefined;
+ }
+
+ return value
+ .split(',')
+ .map((agent) => agent.trim())
+ .filter(Boolean);
+}
+
+function parseClaudeEfforts(value?: string) {
+ if (value == null) {
+ return undefined;
+ }
+
+ return value
+ .split(',')
+ .map((effort) => effort.trim())
+ .filter(Boolean);
+}
+
+function resolveBatchAgents(agents?: RunBatchOptions['agents']) {
+ if (agents == null || agents.length === 0) {
+ return [...BATCH_DEFAULT_AGENT_IDS];
+ }
+
+ return BATCH_AGENT_IDS.filter((agent) => agents.includes(agent));
+}
+
+function resolveClaudeEfforts(options: {
+ claudeEfforts?: RunBatchOptions['claudeEfforts'];
+ claudeEffort?: RunBatchOptions['claudeEffort'];
+}) {
+ if (options.claudeEfforts != null && options.claudeEfforts.length > 0) {
+ return [...new Set(options.claudeEfforts)];
+ }
+
+ if (options.claudeEffort != null) {
+ return [options.claudeEffort];
+ }
+
+ return [...BATCH_DEFAULT_CLAUDE_EFFORTS];
+}
+
+function formatBatchTimestamp(date: Date) {
+ return date.toISOString().replace(/[:.]/g, '-');
+}
+
+function formatExitResult(result: Pick) {
+ return result.signal ? `signal=${result.signal}` : `exit=${result.exitCode ?? 'null'}`;
+}
+
+async function waitForChild(child: SpawnedBatchChild) {
+ return new Promise<{ exitCode: number | null; signal: NodeJS.Signals | null; error?: Error }>(
+ (resolveResult) => {
+ let settled = false;
+
+ const resolveOnce = (result: {
+ exitCode: number | null;
+ signal: NodeJS.Signals | null;
+ error?: Error;
+ }) => {
+ if (settled) {
+ return;
+ }
+ settled = true;
+ resolveResult(result);
+ };
+
+ child.once('error', (error) => {
+ resolveOnce({ exitCode: null, signal: null, error });
+ });
+ child.once('close', (exitCode, signal) => {
+ resolveOnce({ exitCode, signal });
+ });
+ }
+ );
+}
+
+async function closeLogStream(logStream: ReturnType) {
+ logStream.end();
+ await once(logStream, 'finish');
+}
+
+function toError(error: unknown) {
+ return error instanceof Error ? error : new Error(String(error));
+}
+
+function formatError(error: Error) {
+ return error.stack ?? `${error.name}: ${error.message}`;
+}
+
+if (esMain(import.meta.url)) {
+ try {
+ const parsed = parseRunBatchArgs(process.argv.slice(2));
+ if ('help' in parsed) {
+ console.log(
+ formatHelp(
+ `node ${NODE_EVAL_RUN_BATCH_SCRIPT} [options]`,
+ 'Run a batch of eval trials across all benchmark projects.',
+ runBatchOptions
+ )
+ );
+ process.exit(0);
+ }
+ process.exitCode = await main(parsed);
+ } catch (error) {
+ console.error(error);
+ process.exitCode = 1;
+ }
+}
diff --git a/scripts/eval/sync-baselines.test.ts b/scripts/eval/sync-baselines.test.ts
new file mode 100644
index 000000000000..c61296b62cf1
--- /dev/null
+++ b/scripts/eval/sync-baselines.test.ts
@@ -0,0 +1,585 @@
+import { execFileSync } from 'node:child_process';
+import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { mkdir } from 'node:fs/promises';
+import { dirname, join } from 'node:path';
+import { tmpdir } from 'node:os';
+import { afterEach, describe, expect, it } from 'vitest';
+import { BASELINE_STORYBOOK_FILES } from './lib/baseline-template-files.ts';
+import type { Project } from './lib/projects.ts';
+import { syncBaselines } from './sync-baselines.ts';
+
+let TMP = '';
+
+afterEach(() => {
+ if (TMP) {
+ rmSync(TMP, { recursive: true, force: true });
+ TMP = '';
+ }
+});
+
+describe('syncBaselines', () => {
+ it('syncs authoritative mealdrop files into root and nested repos, removes old eval-results, and pushes main', async () => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-sync-baselines-'));
+ const reposRoot = join(TMP, 'repos');
+ const remotesRoot = join(TMP, 'remotes');
+ await mkdir(reposRoot, { recursive: true });
+ await mkdir(remotesRoot, { recursive: true });
+
+ const projects: Project[] = [
+ {
+ name: 'mealdrop',
+ repo: join(remotesRoot, 'mealdrop.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ {
+ name: 'edgy',
+ repo: join(remotesRoot, 'edgy.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/edgy',
+ },
+ {
+ name: 'wikitok',
+ repo: join(remotesRoot, 'wikitok.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/wikitok',
+ projectDir: 'frontend',
+ },
+ ];
+
+ setupRepo({
+ repoRoot: join(reposRoot, 'mealdrop'),
+ remoteRoot: join(remotesRoot, 'mealdrop.git'),
+ storybookDir: '.storybook',
+ mainFile: 'main.ts',
+ mainContents: [
+ "import type { StorybookConfig } from '@storybook/react-vite';",
+ '',
+ 'const config: StorybookConfig = {',
+ " stories: ['../src/**/*.stories.tsx', './eval-support/*.mdx'],",
+ '};',
+ '',
+ 'export default config;',
+ ].join('\n'),
+ evalSupportFiles: {
+ 'summary.mdx': "import data from '../../eval-results/data.json';\n\n# Source Summary\n",
+ 'transcript.mdx':
+ "import data from '../../eval-results/data.json';\nimport { Transcript } from './transcript';\n\n# Transcript\n\n\n",
+ 'transcript.tsx': 'export const Transcript = () => null;\n',
+ 'transcript.types.ts':
+ 'export interface TranscriptProps { messages: unknown[]; prompt: string; promptTokenCount: number; promptCost: number; }\n',
+ },
+ previewContents: "export default { parameters: { a11y: { test: 'todo' } } };\n",
+ rootEvalResultsFiles: {
+ 'summary.json': '{ "empty": true }\n',
+ 'transcript.json': '[]\n',
+ },
+ });
+
+ setupRepo({
+ repoRoot: join(reposRoot, 'edgy'),
+ remoteRoot: join(remotesRoot, 'edgy.git'),
+ storybookDir: '.storybook',
+ mainFile: 'main.js',
+ mainContents: [
+ '/** @type { import("@storybook/react-vite").StorybookConfig } */',
+ 'const config = {',
+ " stories: ['../src/**/*.stories.tsx'],",
+ '};',
+ '',
+ 'export default config;',
+ ].join('\n'),
+ evalSupportFiles: {
+ 'old-helper.ts': 'export const stale = true;\n',
+ },
+ previewContents: 'export default { parameters: { old: true } };\n',
+ rootEvalResultsFiles: {
+ 'data.json': '{}\n',
+ 'summary.json': '{ "empty": true }\n',
+ },
+ });
+
+ setupRepo({
+ repoRoot: join(reposRoot, 'wikitok'),
+ remoteRoot: join(remotesRoot, 'wikitok.git'),
+ storybookDir: 'frontend/.storybook',
+ mainFile: 'main.ts',
+ mainContents: [
+ "import type { StorybookConfig } from '@storybook/react-vite';",
+ '',
+ 'const config: StorybookConfig = {',
+ ' stories: [',
+ " '../src/**/*.stories.tsx',",
+ ' ],',
+ '};',
+ '',
+ 'export default config;',
+ ].join('\n'),
+ evalSupportFiles: {
+ 'old.txt': 'stale\n',
+ },
+ previewContents: 'export default { parameters: { old: true } };\n',
+ rootEvalResultsFiles: {
+ 'transcript.json': '[]\n',
+ },
+ });
+
+ await syncBaselines({
+ reposRoot,
+ projects,
+ push: true,
+ log: () => {},
+ });
+
+ expect(
+ readFileSync(join(reposRoot, 'edgy', '.storybook', 'eval-support', 'summary.mdx'), 'utf-8')
+ ).toContain('../eval-results/data.json');
+ expect(
+ readFileSync(join(reposRoot, 'edgy', '.storybook', 'eval-support', 'summary.mdx'), 'utf-8')
+ ).toContain('# Eval Summary');
+ expect(
+ readFileSync(join(reposRoot, 'edgy', '.storybook', 'eval-support', 'summary.mdx'), 'utf-8')
+ ).toContain("{data.project?.name ?? '-'}");
+ expect(
+ readFileSync(
+ join(reposRoot, 'wikitok', 'frontend', '.storybook', 'eval-support', 'transcript.mdx'),
+ 'utf-8'
+ )
+ ).toContain('../eval-results/data.json');
+ expect(
+ readFileSync(
+ join(reposRoot, 'wikitok', 'frontend', '.storybook', 'eval-support', 'transcript.mdx'),
+ 'utf-8'
+ )
+ ).toContain(
+ ""
+ );
+
+ expect(
+ readFileSync(join(reposRoot, 'edgy', '.storybook', 'eval-results', 'data.json'), 'utf-8')
+ ).toBe('{}\n');
+ expect(
+ readFileSync(
+ join(reposRoot, 'wikitok', 'frontend', '.storybook', 'eval-results', 'data.json'),
+ 'utf-8'
+ )
+ ).toBe('{}\n');
+
+ expect(existsSync(join(reposRoot, 'edgy', 'eval-results'))).toBe(false);
+ expect(existsSync(join(reposRoot, 'wikitok', 'frontend', 'eval-results'))).toBe(false);
+ expect(existsSync(join(reposRoot, 'edgy', '.storybook', 'main.js'))).toBe(false);
+ expect(existsSync(join(reposRoot, 'edgy', '.storybook', 'eval-support', 'old-helper.ts'))).toBe(
+ false
+ );
+ expect(
+ existsSync(join(reposRoot, 'wikitok', 'frontend', '.storybook', 'eval-support', 'old.txt'))
+ ).toBe(false);
+
+ expect(readFileSync(join(reposRoot, 'edgy', '.storybook', 'main.ts'), 'utf-8')).toContain(
+ './eval-support/*.mdx'
+ );
+ expect(readFileSync(join(reposRoot, 'edgy', '.storybook', 'preview.tsx'), 'utf-8')).toBe(
+ baselineTemplate('.storybook/preview.tsx')
+ );
+ expect(
+ readFileSync(join(reposRoot, 'wikitok', 'frontend', '.storybook', 'main.ts'), 'utf-8')
+ ).toContain('./eval-support/*.mdx');
+ expect(
+ readFileSync(join(reposRoot, 'edgy', '.storybook', 'eval-support', 'transcript.tsx'), 'utf-8')
+ ).toBe(resultDocTemplate('transcript.tsx'));
+ expect(
+ readFileSync(
+ join(reposRoot, 'wikitok', 'frontend', '.storybook', 'eval-support', 'transcript.types.ts'),
+ 'utf-8'
+ )
+ ).toBe(resultDocTemplate('transcript.types.ts'));
+
+ expect(getHead(join(reposRoot, 'edgy'))).toBe(getRemoteHead(join(remotesRoot, 'edgy.git')));
+ expect(getHead(join(reposRoot, 'wikitok'))).toBe(
+ getRemoteHead(join(remotesRoot, 'wikitok.git'))
+ );
+ }, 30_000);
+
+ it('fails fast when a non-source target repo is dirty', async () => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-sync-baselines-dirty-'));
+ const reposRoot = join(TMP, 'repos');
+ const remotesRoot = join(TMP, 'remotes');
+ await mkdir(reposRoot, { recursive: true });
+ await mkdir(remotesRoot, { recursive: true });
+
+ const projects: Project[] = [
+ {
+ name: 'mealdrop',
+ repo: join(remotesRoot, 'mealdrop.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ {
+ name: 'edgy',
+ repo: join(remotesRoot, 'edgy.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/edgy',
+ },
+ ];
+
+ setupRepo({
+ repoRoot: join(reposRoot, 'mealdrop'),
+ remoteRoot: join(remotesRoot, 'mealdrop.git'),
+ storybookDir: '.storybook',
+ mainFile: 'main.ts',
+ mainContents: "export default { stories: ['./eval-support/*.mdx'] };\n",
+ evalSupportFiles: {
+ 'summary.mdx': "import data from '../../eval-results/data.json';\n",
+ 'transcript.mdx': "import data from '../../eval-results/data.json';\n",
+ 'transcript.tsx': 'export const Transcript = () => null;\n',
+ 'transcript.types.ts': 'export interface TranscriptProps {}\n',
+ },
+ previewContents: 'export default {};\n',
+ rootEvalResultsFiles: {
+ 'summary.json': '{ "empty": true }\n',
+ },
+ });
+
+ setupRepo({
+ repoRoot: join(reposRoot, 'edgy'),
+ remoteRoot: join(remotesRoot, 'edgy.git'),
+ storybookDir: '.storybook',
+ mainFile: 'main.js',
+ mainContents: 'export default { stories: [] };\n',
+ evalSupportFiles: {},
+ previewContents: 'export default {};\n',
+ rootEvalResultsFiles: {},
+ });
+
+ writeFileSync(join(reposRoot, 'edgy', 'README.md'), 'dirty\n');
+
+ await expect(
+ syncBaselines({
+ reposRoot,
+ projects,
+ push: true,
+ log: () => {},
+ })
+ ).rejects.toThrow('edgy has local changes');
+
+ expect(existsSync(join(reposRoot, 'edgy', '.storybook', 'eval-results', 'data.json'))).toBe(
+ false
+ );
+ expect(existsSync(join(reposRoot, 'edgy', 'eval-results'))).toBe(false);
+ });
+
+ it('syncs nested project baselines even when there is no legacy eval-results directory', async () => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-sync-baselines-nested-no-legacy-'));
+ const reposRoot = join(TMP, 'repos');
+ const remotesRoot = join(TMP, 'remotes');
+ await mkdir(reposRoot, { recursive: true });
+ await mkdir(remotesRoot, { recursive: true });
+
+ const projects: Project[] = [
+ {
+ name: 'mealdrop',
+ repo: join(remotesRoot, 'mealdrop.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ {
+ name: 'excalidraw',
+ repo: join(remotesRoot, 'excalidraw.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/excalidraw',
+ projectDir: 'excalidraw-app',
+ },
+ ];
+
+ setupRepo({
+ repoRoot: join(reposRoot, 'mealdrop'),
+ remoteRoot: join(remotesRoot, 'mealdrop.git'),
+ storybookDir: '.storybook',
+ mainFile: 'main.ts',
+ mainContents: "export default { stories: ['./eval-support/*.mdx'] };\n",
+ evalSupportFiles: {
+ 'summary.mdx': '# Source Summary\n',
+ 'transcript.mdx': '# Transcript\n',
+ 'transcript.tsx': 'export const Transcript = () => null;\n',
+ 'transcript.types.ts': 'export interface TranscriptProps {}\n',
+ },
+ previewContents: 'export default {};\n',
+ rootEvalResultsFiles: {
+ 'summary.json': '{ "empty": true }\n',
+ },
+ });
+
+ setupRepo({
+ repoRoot: join(reposRoot, 'excalidraw'),
+ remoteRoot: join(remotesRoot, 'excalidraw.git'),
+ storybookDir: 'excalidraw-app/.storybook',
+ mainFile: 'main.ts',
+ mainContents: 'export default { stories: ["../stories/**/*.stories.tsx"] };\n',
+ evalSupportFiles: {},
+ previewContents: 'export default { parameters: { old: true } };\n',
+ rootEvalResultsFiles: {},
+ });
+
+ await syncBaselines({
+ reposRoot,
+ projects,
+ push: true,
+ log: () => {},
+ });
+
+ expect(
+ readFileSync(
+ join(reposRoot, 'excalidraw', 'excalidraw-app', '.storybook', 'main.ts'),
+ 'utf-8'
+ )
+ ).toContain('./eval-support/*.mdx');
+ expect(
+ existsSync(join(reposRoot, 'excalidraw', 'excalidraw-app', '.storybook', 'eval-support'))
+ ).toBe(true);
+ expect(
+ readFileSync(
+ join(reposRoot, 'excalidraw', 'excalidraw-app', '.storybook', 'eval-results', 'data.json'),
+ 'utf-8'
+ )
+ ).toBe('{}\n');
+ expect(getHead(join(reposRoot, 'excalidraw'))).toBe(
+ getRemoteHead(join(remotesRoot, 'excalidraw.git'))
+ );
+ });
+
+ it('auto-clones repos that have not been cloned yet', async () => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-sync-baselines-auto-clone-'));
+ const reposRoot = join(TMP, 'repos');
+ const remotesRoot = join(TMP, 'remotes');
+ await mkdir(reposRoot, { recursive: true });
+ await mkdir(remotesRoot, { recursive: true });
+
+ const projects: Project[] = [
+ {
+ name: 'mealdrop',
+ repo: join(remotesRoot, 'mealdrop.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ {
+ name: 'wikitok',
+ repo: join(remotesRoot, 'wikitok.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/wikitok',
+ projectDir: 'frontend',
+ },
+ ];
+
+ // Set up bare remotes with some initial content, but do NOT clone them locally.
+ setupBareRemoteWithContent({
+ remoteRoot: join(remotesRoot, 'mealdrop.git'),
+ files: { 'src/index.ts': 'export {};\n' },
+ });
+ setupBareRemoteWithContent({
+ remoteRoot: join(remotesRoot, 'wikitok.git'),
+ files: { 'frontend/src/index.ts': 'export {};\n' },
+ });
+
+ expect(existsSync(join(reposRoot, 'mealdrop'))).toBe(false);
+ expect(existsSync(join(reposRoot, 'wikitok'))).toBe(false);
+
+ await syncBaselines({
+ reposRoot,
+ projects,
+ push: true,
+ log: () => {},
+ });
+
+ expect(existsSync(join(reposRoot, 'mealdrop', '.git'))).toBe(true);
+ expect(existsSync(join(reposRoot, 'wikitok', '.git'))).toBe(true);
+
+ expect(readFileSync(join(reposRoot, 'mealdrop', '.storybook', 'main.ts'), 'utf-8')).toContain(
+ './eval-support/*.mdx'
+ );
+ expect(
+ readFileSync(join(reposRoot, 'mealdrop', '.storybook', 'eval-results', 'data.json'), 'utf-8')
+ ).toBe('{}\n');
+
+ expect(
+ readFileSync(join(reposRoot, 'wikitok', 'frontend', '.storybook', 'main.ts'), 'utf-8')
+ ).toContain('./eval-support/*.mdx');
+ expect(
+ readFileSync(
+ join(reposRoot, 'wikitok', 'frontend', '.storybook', 'eval-results', 'data.json'),
+ 'utf-8'
+ )
+ ).toBe('{}\n');
+
+ expect(getHead(join(reposRoot, 'mealdrop'))).toBe(
+ getRemoteHead(join(remotesRoot, 'mealdrop.git'))
+ );
+ expect(getHead(join(reposRoot, 'wikitok'))).toBe(
+ getRemoteHead(join(remotesRoot, 'wikitok.git'))
+ );
+ });
+
+ it('fast-forwards a clean target repo before copying the baseline', async () => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-sync-baselines-target-behind-'));
+ const reposRoot = join(TMP, 'repos');
+ const remotesRoot = join(TMP, 'remotes');
+ await mkdir(reposRoot, { recursive: true });
+ await mkdir(remotesRoot, { recursive: true });
+
+ const projects: Project[] = [
+ {
+ name: 'mealdrop',
+ repo: join(remotesRoot, 'mealdrop.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ {
+ name: 'edgy',
+ repo: join(remotesRoot, 'edgy.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/edgy',
+ },
+ ];
+
+ setupRepo({
+ repoRoot: join(reposRoot, 'mealdrop'),
+ remoteRoot: join(remotesRoot, 'mealdrop.git'),
+ storybookDir: '.storybook',
+ mainFile: 'main.ts',
+ mainContents: 'export default { stories: [] };\n',
+ evalSupportFiles: {
+ 'summary.mdx': '# Old Summary\n',
+ 'transcript.mdx': '# Transcript\n',
+ 'transcript.tsx': 'export const Transcript = () => null;\n',
+ 'transcript.types.ts': 'export interface TranscriptProps {}\n',
+ },
+ previewContents: 'export default {};\n',
+ rootEvalResultsFiles: {
+ 'summary.json': '{ "empty": true }\n',
+ },
+ });
+
+ setupRepo({
+ repoRoot: join(reposRoot, 'edgy'),
+ remoteRoot: join(remotesRoot, 'edgy.git'),
+ storybookDir: '.storybook',
+ mainFile: 'main.ts',
+ mainContents: 'export default { stories: [] };\n',
+ evalSupportFiles: {},
+ previewContents: 'export default {};\n',
+ rootEvalResultsFiles: {},
+ });
+
+ const targetRemoteWorktree = join(TMP, 'edgy-remote-worktree');
+ execFileSync('git', ['clone', join(remotesRoot, 'edgy.git'), targetRemoteWorktree]);
+ execFileSync('git', ['-C', targetRemoteWorktree, 'config', 'user.name', 'Test User']);
+ execFileSync('git', ['-C', targetRemoteWorktree, 'config', 'user.email', 'test@example.com']);
+ writeFileSync(join(targetRemoteWorktree, 'README.md'), 'updated upstream\n');
+ execFileSync('git', ['-C', targetRemoteWorktree, 'add', '-A']);
+ execFileSync('git', ['-C', targetRemoteWorktree, 'commit', '-m', 'update target upstream']);
+ execFileSync('git', ['-C', targetRemoteWorktree, 'push', 'origin', 'main']);
+
+ await syncBaselines({
+ reposRoot,
+ projects,
+ push: true,
+ log: () => {},
+ });
+
+ expect(
+ readFileSync(
+ join(reposRoot, 'mealdrop', '.storybook', 'eval-support', 'summary.mdx'),
+ 'utf-8'
+ )
+ ).toBe(baselineTemplate('.storybook/eval-support/summary.mdx'));
+ expect(
+ readFileSync(join(reposRoot, 'edgy', '.storybook', 'eval-support', 'summary.mdx'), 'utf-8')
+ ).toBe(baselineTemplate('.storybook/eval-support/summary.mdx'));
+ expect(readFileSync(join(reposRoot, 'edgy', 'README.md'), 'utf-8').replace(/\r\n/g, '\n')).toBe(
+ 'updated upstream\n'
+ );
+ expect(getHead(join(reposRoot, 'edgy'))).toBe(getRemoteHead(join(remotesRoot, 'edgy.git')));
+ });
+});
+
+function setupRepo(opts: {
+ repoRoot: string;
+ remoteRoot: string;
+ storybookDir: string;
+ mainFile: string;
+ mainContents: string;
+ evalSupportFiles: Record;
+ previewContents: string;
+ rootEvalResultsFiles: Record;
+}) {
+ execFileSync('git', ['init', '--bare', '--initial-branch=main', opts.remoteRoot]);
+ execFileSync('git', ['init', '--initial-branch=main', opts.repoRoot]);
+ execFileSync('git', ['-C', opts.repoRoot, 'config', 'user.name', 'Test User']);
+ execFileSync('git', ['-C', opts.repoRoot, 'config', 'user.email', 'test@example.com']);
+
+ const storybookRoot = join(opts.repoRoot, opts.storybookDir);
+ const projectRoot = join(opts.repoRoot, dirname(opts.storybookDir));
+ mkdirSyncRecursive(join(storybookRoot, 'eval-support'));
+ writeFileSync(join(storybookRoot, opts.mainFile), opts.mainContents);
+ writeFileSync(join(storybookRoot, 'preview.tsx'), opts.previewContents);
+ for (const [name, contents] of Object.entries(opts.evalSupportFiles)) {
+ writeFileSync(join(storybookRoot, 'eval-support', name), contents);
+ }
+
+ if (Object.keys(opts.rootEvalResultsFiles).length > 0) {
+ mkdirSyncRecursive(join(projectRoot, 'eval-results'));
+ for (const [name, contents] of Object.entries(opts.rootEvalResultsFiles)) {
+ writeFileSync(join(projectRoot, 'eval-results', name), contents);
+ }
+ }
+
+ execFileSync('git', ['-C', opts.repoRoot, 'add', '-A']);
+ execFileSync('git', ['-C', opts.repoRoot, 'commit', '-m', 'initial']);
+ execFileSync('git', ['-C', opts.repoRoot, 'remote', 'add', 'origin', opts.remoteRoot]);
+ execFileSync('git', ['-C', opts.repoRoot, 'push', '-u', 'origin', 'main']);
+}
+
+function mkdirSyncRecursive(path: string) {
+ mkdirSync(path, { recursive: true });
+}
+
+function getHead(repoRoot: string) {
+ return execFileSync('git', ['-C', repoRoot, 'rev-parse', 'HEAD'], {
+ encoding: 'utf-8',
+ }).trim();
+}
+
+function getRemoteHead(remoteRoot: string) {
+ return execFileSync('git', ['--git-dir', remoteRoot, 'rev-parse', 'refs/heads/main'], {
+ encoding: 'utf-8',
+ }).trim();
+}
+
+function baselineTemplate(path: string) {
+ const normalizedPath = path.replace(/^\.storybook\//, '');
+ const template = (BASELINE_STORYBOOK_FILES as Record)[normalizedPath];
+ if (template == null) {
+ throw new Error(`Missing baseline template for ${path}`);
+ }
+ return template;
+}
+
+function resultDocTemplate(file: 'transcript.tsx' | 'transcript.types.ts') {
+ return baselineTemplate(`eval-support/${file}`);
+}
+
+/** Create a bare remote repo with initial file content (no local clone). */
+function setupBareRemoteWithContent(opts: { remoteRoot: string; files: Record }) {
+ const staging = mkdtempSync(join(tmpdir(), 'eval-sync-staging-'));
+ execFileSync('git', ['init', '--bare', '--initial-branch=main', opts.remoteRoot]);
+ execFileSync('git', ['clone', opts.remoteRoot, staging]);
+ execFileSync('git', ['-C', staging, 'config', 'user.name', 'Test User']);
+ execFileSync('git', ['-C', staging, 'config', 'user.email', 'test@example.com']);
+ for (const [path, contents] of Object.entries(opts.files)) {
+ mkdirSyncRecursive(join(staging, dirname(path)));
+ writeFileSync(join(staging, path), contents);
+ }
+ execFileSync('git', ['-C', staging, 'add', '-A']);
+ execFileSync('git', ['-C', staging, 'commit', '-m', 'initial']);
+ execFileSync('git', ['-C', staging, 'push', 'origin', 'main']);
+ rmSync(staging, { recursive: true, force: true });
+}
diff --git a/scripts/eval/sync-baselines.ts b/scripts/eval/sync-baselines.ts
new file mode 100644
index 000000000000..ad7d593c08ed
--- /dev/null
+++ b/scripts/eval/sync-baselines.ts
@@ -0,0 +1,310 @@
+import { existsSync } from 'node:fs';
+import { mkdir, rm, writeFile } from 'node:fs/promises';
+import { dirname, join, relative, resolve } from 'node:path';
+import { parseArgs } from 'node:util';
+import pc from 'picocolors';
+import { x } from 'tinyexec';
+import { esMain } from '../utils/esmain.ts';
+import { BASELINE_STORYBOOK_FILES } from './lib/baseline-template-files.ts';
+import { ensureSourceClone } from './lib/prepare-trial.ts';
+import { PROJECTS, type Project } from './lib/projects.ts';
+import {
+ createLogger,
+ formatHelp,
+ formatTable,
+ getEvalResultsDir,
+ getEvalSupportDir,
+ getProjectPath,
+ getStorybookDir,
+ NODE_EVAL_SYNC_BASELINES_SCRIPT,
+ REPOS_DIR,
+} from './lib/utils.ts';
+
+const COMMIT_MESSAGE = 'Eval: sync .storybook baseline';
+
+export interface ProjectPaths {
+ repoRoot: string;
+ projectPath: string;
+ storybookDir: string;
+ evalSupportDir: string;
+ evalResultsDir: string;
+}
+
+export interface SyncBaselinesOptions {
+ reposRoot?: string;
+ projects?: Project[];
+ push?: boolean;
+ commitMessage?: string;
+ log?: (message: string) => void;
+}
+
+export interface SyncResult {
+ project: string;
+ changed: boolean;
+ commitSha?: string;
+}
+
+const syncBaselinesOptions = {
+ project: {
+ type: 'string' as const,
+ multiple: true,
+ description: 'Project(s) to sync (repeatable)',
+ },
+ 'skip-push': {
+ type: 'boolean' as const,
+ description: 'Commit locally but do not push',
+ },
+ help: { type: 'boolean' as const, short: 'h', description: 'Show this help and exit' },
+};
+
+if (esMain(import.meta.url)) {
+ const { values } = parseArgs({
+ args: process.argv.slice(2),
+ options: syncBaselinesOptions,
+ strict: true,
+ });
+
+ if (values.help) {
+ console.log(
+ formatHelp(
+ `node ${NODE_EVAL_SYNC_BASELINES_SCRIPT} [options]`,
+ 'Push the canonical .storybook baseline to each benchmark repo.',
+ syncBaselinesOptions
+ )
+ );
+ process.exit(0);
+ }
+
+ const selectedProjects = values.project?.length
+ ? PROJECTS.filter((project) => values.project?.includes(project.name))
+ : PROJECTS;
+
+ await syncBaselines({
+ projects: selectedProjects,
+ push: !values['skip-push'],
+ log: (message) => console.log(message),
+ });
+}
+
+export async function syncBaselines(options: SyncBaselinesOptions = {}) {
+ const log = options.log ?? (() => {});
+ const reposRoot = resolve(options.reposRoot ?? REPOS_DIR);
+ const projects = options.projects ?? PROJECTS;
+ const push = options.push ?? true;
+ const commitMessage = options.commitMessage ?? COMMIT_MESSAGE;
+
+ const resolvedProjects = await Promise.all(
+ projects.map(async (project) => {
+ const paths = await resolveProjectPaths(project, join(reposRoot, project.name));
+ return { project, paths };
+ })
+ );
+
+ await ensureReposAreClean(resolvedProjects);
+ const baselineFiles = await readBaselineStorybookDir();
+ const results: SyncResult[] = [];
+
+ for (const { project, paths } of resolvedProjects) {
+ log(pc.bold(`\nSyncing ${project.name}`));
+ const result = await syncProjectRepo({
+ project,
+ paths,
+ baselineFiles,
+ push,
+ commitMessage,
+ log,
+ });
+ results.push({
+ project: project.name,
+ changed: result.changed,
+ commitSha: result.commitSha,
+ });
+ }
+
+ log(
+ `\n${formatTable(
+ ['Project', 'Changed', 'Commit'],
+ results.map((result) => [
+ result.project,
+ result.changed ? 'yes' : 'no',
+ result.commitSha ? result.commitSha.slice(0, 8) : '-',
+ ])
+ )}`
+ );
+
+ return results;
+}
+
+export async function resolveProjectPaths(
+ project: Project,
+ repoRoot: string
+): Promise {
+ const projectPath = getProjectPath(repoRoot, project.projectDir);
+ const storybookDir = getStorybookDir(projectPath);
+
+ return {
+ repoRoot,
+ projectPath,
+ storybookDir,
+ evalSupportDir: getEvalSupportDir(projectPath),
+ evalResultsDir: getEvalResultsDir(projectPath),
+ };
+}
+
+async function ensureReposAreClean(projects: Array<{ project: Project; paths: ProjectPaths }>) {
+ const logger = createLogger();
+ for (const { project, paths } of projects) {
+ await ensureSourceClone(project, paths.repoRoot, logger);
+
+ const currentBranch = await getCurrentBranch(paths.repoRoot);
+ if (currentBranch !== project.branch) {
+ throw new Error(
+ `${project.name} must be on ${project.branch} before sync (found ${currentBranch || 'detached'})`
+ );
+ }
+
+ await x('git', ['fetch', 'origin', '--prune'], {
+ timeout: 120_000,
+ nodeOptions: { cwd: paths.repoRoot },
+ });
+
+ const dirtyFiles = await getDirtyFiles(paths.repoRoot);
+ if (dirtyFiles.length > 0) {
+ throw new Error(`${project.name} has local changes: ${dirtyFiles.join(', ')}`);
+ }
+ }
+}
+
+async function syncProjectRepo(opts: {
+ project: Project;
+ paths: ProjectPaths;
+ baselineFiles: Map;
+ push: boolean;
+ commitMessage: string;
+ log: (message: string) => void;
+}) {
+ const { project, paths, baselineFiles, push, commitMessage, log } = opts;
+ const additionalManagedPaths = await getAdditionalManagedPaths(paths);
+
+ await x('git', ['checkout', project.branch], {
+ nodeOptions: { cwd: paths.repoRoot },
+ });
+ await x('git', ['pull', '--ff-only', 'origin', project.branch], {
+ timeout: 120_000,
+ nodeOptions: { cwd: paths.repoRoot },
+ });
+
+ await syncStorybookDir(paths.storybookDir, baselineFiles);
+ await rm(getLegacyEvalResultsDir(paths.projectPath), { recursive: true, force: true });
+ await mkdir(paths.evalResultsDir, { recursive: true });
+ await writeFile(join(paths.evalResultsDir, 'data.json'), '{}\n');
+
+ const managedPaths = getManagedPaths(paths, additionalManagedPaths);
+ await x('git', ['add', '-A', '--', ...managedPaths], {
+ nodeOptions: { cwd: paths.repoRoot },
+ });
+
+ const changed = await hasManagedChanges(paths.repoRoot, managedPaths);
+ if (!changed) {
+ log(` ${pc.dim('no baseline changes')}`);
+ return { changed: false as const };
+ }
+
+ await x('git', ['commit', '--no-verify', '-m', commitMessage], {
+ nodeOptions: { cwd: paths.repoRoot },
+ });
+ const commitSha = await getHead(paths.repoRoot);
+
+ if (push) {
+ await x('git', ['push', 'origin', project.branch], {
+ timeout: 120_000,
+ nodeOptions: { cwd: paths.repoRoot },
+ });
+ }
+
+ return { changed: true as const, commitSha };
+}
+
+async function readBaselineStorybookDir() {
+ return new Map(Object.entries(BASELINE_STORYBOOK_FILES));
+}
+
+async function syncStorybookDir(targetDir: string, sourceFiles: Map) {
+ await rm(targetDir, { recursive: true, force: true });
+ await mkdir(targetDir, { recursive: true });
+
+ for (const [name, contents] of sourceFiles) {
+ const targetPath = join(targetDir, name);
+ await mkdir(dirname(targetPath), { recursive: true });
+ const rewritten = contents.replace(
+ /(?:\.\.\/)+eval-results\/data\.json/g,
+ '../eval-results/data.json'
+ );
+ await writeFile(targetPath, rewritten);
+ }
+}
+
+function getManagedPaths(paths: ProjectPaths, extraPaths: string[] = []) {
+ return [relative(paths.repoRoot, paths.storybookDir), ...extraPaths];
+}
+
+async function getAdditionalManagedPaths(paths: ProjectPaths) {
+ const legacyEvalResultsPath = normalizeRepoPath(
+ relative(paths.repoRoot, getLegacyEvalResultsDir(paths.projectPath))
+ );
+
+ if (!legacyEvalResultsPath) {
+ return [];
+ }
+
+ const legacyEvalResultsExists = existsSync(join(paths.repoRoot, legacyEvalResultsPath));
+ if (legacyEvalResultsExists) {
+ return [legacyEvalResultsPath];
+ }
+
+ const tracked = await x('git', ['ls-files', '--error-unmatch', '--', legacyEvalResultsPath], {
+ throwOnError: false,
+ nodeOptions: { cwd: paths.repoRoot },
+ });
+ return tracked.exitCode === 0 ? [legacyEvalResultsPath] : [];
+}
+
+function getLegacyEvalResultsDir(projectPath: string) {
+ return join(projectPath, 'eval-results');
+}
+
+async function getDirtyFiles(repoRoot: string) {
+ const result = await x('git', ['status', '--short'], {
+ nodeOptions: { cwd: repoRoot },
+ });
+ return result.stdout
+ .split('\n')
+ .filter((line) => line.trim().length > 0)
+ .map((line) => line.slice(3).trim());
+}
+
+async function hasManagedChanges(repoRoot: string, managedPaths: string[]) {
+ const result = await x('git', ['diff', '--cached', '--quiet', '--', ...managedPaths], {
+ throwOnError: false,
+ nodeOptions: { cwd: repoRoot },
+ });
+ return result.exitCode !== 0;
+}
+
+async function getCurrentBranch(repoRoot: string) {
+ const result = await x('git', ['branch', '--show-current'], {
+ nodeOptions: { cwd: repoRoot },
+ });
+ return result.stdout.trim();
+}
+
+async function getHead(repoRoot: string) {
+ const result = await x('git', ['rev-parse', 'HEAD'], {
+ nodeOptions: { cwd: repoRoot },
+ });
+ return result.stdout.trim();
+}
+
+function normalizeRepoPath(value: string) {
+ return value.replace(/^\.\/?/, '');
+}
diff --git a/scripts/eval/sync-storybook-version.test.ts b/scripts/eval/sync-storybook-version.test.ts
new file mode 100644
index 000000000000..c53a580f6d85
--- /dev/null
+++ b/scripts/eval/sync-storybook-version.test.ts
@@ -0,0 +1,471 @@
+import { execFileSync } from 'node:child_process';
+import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { mkdir } from 'node:fs/promises';
+import { dirname, join } from 'node:path';
+import { tmpdir } from 'node:os';
+import { afterEach, describe, expect, it } from 'vitest';
+import type { Project } from './lib/projects.ts';
+import { syncStorybookVersion } from './sync-storybook-version.ts';
+
+let TMP = '';
+
+afterEach(() => {
+ if (TMP) {
+ rmSync(TMP, { recursive: true, force: true });
+ TMP = '';
+ }
+});
+
+describe('syncStorybookVersion', () => {
+ it('runs the upgrade for each project, commits, and pushes', async () => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-sync-storybook-version-'));
+ const reposRoot = join(TMP, 'repos');
+ const remotesRoot = join(TMP, 'remotes');
+ await mkdir(reposRoot, { recursive: true });
+ await mkdir(remotesRoot, { recursive: true });
+
+ const projects: Project[] = [
+ {
+ name: 'mealdrop',
+ repo: join(remotesRoot, 'mealdrop.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ {
+ name: 'wikitok',
+ repo: join(remotesRoot, 'wikitok.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/wikitok',
+ projectDir: 'frontend',
+ },
+ ];
+
+ setupRepo({
+ repoRoot: join(reposRoot, 'mealdrop'),
+ remoteRoot: join(remotesRoot, 'mealdrop.git'),
+ packageJsonPath: 'package.json',
+ packageJson: {
+ name: 'mealdrop',
+ dependencies: { '@storybook/react-vite': '9.0.0', storybook: '9.0.0' },
+ },
+ });
+ setupRepo({
+ repoRoot: join(reposRoot, 'wikitok'),
+ remoteRoot: join(remotesRoot, 'wikitok.git'),
+ packageJsonPath: 'frontend/package.json',
+ packageJson: {
+ name: 'wikitok-frontend',
+ dependencies: { '@storybook/react-vite': '9.0.0', storybook: '9.0.0' },
+ },
+ });
+
+ const upgradeCalls: Array<{
+ version: string;
+ project: string;
+ repoRoot: string;
+ projectPath: string;
+ configDir: string;
+ }> = [];
+
+ const hookOrder: string[] = [];
+
+ const results = await syncStorybookVersion({
+ version: '9.1.0',
+ reposRoot,
+ projects,
+ push: true,
+ log: () => {},
+ installProjectDeps: async ({ project }) => {
+ hookOrder.push(`install:${project.name}`);
+ },
+ runUpgrade: async ({ version, project, repoRoot, projectPath, configDir }) => {
+ hookOrder.push(`upgrade:${project.name}`);
+ upgradeCalls.push({
+ version,
+ project: project.name,
+ repoRoot,
+ projectPath,
+ configDir,
+ });
+ const pkgPath = join(projectPath, 'package.json');
+ const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
+ for (const key of Object.keys(pkg.dependencies ?? {})) {
+ if (key === 'storybook' || key.startsWith('@storybook/')) {
+ pkg.dependencies[key] = version;
+ }
+ }
+ writeFileSync(pkgPath, `${JSON.stringify(pkg, null, 2)}\n`);
+ },
+ });
+
+ expect(hookOrder).toEqual([
+ 'install:mealdrop',
+ 'upgrade:mealdrop',
+ 'install:mealdrop',
+ 'install:wikitok',
+ 'upgrade:wikitok',
+ 'install:wikitok',
+ ]);
+
+ expect(upgradeCalls).toEqual([
+ {
+ version: '9.1.0',
+ project: 'mealdrop',
+ repoRoot: join(reposRoot, 'mealdrop'),
+ projectPath: join(reposRoot, 'mealdrop'),
+ configDir: '.storybook',
+ },
+ {
+ version: '9.1.0',
+ project: 'wikitok',
+ repoRoot: join(reposRoot, 'wikitok'),
+ projectPath: join(reposRoot, 'wikitok', 'frontend'),
+ configDir: 'frontend/.storybook',
+ },
+ ]);
+
+ const mealdropPkg = JSON.parse(
+ readFileSync(join(reposRoot, 'mealdrop', 'package.json'), 'utf-8')
+ );
+ const wikitokPkg = JSON.parse(
+ readFileSync(join(reposRoot, 'wikitok', 'frontend', 'package.json'), 'utf-8')
+ );
+ expect(mealdropPkg.dependencies.storybook).toBe('9.1.0');
+ expect(mealdropPkg.dependencies['@storybook/react-vite']).toBe('9.1.0');
+ expect(wikitokPkg.dependencies.storybook).toBe('9.1.0');
+ expect(wikitokPkg.dependencies['@storybook/react-vite']).toBe('9.1.0');
+
+ expect(results.map((r) => r.project)).toEqual(['mealdrop', 'wikitok']);
+ expect(results.every((r) => r.changed)).toBe(true);
+ expect(results.every((r) => typeof r.commitSha === 'string' && r.commitSha.length > 0)).toBe(
+ true
+ );
+
+ expect(getLatestCommitMessage(join(reposRoot, 'mealdrop'))).toBe(
+ 'Eval: upgrade Storybook to 9.1.0'
+ );
+ expect(getHead(join(reposRoot, 'mealdrop'))).toBe(
+ getRemoteHead(join(remotesRoot, 'mealdrop.git'))
+ );
+ expect(getHead(join(reposRoot, 'wikitok'))).toBe(
+ getRemoteHead(join(remotesRoot, 'wikitok.git'))
+ );
+ });
+
+ it('reports no change and skips commit when upgrade does not modify files', async () => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-sync-storybook-version-noop-'));
+ const reposRoot = join(TMP, 'repos');
+ const remotesRoot = join(TMP, 'remotes');
+ await mkdir(reposRoot, { recursive: true });
+ await mkdir(remotesRoot, { recursive: true });
+
+ const projects: Project[] = [
+ {
+ name: 'mealdrop',
+ repo: join(remotesRoot, 'mealdrop.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ ];
+
+ setupRepo({
+ repoRoot: join(reposRoot, 'mealdrop'),
+ remoteRoot: join(remotesRoot, 'mealdrop.git'),
+ packageJsonPath: 'package.json',
+ packageJson: {
+ name: 'mealdrop',
+ dependencies: { storybook: '9.1.0' },
+ },
+ });
+
+ const headBefore = getHead(join(reposRoot, 'mealdrop'));
+
+ const results = await syncStorybookVersion({
+ version: '9.1.0',
+ reposRoot,
+ projects,
+ push: true,
+ log: () => {},
+ installProjectDeps: async () => {},
+ runUpgrade: async () => {},
+ });
+
+ expect(results).toEqual([{ project: 'mealdrop', changed: false }]);
+ expect(getHead(join(reposRoot, 'mealdrop'))).toBe(headBefore);
+ });
+
+ it('fails fast when a target repo is dirty', async () => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-sync-storybook-version-dirty-'));
+ const reposRoot = join(TMP, 'repos');
+ const remotesRoot = join(TMP, 'remotes');
+ await mkdir(reposRoot, { recursive: true });
+ await mkdir(remotesRoot, { recursive: true });
+
+ const projects: Project[] = [
+ {
+ name: 'mealdrop',
+ repo: join(remotesRoot, 'mealdrop.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ ];
+
+ setupRepo({
+ repoRoot: join(reposRoot, 'mealdrop'),
+ remoteRoot: join(remotesRoot, 'mealdrop.git'),
+ packageJsonPath: 'package.json',
+ packageJson: {
+ name: 'mealdrop',
+ dependencies: { storybook: '9.0.0' },
+ },
+ });
+
+ writeFileSync(join(reposRoot, 'mealdrop', 'README.md'), 'dirty\n');
+
+ const upgradeCalls: string[] = [];
+
+ await expect(
+ syncStorybookVersion({
+ version: '9.1.0',
+ reposRoot,
+ projects,
+ push: true,
+ log: () => {},
+ installProjectDeps: async () => {},
+ runUpgrade: async ({ project }) => {
+ upgradeCalls.push(project.name);
+ },
+ })
+ ).rejects.toThrow('mealdrop has local changes');
+
+ expect(upgradeCalls).toEqual([]);
+ });
+
+ it('auto-clones repos that have not been cloned yet', async () => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-sync-storybook-version-auto-clone-'));
+ const reposRoot = join(TMP, 'repos');
+ const remotesRoot = join(TMP, 'remotes');
+ await mkdir(reposRoot, { recursive: true });
+ await mkdir(remotesRoot, { recursive: true });
+
+ const projects: Project[] = [
+ {
+ name: 'mealdrop',
+ repo: join(remotesRoot, 'mealdrop.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ ];
+
+ setupBareRemoteWithContent({
+ remoteRoot: join(remotesRoot, 'mealdrop.git'),
+ files: {
+ 'package.json': `${JSON.stringify(
+ { name: 'mealdrop', dependencies: { storybook: '9.0.0' } },
+ null,
+ 2
+ )}\n`,
+ },
+ });
+
+ expect(existsSync(join(reposRoot, 'mealdrop'))).toBe(false);
+
+ const results = await syncStorybookVersion({
+ version: '9.1.0',
+ reposRoot,
+ projects,
+ push: true,
+ log: () => {},
+ installProjectDeps: async () => {},
+ runUpgrade: async ({ version, projectPath }) => {
+ const pkgPath = join(projectPath, 'package.json');
+ const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
+ pkg.dependencies.storybook = version;
+ writeFileSync(pkgPath, `${JSON.stringify(pkg, null, 2)}\n`);
+ },
+ });
+
+ expect(existsSync(join(reposRoot, 'mealdrop', '.git'))).toBe(true);
+ const pkg = JSON.parse(readFileSync(join(reposRoot, 'mealdrop', 'package.json'), 'utf-8'));
+ expect(pkg.dependencies.storybook).toBe('9.1.0');
+ expect(results).toEqual([
+ { project: 'mealdrop', changed: true, commitSha: getHead(join(reposRoot, 'mealdrop')) },
+ ]);
+ expect(getHead(join(reposRoot, 'mealdrop'))).toBe(
+ getRemoteHead(join(remotesRoot, 'mealdrop.git'))
+ );
+ });
+
+ it('honors push=false by committing locally but not pushing', async () => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-sync-storybook-version-skip-push-'));
+ const reposRoot = join(TMP, 'repos');
+ const remotesRoot = join(TMP, 'remotes');
+ await mkdir(reposRoot, { recursive: true });
+ await mkdir(remotesRoot, { recursive: true });
+
+ const projects: Project[] = [
+ {
+ name: 'mealdrop',
+ repo: join(remotesRoot, 'mealdrop.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ ];
+
+ setupRepo({
+ repoRoot: join(reposRoot, 'mealdrop'),
+ remoteRoot: join(remotesRoot, 'mealdrop.git'),
+ packageJsonPath: 'package.json',
+ packageJson: {
+ name: 'mealdrop',
+ dependencies: { storybook: '9.0.0' },
+ },
+ });
+
+ const remoteHeadBefore = getRemoteHead(join(remotesRoot, 'mealdrop.git'));
+
+ await syncStorybookVersion({
+ version: '9.1.0',
+ reposRoot,
+ projects,
+ push: false,
+ log: () => {},
+ installProjectDeps: async () => {},
+ runUpgrade: async ({ version, projectPath }) => {
+ const pkgPath = join(projectPath, 'package.json');
+ const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
+ pkg.dependencies.storybook = version;
+ writeFileSync(pkgPath, `${JSON.stringify(pkg, null, 2)}\n`);
+ },
+ });
+
+ const localHead = getHead(join(reposRoot, 'mealdrop'));
+ expect(localHead).not.toBe(remoteHeadBefore);
+ expect(getRemoteHead(join(remotesRoot, 'mealdrop.git'))).toBe(remoteHeadBefore);
+ });
+
+ it('pushes an existing local upgrade commit on a rerun after skip-push', async () => {
+ TMP = mkdtempSync(join(tmpdir(), 'eval-sync-storybook-version-resume-push-'));
+ const reposRoot = join(TMP, 'repos');
+ const remotesRoot = join(TMP, 'remotes');
+ await mkdir(reposRoot, { recursive: true });
+ await mkdir(remotesRoot, { recursive: true });
+
+ const projects: Project[] = [
+ {
+ name: 'mealdrop',
+ repo: join(remotesRoot, 'mealdrop.git'),
+ branch: 'main',
+ githubSlug: 'storybook-tmp/mealdrop',
+ },
+ ];
+
+ setupRepo({
+ repoRoot: join(reposRoot, 'mealdrop'),
+ remoteRoot: join(remotesRoot, 'mealdrop.git'),
+ packageJsonPath: 'package.json',
+ packageJson: {
+ name: 'mealdrop',
+ dependencies: { storybook: '9.0.0' },
+ },
+ });
+
+ const remoteHeadBefore = getRemoteHead(join(remotesRoot, 'mealdrop.git'));
+
+ await syncStorybookVersion({
+ version: '9.1.0',
+ reposRoot,
+ projects,
+ push: false,
+ log: () => {},
+ installProjectDeps: async () => {},
+ runUpgrade: async ({ version, projectPath }) => {
+ const pkgPath = join(projectPath, 'package.json');
+ const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
+ pkg.dependencies.storybook = version;
+ writeFileSync(pkgPath, `${JSON.stringify(pkg, null, 2)}\n`);
+ },
+ });
+
+ const localHead = getHead(join(reposRoot, 'mealdrop'));
+ expect(localHead).not.toBe(remoteHeadBefore);
+ expect(getRemoteHead(join(remotesRoot, 'mealdrop.git'))).toBe(remoteHeadBefore);
+
+ const results = await syncStorybookVersion({
+ version: '9.1.0',
+ reposRoot,
+ projects,
+ push: true,
+ log: () => {},
+ installProjectDeps: async () => {},
+ runUpgrade: async ({ version, projectPath }) => {
+ const pkgPath = join(projectPath, 'package.json');
+ const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
+ pkg.dependencies.storybook = version;
+ writeFileSync(pkgPath, `${JSON.stringify(pkg, null, 2)}\n`);
+ },
+ });
+
+ expect(results).toEqual([{ project: 'mealdrop', changed: true, commitSha: localHead }]);
+ expect(getRemoteHead(join(remotesRoot, 'mealdrop.git'))).toBe(localHead);
+ });
+});
+
+function setupRepo(opts: {
+ repoRoot: string;
+ remoteRoot: string;
+ packageJsonPath: string;
+ packageJson: Record;
+}) {
+ execFileSync('git', ['init', '--bare', '--initial-branch=main', opts.remoteRoot]);
+ execFileSync('git', ['init', '--initial-branch=main', opts.repoRoot]);
+ execFileSync('git', ['-C', opts.repoRoot, 'config', 'user.name', 'Test User']);
+ execFileSync('git', ['-C', opts.repoRoot, 'config', 'user.email', 'test@example.com']);
+
+ const pkgPath = join(opts.repoRoot, opts.packageJsonPath);
+ mkdirSyncRecursive(dirname(pkgPath));
+ writeFileSync(pkgPath, `${JSON.stringify(opts.packageJson, null, 2)}\n`);
+
+ execFileSync('git', ['-C', opts.repoRoot, 'add', '-A']);
+ execFileSync('git', ['-C', opts.repoRoot, 'commit', '-m', 'initial']);
+ execFileSync('git', ['-C', opts.repoRoot, 'remote', 'add', 'origin', opts.remoteRoot]);
+ execFileSync('git', ['-C', opts.repoRoot, 'push', '-u', 'origin', 'main']);
+}
+
+function setupBareRemoteWithContent(opts: { remoteRoot: string; files: Record }) {
+ const staging = mkdtempSync(join(tmpdir(), 'eval-sync-storybook-version-staging-'));
+ execFileSync('git', ['init', '--bare', '--initial-branch=main', opts.remoteRoot]);
+ execFileSync('git', ['clone', opts.remoteRoot, staging]);
+ execFileSync('git', ['-C', staging, 'config', 'user.name', 'Test User']);
+ execFileSync('git', ['-C', staging, 'config', 'user.email', 'test@example.com']);
+ for (const [path, contents] of Object.entries(opts.files)) {
+ mkdirSyncRecursive(join(staging, dirname(path)));
+ writeFileSync(join(staging, path), contents);
+ }
+ execFileSync('git', ['-C', staging, 'add', '-A']);
+ execFileSync('git', ['-C', staging, 'commit', '-m', 'initial']);
+ execFileSync('git', ['-C', staging, 'push', 'origin', 'main']);
+ rmSync(staging, { recursive: true, force: true });
+}
+
+function mkdirSyncRecursive(path: string) {
+ execFileSync('mkdir', ['-p', path]);
+}
+
+function getHead(repoRoot: string) {
+ return execFileSync('git', ['-C', repoRoot, 'rev-parse', 'HEAD'], {
+ encoding: 'utf-8',
+ }).trim();
+}
+
+function getRemoteHead(remoteRoot: string) {
+ return execFileSync('git', ['--git-dir', remoteRoot, 'rev-parse', 'refs/heads/main'], {
+ encoding: 'utf-8',
+ }).trim();
+}
+
+function getLatestCommitMessage(repoRoot: string) {
+ return execFileSync('git', ['-C', repoRoot, 'log', '-1', '--pretty=%s'], {
+ encoding: 'utf-8',
+ }).trim();
+}
diff --git a/scripts/eval/sync-storybook-version.ts b/scripts/eval/sync-storybook-version.ts
new file mode 100644
index 000000000000..c4317fb6ccfe
--- /dev/null
+++ b/scripts/eval/sync-storybook-version.ts
@@ -0,0 +1,246 @@
+import { join, relative, resolve } from 'node:path';
+import { parseArgs } from 'node:util';
+import pc from 'picocolors';
+import { x } from 'tinyexec';
+import { esMain } from '../utils/esmain.ts';
+import { installDeps } from './lib/package-manager.ts';
+import { ensureSourceClone } from './lib/prepare-trial.ts';
+import { PROJECTS, type Project } from './lib/projects.ts';
+import {
+ createLogger,
+ formatHelp,
+ formatTable,
+ getProjectPath,
+ getStorybookDir,
+ NODE_EVAL_SYNC_STORYBOOK_VERSION_SCRIPT,
+ REPOS_DIR,
+ toPosixPath,
+} from './lib/utils.ts';
+
+type HookArgs = {
+ project: Project;
+ repoRoot: string;
+ projectPath: string;
+ configDir: string;
+};
+
+type RunUpgrade = (args: HookArgs & { version: string }) => Promise;
+type RunInstall = (args: HookArgs) => Promise;
+
+export interface SyncStorybookVersionOptions {
+ /** Storybook version to upgrade to (e.g. `latest`, `9.1.0`, `0.0.0-pr-1-sha-abc`). */
+ version: string;
+ /** Per-project clones live under `reposRoot/`. Defaults to `REPOS_DIR`. */
+ reposRoot?: string;
+ /** Subset of benchmark projects (defaults to all). */
+ projects?: Project[];
+ /** Push the resulting commit to origin. Defaults to true. */
+ push?: boolean;
+ log?: (message: string) => void;
+ /** Test hook; defaults to running `npx storybook@ upgrade ...` from the repo root. */
+ runUpgrade?: RunUpgrade;
+ /** Test hook; defaults to `installDeps(projectPath, ...)`. */
+ installProjectDeps?: RunInstall;
+}
+
+export interface SyncResult {
+ project: string;
+ changed: boolean;
+ commitSha?: string;
+}
+
+const cliOptions = {
+ version: {
+ type: 'string' as const,
+ short: 'V',
+ description: 'Storybook version to upgrade to (e.g. latest, 9.1.0, 0.0.0-pr-1-sha-abc)',
+ },
+ project: {
+ type: 'string' as const,
+ multiple: true,
+ description: 'Project(s) to sync (repeatable)',
+ },
+ 'skip-push': {
+ type: 'boolean' as const,
+ description: 'Commit locally but do not push',
+ },
+ help: { type: 'boolean' as const, short: 'h', description: 'Show this help and exit' },
+};
+
+if (esMain(import.meta.url)) {
+ const { values } = parseArgs({
+ args: process.argv.slice(2),
+ options: cliOptions,
+ strict: true,
+ });
+
+ if (values.help) {
+ console.log(
+ formatHelp(
+ `node ${NODE_EVAL_SYNC_STORYBOOK_VERSION_SCRIPT} --version [options]`,
+ 'Upgrade Storybook in every benchmark repo to the given version.',
+ cliOptions
+ )
+ );
+ process.exit(0);
+ }
+
+ if (!values.version) {
+ console.error(
+ `Error: --version is required. See \`node ${NODE_EVAL_SYNC_STORYBOOK_VERSION_SCRIPT} --help\`.`
+ );
+ process.exit(1);
+ }
+
+ const selected = values.project?.length
+ ? PROJECTS.filter((p) => values.project?.includes(p.name))
+ : PROJECTS;
+
+ await syncStorybookVersion({
+ version: values.version,
+ projects: selected,
+ push: !values['skip-push'],
+ log: (message) => console.log(message),
+ });
+}
+
+export async function syncStorybookVersion(
+ options: SyncStorybookVersionOptions
+): Promise {
+ const {
+ version,
+ reposRoot = REPOS_DIR,
+ projects = PROJECTS,
+ push = true,
+ log = () => {},
+ runUpgrade = defaultRunUpgrade,
+ installProjectDeps = defaultInstallProjectDeps,
+ } = options;
+
+ if (!version) {
+ throw new Error('syncStorybookVersion requires a non-empty `version`');
+ }
+
+ const logger = createLogger();
+ const resolved = projects.map((project) => {
+ const repoRoot = join(resolve(reposRoot), project.name);
+ const projectPath = getProjectPath(repoRoot, project.projectDir);
+ const configDir = toPosixPath(relative(repoRoot, getStorybookDir(projectPath))) || '.storybook';
+ return { project, repoRoot, projectPath, configDir };
+ });
+
+ // Preflight: auto-clone missing repos and fail fast if any working tree is dirty.
+ for (const { project, repoRoot } of resolved) {
+ await ensureSourceClone(project, repoRoot, logger);
+ const { stdout } = await x('git', ['status', '--short'], { nodeOptions: { cwd: repoRoot } });
+ if (stdout.trim()) {
+ throw new Error(`${project.name} has local changes: ${stdout.trim().replace(/\n/g, ', ')}`);
+ }
+ }
+
+ const results: SyncResult[] = [];
+ for (const hookArgs of resolved) {
+ const { project, repoRoot } = hookArgs;
+ log(pc.bold(`\nUpgrading ${project.name} to ${version}`));
+
+ await x('git', ['checkout', project.branch], { nodeOptions: { cwd: repoRoot } });
+ await x('git', ['pull', '--ff-only', 'origin', project.branch], {
+ timeout: 120_000,
+ nodeOptions: { cwd: repoRoot },
+ });
+
+ // `.storybook/main.ts` needs node_modules to evaluate during `storybook upgrade`,
+ // so install first. Install again afterwards because the upgrade's own install
+ // does not always refresh sub-package lockfiles (e.g. wikitok's `frontend/`).
+ await installProjectDeps(hookArgs);
+ await runUpgrade({ version, ...hookArgs });
+ await installProjectDeps(hookArgs);
+
+ await x('git', ['add', '-A'], { nodeOptions: { cwd: repoRoot } });
+ const diff = await x('git', ['diff', '--cached', '--quiet'], {
+ throwOnError: false,
+ nodeOptions: { cwd: repoRoot },
+ });
+ if (diff.exitCode === 0) {
+ const ahead = await x('git', ['rev-list', '--count', `origin/${project.branch}..HEAD`], {
+ nodeOptions: { cwd: repoRoot },
+ });
+ if (push && ahead.stdout.trim() !== '0') {
+ await x('git', ['push', 'origin', project.branch], {
+ timeout: 120_000,
+ nodeOptions: { cwd: repoRoot },
+ });
+ const head = await x('git', ['rev-parse', 'HEAD'], { nodeOptions: { cwd: repoRoot } });
+ const commitSha = head.stdout.trim();
+ log(` ${pc.dim('already on target version; pushed existing local commit')}`);
+ results.push({ project: project.name, changed: true, commitSha });
+ } else {
+ log(` ${pc.dim('already on target version')}`);
+ results.push({ project: project.name, changed: false });
+ }
+ continue;
+ }
+
+ await x('git', ['commit', '--no-verify', '-m', `Eval: upgrade Storybook to ${version}`], {
+ nodeOptions: { cwd: repoRoot },
+ });
+ const head = await x('git', ['rev-parse', 'HEAD'], { nodeOptions: { cwd: repoRoot } });
+ const commitSha = head.stdout.trim();
+
+ if (push) {
+ await x('git', ['push', 'origin', project.branch], {
+ timeout: 120_000,
+ nodeOptions: { cwd: repoRoot },
+ });
+ }
+
+ results.push({ project: project.name, changed: true, commitSha });
+ }
+
+ log(
+ `\n${formatTable(
+ ['Project', 'Changed', 'Commit'],
+ results.map((r) => [r.project, r.changed ? 'yes' : 'no', r.commitSha?.slice(0, 8) ?? '-'])
+ )}`
+ );
+
+ return results;
+}
+
+async function defaultRunUpgrade({
+ version,
+ repoRoot,
+ configDir,
+}: Parameters[0]): Promise {
+ // `--yes`/`--force` already disable prompts. `CI`, `YARN_ENABLE_IMMUTABLE_INSTALLS`,
+ // and `npm_config_frozen_lockfile` would refuse lockfile updates and leave
+ // package.json and the lockfile out of sync, so unset them here.
+ const env: NodeJS.ProcessEnv = {
+ ...process.env,
+ YARN_ENABLE_IMMUTABLE_INSTALLS: 'false',
+ npm_config_frozen_lockfile: 'false',
+ };
+ delete env.CI;
+
+ await x(
+ 'npx',
+ [
+ `storybook@${version}`,
+ 'upgrade',
+ '--yes',
+ '--force',
+ '--skip-check',
+ '--skip-automigrations',
+ '-c',
+ configDir,
+ ],
+ { timeout: 900_000, nodeOptions: { cwd: repoRoot, env, stdio: 'inherit' } }
+ );
+}
+
+async function defaultInstallProjectDeps({
+ repoRoot,
+ projectPath,
+}: Parameters[0]): Promise {
+ await installDeps(projectPath, createLogger(), undefined, { stopAt: repoRoot });
+}
diff --git a/scripts/event-log-collector.ts b/scripts/event-log-collector.ts
index 1e4a656960b1..5c07a55c93c3 100644
--- a/scripts/event-log-collector.ts
+++ b/scripts/event-log-collector.ts
@@ -1,23 +1,125 @@
-import { json } from '@polka/parse';
-import polka from 'polka';
+#!/usr/bin/env node
-const PORT = process.env.PORT || 6007;
+/**
+ * Telemetry event log collector for local development and testing.
+ *
+ * Usage:
+ * node scripts/event-log-collector.ts [--include ] [--exclude ]
+ *
+ * Then point Storybook at it:
+ * STORYBOOK_TELEMETRY_URL=http://localhost:6007/event-log yarn storybook
+ *
+ * Options:
+ * --include Only collect events whose eventType matches the regex
+ * --exclude Skip events whose eventType matches the regex
+ * --no-metadata Hide the metadata property when logging events
+ *
+ * Examples:
+ * node scripts/event-log-collector.ts --include "ai-.*"
+ * node scripts/event-log-collector.ts --exclude "mocking"
+ * node scripts/event-log-collector.ts --include "ai-.*" --exclude "ai-debug"
+ * node scripts/event-log-collector.ts --no-metadata
+ *
+ * Endpoints:
+ * POST /event-log — receives telemetry events (logs + stores)
+ * GET /event-log — returns all received events as JSON array
+ * GET /events — alias: returns all received events as JSON array
+ * GET /events/:type — returns events filtered by eventType
+ */
-const server = polka();
-server.use(json());
+import { createServer } from 'node:http';
+import { writeFile, mkdir } from 'node:fs/promises';
+import { resolve } from 'node:path';
-const events: Record[] = [];
-server.post('/event-log', (req, res) => {
- console.log(`Received event ${req.body.eventType}`);
- events.push(req.body);
- res.end('OK');
-});
+const args = process.argv.slice(2);
+const getFlag = (flag: string): string | undefined => {
+ for (const arg of args) {
+ if (arg === flag) return args[args.indexOf(arg) + 1];
+ if (arg.startsWith(`${flag}=`)) return arg.slice(flag.length + 1);
+ }
+ return undefined;
+};
+
+const includePattern = getFlag('--include');
+const excludePattern = getFlag('--exclude');
+const includeRegex = includePattern ? new RegExp(includePattern) : null;
+const excludeRegex = excludePattern ? new RegExp(excludePattern) : null;
+const hideMetadata = args.includes('--no-metadata');
+
+const matchesFilter = (eventType: string): boolean => {
+ if (includeRegex && !includeRegex.test(eventType)) return false;
+ if (excludeRegex && excludeRegex.test(eventType)) return false;
+ return true;
+};
+
+const PORT = Number(process.env.PORT || 6007);
+const LOG_DIR = resolve(process.env.LOG_DIR || '.cache/telemetry-debug');
+const events: Array<{ receivedAt: string; [key: string]: unknown }> = [];
+
+await mkdir(LOG_DIR, { recursive: true });
+
+const server = createServer(async (req, res) => {
+ // POST /event-log — receive a telemetry event
+ if (req.method === 'POST' && req.url === '/event-log') {
+ let body = '';
+ req.on('data', (chunk: Buffer) => {
+ body += chunk;
+ });
+ req.on('end', async () => {
+ try {
+ const data = JSON.parse(body);
+ const eventType = data.eventType || 'unknown';
+ const entry = { receivedAt: new Date().toISOString(), ...data };
+ events.push(entry);
+
+ if (matchesFilter(eventType)) {
+ console.log(`\n\x1b[1;32m[telemetry] ${eventType}\x1b[0m`);
+ const logged = hideMetadata ? { ...data, metadata: undefined } : data;
+ console.log(JSON.stringify(logged, null, 2));
+ }
+
+ await writeFile(
+ resolve(LOG_DIR, `events-${new Date().toISOString().slice(0, 10)}.jsonl`),
+ JSON.stringify(entry) + '\n',
+ { flag: 'a' }
+ );
+
+ res.statusCode = 200;
+ res.end('OK');
+ } catch {
+ res.statusCode = 400;
+ res.end('bad json');
+ }
+ });
+ return;
+ }
+
+ // GET /event-log — return all events (used by event-log-checker)
+ if (req.method === 'GET' && req.url === '/event-log') {
+ console.log(`Sending ${events.length} events`);
+ res.setHeader('Content-Type', 'application/json');
+ res.end(JSON.stringify(events));
+ return;
+ }
+
+ // GET /events or GET /events/:type — return all or filtered events
+ if (req.method === 'GET' && req.url?.startsWith('/events')) {
+ const typeFilter = req.url.split('/events/')[1];
+ const filtered = typeFilter ? events.filter((e) => e.eventType === typeFilter) : events;
+ res.setHeader('Content-Type', 'application/json');
+ res.end(JSON.stringify(filtered));
+ return;
+ }
-server.get('/event-log', (_req, res) => {
- console.log(`Sending ${events.length} events`);
- res.end(JSON.stringify(events));
+ res.statusCode = 404;
+ res.end('not found');
});
server.listen(PORT, () => {
- console.log(`Event log listening on ${PORT}`);
+ console.log(`Event log collector listening on http://localhost:${PORT}/event-log`);
+ console.log(`GET http://localhost:${PORT}/events to see all received events`);
+ console.log(`GET http://localhost:${PORT}/events/ to filter by event type`);
+ if (includeRegex) console.log(`Including only events matching: ${includePattern}`);
+ if (excludeRegex) console.log(`Excluding events matching: ${excludePattern}`);
+ console.log(`Logs written to ${LOG_DIR}`);
});
diff --git a/scripts/package.json b/scripts/package.json
index 569b491a5b41..d644d08dfda7 100644
--- a/scripts/package.json
+++ b/scripts/package.json
@@ -10,6 +10,11 @@
"check-package": "jiti ./check-package.ts",
"docs:check": "jiti ./docs/check-docs.ts",
"docs:codemod": "jiti ./snippets/codemod.ts",
+ "eval": "node ./eval/eval.ts",
+ "eval:collect-pr-data": "node ./eval/collect-pr-data.ts",
+ "eval:run-batch": "node ./eval/run-batch.ts",
+ "eval:sync-baselines": "node ./eval/sync-baselines.ts",
+ "eval:sync-storybook-version": "node ./eval/sync-storybook-version.ts",
"generate-sandboxes": "jiti ./sandbox/generate.ts",
"get-report-message": "jiti ./get-report-message.ts",
"get-sandbox-dir": "jiti ./get-sandbox-dir.ts",
@@ -42,10 +47,12 @@
},
"dependencies": {
"@actions/core": "^1.11.1",
+ "@anthropic-ai/claude-agent-sdk": "^0.2.85",
"@fal-works/esbuild-plugin-global-externals": "^2.1.2",
"@google-cloud/bigquery": "^6.2.1",
"@octokit/graphql": "^5.0.6",
"@octokit/request": "^8.4.1",
+ "@openai/codex-sdk": "^0.117.0",
"@polka/parse": "^1.0.0-next.28",
"@testing-library/dom": "^10.4.0",
"@testing-library/jest-dom": "^6.9.1",
@@ -74,6 +81,7 @@
"@vitest/coverage-v8": "^4.1.0",
"ansi-regex": "^6.0.1",
"chromatic": "^13.3.4",
+ "citty": "^0.2.1",
"codecov": "^3.8.1",
"commander": "^14.0.2",
"cross-env": "^7.0.3",
@@ -123,6 +131,7 @@
"ora": "^5.4.1",
"p-limit": "^7.2.0",
"p-retry": "^7.1.0",
+ "pathe": "^1.1.2",
"picocolors": "^1.1.0",
"playwright": "1.58.2",
"playwright-core": "1.58.2",
diff --git a/scripts/tsconfig.json b/scripts/tsconfig.json
index 15b5fedccdfb..9c5b78519b8b 100644
--- a/scripts/tsconfig.json
+++ b/scripts/tsconfig.json
@@ -1,6 +1,7 @@
{
"compileOnSave": false,
"compilerOptions": {
+ "customConditions": ["code"],
"baseUrl": ".",
"noEmit": true,
"incremental": false,
@@ -11,11 +12,12 @@
"moduleResolution": "bundler",
"target": "ESNext",
"module": "Preserve",
+ // Required for native Node TS execution (node file.ts) — we are migrating from jiti to native node
+ "allowImportingTsExtensions": true,
"skipLibCheck": true,
"allowSyntheticDefaultImports": true,
"esModuleInterop": true,
"isolatedModules": true,
- "allowImportingTsExtensions": true,
"strictBindCallApply": true,
"lib": ["ESNext"],
"types": ["node"],
diff --git a/yarn.lock b/yarn.lock
index 05c0c5654d59..32fef2de6b92 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -436,6 +436,44 @@ __metadata:
languageName: node
linkType: hard
+"@anthropic-ai/claude-agent-sdk@npm:^0.2.85":
+ version: 0.2.85
+ resolution: "@anthropic-ai/claude-agent-sdk@npm:0.2.85"
+ dependencies:
+ "@img/sharp-darwin-arm64": "npm:^0.34.2"
+ "@img/sharp-darwin-x64": "npm:^0.34.2"
+ "@img/sharp-linux-arm": "npm:^0.34.2"
+ "@img/sharp-linux-arm64": "npm:^0.34.2"
+ "@img/sharp-linux-x64": "npm:^0.34.2"
+ "@img/sharp-linuxmusl-arm64": "npm:^0.34.2"
+ "@img/sharp-linuxmusl-x64": "npm:^0.34.2"
+ "@img/sharp-win32-arm64": "npm:^0.34.2"
+ "@img/sharp-win32-x64": "npm:^0.34.2"
+ peerDependencies:
+ zod: ^4.0.0
+ dependenciesMeta:
+ "@img/sharp-darwin-arm64":
+ optional: true
+ "@img/sharp-darwin-x64":
+ optional: true
+ "@img/sharp-linux-arm":
+ optional: true
+ "@img/sharp-linux-arm64":
+ optional: true
+ "@img/sharp-linux-x64":
+ optional: true
+ "@img/sharp-linuxmusl-arm64":
+ optional: true
+ "@img/sharp-linuxmusl-x64":
+ optional: true
+ "@img/sharp-win32-arm64":
+ optional: true
+ "@img/sharp-win32-x64":
+ optional: true
+ checksum: 10c0/5bb31712460b03b264b489c38a2ddcac62ba60aad50da8cd6d3cebdaf46fae84c37473f25b7a4e20a6bda6f2310b4cc9f3574bc3f2e8f73a4a6e6bd0e04bd827
+ languageName: node
+ linkType: hard
+
"@aw-web-design/x-default-browser@npm:1.4.126":
version: 1.4.126
resolution: "@aw-web-design/x-default-browser@npm:1.4.126"
@@ -2972,7 +3010,7 @@ __metadata:
languageName: node
linkType: hard
-"@img/sharp-darwin-arm64@npm:0.34.5":
+"@img/sharp-darwin-arm64@npm:0.34.5, @img/sharp-darwin-arm64@npm:^0.34.2":
version: 0.34.5
resolution: "@img/sharp-darwin-arm64@npm:0.34.5"
dependencies:
@@ -2984,7 +3022,7 @@ __metadata:
languageName: node
linkType: hard
-"@img/sharp-darwin-x64@npm:0.34.5":
+"@img/sharp-darwin-x64@npm:0.34.5, @img/sharp-darwin-x64@npm:^0.34.2":
version: 0.34.5
resolution: "@img/sharp-darwin-x64@npm:0.34.5"
dependencies:
@@ -3066,7 +3104,7 @@ __metadata:
languageName: node
linkType: hard
-"@img/sharp-linux-arm64@npm:0.34.5":
+"@img/sharp-linux-arm64@npm:0.34.5, @img/sharp-linux-arm64@npm:^0.34.2":
version: 0.34.5
resolution: "@img/sharp-linux-arm64@npm:0.34.5"
dependencies:
@@ -3078,7 +3116,7 @@ __metadata:
languageName: node
linkType: hard
-"@img/sharp-linux-arm@npm:0.34.5":
+"@img/sharp-linux-arm@npm:0.34.5, @img/sharp-linux-arm@npm:^0.34.2":
version: 0.34.5
resolution: "@img/sharp-linux-arm@npm:0.34.5"
dependencies:
@@ -3126,7 +3164,7 @@ __metadata:
languageName: node
linkType: hard
-"@img/sharp-linux-x64@npm:0.34.5":
+"@img/sharp-linux-x64@npm:0.34.5, @img/sharp-linux-x64@npm:^0.34.2":
version: 0.34.5
resolution: "@img/sharp-linux-x64@npm:0.34.5"
dependencies:
@@ -3138,7 +3176,7 @@ __metadata:
languageName: node
linkType: hard
-"@img/sharp-linuxmusl-arm64@npm:0.34.5":
+"@img/sharp-linuxmusl-arm64@npm:0.34.5, @img/sharp-linuxmusl-arm64@npm:^0.34.2":
version: 0.34.5
resolution: "@img/sharp-linuxmusl-arm64@npm:0.34.5"
dependencies:
@@ -3150,7 +3188,7 @@ __metadata:
languageName: node
linkType: hard
-"@img/sharp-linuxmusl-x64@npm:0.34.5":
+"@img/sharp-linuxmusl-x64@npm:0.34.5, @img/sharp-linuxmusl-x64@npm:^0.34.2":
version: 0.34.5
resolution: "@img/sharp-linuxmusl-x64@npm:0.34.5"
dependencies:
@@ -3171,7 +3209,7 @@ __metadata:
languageName: node
linkType: hard
-"@img/sharp-win32-arm64@npm:0.34.5":
+"@img/sharp-win32-arm64@npm:0.34.5, @img/sharp-win32-arm64@npm:^0.34.2":
version: 0.34.5
resolution: "@img/sharp-win32-arm64@npm:0.34.5"
conditions: os=win32 & cpu=arm64
@@ -3185,7 +3223,7 @@ __metadata:
languageName: node
linkType: hard
-"@img/sharp-win32-x64@npm:0.34.5":
+"@img/sharp-win32-x64@npm:0.34.5, @img/sharp-win32-x64@npm:^0.34.2":
version: 0.34.5
resolution: "@img/sharp-win32-x64@npm:0.34.5"
conditions: os=win32 & cpu=x64
@@ -4665,6 +4703,86 @@ __metadata:
languageName: node
linkType: hard
+"@openai/codex-darwin-arm64@npm:@openai/codex@0.117.0-darwin-arm64":
+ version: 0.117.0-darwin-arm64
+ resolution: "@openai/codex@npm:0.117.0-darwin-arm64"
+ conditions: os=darwin & cpu=arm64
+ languageName: node
+ linkType: hard
+
+"@openai/codex-darwin-x64@npm:@openai/codex@0.117.0-darwin-x64":
+ version: 0.117.0-darwin-x64
+ resolution: "@openai/codex@npm:0.117.0-darwin-x64"
+ conditions: os=darwin & cpu=x64
+ languageName: node
+ linkType: hard
+
+"@openai/codex-linux-arm64@npm:@openai/codex@0.117.0-linux-arm64":
+ version: 0.117.0-linux-arm64
+ resolution: "@openai/codex@npm:0.117.0-linux-arm64"
+ conditions: os=linux & cpu=arm64
+ languageName: node
+ linkType: hard
+
+"@openai/codex-linux-x64@npm:@openai/codex@0.117.0-linux-x64":
+ version: 0.117.0-linux-x64
+ resolution: "@openai/codex@npm:0.117.0-linux-x64"
+ conditions: os=linux & cpu=x64
+ languageName: node
+ linkType: hard
+
+"@openai/codex-sdk@npm:^0.117.0":
+ version: 0.117.0
+ resolution: "@openai/codex-sdk@npm:0.117.0"
+ dependencies:
+ "@openai/codex": "npm:0.117.0"
+ checksum: 10c0/96f86890fd45a4030a8e9b6f8466389a015d0ee534b1661b56463a1fd210c6fc3af0ea1f3ce57306a13a9b6ff6197d6409a4d5af7f6d7c90e672009eee15e3fd
+ languageName: node
+ linkType: hard
+
+"@openai/codex-win32-arm64@npm:@openai/codex@0.117.0-win32-arm64":
+ version: 0.117.0-win32-arm64
+ resolution: "@openai/codex@npm:0.117.0-win32-arm64"
+ conditions: os=win32 & cpu=arm64
+ languageName: node
+ linkType: hard
+
+"@openai/codex-win32-x64@npm:@openai/codex@0.117.0-win32-x64":
+ version: 0.117.0-win32-x64
+ resolution: "@openai/codex@npm:0.117.0-win32-x64"
+ conditions: os=win32 & cpu=x64
+ languageName: node
+ linkType: hard
+
+"@openai/codex@npm:0.117.0":
+ version: 0.117.0
+ resolution: "@openai/codex@npm:0.117.0"
+ dependencies:
+ "@openai/codex-darwin-arm64": "npm:@openai/codex@0.117.0-darwin-arm64"
+ "@openai/codex-darwin-x64": "npm:@openai/codex@0.117.0-darwin-x64"
+ "@openai/codex-linux-arm64": "npm:@openai/codex@0.117.0-linux-arm64"
+ "@openai/codex-linux-x64": "npm:@openai/codex@0.117.0-linux-x64"
+ "@openai/codex-win32-arm64": "npm:@openai/codex@0.117.0-win32-arm64"
+ "@openai/codex-win32-x64": "npm:@openai/codex@0.117.0-win32-x64"
+ dependenciesMeta:
+ "@openai/codex-darwin-arm64":
+ optional: true
+ "@openai/codex-darwin-x64":
+ optional: true
+ "@openai/codex-linux-arm64":
+ optional: true
+ "@openai/codex-linux-x64":
+ optional: true
+ "@openai/codex-win32-arm64":
+ optional: true
+ "@openai/codex-win32-x64":
+ optional: true
+ bin:
+ codex: bin/codex.js
+ checksum: 10c0/a5104a396f0f33558c9a402012bf2dd954f5d3465d3b0bb5fe780d265760a3c72b64af4a2d42a0012f661b7e4a274a42c5d4f5582de115613557f480dbec3b5b
+ languageName: node
+ linkType: hard
+
"@oxc-project/runtime@npm:0.115.0":
version: 0.115.0
resolution: "@oxc-project/runtime@npm:0.115.0"
@@ -8751,10 +8869,12 @@ __metadata:
resolution: "@storybook/scripts@workspace:scripts"
dependencies:
"@actions/core": "npm:^1.11.1"
+ "@anthropic-ai/claude-agent-sdk": "npm:^0.2.85"
"@fal-works/esbuild-plugin-global-externals": "npm:^2.1.2"
"@google-cloud/bigquery": "npm:^6.2.1"
"@octokit/graphql": "npm:^5.0.6"
"@octokit/request": "npm:^8.4.1"
+ "@openai/codex-sdk": "npm:^0.117.0"
"@polka/parse": "npm:^1.0.0-next.28"
"@testing-library/dom": "npm:^10.4.0"
"@testing-library/jest-dom": "npm:^6.9.1"
@@ -8784,6 +8904,7 @@ __metadata:
"@vitest/coverage-v8": "npm:^4.1.0"
ansi-regex: "npm:^6.0.1"
chromatic: "npm:^13.3.4"
+ citty: "npm:^0.2.1"
codecov: "npm:^3.8.1"
commander: "npm:^14.0.2"
cross-env: "npm:^7.0.3"
@@ -8834,6 +8955,7 @@ __metadata:
oxfmt: "npm:^0.41.0"
p-limit: "npm:^7.2.0"
p-retry: "npm:^7.1.0"
+ pathe: "npm:^1.1.2"
picocolors: "npm:^1.1.0"
playwright: "npm:1.58.2"
playwright-core: "npm:1.58.2"
@@ -13753,6 +13875,13 @@ __metadata:
languageName: node
linkType: hard
+"citty@npm:^0.2.1":
+ version: 0.2.1
+ resolution: "citty@npm:0.2.1"
+ checksum: 10c0/504ac5aeb076f750bf5f25d40c730083e8ed6112eac2f00dbe341a223c46ad16893ce73dfdb55b2d0da505100b9678968ee0443637c45b21917db48daa5a6977
+ languageName: node
+ linkType: hard
+
"cjs-module-lexer@npm:^1.2.3":
version: 1.4.3
resolution: "cjs-module-lexer@npm:1.4.3"