From 407f45195069bc6c05e3fa0ca51d79e686e55ac8 Mon Sep 17 00:00:00 2001
From: Kasper Peulen <kasperpeulen@gmail.com>
Date: Mon, 20 Apr 2026 00:05:25 +0700
Subject: [PATCH 01/17] Eval: require a getComputedStyle assertion in stories
 to prove CSS is loaded
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses #34594. Adds a prompt-level instruction and a grading flag
that together catch "renders fine, but user CSS never loaded" failures.

- pattern-copy-play.md: new Step 7 requires exactly one story to assert
  a component-specific computed style via getComputedStyle.
- grade.ts: records hasComputedStyleAssertion based on whether the staged
  diff contains "getComputedStyle" (reuses the existing cached diff, no
  extra file reads).

Chose the prompt+diff approach over a runtime stylesheet heuristic
(filtering document.styleSheets, isolated all:initial probe, etc.)
because:

- The agent already knows what "styled correctly" means for a given
  component; a component-specific computed-style assertion catches the
  real failure ("bg-blue-600 did not apply") rather than a generic
  "something was applied" signal.
- No fragile filtering of vitest-browser / storybook / addon stylesheet
  sources. Addons keep shipping new sheets; that filter would bit-rot.
- Failures surface as normal Vitest assertion failures and already flow
  through pass/fail grading — no new counter, no new warning channel,
  no changes to render-analysis.
- Complementary to a future runtime heuristic if we want one: prompt-level
  catches "agent misconfigured the design system"; runtime catches "agent
  shipped a visibly unstyled story without the check".
---
 scripts/eval/lib/grade.ts                 | 36 ++++++++++++++++++-----
 scripts/eval/lib/publish-trial.test.ts    |  3 ++
 scripts/eval/lib/result-docs.test.ts      |  1 +
 scripts/eval/lib/run-trial.test.ts        |  2 ++
 scripts/eval/lib/run-trial.ts             |  1 +
 scripts/eval/prompts/pattern-copy-play.md | 18 ++++++++++--
 6 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts
index c6b1bb1c9888..9a537769cfa4 100644
--- a/scripts/eval/lib/grade.ts
+++ b/scripts/eval/lib/grade.ts
@@ -52,6 +52,8 @@ export interface Grade {
   ghostStories?: GhostStoryGrade;
   baselinePreviewStories?: StoryRenderGrade;
   storyRender?: StoryRenderGrade;
+  /** True when the agent added at least one `getComputedStyle` call (CSS-loaded assertion). */
+  hasComputedStyleAssertion: boolean;
 }
 
 /** Filter file changes to only storybook-related ones. */
@@ -121,12 +123,19 @@ export async function grade(
 
   // Changed files
   logger.logStep('Collecting agent changes...');
-  const fileChanges = await getChangedFiles(repoRoot, baselineCommit);
+  const { changes: fileChanges, rawDiff } = await getChangedFiles(repoRoot, baselineCommit);
   const storybookChanges = filterStorybookFiles(fileChanges);
   logger.logSuccess(
     `${fileChanges.length} files changed (${storybookChanges.length} storybook-related)`
   );
 
+  const hasComputedStyleAssertion = rawDiff.includes('getComputedStyle');
+  if (hasComputedStyleAssertion) {
+    logger.logSuccess('CSS-loaded assertion present (getComputedStyle found in diff)');
+  } else {
+    logger.logError('CSS-loaded assertion missing (no getComputedStyle in diff)');
+  }
+
   // Storybook build + TypeScript check in parallel
   logger.logStep('Running storybook build + typecheck...');
   const [build, tsc] = await Promise.all([
@@ -206,6 +215,7 @@ export async function grade(
     ghostStories,
     baselinePreviewStories: baselinePreviewRun.summary,
     storyRender: storyRenderRun.summary,
+    hasComputedStyleAssertion,
   };
 
   const score = computeQualityScore({
@@ -240,15 +250,27 @@ function parseGitDiffStatus(rawStatus?: string): GitDiffStatus {
     : 'M';
 }
 
-async function getChangedFiles(repoRoot: string, baseline: string): Promise<FileChange[]> {
+async function getChangedFiles(
+  repoRoot: string,
+  baseline: string
+): Promise<{ changes: FileChange[]; rawDiff: string }> {
   // Stage all files so `git diff --cached` picks up new files the agent created.
   // Safe: this runs on an ephemeral trial copy, not the real repo.
   await x('git', ['add', '-A'], { nodeOptions: { cwd: repoRoot } });
-  const { stdout } = await x('git', ['diff', '--cached', '--name-status', baseline], {
-    throwOnError: false,
-    nodeOptions: { cwd: repoRoot },
-  });
-  return parseChangedFiles(stdout);
+  const [nameStatus, patch] = await Promise.all([
+    x('git', ['diff', '--cached', '--name-status', baseline], {
+      throwOnError: false,
+      nodeOptions: { cwd: repoRoot },
+    }),
+    x('git', ['diff', '--cached', baseline], {
+      throwOnError: false,
+      nodeOptions: { cwd: repoRoot },
+    }),
+  ]);
+  return {
+    changes: parseChangedFiles(nameStatus.stdout),
+    rawDiff: patch.stdout,
+  };
 }
 
 export async function collectGhostStoriesGrade(
diff --git a/scripts/eval/lib/publish-trial.test.ts b/scripts/eval/lib/publish-trial.test.ts
index db700424b8ad..7700fb9dedde 100644
--- a/scripts/eval/lib/publish-trial.test.ts
+++ b/scripts/eval/lib/publish-trial.test.ts
@@ -178,6 +178,7 @@ describe('publishTrialBranch', () => {
           typeCheckErrors: 0,
           fileChanges: [],
           storybookChanges: [],
+          hasComputedStyleAssertion: false,
           ghostStories: {
             candidateCount: 6,
             total: 4,
@@ -341,6 +342,7 @@ describe('publishTrialBranch', () => {
             typeCheckErrors: 0,
             fileChanges: [],
             storybookChanges: [],
+            hasComputedStyleAssertion: false,
           },
           score: {
             score: 1,
@@ -460,6 +462,7 @@ describe('publishTrialBranch', () => {
           typeCheckErrors: 0,
           fileChanges: [],
           storybookChanges: [],
+          hasComputedStyleAssertion: false,
         },
         score: {
           score: 1,
diff --git a/scripts/eval/lib/result-docs.test.ts b/scripts/eval/lib/result-docs.test.ts
index f53b31a1536e..5afdc38f0cd0 100644
--- a/scripts/eval/lib/result-docs.test.ts
+++ b/scripts/eval/lib/result-docs.test.ts
@@ -262,6 +262,7 @@ describe('normalizeTranscriptForDocs', () => {
         typeCheckErrors: 0,
         fileChanges: [],
         storybookChanges: [],
+        hasComputedStyleAssertion: false,
       },
       score: {
         score: 1,
diff --git a/scripts/eval/lib/run-trial.test.ts b/scripts/eval/lib/run-trial.test.ts
index 6022b45b5e62..2b9a9a182ea9 100644
--- a/scripts/eval/lib/run-trial.test.ts
+++ b/scripts/eval/lib/run-trial.test.ts
@@ -328,6 +328,7 @@ describe('runTrial pipeline', () => {
           typeCheckErrors: 0,
           fileChanges: [],
           storybookChanges: [],
+          hasComputedStyleAssertion: false,
         },
         score: {
           score: 0,
@@ -419,6 +420,7 @@ function setupMocks(overrides?: {
         { path: '.storybook/preview.tsx', gitStatus: 'A' },
         { path: 'src/Button.stories.tsx', gitStatus: 'A' },
       ],
+      hasComputedStyleAssertion: true,
       ...(buildSuccess
         ? {
             storyRender: {
diff --git a/scripts/eval/lib/run-trial.ts b/scripts/eval/lib/run-trial.ts
index 55e2040bcdaf..24ff5a81770d 100644
--- a/scripts/eval/lib/run-trial.ts
+++ b/scripts/eval/lib/run-trial.ts
@@ -113,6 +113,7 @@ export async function runTrial(config: TrialConfig, logger?: Logger): Promise<Ru
       typeCheckErrors: 0,
       fileChanges: [],
       storybookChanges: [],
+      hasComputedStyleAssertion: false,
     },
     score: {
       score: 0,
diff --git a/scripts/eval/prompts/pattern-copy-play.md b/scripts/eval/prompts/pattern-copy-play.md
index a8ce9b20bd94..081847605f37 100644
--- a/scripts/eval/prompts/pattern-copy-play.md
+++ b/scripts/eval/prompts/pattern-copy-play.md
@@ -462,7 +462,21 @@ Examples of useful checks:
 - a toast, alert, or badge has the expected accessible text and visual state
 - a CSS class or computed style confirms the real state that matters
 
-## 7. Cover the patterns you found
+## 7. Prove CSS is loaded in exactly one story
+
+In exactly one story, assert a component-specific computed style. `toBeVisible` passes on an unstyled component; a concrete style value proves the shared preview loaded the app's CSS.
+
+Read a styling value from the component's source and assert it with `getComputedStyle`:
+
+```tsx
+play: async ({ canvas }) => {
+  const button = canvas.getByRole('button', { name: /submit/i });
+  // PrimaryButton uses bg-blue-600 — fails if Tailwind / global CSS did not load.
+  await expect(getComputedStyle(button).backgroundColor).toBe('rgb(37, 99, 235)');
+},
+```
+
+## 8. Cover the patterns you found
 
 Write stories for the real patterns in the codebase, for example:
 - a low-level reusable component in real JSX usage
@@ -495,7 +509,7 @@ export const Default: Story = {
 };
 ```
 
-## 8. Verify both rendering and types
+## 9. Verify both rendering and types
 
 As you work, verify the stories with Vitest:
 

From 720d018e1afa1b66c3563704e09fe6088b10f4a1 Mon Sep 17 00:00:00 2001
From: Kasper Peulen <kasperpeulen@gmail.com>
Date: Mon, 20 Apr 2026 00:37:59 +0700
Subject: [PATCH 02/17] Eval prompt: reword end-state sentence to avoid biasing
 toward render()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

'Render call' could read as 'you need a render: () => ... function',
which is wrong — args stories have no render call and that's the
preferred shape for prop-driven components. Softening to 'just
rendering the component in the story is enough' keeps the intent
without steering toward render().
---
 scripts/eval/prompts/pattern-copy-play.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/eval/prompts/pattern-copy-play.md b/scripts/eval/prompts/pattern-copy-play.md
index 081847605f37..e5e26ba90223 100644
--- a/scripts/eval/prompts/pattern-copy-play.md
+++ b/scripts/eval/prompts/pattern-copy-play.md
@@ -3,7 +3,7 @@
 Your goal is to make Storybook fully functional in this project by analyzing the codebase,
 configuring the preview with the right decorators, and writing stories for some components.
 
-The end state should be a Storybook where any component — from a small button to a full page — can be added without story-specific workarounds. All necessary providers, CSS, browser state, and network mocks should live in the shared preview so that new stories only need the component import and a render call.
+The end state should be a Storybook where any component — from a small button to a full page — can be added without story-specific workarounds. All necessary providers, CSS, browser state, and network mocks should live in the shared preview so that just rendering the component in the story is enough.
 
 After each created story, run Vitest to verify it renders.
 If the test fails, read the error, fix the issue, and re-run until it passes before moving on.

From 88dffdb4ea4597ed027a9c8497a52f235edaa0b9 Mon Sep 17 00:00:00 2001
From: Kasper Peulen <kasperpeulen@gmail.com>
Date: Mon, 20 Apr 2026 20:46:04 +0700
Subject: [PATCH 03/17] Eval grade: scope CSS-assertion check to added lines in
 story files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before: hasComputedStyleAssertion was a plain rawDiff.includes('getComputedStyle'),
which matched the prompt markdown (written to .storybook/eval-results/prompt.md
before grade runs) and the transcript JSON — both of which contain the token
verbatim because the new prompt Step 7 and the agent's own tool-output lines
include it. The flag was effectively tautological: true whenever the prompt was
staged, regardless of what the agent did.

After: parse the unified patch, track which file each hunk belongs to via the
'+++ b/<path>' headers, and only consider added lines (skipping the '+++' header
itself) that live in files also present in storybookChanges. Uses the existing
STORY_FILE_PATTERN from story-render.ts as the single source of truth for what
counts as a story file.

Exports diffAddsTokenInStoryFiles as a pure helper with unit tests covering the
false-positive paths (prompt.md / data.json), deleted lines, the +++ header,
and files not in storybookChanges.
---
 scripts/eval/lib/grade.test.ts   | 65 ++++++++++++++++++++++++++++++++
 scripts/eval/lib/grade.ts        | 45 ++++++++++++++++++++--
 scripts/eval/lib/story-render.ts |  2 +-
 3 files changed, 108 insertions(+), 4 deletions(-)

diff --git a/scripts/eval/lib/grade.test.ts b/scripts/eval/lib/grade.test.ts
index 1d6cf13b1779..3d70e33be88b 100644
--- a/scripts/eval/lib/grade.test.ts
+++ b/scripts/eval/lib/grade.test.ts
@@ -1,6 +1,7 @@
 import { describe, expect, it } from 'vitest';
 
 import {
+  diffAddsTokenInStoryFiles,
   filterStorybookFiles,
   computeQualityScore,
   countTypeCheckErrors,
@@ -56,6 +57,70 @@ describe('filterStorybookFiles', () => {
   });
 });
 
+describe('diffAddsTokenInStoryFiles', () => {
+  const storyChanges: FileChange[] = [
+    { path: 'src/Button.stories.tsx', gitStatus: 'A' },
+    { path: '.storybook/preview.tsx', gitStatus: 'M' },
+  ];
+
+  it('returns true when the token is added inside a story file', () => {
+    const diff = [
+      'diff --git a/src/Button.stories.tsx b/src/Button.stories.tsx',
+      '--- a/src/Button.stories.tsx',
+      '+++ b/src/Button.stories.tsx',
+      '@@ -0,0 +1,3 @@',
+      '+  const button = canvas.getByRole("button");',
+      '+  await expect(getComputedStyle(button).backgroundColor).toBe("rgb(37, 99, 235)");',
+    ].join('\n');
+    expect(diffAddsTokenInStoryFiles(diff, storyChanges, 'getComputedStyle')).toBe(true);
+  });
+
+  it('ignores the token when it only appears in non-story files (prompt.md, data.json)', () => {
+    const diff = [
+      '+++ b/.storybook/eval-results/prompt.md',
+      '+Use getComputedStyle to prove CSS loaded',
+      '+++ b/.storybook/eval-results/data.json',
+      '+"content": "...getComputedStyle..."',
+    ].join('\n');
+    expect(diffAddsTokenInStoryFiles(diff, storyChanges, 'getComputedStyle')).toBe(false);
+  });
+
+  it('ignores the token on removed lines', () => {
+    const diff = [
+      '+++ b/src/Button.stories.tsx',
+      '-  await expect(getComputedStyle(button).backgroundColor).toBe("red");',
+    ].join('\n');
+    expect(diffAddsTokenInStoryFiles(diff, storyChanges, 'getComputedStyle')).toBe(false);
+  });
+
+  it('does not match the `+++ b/...` file header itself', () => {
+    const diff = ['+++ b/src/getComputedStyle-notes.stories.tsx'].join('\n');
+    // The header mentions the token, but no content line does. Also the file is not in changes.
+    expect(diffAddsTokenInStoryFiles(diff, storyChanges, 'getComputedStyle')).toBe(false);
+  });
+
+  it('only considers files present in storybookChanges as story files', () => {
+    const diff = [
+      '+++ b/src/Button.stories.tsx',
+      '+  await expect(getComputedStyle(button).backgroundColor).toBe("red");',
+    ].join('\n');
+    // Pass an empty list — even though the file is named like a story, it is not in changes.
+    expect(diffAddsTokenInStoryFiles(diff, [], 'getComputedStyle')).toBe(false);
+  });
+
+  it('ignores non-story storybook files like .storybook/preview.tsx', () => {
+    const diff = [
+      '+++ b/.storybook/preview.tsx',
+      '+import { initialize } from "getComputedStyle";',
+    ].join('\n');
+    expect(diffAddsTokenInStoryFiles(diff, storyChanges, 'getComputedStyle')).toBe(false);
+  });
+
+  it('returns false for an empty diff', () => {
+    expect(diffAddsTokenInStoryFiles('', storyChanges, 'getComputedStyle')).toBe(false);
+  });
+});
+
 describe('computeQualityScore', () => {
   it('uses normalized preview gain as the score', () => {
     const result = computeQualityScore({
diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts
index 9a537769cfa4..e718331129d0 100644
--- a/scripts/eval/lib/grade.ts
+++ b/scripts/eval/lib/grade.ts
@@ -11,6 +11,7 @@ import {
   getGeneratedStoryFiles,
   getScriptRunCommand,
   runStoryRenderPass,
+  STORY_FILE_PATTERN,
   type StoryRenderGrade,
   withBaselinePreviewEnvironment,
 } from './story-render.ts';
@@ -95,6 +96,40 @@ export function countTypeCheckErrors(tscOutput: string): number {
   return (tscOutput.match(/error TS\d+/g) || []).length;
 }
 
+/**
+ * Walks a unified `git diff` patch and returns true if any added line (`+`, not the `+++` header)
+ * inside a story file contains `token`.
+ *
+ * Guards against false positives from the prompt markdown, the agent transcript, and other
+ * artifacts that end up in the diff because we stage every file in the trial worktree.
+ */
+export function diffAddsTokenInStoryFiles(
+  rawDiff: string,
+  storybookChanges: FileChange[],
+  token: string
+): boolean {
+  const changedStoryPaths = new Set(
+    storybookChanges
+      .filter((change) => change.gitStatus !== 'D' && STORY_FILE_PATTERN.test(change.path))
+      .map((change) => change.path)
+  );
+  if (changedStoryPaths.size === 0) return false;
+
+  let currentPathIsStory = false;
+  for (const line of rawDiff.split('\n')) {
+    if (line.startsWith('+++ ')) {
+      // `+++ b/<path>` (or `+++ /dev/null` for deletions). Track whether we're now inside a story file.
+      const path = line.slice(4).replace(/^b\//, '');
+      currentPathIsStory = changedStoryPaths.has(path);
+      continue;
+    }
+    if (!currentPathIsStory) continue;
+    if (!line.startsWith('+') || line.startsWith('+++')) continue;
+    if (line.includes(token)) return true;
+  }
+  return false;
+}
+
 /** Parse git diff --name-status output into FileChange objects. */
 export function parseChangedFiles(gitOutput: string): FileChange[] {
   return gitOutput
@@ -129,11 +164,15 @@ export async function grade(
     `${fileChanges.length} files changed (${storybookChanges.length} storybook-related)`
   );
 
-  const hasComputedStyleAssertion = rawDiff.includes('getComputedStyle');
+  const hasComputedStyleAssertion = diffAddsTokenInStoryFiles(
+    rawDiff,
+    storybookChanges,
+    'getComputedStyle'
+  );
   if (hasComputedStyleAssertion) {
-    logger.logSuccess('CSS-loaded assertion present (getComputedStyle found in diff)');
+    logger.logSuccess('CSS-loaded assertion present (getComputedStyle added in a story file)');
   } else {
-    logger.logError('CSS-loaded assertion missing (no getComputedStyle in diff)');
+    logger.logError('CSS-loaded assertion missing (no getComputedStyle added in a story file)');
   }
 
   // Storybook build + TypeScript check in parallel
diff --git a/scripts/eval/lib/story-render.ts b/scripts/eval/lib/story-render.ts
index eb52af619c05..f8c6ef3473a2 100644
--- a/scripts/eval/lib/story-render.ts
+++ b/scripts/eval/lib/story-render.ts
@@ -7,7 +7,7 @@ import type { FileChange } from './grade.ts';
 import { detectPackageManager, resolveInstallRoot } from './package-manager.ts';
 import type { Logger } from './utils.ts';
 
-const STORY_FILE_PATTERN = /\.(stories|story)\.[tj]sx?$/;
+export const STORY_FILE_PATTERN = /\.(stories|story)\.[tj]sx?$/;
 
 export interface StoryRenderGrade {
   total: number;

From fd48b038d8bb1e23ac0eb3aa266a73fa9fc7413d Mon Sep 17 00:00:00 2001
From: Kasper Peulen <kasperpeulen@gmail.com>
Date: Mon, 20 Apr 2026 21:13:13 +0700
Subject: [PATCH 04/17] Eval: name the CSS-check story `CssCheck` so telemetry
 can find it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Aligns the prompt + grade check with the Slack agreement: instead of
hoping the agent adds *some* `getComputedStyle` call somewhere, the
prompt now asks for one story explicitly named `CssCheck`. That
specific story name is what the AI-stories vitest run in core will
grep for to attribute the pass/fail result in the
`ai-setup-final-scoring` telemetry event.

- `pattern-copy-play.md` Step 7: heading + example updated to
  `export const CssCheck: Story = { ... }`.
- `grade.ts`: `hasComputedStyleAssertion` -> `hasCssCheckStory`,
  token matched in the diff changed from `getComputedStyle` to
  `export const CssCheck`.
- `grade.test.ts`: added two tests locking in the new use case
  (positive: story-file diff with the export; negative: prompt.md
  false positive).
- Trial / publish / result-docs test mocks renamed to match.

Rationale (from Slack): giving the story a known name means
telemetry in core can report on the CSS check result directly,
without layering on a separate tag. The story also ends up being
educational — a visible example of how to verify CSS loaded. No
tag, no new telemetry field required on top of whatever core
adds in a follow-up PR.
---
 scripts/eval/lib/grade.test.ts            | 23 ++++++++++++++++
 scripts/eval/lib/grade.ts                 | 19 +++++++------
 scripts/eval/lib/publish-trial.test.ts    |  6 ++---
 scripts/eval/lib/result-docs.test.ts      |  2 +-
 scripts/eval/lib/run-trial.test.ts        |  4 +--
 scripts/eval/lib/run-trial.ts             |  2 +-
 scripts/eval/prompts/pattern-copy-play.md | 33 +++++++++++++++++------
 7 files changed, 66 insertions(+), 23 deletions(-)

diff --git a/scripts/eval/lib/grade.test.ts b/scripts/eval/lib/grade.test.ts
index 3d70e33be88b..737feec93425 100644
--- a/scripts/eval/lib/grade.test.ts
+++ b/scripts/eval/lib/grade.test.ts
@@ -119,6 +119,29 @@ describe('diffAddsTokenInStoryFiles', () => {
   it('returns false for an empty diff', () => {
     expect(diffAddsTokenInStoryFiles('', storyChanges, 'getComputedStyle')).toBe(false);
   });
+
+  it('detects an `export const CssCheck` story added in a story file', () => {
+    const diff = [
+      '+++ b/src/Button.stories.tsx',
+      '@@ -0,0 +1,6 @@',
+      '+export const CssCheck: Story = {',
+      '+  args: { children: "Submit" },',
+      '+  play: async ({ canvas }) => {',
+      '+    const button = canvas.getByRole("button");',
+      '+    await expect(getComputedStyle(button).backgroundColor).toBe("rgb(37, 99, 235)");',
+      '+  },',
+      '+};',
+    ].join('\n');
+    expect(diffAddsTokenInStoryFiles(diff, storyChanges, 'export const CssCheck')).toBe(true);
+  });
+
+  it('ignores `export const CssCheck` added outside of story files (e.g. prompt.md)', () => {
+    const diff = [
+      '+++ b/.storybook/eval-results/prompt.md',
+      '+Name this story `CssCheck`, for example `export const CssCheck: Story = { ... }`.',
+    ].join('\n');
+    expect(diffAddsTokenInStoryFiles(diff, storyChanges, 'export const CssCheck')).toBe(false);
+  });
 });
 
 describe('computeQualityScore', () => {
diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts
index e718331129d0..53a78faa3d5a 100644
--- a/scripts/eval/lib/grade.ts
+++ b/scripts/eval/lib/grade.ts
@@ -53,8 +53,11 @@ export interface Grade {
   ghostStories?: GhostStoryGrade;
   baselinePreviewStories?: StoryRenderGrade;
   storyRender?: StoryRenderGrade;
-  /** True when the agent added at least one `getComputedStyle` call (CSS-loaded assertion). */
-  hasComputedStyleAssertion: boolean;
+  /**
+   * True when the agent added a story named `CssCheck` (a `play` function that asserts a
+   * component-specific computed style, to prove the shared preview loaded the app's CSS).
+   */
+  hasCssCheckStory: boolean;
 }
 
 /** Filter file changes to only storybook-related ones. */
@@ -164,15 +167,15 @@ export async function grade(
     `${fileChanges.length} files changed (${storybookChanges.length} storybook-related)`
   );
 
-  const hasComputedStyleAssertion = diffAddsTokenInStoryFiles(
+  const hasCssCheckStory = diffAddsTokenInStoryFiles(
     rawDiff,
     storybookChanges,
-    'getComputedStyle'
+    'export const CssCheck'
   );
-  if (hasComputedStyleAssertion) {
-    logger.logSuccess('CSS-loaded assertion present (getComputedStyle added in a story file)');
+  if (hasCssCheckStory) {
+    logger.logSuccess('CssCheck story present (export const CssCheck added in a story file)');
   } else {
-    logger.logError('CSS-loaded assertion missing (no getComputedStyle added in a story file)');
+    logger.logError('CssCheck story missing (no export const CssCheck added in a story file)');
   }
 
   // Storybook build + TypeScript check in parallel
@@ -254,7 +257,7 @@ export async function grade(
     ghostStories,
     baselinePreviewStories: baselinePreviewRun.summary,
     storyRender: storyRenderRun.summary,
-    hasComputedStyleAssertion,
+    hasCssCheckStory,
   };
 
   const score = computeQualityScore({
diff --git a/scripts/eval/lib/publish-trial.test.ts b/scripts/eval/lib/publish-trial.test.ts
index 7700fb9dedde..a9400c7bf277 100644
--- a/scripts/eval/lib/publish-trial.test.ts
+++ b/scripts/eval/lib/publish-trial.test.ts
@@ -178,7 +178,7 @@ describe('publishTrialBranch', () => {
           typeCheckErrors: 0,
           fileChanges: [],
           storybookChanges: [],
-          hasComputedStyleAssertion: false,
+          hasCssCheckStory: false,
           ghostStories: {
             candidateCount: 6,
             total: 4,
@@ -342,7 +342,7 @@ describe('publishTrialBranch', () => {
             typeCheckErrors: 0,
             fileChanges: [],
             storybookChanges: [],
-            hasComputedStyleAssertion: false,
+            hasCssCheckStory: false,
           },
           score: {
             score: 1,
@@ -462,7 +462,7 @@ describe('publishTrialBranch', () => {
           typeCheckErrors: 0,
           fileChanges: [],
           storybookChanges: [],
-          hasComputedStyleAssertion: false,
+          hasCssCheckStory: false,
         },
         score: {
           score: 1,
diff --git a/scripts/eval/lib/result-docs.test.ts b/scripts/eval/lib/result-docs.test.ts
index 5afdc38f0cd0..878a9d4ceb1a 100644
--- a/scripts/eval/lib/result-docs.test.ts
+++ b/scripts/eval/lib/result-docs.test.ts
@@ -262,7 +262,7 @@ describe('normalizeTranscriptForDocs', () => {
         typeCheckErrors: 0,
         fileChanges: [],
         storybookChanges: [],
-        hasComputedStyleAssertion: false,
+        hasCssCheckStory: false,
       },
       score: {
         score: 1,
diff --git a/scripts/eval/lib/run-trial.test.ts b/scripts/eval/lib/run-trial.test.ts
index 2b9a9a182ea9..b3e41d0347b2 100644
--- a/scripts/eval/lib/run-trial.test.ts
+++ b/scripts/eval/lib/run-trial.test.ts
@@ -328,7 +328,7 @@ describe('runTrial pipeline', () => {
           typeCheckErrors: 0,
           fileChanges: [],
           storybookChanges: [],
-          hasComputedStyleAssertion: false,
+          hasCssCheckStory: false,
         },
         score: {
           score: 0,
@@ -420,7 +420,7 @@ function setupMocks(overrides?: {
         { path: '.storybook/preview.tsx', gitStatus: 'A' },
         { path: 'src/Button.stories.tsx', gitStatus: 'A' },
       ],
-      hasComputedStyleAssertion: true,
+      hasCssCheckStory: true,
       ...(buildSuccess
         ? {
             storyRender: {
diff --git a/scripts/eval/lib/run-trial.ts b/scripts/eval/lib/run-trial.ts
index 24ff5a81770d..3ffbae95cb49 100644
--- a/scripts/eval/lib/run-trial.ts
+++ b/scripts/eval/lib/run-trial.ts
@@ -113,7 +113,7 @@ export async function runTrial(config: TrialConfig, logger?: Logger): Promise<Ru
       typeCheckErrors: 0,
       fileChanges: [],
       storybookChanges: [],
-      hasComputedStyleAssertion: false,
+      hasCssCheckStory: false,
     },
     score: {
       score: 0,
diff --git a/scripts/eval/prompts/pattern-copy-play.md b/scripts/eval/prompts/pattern-copy-play.md
index e5e26ba90223..7aca3ddf6296 100644
--- a/scripts/eval/prompts/pattern-copy-play.md
+++ b/scripts/eval/prompts/pattern-copy-play.md
@@ -19,6 +19,7 @@ Read enough of the app to understand the full runtime environment before writing
 
 Do not stop at `main.tsx` or `App.tsx`.
 Follow imports into providers, pages, hooks, and shared components until you know:
+
 - which providers exist
 - which CSS files are injected
 - which queries fetch data
@@ -42,6 +43,7 @@ createRoot(document.getElementById('root')!).render(
 ```
 
 That means Storybook should copy:
+
 - the `index.css` import
 - the `SessionProvider`
 - the same provider order
@@ -68,6 +70,7 @@ const savedTheme = localStorage.getItem('theme');
 ```
 
 That means the default Storybook setup should discover and prepare:
+
 - provider state
 - MSW handlers for queries
 - browser-state values that are actually read during render
@@ -77,6 +80,7 @@ That means the default Storybook setup should discover and prepare:
 Set up Storybook once so most stories work without story-specific setup.
 
 Start with the smallest faithful environment:
+
 - the real provider tree
 - the real root CSS
 - seeded browser state if the app reads it during render
@@ -116,6 +120,7 @@ export default preview;
 ```
 
 Use this same idea for:
+
 - providers
 - root CSS
 - browser state
@@ -141,6 +146,7 @@ export default preview;
 If the app uses portals, copy that setup into Storybook too.
 
 Look for patterns like:
+
 - `createPortal(...)`
 - modal, dialog, drawer, popover, tooltip, toast, or dropdown portal components
 - hard-coded roots such as `#portal-root`, `#modal-root`, `#drawer-root`, or `#toast-root`
@@ -270,6 +276,7 @@ Write colocated stories for top-level components, from low-level reusable compon
 Write up to 10 story files, or fewer only if the codebase clearly has fewer meaningful targets.
 
 The stories should use JSX copied from real usage patterns in:
+
 - pages
 - app shells
 - routes
@@ -401,6 +408,7 @@ Every named story export must have a `play` function.
 The `play` function is not optional, even for simple stories.
 
 The purpose of the `play` function is to prove that the story actually works in the copied Storybook environment:
+
 - the story renders something real and non-empty
 - the decorators provide the needed context
 - the CSS is applied well enough for the intended state to be visible
@@ -411,6 +419,7 @@ Use `play` functions to verify behavior, not just to click around.
 A story without assertions is incomplete.
 
 Use tools from `storybook/test` such as:
+
 - `expect`
 - `waitFor`
 
@@ -448,6 +457,7 @@ export const FilledForm: Story = {
 ```
 
 The assertions should match the real pattern you copied:
+
 - for provider-backed stories, assert the provider-dependent UI appears correctly
 - for mocked-data stories, wait for the mocked data to appear and assert on it
 - for CSS-sensitive states, assert on visibility, text layout, class-driven states, or meaningful computed styles
@@ -455,6 +465,7 @@ The assertions should match the real pattern you copied:
 - for portal stories, query from `canvasElement.ownerDocument` when the UI renders outside the canvas
 
 Examples of useful checks:
+
 - a themed button has the expected label and is visibly enabled or disabled
 - a modal opened through a decorator or provider is visible in the portal root
 - mocked API data appears in the page instead of a loading spinner forever
@@ -462,23 +473,27 @@ Examples of useful checks:
 - a toast, alert, or badge has the expected accessible text and visual state
 - a CSS class or computed style confirms the real state that matters
 
-## 7. Prove CSS is loaded in exactly one story
+## 7. Prove CSS is loaded in exactly one story named `CssCheck`
 
-In exactly one story, assert a component-specific computed style. `toBeVisible` passes on an unstyled component; a concrete style value proves the shared preview loaded the app's CSS.
+In exactly one story, named `CssCheck`, assert a component-specific computed style. `toBeVisible` passes on an unstyled component; a concrete style value proves the shared preview loaded the app's CSS.
 
-Read a styling value from the component's source and assert it with `getComputedStyle`:
+Pick a visually distinctive component, read a styling value from its source, and assert it with `getComputedStyle`:
 
 ```tsx
-play: async ({ canvas }) => {
-  const button = canvas.getByRole('button', { name: /submit/i });
-  // PrimaryButton uses bg-blue-600 — fails if Tailwind / global CSS did not load.
-  await expect(getComputedStyle(button).backgroundColor).toBe('rgb(37, 99, 235)');
-},
+export const CssCheck: Story = {
+  args: { children: 'Submit' },
+  play: async ({ canvas }) => {
+    const button = canvas.getByRole('button', { name: /submit/i });
+    // PrimaryButton uses bg-blue-600 — fails if Tailwind / global CSS did not load.
+    await expect(getComputedStyle(button).backgroundColor).toBe('rgb(37, 99, 235)');
+  },
+};
 ```
 
 ## 8. Cover the patterns you found
 
 Write stories for the real patterns in the codebase, for example:
+
 - a low-level reusable component in real JSX usage
 - a provider-backed component
 - a browser-state-backed component
@@ -534,6 +549,7 @@ npx storybook build
 If the build fails, fix the issue before finishing. Common build failures include missing dependencies, broken imports that only surface during static analysis, or configuration issues in `.storybook/main.ts`.
 
 Keep iterating until:
+
 - every story you wrote passes
 - every story you wrote has a meaningful passing `play` function
 - the changed stories and preview setup pass the project's real TypeScript check
@@ -541,3 +557,4 @@ Keep iterating until:
 - the rendered output looks sensible
 - the default global mocked environment is strong enough that stories do not need manual fetch overrides
 - stories no longer fail because the shared preview setup and story JSX are fixed
+

From 28fea359225cf6d84880b3a11de39032dd875709 Mon Sep 17 00:00:00 2001
From: Kasper Peulen <kasperpeulen@gmail.com>
Date: Mon, 20 Apr 2026 21:27:03 +0700
Subject: [PATCH 05/17] Build: Source eval prompts from the CLI via
 EVAL_SETUP_PROMPT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the eval harness's prompt catalog into code/lib/cli-storybook/src/ai/prompts/
so trials exercise the exact prompt a real user gets from `npx storybook ai setup`.
Each variant lives in its own fully isolated .ts file; the registry selects one at
runtime via the internal EVAL_SETUP_PROMPT env var (unset for real users → always
the default). The harness now hands the agent the AI_SETUP_PROMPT nudge and sets
EVAL_SETUP_PROMPT on the agent's spawn, so the agent itself runs `ai setup` as a
tool call — mirroring the real user flow instead of resolving the prompt upfront.
---
 code/lib/cli-storybook/src/ai/prompt.ts       | 705 +-----------------
 .../lib/cli-storybook/src/ai/prompts/index.ts |  53 ++
 .../src/ai/prompts/pattern-copy-play.ts       | 691 +++++++++++++++++
 .../lib/cli-storybook/src/ai/prompts/setup.ts | 283 +++++++
 scripts/eval/README.md                        |  40 +-
 scripts/eval/eval.ts                          |  19 +-
 scripts/eval/lib/agents/claude-code.ts        |  10 +-
 scripts/eval/lib/agents/codex.ts              |  10 +-
 scripts/eval/lib/agents/config.ts             |   8 +
 scripts/eval/lib/run-trial.ts                 |  11 +-
 scripts/eval/lib/utils.test.ts                |  31 +-
 scripts/eval/lib/utils.ts                     |  32 +-
 scripts/eval/prompts/pattern-copy-play.md     | 529 -------------
 scripts/eval/prompts/setup.md                 | 204 -----
 scripts/eval/run-batch.ts                     |   5 +-
 15 files changed, 1141 insertions(+), 1490 deletions(-)
 create mode 100644 code/lib/cli-storybook/src/ai/prompts/index.ts
 create mode 100644 code/lib/cli-storybook/src/ai/prompts/pattern-copy-play.ts
 create mode 100644 code/lib/cli-storybook/src/ai/prompts/setup.ts
 delete mode 100644 scripts/eval/prompts/pattern-copy-play.md
 delete mode 100644 scripts/eval/prompts/setup.md

diff --git a/code/lib/cli-storybook/src/ai/prompt.ts b/code/lib/cli-storybook/src/ai/prompt.ts
index 1ad37f97de71..86c3258554af 100644
--- a/code/lib/cli-storybook/src/ai/prompt.ts
+++ b/code/lib/cli-storybook/src/ai/prompt.ts
@@ -1,708 +1,7 @@
 import { dedent } from 'ts-dedent';
 
-import type { ProjectInfo, AiPrompt } from './types.ts';
-
-/**
- * Builds a markdown-format docs URL with renderer and language query parameters.
- * Appending .md to any Storybook docs URL returns clean markdown with code examples.
- */
-export function getDocsMarkdownUrl(
-  path: string,
-  projectInfo?: Pick<ProjectInfo, 'majorVersion' | 'renderer' | 'language'>
-): string {
-  const { majorVersion, renderer = 'react', language = 'ts' } = projectInfo ?? {};
-  const versionSegment = majorVersion ? `/${majorVersion}` : '';
-  const params = new URLSearchParams();
-  if (renderer) {
-    params.set('renderer', renderer);
-  }
-  params.set('language', language);
-  const query = params.toString();
-  return `https://storybook.js.org/docs${versionSegment}/${path}.md${query ? `?${query}` : ''}`;
-}
-
-export function getPrompts(projectInfo: ProjectInfo): {
-  prompts: AiPrompt[];
-} {
-  const aiPrompts: AiPrompt[] = [];
-
-  aiPrompts.push({
-    name: 'setup',
-    description: 'Set up Storybook for success',
-    instructions: getSetupInstructions(projectInfo),
-  });
-
-  return { prompts: aiPrompts };
-}
-
-function getTypeImportSource(projectInfo: ProjectInfo): string {
-  return projectInfo.framework || projectInfo.rendererPackage || '@storybook/react';
-}
-
-function getSetupInstructions(projectInfo: ProjectInfo): string {
-  const configDir = projectInfo.configDir;
-  const typeImport = getTypeImportSource(projectInfo);
-
-  return dedent`
-    Attention: The following instructions must be followed in order to successfully set up Storybook in this project. Do not skip steps or attempt to do them out of order.
-
-    Your goal is to make Storybook fully functional in this project by analyzing the codebase,
-    configuring the preview with the right decorators, and writing stories for some components.
-
-    After each created story, run Vitest to verify it renders.
-    If the test fails, read the error, fix the issue, and re-run until it passes before moving on.
-
-    - Copy real patterns from the codebase
-    - Keep the app code unchanged
-    - Put the default setup in \`${configDir}/preview.tsx\`
-    - Keep app mocking and runtime setup in \`${configDir}/preview.tsx\`, not in the stories
-
-    ${getDocsReferenceSection(projectInfo)}
-
-    ### Step 1: Analyze the codebase
-
-    Read enough of the app to understand the full runtime environment before writing any stories.
-
-    Do not stop at \`main.tsx\` or \`App.tsx\`.
-    Follow imports into providers, pages, hooks, and shared components until you know:
-
-    - which providers exist
-    - which CSS files are injected
-    - which queries fetch data
-    - which browser-state reads happen
-    - which portals and portal roots exist
-    - which pages and components show the real usage patterns
-
-    Example of what to copy:
-
-    \`\`\`tsx
-    // src/main.tsx
-    import "./index.css";
-    import App from "./App";
-    import { SessionProvider } from "./contexts/SessionContext";
-
-    createRoot(document.getElementById("root")!).render(
-      <SessionProvider>
-        <App />
-      </SessionProvider>,
-    );
-    \`\`\`
-
-    That means Storybook should copy:
-
-    - the \`index.css\` import
-    - the \`SessionProvider\`
-    - the same provider order
-
-    Example of tracing the app deeper:
-
-    \`\`\`tsx
-    // src/App.tsx
-    function App() {
-      const { products, loadMoreProducts } = useProducts();
-      const { currentUser, signOut } = useSession();
-      // ...
-    }
-    \`\`\`
-
-    \`\`\`ts
-    // src/hooks/useProducts.ts
-    const response = await fetch(apiBaseUrl + "/products?page=1");
-    \`\`\`
-
-    \`\`\`ts
-    // src/hooks/useTheme.ts
-    const savedTheme = localStorage.getItem("theme");
-    \`\`\`
-
-    That means the default Storybook setup should discover and prepare:
-
-    - provider state
-    - MSW handlers for queries
-    - browser-state values that are actually read during render
-
-    ### Step 2: Build one default app environment in preview
-
-    Set up Storybook once so most stories work without story-specific setup.
-
-    Start with the smallest faithful environment:
-
-    - the real provider tree
-    - the real root CSS
-    - seeded browser state if the app reads it during render
-    - MSW for network/data queries
-
-    It is fine to seed browser state such as \`localStorage\`, \`sessionStorage\`, and cookies when the app reads them during render.
-    Seed only the specific app-owned keys and values you need.
-    Do not clear all \`localStorage\`, \`sessionStorage\`, or cookies, and do not reset Storybook's own state.
-    Do not mock or redefine the browser runtime itself.
-    The stories run in Vitest browser mode, so the real browser environment should already exist.
-
-    ${getPreviewConfigExample(projectInfo)}
-
-    Use this same idea for:
-
-    - providers
-    - root CSS
-    - browser state
-    - dates, and if the app logic depends on them during render then always use \`mockdate\`
-
-    Example with the \`mockdate\` package:
-
-    ${getMockDateExample(projectInfo)}
-
-    ### Step 3: Support portals with preview-body.html
-
-    If the app uses portals, copy that setup into Storybook too.
-
-    Look for patterns like:
-
-    - \`createPortal(...)\`
-    - modal, dialog, drawer, popover, tooltip, toast, or dropdown portal components
-    - hard-coded roots such as \`#portal-root\`, \`#modal-root\`, \`#drawer-root\`, or \`#toast-root\`
-
-    Example of what to copy:
-
-    \`\`\`tsx
-    // real component
-    return createPortal(<ModalContent />, document.getElementById("portal-root")!);
-    \`\`\`
-
-    That means Storybook should create the same portal root in \`${configDir}/preview-body.html\`:
-
-    \`\`\`html
-    <!-- ${configDir}/preview-body.html -->
-    <div id="portal-root"></div>
-    \`\`\`
-
-    If the app uses multiple portal roots, create all of them there:
-
-    \`\`\`html
-    <!-- ${configDir}/preview-body.html -->
-    <div id="modal-root"></div>
-    <div id="drawer-root"></div>
-    <div id="toast-root"></div>
-    \`\`\`
-
-    If a library portals directly to \`document.body\`, do not add extra roots for it.
-    Make sure the copied page shell, CSS, and layout still allow overlays, fixed positioning, and z-index stacking to render correctly.
-
-    ### Step 4: Mock side effects globally
-
-    All network/data queries should be handled by the default Storybook environment.
-
-    - Always use \`msw-storybook-addon\` for query mocking.
-    - If you introduce MSW, run \`npx msw init ./public --save\` to create the worker file.
-    - Make sure Storybook serves \`./public\` as a static dir so \`mockServiceWorker.js\` is available.
-    - Do not mock \`fetch\` directly.
-    - Network/data queries should return deterministic mock data.
-    - If you need to change dependencies, first check the lockfile and use that package manager for the change.
-
-    Example of copying a real fetch pattern into shared handlers:
-
-    \`\`\`ts
-    // real app hook
-    const response = await fetch(
-      apiBaseUrl +
-        "/products?" +
-        new URLSearchParams({
-          page: "1",
-          sort: "featured",
-        }),
-    );
-    \`\`\`
-
-    \`\`\`ts
-    // ${configDir}/msw-handlers.ts
-    import { http, HttpResponse } from "msw";
-
-    export const mswHandlers = {
-      products: [
-        http.get("https://api.example.com/products", () =>
-          HttpResponse.json({
-            items: [
-              {
-                id: "product-1",
-                name: "Example product",
-                description: "Mock product description",
-                imageUrl: "https://images.example.com/product.jpg",
-                price: 42,
-              },
-            ],
-          }),
-        ),
-      ],
-    };
-    \`\`\`
-
-    ${getMswPreviewExample(projectInfo)}
-
-    \`\`\`ts
-    // ${configDir}/main.ts
-    import type { StorybookConfig } from "${typeImport}";
-
-    const config: StorybookConfig = {
-      staticDirs: ["../public"],
-    };
-
-    export default config;
-    \`\`\`
-
-    Keep these mocks global.
-    Do not put fetch mocks in individual stories.
-    Only add handlers for requests that the shared preview setup or the stories actually use.
-    Do not add catch-all handlers that can hide unrelated failures.
-    If the defaults are not enough, improve the shared default setup instead.
-    Seed browser state when needed, but do not mock \`window\`, \`document\`, \`navigator\`, observers, or similar runtime APIs.
-    The only exception is \`mockdate\` when date-based rendering exists.
-
-    ### Step 5: Write stories
-
-    Try to find around 10 good candidate components for story files.
-    Write colocated stories for top-level components, from low-level reusable components up to page components.
-    Write up to 10 story files, or fewer only if the codebase clearly has fewer meaningful targets.
-
-    The stories should use JSX copied from real usage patterns in:
-
-    - pages
-    - app shells
-    - routes
-    - tests
-    - existing feature code
-
-    As a rule of thumb, each story file should have around 3 story exports when the component or page has enough meaningful states.
-    It can have more when the real usage supports it, up to 10 story exports in one file.
-
-    Always show all imports explicitly in story and preview files.
-    Do not rely on omitted or implied imports in examples or generated code.
-
-    #### Story tags
-
-    Every story meta must include the \`ai-generated\` tag to identify AI-created stories:
-
-    ${getStoryExample(projectInfo)}
-
-    If a story could not be fully fixed after the self-healing loop (the test still fails
-    or the rendering is incomplete), add the \`needs-work\` tag alongside \`ai-generated\`:
-
-    ${getNeedsWorkTagExample(projectInfo)}
-
-    Keep app mocking and runtime setup in preview, not in the stories.
-    Do not build large story-specific harnesses.
-    Do not write story files for subcomponents, hooks, contexts, or helpers.
-    Do not create new application components.
-    Do not add a custom \`title\`.
-    Do not stop after only a few easy targets if the codebase has more meaningful components or pages available.
-
-    ### Step 6: Write a play function for every story
-
-    Every named story export must have a \`play\` function.
-    The \`play\` function is not optional, even for simple stories.
-
-    The purpose of the \`play\` function is to prove that the story actually works in the copied Storybook environment:
-
-    - the story renders something real and non-empty
-    - the decorators provide the needed context
-    - the CSS is applied well enough for the intended state to be visible
-    - the MSW mocks or seeded browser state are actually being used
-    - important interactions, async loading states, and portals behave correctly
-
-    Use \`play\` functions to verify behavior, not just to click around.
-    A story without assertions is incomplete.
-
-    Use tools from \`storybook/test\` such as:
-
-    - \`expect\`
-    - \`waitFor\`
-
-    Prefer \`canvas\` and \`userEvent\` from the \`play\` context.
-    Do not destructure \`canvasElement\` just to create \`const canvas = within(canvasElement)\`.
-    Do not import \`userEvent\` from \`storybook/test\`; use \`userEvent\` from the \`play\` context instead.
-    Only use \`canvasElement.ownerDocument\` when you need to query outside the canvas, such as for portals.
-
-    Example:
-
-    \`\`\`tsx
-    import type { StoryObj } from "${typeImport}";
-
-    export const FilledForm: Story = {
-      play: async ({ canvas, userEvent }) => {
-        const emailInput = canvas.getByLabelText("email", {
-          selector: "input",
-        });
-
-        await userEvent.type(emailInput, "example-email@email.com", {
-          delay: 100,
-        });
-
-        const passwordInput = canvas.getByLabelText("password", {
-          selector: "input",
-        });
-
-        await userEvent.type(passwordInput, "ExamplePassword", {
-          delay: 100,
-        });
-
-        const submitButton = canvas.getByRole("button");
-        await userEvent.click(submitButton);
-      },
-    };
-    \`\`\`
-
-    The assertions should match the real pattern you copied:
-
-    - for provider-backed stories, assert the provider-dependent UI appears correctly
-    - for mocked-data stories, wait for the mocked data to appear and assert on it
-    - for CSS-sensitive states, assert on visibility, text layout, class-driven states, or meaningful computed styles
-    - for routing or navigation stories, assert the routed state or navigation outcome
-    - for portal stories, query from \`canvasElement.ownerDocument\` when the UI renders outside the canvas
-
-    Examples of useful checks:
-
-    - a themed button has the expected label and is visibly enabled or disabled
-    - a modal opened through a decorator or provider is visible in the portal root
-    - mocked API data appears in the page instead of a loading spinner forever
-    - a selected tab actually shows the selected panel
-    - a toast, alert, or badge has the expected accessible text and visual state
-    - a CSS class or computed style confirms the real state that matters
-
-    ### Step 7: Cover the patterns you found
-
-    Write stories for the real patterns in the codebase, for example:
-
-    - a low-level reusable component in real JSX usage
-    - a provider-backed component
-    - a browser-state-backed component
-    - a fetched-data component
-    - a real page component
-
-    Use \`App.tsx\` to inspect the real provider tree and usage patterns, but do not make a story for \`App\` when the codebase has actual page components.
-
-    Example page story:
-
-    ${getPageStoryExample(projectInfo)}
-
-    ### Step 8: Verify both rendering and types
-
-    As you work, verify the stories with Vitest:
-
-    \`\`\`bash
-    npx vitest --project storybook <path-to-story-file>
-    \`\`\`
-
-    Also verify types so you catch missing required props, broken imports, and preview typing issues. Run the same TypeScript command the project itself uses.
-
-    \`\`\`bash
-    <project-specific-typescript-command>
-    \`\`\`
-
-    After verification passes, review every changed file and remove anything that is not needed for the final solution, especially debug fixes, overly broad mocks, unnecessary dependencies, and eval artifacts.
-
-    Keep iterating until:
-
-    - every story you wrote passes
-    - every story you wrote has a meaningful passing \`play\` function
-    - the changed stories and preview setup pass the project's real TypeScript check
-    - the rendered output looks sensible
-    - the default global mocked environment is strong enough that stories do not need manual fetch overrides
-    - stories no longer fail because the shared preview setup and story JSX are fixed
-    - all passing stories have \`tags: ['ai-generated']\` in their meta
-    - any stories that still need work have \`tags: ['ai-generated', 'needs-work']\` in their meta
-  `;
-}
-
-function getDocsReferenceSection(projectInfo: ProjectInfo): string {
-  const docsUrl = (path: string) => getDocsMarkdownUrl(path, projectInfo);
-
-  return dedent`
-    ### Storybook Documentation Reference
-
-    Use the following references to look up Storybook APIs, concepts, or examples:
-
-    - Full docs index: https://storybook.js.org/llms.txt
-    - See code snippets only with codeOnly=true param e.g. ${docsUrl('writing-stories')}&codeOnly=true
-
-    Key documentation pages for this task:
-    - Writing stories: ${docsUrl('writing-stories')}
-    - Decorators: ${docsUrl('writing-stories/decorators')}
-    - Args: ${docsUrl('writing-stories/args')}
-    - Play functions: ${docsUrl('writing-stories/play-function')}
-    - Vitest integration: ${docsUrl('writing-tests/vitest-plugin')}
-
-    Fetch these URLs directly when you need guidance on Storybook APIs or patterns.
-  `;
-}
-
-function getPreviewConfigExample(projectInfo: ProjectInfo): string {
-  const configDir = projectInfo.configDir;
-  const typeImport = getTypeImportSource(projectInfo);
-
-  if (projectInfo.hasCsfFactoryPreview) {
-    return dedent`
-      \`\`\`tsx
-      // ${configDir}/preview.tsx
-      import '../src/index.css'; // import global styles
-      import MockDate from 'mockdate';
-
-      import { definePreview } from 'storybook/preview';
-      import { SessionProvider } from '../src/contexts/SessionContext';
-
-      export default definePreview({
-        decorators: [
-          (Story) => (
-            <SessionProvider>
-              <Story />
-            </SessionProvider>
-          ),
-        ],
-        async beforeEach() {
-          localStorage.setItem('theme', 'dark');
-          localStorage.setItem('sidebar:open', 'true');
-          MockDate.set('2024-04-01T12:00:00Z');
-        },
-      });
-      \`\`\`
-    `;
-  }
-
-  return dedent`
-    \`\`\`tsx
-    // ${configDir}/preview.tsx
-    import type { Preview } from '${typeImport}';
-    import MockDate from 'mockdate';
-    import '../src/index.css'; // import global styles
-    import { SessionProvider } from '../src/contexts/SessionContext';
-
-    const preview: Preview = {
-      decorators: [
-        (Story) => (
-          <SessionProvider>
-            <Story />
-          </SessionProvider>
-        ),
-      ],
-      async beforeEach() {
-        localStorage.setItem('theme', 'dark');
-        localStorage.setItem('sidebar:open', 'true');
-        MockDate.set('2024-04-01T12:00:00Z');
-      },
-    };
-
-    export default preview;
-    \`\`\`
-  `;
-}
-
-function getMockDateExample(projectInfo: ProjectInfo): string {
-  const typeImport = getTypeImportSource(projectInfo);
-
-  if (projectInfo.hasCsfFactoryPreview) {
-    return dedent`
-      \`\`\`tsx
-      import MockDate from 'mockdate';
-      import { definePreview } from 'storybook/preview';
-
-      export default definePreview({
-        async beforeEach() {
-          MockDate.set('2024-04-01T12:00:00Z');
-        },
-      });
-      \`\`\`
-    `;
-  }
-
-  return dedent`
-    \`\`\`tsx
-    import type { Preview } from '${typeImport}';
-    import MockDate from 'mockdate';
-
-    const preview: Preview = {
-      async beforeEach() {
-        MockDate.set('2024-04-01T12:00:00Z');
-      },
-    };
-
-    export default preview;
-    \`\`\`
-  `;
-}
-
-function getMswPreviewExample(projectInfo: ProjectInfo): string {
-  const configDir = projectInfo.configDir;
-  const typeImport = getTypeImportSource(projectInfo);
-
-  if (projectInfo.hasCsfFactoryPreview) {
-    return dedent`
-      \`\`\`tsx
-      // ${configDir}/preview.tsx
-      import { definePreview } from 'storybook/preview';
-      import { initialize, mswLoader } from 'msw-storybook-addon';
-      import { mswHandlers } from './msw-handlers';
-
-      initialize({
-        onUnhandledRequest: 'bypass',
-      });
-
-      export default definePreview({
-        loaders: [mswLoader],
-        parameters: {
-          msw: {
-            handlers: mswHandlers,
-          },
-        },
-      });
-      \`\`\`
-    `;
-  }
-
-  return dedent`
-    \`\`\`tsx
-    // ${configDir}/preview.tsx
-    import type { Preview } from '${typeImport}';
-    import { initialize, mswLoader } from 'msw-storybook-addon';
-    import { mswHandlers } from './msw-handlers';
-
-    initialize({
-      onUnhandledRequest: 'bypass',
-    });
-
-    const preview: Preview = {
-      loaders: [mswLoader],
-      parameters: {
-        msw: {
-          handlers: mswHandlers,
-        },
-      },
-    };
-
-    export default preview;
-    \`\`\`
-  `;
-}
-
-function getStoryExample(projectInfo: ProjectInfo): string {
-  if (projectInfo.hasCsfFactoryPreview) {
-    return dedent`
-      \`\`\`tsx
-      import preview from '#.storybook/preview';
-      import { expect } from 'storybook/test';
-      import { SomeComponent } from './SomeComponent';
-
-      const meta = preview.meta({
-        component: SomeComponent,
-        tags: ['ai-generated'],
-      });
-
-      export const Default = meta.story({
-        render: () => <SomeComponent variant="primary" disabled={false} />,
-        play: async ({ canvas }) => {
-          await expect(canvas.getByRole('button')).toBeVisible();
-        },
-      });
-      \`\`\`
-    `;
-  }
-
-  const typeImport = getTypeImportSource(projectInfo);
-
-  return dedent`
-    \`\`\`tsx
-    import type { Meta, StoryObj } from '${typeImport}';
-    import { expect } from 'storybook/test';
-    import { SomeComponent } from './SomeComponent';
-
-    const meta = {
-      component: SomeComponent,
-      tags: ['ai-generated'],
-    } satisfies Meta<typeof SomeComponent>;
-
-    export default meta;
-    type Story = StoryObj<typeof meta>;
-
-    export const Default: Story = {
-      render: () => <SomeComponent variant="primary" disabled={false} />,
-      play: async ({ canvas }) => {
-        await expect(canvas.getByRole('button')).toBeVisible();
-      },
-    };
-    \`\`\`
-  `;
-}
-
-function getNeedsWorkTagExample(projectInfo: ProjectInfo): string {
-  if (projectInfo.hasCsfFactoryPreview) {
-    return dedent`
-      \`\`\`ts
-      const meta = preview.meta({
-        component: SomeComponent,
-        tags: ['ai-generated', 'needs-work'],
-      });
-      \`\`\`
-    `;
-  }
-
-  return dedent`
-    \`\`\`ts
-    const meta = {
-      component: SomeComponent,
-      tags: ['ai-generated', 'needs-work'],
-    } satisfies Meta<typeof SomeComponent>;
-    \`\`\`
-  `;
-}
-
-function getPageStoryExample(projectInfo: ProjectInfo): string {
-  if (projectInfo.hasCsfFactoryPreview) {
-    return dedent`
-      \`\`\`tsx
-      import preview from '#.storybook/preview';
-      import { expect } from 'storybook/test';
-      import { ProductPage } from './ProductPage';
-
-      const meta = preview.meta({
-        component: ProductPage,
-        tags: ['ai-generated'],
-      });
-
-      export const Default = meta.story({
-        render: () => <ProductPage />,
-        play: async ({ canvas }) => {
-          await expect(
-            canvas.getByRole('heading', { name: /products/i }),
-          ).toBeVisible();
-        },
-      });
-      \`\`\`
-    `;
-  }
-
-  const typeImport = getTypeImportSource(projectInfo);
-
-  return dedent`
-    \`\`\`tsx
-    import type { Meta, StoryObj } from '${typeImport}';
-    import { expect } from 'storybook/test';
-    import { ProductPage } from './ProductPage';
-
-    const meta = {
-      component: ProductPage,
-      tags: ['ai-generated'],
-    } satisfies Meta<typeof ProductPage>;
-
-    export default meta;
-    type Story = StoryObj<typeof meta>;
-
-    export const Default: Story = {
-      render: () => <ProductPage />,
-      play: async ({ canvas }) => {
-        await expect(
-          canvas.getByRole('heading', { name: /products/i }),
-        ).toBeVisible();
-      },
-    };
-    \`\`\`
-  `;
-}
+import type { ProjectInfo } from './types.ts';
+import { getPrompts } from './prompts/index.ts';
 
 function getProjectOverview(projectInfo: ProjectInfo): string {
   return dedent`
diff --git a/code/lib/cli-storybook/src/ai/prompts/index.ts b/code/lib/cli-storybook/src/ai/prompts/index.ts
new file mode 100644
index 000000000000..799afe31cd5a
--- /dev/null
+++ b/code/lib/cli-storybook/src/ai/prompts/index.ts
@@ -0,0 +1,53 @@
+import type { AiPrompt, ProjectInfo } from '../types.ts';
+
+import { patternCopyPlayInstructions } from './pattern-copy-play.ts';
+import { setupInstructions } from './setup.ts';
+
+/**
+ * Registry of all prompt builders. Each key is a prompt identifier used only
+ * internally (by the eval harness via `EVAL_SETUP_PROMPT`); users never see
+ * these names.
+ */
+const PROMPT_BUILDERS = {
+  'pattern-copy-play': patternCopyPlayInstructions,
+  setup: setupInstructions,
+} satisfies Record<string, (projectInfo: ProjectInfo) => string>;
+
+export type PromptName = keyof typeof PROMPT_BUILDERS;
+
+export const PROMPT_NAMES = Object.keys(PROMPT_BUILDERS) as PromptName[];
+
+/**
+ * The single prompt variant that ships to real users. Running
+ * `npx storybook ai setup` without any overrides always produces this prompt.
+ */
+export const DEFAULT_PROMPT_NAME: PromptName = 'pattern-copy-play';
+
+/**
+ * Internal env var read only by `getPrompts`. The eval harness sets this
+ * before spawning `ai setup` to select a non-default prompt variant for A/B
+ * comparison. Unknown values fall back to the default so a typo never breaks
+ * the CLI for real users.
+ */
+const EVAL_SETUP_PROMPT_ENV = 'EVAL_SETUP_PROMPT';
+
+function resolvePromptName(): PromptName {
+  const requested = process.env[EVAL_SETUP_PROMPT_ENV]?.trim();
+  if (requested && requested in PROMPT_BUILDERS) {
+    return requested as PromptName;
+  }
+  return DEFAULT_PROMPT_NAME;
+}
+
+export function getPrompts(projectInfo: ProjectInfo): { prompts: AiPrompt[] } {
+  const name = resolvePromptName();
+  return {
+    prompts: [
+      {
+        name,
+        description: 'Set up Storybook for success',
+        instructions: PROMPT_BUILDERS[name](projectInfo),
+      },
+    ],
+  };
+}
diff --git a/code/lib/cli-storybook/src/ai/prompts/pattern-copy-play.ts b/code/lib/cli-storybook/src/ai/prompts/pattern-copy-play.ts
new file mode 100644
index 000000000000..7f53f05d15bb
--- /dev/null
+++ b/code/lib/cli-storybook/src/ai/prompts/pattern-copy-play.ts
@@ -0,0 +1,691 @@
+import { dedent } from 'ts-dedent';
+
+import type { ProjectInfo } from '../types.ts';
+
+/**
+ * Builds a markdown-format docs URL with renderer and language query parameters.
+ * Appending .md to any Storybook docs URL returns clean markdown with code examples.
+ */
+function getDocsMarkdownUrl(
+  path: string,
+  projectInfo?: Pick<ProjectInfo, 'majorVersion' | 'renderer' | 'language'>
+): string {
+  const { majorVersion, renderer = 'react', language = 'ts' } = projectInfo ?? {};
+  const versionSegment = majorVersion ? `/${majorVersion}` : '';
+  const params = new URLSearchParams();
+  if (renderer) {
+    params.set('renderer', renderer);
+  }
+  params.set('language', language);
+  const query = params.toString();
+  return `https://storybook.js.org/docs${versionSegment}/${path}.md${query ? `?${query}` : ''}`;
+}
+
+function getTypeImportSource(projectInfo: ProjectInfo): string {
+  return projectInfo.framework || projectInfo.rendererPackage || '@storybook/react';
+}
+
+function getDocsReferenceSection(projectInfo: ProjectInfo): string {
+  const docsUrl = (path: string) => getDocsMarkdownUrl(path, projectInfo);
+
+  return dedent`
+    ### Storybook Documentation Reference
+
+    Use the following references to look up Storybook APIs, concepts, or examples:
+
+    - Full docs index: https://storybook.js.org/llms.txt
+    - See code snippets only with codeOnly=true param e.g. ${docsUrl('writing-stories')}&codeOnly=true
+
+    Key documentation pages for this task:
+    - Writing stories: ${docsUrl('writing-stories')}
+    - Decorators: ${docsUrl('writing-stories/decorators')}
+    - Args: ${docsUrl('writing-stories/args')}
+    - Play functions: ${docsUrl('writing-stories/play-function')}
+    - Vitest integration: ${docsUrl('writing-tests/vitest-plugin')}
+
+    Fetch these URLs directly when you need guidance on Storybook APIs or patterns.
+  `;
+}
+
+function getPreviewConfigExample(projectInfo: ProjectInfo): string {
+  const configDir = projectInfo.configDir;
+  const typeImport = getTypeImportSource(projectInfo);
+
+  if (projectInfo.hasCsfFactoryPreview) {
+    return dedent`
+      \`\`\`tsx
+      // ${configDir}/preview.tsx
+      import '../src/index.css'; // import global styles
+      import MockDate from 'mockdate';
+
+      import { definePreview } from 'storybook/preview';
+      import { SessionProvider } from '../src/contexts/SessionContext';
+
+      export default definePreview({
+        decorators: [
+          (Story) => (
+            <SessionProvider>
+              <Story />
+            </SessionProvider>
+          ),
+        ],
+        async beforeEach() {
+          localStorage.setItem('theme', 'dark');
+          localStorage.setItem('sidebar:open', 'true');
+          MockDate.set('2024-04-01T12:00:00Z');
+        },
+      });
+      \`\`\`
+    `;
+  }
+
+  return dedent`
+    \`\`\`tsx
+    // ${configDir}/preview.tsx
+    import type { Preview } from '${typeImport}';
+    import MockDate from 'mockdate';
+    import '../src/index.css'; // import global styles
+    import { SessionProvider } from '../src/contexts/SessionContext';
+
+    const preview: Preview = {
+      decorators: [
+        (Story) => (
+          <SessionProvider>
+            <Story />
+          </SessionProvider>
+        ),
+      ],
+      async beforeEach() {
+        localStorage.setItem('theme', 'dark');
+        localStorage.setItem('sidebar:open', 'true');
+        MockDate.set('2024-04-01T12:00:00Z');
+      },
+    };
+
+    export default preview;
+    \`\`\`
+  `;
+}
+
+function getMockDateExample(projectInfo: ProjectInfo): string {
+  const typeImport = getTypeImportSource(projectInfo);
+
+  if (projectInfo.hasCsfFactoryPreview) {
+    return dedent`
+      \`\`\`tsx
+      import MockDate from 'mockdate';
+      import { definePreview } from 'storybook/preview';
+
+      export default definePreview({
+        async beforeEach() {
+          MockDate.set('2024-04-01T12:00:00Z');
+        },
+      });
+      \`\`\`
+    `;
+  }
+
+  return dedent`
+    \`\`\`tsx
+    import type { Preview } from '${typeImport}';
+    import MockDate from 'mockdate';
+
+    const preview: Preview = {
+      async beforeEach() {
+        MockDate.set('2024-04-01T12:00:00Z');
+      },
+    };
+
+    export default preview;
+    \`\`\`
+  `;
+}
+
+function getMswPreviewExample(projectInfo: ProjectInfo): string {
+  const configDir = projectInfo.configDir;
+  const typeImport = getTypeImportSource(projectInfo);
+
+  if (projectInfo.hasCsfFactoryPreview) {
+    return dedent`
+      \`\`\`tsx
+      // ${configDir}/preview.tsx
+      import { definePreview } from 'storybook/preview';
+      import { initialize, mswLoader } from 'msw-storybook-addon';
+      import { mswHandlers } from './msw-handlers';
+
+      initialize({
+        onUnhandledRequest: 'bypass',
+      });
+
+      export default definePreview({
+        loaders: [mswLoader],
+        parameters: {
+          msw: {
+            handlers: mswHandlers,
+          },
+        },
+      });
+      \`\`\`
+    `;
+  }
+
+  return dedent`
+    \`\`\`tsx
+    // ${configDir}/preview.tsx
+    import type { Preview } from '${typeImport}';
+    import { initialize, mswLoader } from 'msw-storybook-addon';
+    import { mswHandlers } from './msw-handlers';
+
+    initialize({
+      onUnhandledRequest: 'bypass',
+    });
+
+    const preview: Preview = {
+      loaders: [mswLoader],
+      parameters: {
+        msw: {
+          handlers: mswHandlers,
+        },
+      },
+    };
+
+    export default preview;
+    \`\`\`
+  `;
+}
+
+function getStoryExample(projectInfo: ProjectInfo): string {
+  if (projectInfo.hasCsfFactoryPreview) {
+    return dedent`
+      \`\`\`tsx
+      import preview from '#.storybook/preview';
+      import { expect } from 'storybook/test';
+      import { SomeComponent } from './SomeComponent';
+
+      const meta = preview.meta({
+        component: SomeComponent,
+        tags: ['ai-generated'],
+      });
+
+      export const Default = meta.story({
+        render: () => <SomeComponent variant="primary" disabled={false} />,
+        play: async ({ canvas }) => {
+          await expect(canvas.getByRole('button')).toBeVisible();
+        },
+      });
+      \`\`\`
+    `;
+  }
+
+  const typeImport = getTypeImportSource(projectInfo);
+
+  return dedent`
+    \`\`\`tsx
+    import type { Meta, StoryObj } from '${typeImport}';
+    import { expect } from 'storybook/test';
+    import { SomeComponent } from './SomeComponent';
+
+    const meta = {
+      component: SomeComponent,
+      tags: ['ai-generated'],
+    } satisfies Meta<typeof SomeComponent>;
+
+    export default meta;
+    type Story = StoryObj<typeof meta>;
+
+    export const Default: Story = {
+      render: () => <SomeComponent variant="primary" disabled={false} />,
+      play: async ({ canvas }) => {
+        await expect(canvas.getByRole('button')).toBeVisible();
+      },
+    };
+    \`\`\`
+  `;
+}
+
+function getNeedsWorkTagExample(projectInfo: ProjectInfo): string {
+  if (projectInfo.hasCsfFactoryPreview) {
+    return dedent`
+      \`\`\`ts
+      const meta = preview.meta({
+        component: SomeComponent,
+        tags: ['ai-generated', 'needs-work'],
+      });
+      \`\`\`
+    `;
+  }
+
+  return dedent`
+    \`\`\`ts
+    const meta = {
+      component: SomeComponent,
+      tags: ['ai-generated', 'needs-work'],
+    } satisfies Meta<typeof SomeComponent>;
+    \`\`\`
+  `;
+}
+
+function getPageStoryExample(projectInfo: ProjectInfo): string {
+  if (projectInfo.hasCsfFactoryPreview) {
+    return dedent`
+      \`\`\`tsx
+      import preview from '#.storybook/preview';
+      import { expect } from 'storybook/test';
+      import { ProductPage } from './ProductPage';
+
+      const meta = preview.meta({
+        component: ProductPage,
+        tags: ['ai-generated'],
+      });
+
+      export const Default = meta.story({
+        render: () => <ProductPage />,
+        play: async ({ canvas }) => {
+          await expect(
+            canvas.getByRole('heading', { name: /products/i }),
+          ).toBeVisible();
+        },
+      });
+      \`\`\`
+    `;
+  }
+
+  const typeImport = getTypeImportSource(projectInfo);
+
+  return dedent`
+    \`\`\`tsx
+    import type { Meta, StoryObj } from '${typeImport}';
+    import { expect } from 'storybook/test';
+    import { ProductPage } from './ProductPage';
+
+    const meta = {
+      component: ProductPage,
+      tags: ['ai-generated'],
+    } satisfies Meta<typeof ProductPage>;
+
+    export default meta;
+    type Story = StoryObj<typeof meta>;
+
+    export const Default: Story = {
+      render: () => <ProductPage />,
+      play: async ({ canvas }) => {
+        await expect(
+          canvas.getByRole('heading', { name: /products/i }),
+        ).toBeVisible();
+      },
+    };
+    \`\`\`
+  `;
+}
+
+export function patternCopyPlayInstructions(projectInfo: ProjectInfo): string {
+  const configDir = projectInfo.configDir;
+  const typeImport = getTypeImportSource(projectInfo);
+
+  return dedent`
+    Attention: The following instructions must be followed in order to successfully set up Storybook in this project. Do not skip steps or attempt to do them out of order.
+
+    Your goal is to make Storybook fully functional in this project by analyzing the codebase,
+    configuring the preview with the right decorators, and writing stories for some components.
+
+    After each created story, run Vitest to verify it renders.
+    If the test fails, read the error, fix the issue, and re-run until it passes before moving on.
+
+    - Copy real patterns from the codebase
+    - Keep the app code unchanged
+    - Put the default setup in \`${configDir}/preview.tsx\`
+    - Keep app mocking and runtime setup in \`${configDir}/preview.tsx\`, not in the stories
+
+    ${getDocsReferenceSection(projectInfo)}
+
+    ### Step 1: Analyze the codebase
+
+    Read enough of the app to understand the full runtime environment before writing any stories.
+
+    Do not stop at \`main.tsx\` or \`App.tsx\`.
+    Follow imports into providers, pages, hooks, and shared components until you know:
+
+    - which providers exist
+    - which CSS files are injected
+    - which queries fetch data
+    - which browser-state reads happen
+    - which portals and portal roots exist
+    - which pages and components show the real usage patterns
+
+    Example of what to copy:
+
+    \`\`\`tsx
+    // src/main.tsx
+    import "./index.css";
+    import App from "./App";
+    import { SessionProvider } from "./contexts/SessionContext";
+
+    createRoot(document.getElementById("root")!).render(
+      <SessionProvider>
+        <App />
+      </SessionProvider>,
+    );
+    \`\`\`
+
+    That means Storybook should copy:
+
+    - the \`index.css\` import
+    - the \`SessionProvider\`
+    - the same provider order
+
+    Example of tracing the app deeper:
+
+    \`\`\`tsx
+    // src/App.tsx
+    function App() {
+      const { products, loadMoreProducts } = useProducts();
+      const { currentUser, signOut } = useSession();
+      // ...
+    }
+    \`\`\`
+
+    \`\`\`ts
+    // src/hooks/useProducts.ts
+    const response = await fetch(apiBaseUrl + "/products?page=1");
+    \`\`\`
+
+    \`\`\`ts
+    // src/hooks/useTheme.ts
+    const savedTheme = localStorage.getItem("theme");
+    \`\`\`
+
+    That means the default Storybook setup should discover and prepare:
+
+    - provider state
+    - MSW handlers for queries
+    - browser-state values that are actually read during render
+
+    ### Step 2: Build one default app environment in preview
+
+    Set up Storybook once so most stories work without story-specific setup.
+
+    Start with the smallest faithful environment:
+
+    - the real provider tree
+    - the real root CSS
+    - seeded browser state if the app reads it during render
+    - MSW for network/data queries
+
+    It is fine to seed browser state such as \`localStorage\`, \`sessionStorage\`, and cookies when the app reads them during render.
+    Seed only the specific app-owned keys and values you need.
+    Do not clear all \`localStorage\`, \`sessionStorage\`, or cookies, and do not reset Storybook's own state.
+    Do not mock or redefine the browser runtime itself.
+    The stories run in Vitest browser mode, so the real browser environment should already exist.
+
+    ${getPreviewConfigExample(projectInfo)}
+
+    Use this same idea for:
+
+    - providers
+    - root CSS
+    - browser state
+    - dates, and if the app logic depends on them during render then always use \`mockdate\`
+
+    Example with the \`mockdate\` package:
+
+    ${getMockDateExample(projectInfo)}
+
+    ### Step 3: Support portals with preview-body.html
+
+    If the app uses portals, copy that setup into Storybook too.
+
+    Look for patterns like:
+
+    - \`createPortal(...)\`
+    - modal, dialog, drawer, popover, tooltip, toast, or dropdown portal components
+    - hard-coded roots such as \`#portal-root\`, \`#modal-root\`, \`#drawer-root\`, or \`#toast-root\`
+
+    Example of what to copy:
+
+    \`\`\`tsx
+    // real component
+    return createPortal(<ModalContent />, document.getElementById("portal-root")!);
+    \`\`\`
+
+    That means Storybook should create the same portal root in \`${configDir}/preview-body.html\`:
+
+    \`\`\`html
+    <!-- ${configDir}/preview-body.html -->
+    <div id="portal-root"></div>
+    \`\`\`
+
+    If the app uses multiple portal roots, create all of them there:
+
+    \`\`\`html
+    <!-- ${configDir}/preview-body.html -->
+    <div id="modal-root"></div>
+    <div id="drawer-root"></div>
+    <div id="toast-root"></div>
+    \`\`\`
+
+    If a library portals directly to \`document.body\`, do not add extra roots for it.
+    Make sure the copied page shell, CSS, and layout still allow overlays, fixed positioning, and z-index stacking to render correctly.
+
+    ### Step 4: Mock side effects globally
+
+    All network/data queries should be handled by the default Storybook environment.
+
+    - Always use \`msw-storybook-addon\` for query mocking.
+    - If you introduce MSW, run \`npx msw init ./public --save\` to create the worker file.
+    - Make sure Storybook serves \`./public\` as a static dir so \`mockServiceWorker.js\` is available.
+    - Do not mock \`fetch\` directly.
+    - Network/data queries should return deterministic mock data.
+    - If you need to change dependencies, first check the lockfile and use that package manager for the change.
+
+    Example of copying a real fetch pattern into shared handlers:
+
+    \`\`\`ts
+    // real app hook
+    const response = await fetch(
+      apiBaseUrl +
+        "/products?" +
+        new URLSearchParams({
+          page: "1",
+          sort: "featured",
+        }),
+    );
+    \`\`\`
+
+    \`\`\`ts
+    // ${configDir}/msw-handlers.ts
+    import { http, HttpResponse } from "msw";
+
+    export const mswHandlers = {
+      products: [
+        http.get("https://api.example.com/products", () =>
+          HttpResponse.json({
+            items: [
+              {
+                id: "product-1",
+                name: "Example product",
+                description: "Mock product description",
+                imageUrl: "https://images.example.com/product.jpg",
+                price: 42,
+              },
+            ],
+          }),
+        ),
+      ],
+    };
+    \`\`\`
+
+    ${getMswPreviewExample(projectInfo)}
+
+    \`\`\`ts
+    // ${configDir}/main.ts
+    import type { StorybookConfig } from "${typeImport}";
+
+    const config: StorybookConfig = {
+      staticDirs: ["../public"],
+    };
+
+    export default config;
+    \`\`\`
+
+    Keep these mocks global.
+    Do not put fetch mocks in individual stories.
+    Only add handlers for requests that the shared preview setup or the stories actually use.
+    Do not add catch-all handlers that can hide unrelated failures.
+    If the defaults are not enough, improve the shared default setup instead.
+    Seed browser state when needed, but do not mock \`window\`, \`document\`, \`navigator\`, observers, or similar runtime APIs.
+    The only exception is \`mockdate\` when date-based rendering exists.
+
+    ### Step 5: Write stories
+
+    Try to find around 10 good candidate components for story files.
+    Write colocated stories for top-level components, from low-level reusable components up to page components.
+    Write up to 10 story files, or fewer only if the codebase clearly has fewer meaningful targets.
+
+    The stories should use JSX copied from real usage patterns in:
+
+    - pages
+    - app shells
+    - routes
+    - tests
+    - existing feature code
+
+    As a rule of thumb, each story file should have around 3 story exports when the component or page has enough meaningful states.
+    It can have more when the real usage supports it, up to 10 story exports in one file.
+
+    Always show all imports explicitly in story and preview files.
+    Do not rely on omitted or implied imports in examples or generated code.
+
+    #### Story tags
+
+    Every story meta must include the \`ai-generated\` tag to identify AI-created stories:
+
+    ${getStoryExample(projectInfo)}
+
+    If a story could not be fully fixed after the self-healing loop (the test still fails
+    or the rendering is incomplete), add the \`needs-work\` tag alongside \`ai-generated\`:
+
+    ${getNeedsWorkTagExample(projectInfo)}
+
+    Keep app mocking and runtime setup in preview, not in the stories.
+    Do not build large story-specific harnesses.
+    Do not write story files for subcomponents, hooks, contexts, or helpers.
+    Do not create new application components.
+    Do not add a custom \`title\`.
+    Do not stop after only a few easy targets if the codebase has more meaningful components or pages available.
+
+    ### Step 6: Write a play function for every story
+
+    Every named story export must have a \`play\` function.
+    The \`play\` function is not optional, even for simple stories.
+
+    The purpose of the \`play\` function is to prove that the story actually works in the copied Storybook environment:
+
+    - the story renders something real and non-empty
+    - the decorators provide the needed context
+    - the CSS is applied well enough for the intended state to be visible
+    - the MSW mocks or seeded browser state are actually being used
+    - important interactions, async loading states, and portals behave correctly
+
+    Use \`play\` functions to verify behavior, not just to click around.
+    A story without assertions is incomplete.
+
+    Use tools from \`storybook/test\` such as:
+
+    - \`expect\`
+    - \`waitFor\`
+
+    Prefer \`canvas\` and \`userEvent\` from the \`play\` context.
+    Do not destructure \`canvasElement\` just to create \`const canvas = within(canvasElement)\`.
+    Do not import \`userEvent\` from \`storybook/test\`; use \`userEvent\` from the \`play\` context instead.
+    Only use \`canvasElement.ownerDocument\` when you need to query outside the canvas, such as for portals.
+
+    Example:
+
+    \`\`\`tsx
+    import type { StoryObj } from "${typeImport}";
+
+    export const FilledForm: Story = {
+      play: async ({ canvas, userEvent }) => {
+        const emailInput = canvas.getByLabelText("email", {
+          selector: "input",
+        });
+
+        await userEvent.type(emailInput, "example-email@email.com", {
+          delay: 100,
+        });
+
+        const passwordInput = canvas.getByLabelText("password", {
+          selector: "input",
+        });
+
+        await userEvent.type(passwordInput, "ExamplePassword", {
+          delay: 100,
+        });
+
+        const submitButton = canvas.getByRole("button");
+        await userEvent.click(submitButton);
+      },
+    };
+    \`\`\`
+
+    The assertions should match the real pattern you copied:
+
+    - for provider-backed stories, assert the provider-dependent UI appears correctly
+    - for mocked-data stories, wait for the mocked data to appear and assert on it
+    - for CSS-sensitive states, assert on visibility, text layout, class-driven states, or meaningful computed styles
+    - for routing or navigation stories, assert the routed state or navigation outcome
+    - for portal stories, query from \`canvasElement.ownerDocument\` when the UI renders outside the canvas
+
+    Examples of useful checks:
+
+    - a themed button has the expected label and is visibly enabled or disabled
+    - a modal opened through a decorator or provider is visible in the portal root
+    - mocked API data appears in the page instead of a loading spinner forever
+    - a selected tab actually shows the selected panel
+    - a toast, alert, or badge has the expected accessible text and visual state
+    - a CSS class or computed style confirms the real state that matters
+
+    ### Step 7: Cover the patterns you found
+
+    Write stories for the real patterns in the codebase, for example:
+
+    - a low-level reusable component in real JSX usage
+    - a provider-backed component
+    - a browser-state-backed component
+    - a fetched-data component
+    - a real page component
+
+    Use \`App.tsx\` to inspect the real provider tree and usage patterns, but do not make a story for \`App\` when the codebase has actual page components.
+
+    Example page story:
+
+    ${getPageStoryExample(projectInfo)}
+
+    ### Step 8: Verify both rendering and types
+
+    As you work, verify the stories with Vitest:
+
+    \`\`\`bash
+    npx vitest --project storybook <path-to-story-file>
+    \`\`\`
+
+    Also verify types so you catch missing required props, broken imports, and preview typing issues. Run the same TypeScript command the project itself uses.
+
+    \`\`\`bash
+    <project-specific-typescript-command>
+    \`\`\`
+
+    After verification passes, review every changed file and remove anything that is not needed for the final solution, especially debug fixes, overly broad mocks, unnecessary dependencies, and eval artifacts.
+
+    Keep iterating until:
+
+    - every story you wrote passes
+    - every story you wrote has a meaningful passing \`play\` function
+    - the changed stories and preview setup pass the project's real TypeScript check
+    - the rendered output looks sensible
+    - the default global mocked environment is strong enough that stories do not need manual fetch overrides
+    - stories no longer fail because the shared preview setup and story JSX are fixed
+    - all passing stories have \`tags: ['ai-generated']\` in their meta
+    - any stories that still need work have \`tags: ['ai-generated', 'needs-work']\` in their meta
+  `;
+}
diff --git a/code/lib/cli-storybook/src/ai/prompts/setup.ts b/code/lib/cli-storybook/src/ai/prompts/setup.ts
new file mode 100644
index 000000000000..33028403bfc6
--- /dev/null
+++ b/code/lib/cli-storybook/src/ai/prompts/setup.ts
@@ -0,0 +1,283 @@
+import { dedent } from 'ts-dedent';
+
+import type { ProjectInfo } from '../types.ts';
+
+function getTypeImportSource(projectInfo: ProjectInfo): string {
+  return projectInfo.framework || projectInfo.rendererPackage || '@storybook/react';
+}
+
+function getPreviewDecoratorExample(projectInfo: ProjectInfo): string {
+  const configDir = projectInfo.configDir;
+
+  if (projectInfo.hasCsfFactoryPreview) {
+    return dedent`
+      \`\`\`tsx
+      // ${configDir}/preview.tsx
+      import '../src/index.css'; // import global styles
+
+      import { definePreview } from 'storybook/preview';
+
+      export default definePreview({
+        decorators: [
+          (Story) => (
+            <ThemeProvider theme={theme}>
+              <MemoryRouter>
+                <Story />
+              </MemoryRouter>
+            </ThemeProvider>
+          ),
+        ],
+      });
+      \`\`\`
+    `;
+  }
+
+  const typeImport = getTypeImportSource(projectInfo);
+
+  return dedent`
+    \`\`\`tsx
+    // ${configDir}/preview.tsx
+    import type { Preview } from '${typeImport}';
+    import '../src/index.css'; // import global styles
+
+    const preview: Preview = {
+      decorators: [
+        (Story) => (
+          <ThemeProvider theme={theme}>
+            <MemoryRouter>
+              <Story />
+            </MemoryRouter>
+          </ThemeProvider>
+        ),
+      ],
+    };
+
+    export default preview;
+    \`\`\`
+  `;
+}
+
+function getSimpleStoryExample(projectInfo: ProjectInfo): string {
+  if (projectInfo.hasCsfFactoryPreview) {
+    return dedent`
+      \`\`\`tsx
+      import preview from '#.storybook/preview';
+      import { Button } from './Button';
+
+      const meta = preview.meta({
+        title: 'AI Generated/Simple/Button',
+        component: Button,
+        tags: ['ai-generated'],
+      });
+
+      export const Default = meta.story({
+        args: {
+          label: 'Click me',
+        },
+      });
+
+      export const Disabled = meta.story({
+        args: {
+          label: 'Disabled',
+          disabled: true,
+        },
+      });
+      \`\`\`
+    `;
+  }
+
+  const typeImport = getTypeImportSource(projectInfo);
+
+  return dedent`
+    \`\`\`tsx
+    import type { Meta, StoryObj } from '${typeImport}';
+    import { Button } from './Button';
+
+    const meta = {
+      title: 'AI Generated/Simple/Button',
+      component: Button,
+      tags: ['ai-generated'],
+    } satisfies Meta<typeof Button>;
+
+    export default meta;
+    type Story = StoryObj<typeof meta>;
+
+    export const Default: Story = {
+      args: {
+        label: 'Click me',
+      },
+    };
+
+    export const Disabled: Story = {
+      args: {
+        label: 'Disabled',
+        disabled: true,
+      },
+    };
+    \`\`\`
+  `;
+}
+
+function getNeedsWorkTagExample(projectInfo: ProjectInfo): string {
+  if (projectInfo.hasCsfFactoryPreview) {
+    return dedent`
+      \`\`\`ts
+      const meta = preview.meta({
+        title: 'AI Generated/Simple/Button',
+        component: Button,
+        tags: ['ai-generated', 'needs-work'],
+      });
+      \`\`\`
+    `;
+  }
+
+  return dedent`
+    \`\`\`ts
+    const meta = {
+      title: 'AI Generated/Simple/Button',
+      component: Button,
+      tags: ['ai-generated', 'needs-work'],
+    } satisfies Meta<typeof Button>;
+    \`\`\`
+  `;
+}
+
+export function setupInstructions(projectInfo: ProjectInfo): string {
+  const configDir = projectInfo.configDir;
+
+  return dedent`
+    Attention: The following instructions must be followed in order to successfully set up Storybook in this project. Do not skip steps or attempt to do them out of order.
+
+    Your goal is to make Storybook fully functional in this project by analyzing the codebase,
+    configuring the preview with the right decorators, and writing example stories for 9 components.
+
+    Work through these steps in order. After each story file, run Vitest to verify it renders.
+    If the test fails, read the error, fix the issue, and re-run until it passes before moving on.
+
+    ### Step 1: Analyze the codebase
+
+    Before writing any stories, understand what the components need to render:
+
+    - Scan the project for context providers, theme systems, routers, stores, and i18n setups.
+      Look at the app's entry point (e.g. \`App.tsx\`, \`main.tsx\`, \`layout.tsx\`) to see what
+      providers wrap the component tree.
+    - Identify global CSS or style imports required for components to look correct.
+    - Note any path aliases configured in tsconfig or bundler config.
+    - Read \`${configDir}/main.ts\` (or \`main.js\`) to find the \`stories\` glob patterns.
+      Your story files must match those patterns to be picked up by Storybook.
+
+    ### Step 2: Configure \`${configDir}/preview.tsx\` with decorators
+
+    Add decorators that wrap every story with the providers your components need.
+    Without this, most non-trivial components will crash.
+
+    ${getPreviewDecoratorExample(projectInfo)}
+
+    Common decorators to add:
+
+    - **Theme providers** (e.g. ThemeProvider, MUI ThemeProvider, styled-components, Tailwind)
+    - **Router** (e.g. MemoryRouter, BrowserRouter mock)
+    - **State stores** (e.g. Redux Provider, Zustand, Jotai)
+    - **i18n** (e.g. IntlProvider, I18nextProvider)
+    - **Global CSS** — import global stylesheets at the top of \`preview.tsx\`
+
+    ### Step 3: Write stories for 9 components
+
+    Pick 9 real components from the codebase, 3 of each complexity level.
+    Use the title prefix \`AI Generated/<Complexity>/<ComponentName>\` so they are grouped
+    together in the Storybook sidebar.
+
+    **Simple (3 components)** — Presentational with few props, no internal state.
+    Examples: Button, Badge, Avatar, Icon, Label, Chip.
+    Title format: \`AI Generated/Simple/<ComponentName>\`
+
+    **Medium (3 components)** — Multiple visual variants or composed from simpler components.
+    Examples: Card, Alert, Input, Select, Tooltip, Tabs.
+    Title format: \`AI Generated/Medium/<ComponentName>\`
+
+    **Complex (3 components)** — Internal state, side effects, or deep composition.
+    Examples: Modal, DataTable, Form, Dropdown, Accordion, Sidebar.
+    Title format: \`AI Generated/Complex/<ComponentName>\`
+
+    For each component, create a \`<ComponentName>.stories.tsx\` file next to the component.
+    Each file must have at least 2 story exports covering the component's main states.
+    Make sure the file location and naming matches the \`stories\` patterns in \`${configDir}/main.ts\`.
+
+    #### Story tags
+
+    Every story meta must include the \`ai-generated\` tag to identify AI-created stories:
+
+    ${getSimpleStoryExample(projectInfo)}
+
+    If a story could not be fully fixed after the self-healing loop (the test still fails
+    or the rendering is incomplete), add the \`needs-work\` tag alongside \`ai-generated\`:
+
+    ${getNeedsWorkTagExample(projectInfo)}
+
+    Rules:
+
+    - Every named export is a story. Use \`args\` to set props.
+    - Provide all required props via \`args\` — check the component's types.
+    - If a component needs per-story decorators (beyond the global ones), add them in the meta.
+    - Do NOT use \`any\` types. Use the component's prop types for type safety.
+
+    Reference: https://storybook.js.org/docs/writing-stories
+
+    ### Step 4: Verify each story with Vitest
+
+    After writing each story file, immediately verify it:
+
+    \`\`\`bash
+    npx vitest --project storybook <path-to-story-file>
+    \`\`\`
+
+    **Self-healing loop — repeat for every story file:**
+
+    1. Write/update the story file
+    2. Run \`npx vitest --project storybook <path-to-story-file>\`
+    3. If it fails: read the error output carefully
+       - Missing provider → add a decorator in \`${configDir}/preview.tsx\` or in the story meta
+       - Missing prop → add the required prop to \`args\`
+       - Import error → fix the import path
+       - CSS/asset error → add static dirs or import the stylesheet
+    4. Fix the issue and go back to step 2
+    5. Once the test passes, move to the next component
+
+    After all 9 story files pass individually, run the full suite:
+
+    \`\`\`bash
+    npx vitest --project storybook
+    \`\`\`
+
+    Once all stories pass, run a full Storybook build as a final check:
+
+    \`\`\`bash
+    npx storybook build
+    \`\`\`
+
+    If the build fails, fix the issue before finishing.
+
+    Finally, run \`npx storybook doctor\` to check for common issues
+    (version mismatches, duplicated deps, etc.) and fix anything it reports.
+
+    ### Checklist
+
+    - [ ] Analyzed codebase for providers, global styles, and path aliases
+    - [ ] Read story patterns from \`${configDir}/main.ts\`
+    - [ ] Configured \`${configDir}/preview.tsx\` with necessary decorators
+    - [ ] Simple component 1: story written and passing
+    - [ ] Simple component 2: story written and passing
+    - [ ] Simple component 3: story written and passing
+    - [ ] Medium component 1: story written and passing
+    - [ ] Medium component 2: story written and passing
+    - [ ] Medium component 3: story written and passing
+    - [ ] Complex component 1: story written and passing
+    - [ ] Complex component 2: story written and passing
+    - [ ] Complex component 3: story written and passing
+    - [ ] Full Vitest suite passes: \`npx vitest --project storybook\`
+    - [ ] \`npx storybook build\` succeeds
+    - [ ] \`npx storybook doctor\` reports no remaining issues
+    - [ ] All passing stories have \`tags: ['ai-generated']\` in their meta
+    - [ ] Any stories that still need work have \`tags: ['ai-generated', 'needs-work']\` in their meta
+  `;
+}
diff --git a/scripts/eval/README.md b/scripts/eval/README.md
index da658a6ce9a0..3660241b237e 100644
--- a/scripts/eval/README.md
+++ b/scripts/eval/README.md
@@ -31,7 +31,7 @@ Each trial follows this lifecycle:
 All commands run from the repo root.
 
 ```sh
-# Prompt file is required (scripts/eval/prompts/{name}.md). Example: pattern-copy-play
+# Prompt variant is required. Example: pattern-copy-play (the CLI default)
 node scripts/eval/eval.ts -p mealdrop --prompt pattern-copy-play
 
 # Specific agent
@@ -236,17 +236,39 @@ To benchmark a new app, register it in the harness and sync baselines. Follow th
 
 ## Prompts
 
-Prompts are markdown files in `scripts/eval/prompts/` that tell the agent what to do during a trial. The `--prompt` flag selects one by filename (without `.md`).
+The eval mirrors the real user flow exactly:
+
+1. A real user copies the "Set up Storybook with AI" prompt from the Storybook UI — a one-line nudge (`AI_SETUP_PROMPT`) that just says _"Run `npx storybook ai setup` and follow its instructions precisely."_
+2. The user pastes that into their AI agent.
+3. The **agent** runs `npx storybook ai setup` itself as a tool call.
+4. The agent reads the resulting project-aware markdown and follows it.
+
+The harness hands step (1) to the trial agent as its task. It never spawns `ai setup` itself — that's the agent's job, just like with real users.
+
+### How variant selection works
+
+Prompt variants live in [`code/lib/cli-storybook/src/ai/prompts/`](../../code/lib/cli-storybook/src/ai/prompts/). Each variant is a self-contained `.ts` file that exports an `instructions(projectInfo)` function. The registry in `prompts/index.ts` lists every variant.
+
+The eval selects a variant by injecting the `EVAL_SETUP_PROMPT` env var into the agent's spawn environment. When the agent later runs `npx storybook ai setup`, the CLI reads that env var and returns the matching variant. Real users never set this env var, so they always get the default (`pattern-copy-play`).
+
+```
+eval.ts --prompt setup
+  → run-trial.ts calls driver.execute({ env: { EVAL_SETUP_PROMPT: 'setup' } })
+    → agent spawns with that env
+      → agent's `npx storybook ai setup` tool call inherits EVAL_SETUP_PROMPT
+        → CLI's getPrompts() picks the 'setup' variant
+```
 
 ### Available prompts
 
-- `**pattern-copy-play**` — analyze the codebase, copy real usage patterns, configure preview with providers and MSW mocks, write ~10 story files with play functions, verify each with Vitest.
-- `**setup**` — structured step-by-step: analyze, configure preview, write 9 stories (3 simple / 3 medium / 3 complex), verify each with Vitest.
+- `**pattern-copy-play**` *(default)* — analyze the codebase, copy real usage patterns, configure preview with providers and MSW mocks, write ~10 story files with play functions, verify each with Vitest. This is the only prompt users ever see when they run `npx storybook ai setup`.
+- `**setup**` — structured step-by-step: analyze, configure preview, write 9 stories (3 simple / 3 medium / 3 complex), verify each with Vitest. Available only to the eval harness for A/B comparison against the default.
 
-### Writing a new prompt
+### Adding a new prompt variant
 
-1. Create a markdown file in `scripts/eval/prompts/`, e.g. `my-strategy.md`.
-2. Write the instructions the agent should follow. The prompt is passed directly to the agent as its task.
-3. Use it: `node scripts/eval/eval.ts -p mealdrop --prompt my-strategy`
+1. Create `code/lib/cli-storybook/src/ai/prompts/<name>.ts`. Make it fully self-contained — keep its own `getTypeImportSource`, code-example helpers, and any other private utilities so changing one variant can never accidentally change another. Duplication is deliberate here.
+2. Export an `instructions(projectInfo: ProjectInfo): string` function.
+3. Register it in `code/lib/cli-storybook/src/ai/prompts/index.ts` by adding an entry to `PROMPT_BUILDERS`.
+4. Use it from the eval: `node scripts/eval/eval.ts -p mealdrop --prompt <name>`.
 
-The prompt should tell the agent how to analyze the codebase, configure `.storybook/preview.ts`, write story files matching the `stories` glob, and verify with `npx vitest --project storybook`.
\ No newline at end of file
+To promote a variant to be the default users see, change `DEFAULT_PROMPT_NAME` in the same registry file.
\ No newline at end of file
diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts
index e6348614065e..afa3565b26ed 100644
--- a/scripts/eval/eval.ts
+++ b/scripts/eval/eval.ts
@@ -84,7 +84,7 @@ const argsSchema = z
     if (prompt === '') {
       ctx.addIssue({
         code: z.ZodIssueCode.custom,
-        message: `Specify --prompt <name> (markdown file in scripts/eval/prompts/). Example: --prompt ${EXAMPLE_PROMPT_BASENAME}. Run with --list-prompts to see available names.`,
+        message: `Specify --prompt <name>. Example: --prompt ${EXAMPLE_PROMPT_BASENAME}. Run with --list-prompts to see available names.`,
         path: ['prompt'],
       });
     }
@@ -101,7 +101,7 @@ const evalOptions = {
   effort: { type: 'string' as const, short: 'e', description: 'Effort level' },
   prompt: {
     type: 'string' as const,
-    description: `Prompt template name — required with -p (file: prompts/{name}.md; e.g. ${EXAMPLE_PROMPT_BASENAME})`,
+    description: `Prompt variant name — required with -p (e.g. ${EXAMPLE_PROMPT_BASENAME}). Use --list-prompts to see available names.`,
   },
   verbose: { type: 'boolean' as const, short: 'v', description: 'Enable verbose output' },
   manual: {
@@ -192,7 +192,7 @@ if (args.manual) {
   const promptPath = join(workspace.resultsDir, 'prompt.md');
   await writeFile(promptPath, prompt);
 
-  const cliCommand = buildManualCommand(variant, promptPath);
+  const cliCommand = buildManualCommand(variant, promptPath, promptName);
 
   logger.log(pc.bold('\n── Manual mode ──'));
   logger.log(`\n  Trial dir:    ${pc.cyan(workspace.trialDir)}`);
@@ -241,13 +241,20 @@ function inferAgent(model: string): AgentId {
   throw new Error(`No agent found for model: ${model}`);
 }
 
-function buildManualCommand(variant: AgentVariant, promptPath: string): string {
+function buildManualCommand(
+  variant: AgentVariant,
+  promptPath: string,
+  promptName: string
+): string {
+  // EVAL_SETUP_PROMPT must be in the env the agent inherits, so that the
+  // agent's own `npx storybook ai setup` tool call picks the right variant.
+  const envPrefix = `EVAL_SETUP_PROMPT=${promptName} `;
   const promptArg = `"$(cat ${promptPath})"`;
   if (variant.agent === 'claude') {
     const sdkModel = AGENTS.claude.sdkModelIds[variant.model] ?? variant.model;
-    return `claude --model ${sdkModel} ${promptArg}`;
+    return `${envPrefix}claude --model ${sdkModel} ${promptArg}`;
   }
-  return `codex --model ${variant.model} --reasoning-effort ${variant.effort} ${promptArg}`;
+  return `${envPrefix}codex --model ${variant.model} --reasoning-effort ${variant.effort} ${promptArg}`;
 }
 
 function toVariant(args: z.infer<typeof argsSchema>): AgentVariant {
diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts
index a403abb1933e..7113a43d4416 100644
--- a/scripts/eval/lib/agents/claude-code.ts
+++ b/scripts/eval/lib/agents/claude-code.ts
@@ -13,7 +13,14 @@ import type { Logger } from '../utils.ts';
 export const claudeAgent: AgentDriver = {
   name: 'claude',
 
-  async execute({ prompt, projectPath, variant, logger, verbose }): Promise<AgentExecutionResult> {
+  async execute({
+    prompt,
+    projectPath,
+    variant,
+    logger,
+    verbose,
+    env,
+  }): Promise<AgentExecutionResult> {
     if (variant.agent !== 'claude') {
       throw new Error(`Claude driver received unsupported variant: ${variant.agent}`);
     }
@@ -37,6 +44,7 @@ export const claudeAgent: AgentDriver = {
           cwd: projectPath,
           env: {
             ...process.env,
+            ...env,
             STORYBOOK_DISABLE_TELEMETRY: '1',
           },
           allowedTools: [...settings.allowedTools],
diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts
index ae10f2e659d0..c3a75c00e488 100644
--- a/scripts/eval/lib/agents/codex.ts
+++ b/scripts/eval/lib/agents/codex.ts
@@ -11,7 +11,14 @@ import { countLines } from '../output-preview.ts';
 export const codexAgent: AgentDriver = {
   name: 'codex',
 
-  async execute({ prompt, projectPath, variant, logger, verbose }): Promise<AgentExecutionResult> {
+  async execute({
+    prompt,
+    projectPath,
+    variant,
+    logger,
+    verbose,
+    env,
+  }): Promise<AgentExecutionResult> {
     if (variant.agent !== 'codex') {
       throw new Error(`Codex driver received unsupported variant: ${variant.agent}`);
     }
@@ -23,6 +30,7 @@ export const codexAgent: AgentDriver = {
     const codex = new Codex({
       env: {
         ...process.env,
+        ...env,
         STORYBOOK_DISABLE_TELEMETRY: '1',
       },
     });
diff --git a/scripts/eval/lib/agents/config.ts b/scripts/eval/lib/agents/config.ts
index 1f49fef48c7a..71ca3986b125 100644
--- a/scripts/eval/lib/agents/config.ts
+++ b/scripts/eval/lib/agents/config.ts
@@ -46,6 +46,14 @@ export interface AgentExecuteParams {
   resultsDir: string;
   logger: Logger;
   verbose?: boolean;
+  /**
+   * Extra env vars to forward to the agent's spawn. Merged on top of
+   * `process.env` and under the driver's fixed entries (e.g.
+   * `STORYBOOK_DISABLE_TELEMETRY`). Used by the harness to inject
+   * `EVAL_SETUP_PROMPT` so that the agent's own `npx storybook ai setup`
+   * tool call resolves to the selected prompt variant.
+   */
+  env?: Record<string, string>;
 }
 
 export interface AgentDriver {
diff --git a/scripts/eval/lib/run-trial.ts b/scripts/eval/lib/run-trial.ts
index 55e2040bcdaf..5616aeeafc0a 100644
--- a/scripts/eval/lib/run-trial.ts
+++ b/scripts/eval/lib/run-trial.ts
@@ -22,7 +22,7 @@ export interface TrialConfig {
   project: Project;
   /** Agent, model, and effort level. */
   variant: AgentVariant;
-  /** Prompt name — maps to `prompts/{name}.md` (e.g. "setup"). */
+  /** Prompt variant name — registered in `code/lib/cli-storybook/src/ai/prompts/` (e.g. "pattern-copy-play"). */
   prompt: string;
   /** Log agent messages to stdout. */
   verbose?: boolean;
@@ -64,11 +64,15 @@ export async function runTrial(config: TrialConfig, logger?: Logger): Promise<Ru
     'baseline ghost stories'
   );
 
-  // 4. Load the prompt
+  // 4. Load the nudge prompt the agent will receive. The agent itself runs
+  //    `npx storybook ai setup` as a tool call — mirroring what real users do
+  //    when they copy the "Set up Storybook with AI" prompt from the UI.
   const prompt = loadPrompt(promptName);
   await writeFile(join(workspace.resultsDir, 'prompt.md'), prompt);
 
-  // 5. Execute the agent
+  // 5. Execute the agent. EVAL_SETUP_PROMPT is forwarded into the agent's
+  //    environment so its `ai setup` tool call resolves to the selected
+  //    prompt variant (unset for real users → always the default).
   log.log(`  Running ${agentName} (${model}, effort=${variant.effort})...`);
   const driver = drivers[agentName];
   const { execution, transcript } = await driver.execute({
@@ -78,6 +82,7 @@ export async function runTrial(config: TrialConfig, logger?: Logger): Promise<Ru
     resultsDir: workspace.resultsDir,
     logger: log,
     verbose: config.verbose,
+    env: { EVAL_SETUP_PROMPT: promptName },
   });
   log.logSuccess(
     `Agent completed (${Math.round(execution.duration)}s, ${execution.cost ? `$${execution.cost.toFixed(2)}` : 'cost N/A'}, ${execution.turns} turns)`
diff --git a/scripts/eval/lib/utils.test.ts b/scripts/eval/lib/utils.test.ts
index db1d8e6ece53..bb197efcfd98 100644
--- a/scripts/eval/lib/utils.test.ts
+++ b/scripts/eval/lib/utils.test.ts
@@ -98,40 +98,33 @@ describe('formatReadableUtcTimestamp', () => {
 });
 
 describe('listPrompts', () => {
-  it('lists available prompt names', () => {
+  it('mirrors the CLI prompt registry', () => {
     const prompts = listPrompts();
     expect(prompts).toContain('pattern-copy-play');
-    expect(prompts).not.toContain('pattern-copy');
     expect(prompts).toContain('setup');
+    expect(prompts).not.toContain('pattern-copy');
   });
 
-  it('returns only names without .md extension', () => {
-    for (const name of listPrompts()) {
-      expect(name).not.toContain('.md');
-    }
+  it('includes the default/example prompt', () => {
+    expect(listPrompts()).toContain(EXAMPLE_PROMPT_BASENAME);
   });
 });
 
 describe('loadPrompt', () => {
-  it('loads setup prompt by name', () => {
-    const prompt = loadPrompt('setup');
-    expect(prompt).toContain('Storybook');
-    expect(prompt).toContain('### Step 1');
-  });
-
-  it('loads the play-driven pattern-copy prompt by name', () => {
+  it('returns the nudge string the agent receives (not the resolved instructions)', () => {
     const prompt = loadPrompt(EXAMPLE_PROMPT_BASENAME);
-    expect(prompt).toContain('play function');
-    expect(prompt).toContain('The purpose of the `play` function is to prove');
+    expect(prompt).toContain('npx storybook ai setup');
+    expect(prompt).not.toContain('### Step 1');
   });
 
-  it('throws for unknown prompt', () => {
+  it('rejects unknown prompt names', () => {
     expect(() => loadPrompt('nonexistent-prompt-xyz')).toThrow('Prompt not found');
   });
 
-  it('returns trimmed content', () => {
-    const prompt = loadPrompt(EXAMPLE_PROMPT_BASENAME);
-    expect(prompt).toBe(prompt.trim());
+  it('accepts every registered prompt name', () => {
+    for (const name of listPrompts()) {
+      expect(() => loadPrompt(name)).not.toThrow();
+    }
   });
 });
 
diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts
index afa49482fcd5..82d7e519b604 100644
--- a/scripts/eval/lib/utils.ts
+++ b/scripts/eval/lib/utils.ts
@@ -1,8 +1,13 @@
-import { readFileSync, existsSync, readdirSync } from 'node:fs';
-import { basename, join, resolve, sep } from 'node:path';
+import { join, resolve, sep } from 'node:path';
 import pc from 'picocolors';
 import { x } from 'tinyexec';
 
+import { AI_SETUP_PROMPT } from '../../../code/core/src/shared/constants/ai-prompts.ts';
+import {
+  DEFAULT_PROMPT_NAME,
+  PROMPT_NAMES,
+} from '../../../code/lib/cli-storybook/src/ai/prompts/index.ts';
+
 export interface Logger {
   log: (msg: string) => void;
   logStep: (msg: string) => void;
@@ -14,9 +19,8 @@ export const REPO_ROOT = resolve(import.meta.dirname, '..', '..', '..');
 export const EVAL_ROOT = resolve(REPO_ROOT, '..', 'storybook-eval');
 export const REPOS_DIR = resolve(EVAL_ROOT, 'repos');
 export const TRIALS_DIR = resolve(EVAL_ROOT, 'trials');
-export const PROMPTS_DIR = resolve(import.meta.dirname, '..', 'prompts');
-/** Basename (no `.md`) used in docs and tests when a concrete prompt must be named. */
-export const EXAMPLE_PROMPT_BASENAME = 'pattern-copy-play';
+/** Name used in docs and tests when a concrete prompt must be named. Tracks the CLI default. */
+export const EXAMPLE_PROMPT_BASENAME = DEFAULT_PROMPT_NAME;
 export const NODE_EVAL_TRIAL_SCRIPT = 'scripts/eval/eval.ts' as const;
 export const NODE_EVAL_RUN_BATCH_SCRIPT = 'scripts/eval/run-batch.ts' as const;
 export const NODE_EVAL_SYNC_BASELINES_SCRIPT = 'scripts/eval/sync-baselines.ts' as const;
@@ -140,22 +144,24 @@ export function formatTable(headers: string[], rows: string[][]): string {
   ].join('\n');
 }
 
-/** Load a prompt by name from prompts/{name}.md. */
+/**
+ * Returns the exact nudge string a real user copies from the Storybook UI —
+ * "Run `npx storybook ai setup` and follow its instructions precisely." The
+ * AGENT then runs `ai setup` itself as a tool call, mirroring the real user
+ * flow. The harness selects a prompt variant via the `EVAL_SETUP_PROMPT` env
+ * var on the agent's spawn (not here); this function only validates the name.
+ */
 export function loadPrompt(name: string): string {
   const available = listPrompts();
   if (!available.includes(name)) {
     throw new Error(`Prompt not found: ${name}\nAvailable: ${available.join(', ')}`);
   }
-  const file = resolve(PROMPTS_DIR, `${name}.md`);
-  return readFileSync(file, 'utf-8').trim();
+  return AI_SETUP_PROMPT;
 }
 
-/** List available prompt names. */
+/** List available prompt names. Mirrors the builder registry in the CLI. */
 export function listPrompts(): string[] {
-  if (!existsSync(PROMPTS_DIR)) return [];
-  return readdirSync(PROMPTS_DIR)
-    .filter((f) => f.endsWith('.md'))
-    .map((f) => basename(f, '.md'));
+  return [...PROMPT_NAMES];
 }
 
 export interface EvalEnvironment {
diff --git a/scripts/eval/prompts/pattern-copy-play.md b/scripts/eval/prompts/pattern-copy-play.md
deleted file mode 100644
index a8ce9b20bd94..000000000000
--- a/scripts/eval/prompts/pattern-copy-play.md
+++ /dev/null
@@ -1,529 +0,0 @@
-# Pattern-Copy Storybook Setup With Play Functions
-
-Your goal is to make Storybook fully functional in this project by analyzing the codebase,
-configuring the preview with the right decorators, and writing stories for some components.
-
-The end state should be a Storybook where any component — from a small button to a full page — can be added without story-specific workarounds. All necessary providers, CSS, browser state, and network mocks should live in the shared preview so that new stories only need the component import and a render call.
-
-After each created story, run Vitest to verify it renders.
-If the test fails, read the error, fix the issue, and re-run until it passes before moving on.
-
-- copy real patterns from the codebase
-- keep the app code unchanged
-- put the default setup in `.storybook/preview.tsx`
-- keep app mocking and runtime setup in `.storybook/preview.tsx`, not in the stories
-
-### Step 1: Analyze the codebase
-
-Read enough of the app to understand the full runtime environment before writing any stories.
-
-Do not stop at `main.tsx` or `App.tsx`.
-Follow imports into providers, pages, hooks, and shared components until you know:
-- which providers exist
-- which CSS files are injected
-- which queries fetch data
-- which browser-state reads happen
-- which portals and portal roots exist
-- which pages and components show the real usage patterns
-
-Example of what to copy:
-
-```tsx
-// src/main.tsx
-import './index.css';
-import App from './App';
-import { SessionProvider } from './contexts/SessionContext';
-
-createRoot(document.getElementById('root')!).render(
-  <SessionProvider>
-    <App />
-  </SessionProvider>
-);
-```
-
-That means Storybook should copy:
-- the `index.css` import
-- the `SessionProvider`
-- the same provider order
-
-Example of tracing the app deeper:
-
-```tsx
-// src/App.tsx
-function App() {
-  const { products, loadMoreProducts } = useProducts();
-  const { currentUser, signOut } = useSession();
-  // ...
-}
-```
-
-```ts
-// src/hooks/useProducts.ts
-const response = await fetch(apiBaseUrl + '/products?page=1');
-```
-
-```ts
-// src/hooks/useTheme.ts
-const savedTheme = localStorage.getItem('theme');
-```
-
-That means the default Storybook setup should discover and prepare:
-- provider state
-- MSW handlers for queries
-- browser-state values that are actually read during render
-
-## 2. Build one default app environment in preview
-
-Set up Storybook once so most stories work without story-specific setup.
-
-Start with the smallest faithful environment:
-- the real provider tree
-- the real root CSS
-- seeded browser state if the app reads it during render
-- MSW for network/data queries
-
-It is fine to seed browser state such as `localStorage`, `sessionStorage`, and cookies when the app reads them during render.
-Seed only the specific app-owned keys and values you need.
-Do not clear all `localStorage`, `sessionStorage`, or cookies, and do not reset Storybook's own state.
-Do not mock or redefine the browser runtime itself.
-The stories run in Vitest browser mode, so the real browser environment should already exist.
-
-Example:
-
-```tsx
-// .storybook/preview.tsx
-import type { Preview } from '@storybook/react-vite';
-import MockDate from 'mockdate';
-import '../src/index.css';
-import { SessionProvider } from '../src/contexts/SessionContext';
-
-const preview: Preview = {
-  decorators: [
-    (Story) => (
-      <SessionProvider>
-        <Story />
-      </SessionProvider>
-    ),
-  ],
-  async beforeEach() {
-    localStorage.setItem('theme', 'dark');
-    localStorage.setItem('sidebar:open', 'true');
-    MockDate.set('2024-04-01T12:00:00Z');
-  },
-};
-
-export default preview;
-```
-
-Use this same idea for:
-- providers
-- root CSS
-- browser state
-- dates, and if the app logic depends on them during render then always use `mockdate`
-
-Example with the `mockdate` package:
-
-```tsx
-import type { Preview } from '@storybook/react-vite';
-import MockDate from 'mockdate';
-
-const preview: Preview = {
-  async beforeEach() {
-    MockDate.set('2024-04-01T12:00:00Z');
-  },
-};
-
-export default preview;
-```
-
-## 3. Support portals with preview-body.html
-
-If the app uses portals, copy that setup into Storybook too.
-
-Look for patterns like:
-- `createPortal(...)`
-- modal, dialog, drawer, popover, tooltip, toast, or dropdown portal components
-- hard-coded roots such as `#portal-root`, `#modal-root`, `#drawer-root`, or `#toast-root`
-
-Example of what to copy:
-
-```tsx
-// real component
-return createPortal(
-  <ModalContent />,
-  document.getElementById('portal-root')!
-);
-```
-
-That means Storybook should create the same portal root in `.storybook/preview-body.html`:
-
-```html
-<!-- .storybook/preview-body.html -->
-<div id="portal-root"></div>
-```
-
-If the app uses multiple portal roots, create all of them there:
-
-```html
-<!-- .storybook/preview-body.html -->
-<div id="modal-root"></div>
-<div id="drawer-root"></div>
-<div id="toast-root"></div>
-```
-
-If a library portals directly to `document.body`, do not add extra roots for it.
-Make sure the copied page shell, CSS, and layout still allow overlays, fixed positioning, and z-index stacking to render correctly.
-
-## 4. Mock side effects globally
-
-All network/data queries should be handled by the default Storybook environment.
-
-- Always use `msw-storybook-addon` for query mocking.
-- If you introduce MSW, run `npx msw init ./public --save` to create the worker file.
-- Make sure Storybook serves `./public` as a static dir so `mockServiceWorker.js` is available.
-- Do not mock `fetch` directly.
-- Network/data queries should return deterministic mock data.
-- If you need to change dependencies, first check the lockfile and use that package manager for the change.
-
-Example of copying a real fetch pattern into shared handlers:
-
-```ts
-// real app hook
-const response = await fetch(
-  apiBaseUrl +
-    '/products?' +
-    new URLSearchParams({
-      page: '1',
-      sort: 'featured',
-    })
-);
-```
-
-```ts
-// .storybook/msw-handlers.ts
-import { http, HttpResponse } from 'msw';
-
-export const mswHandlers = {
-  products: [
-    http.get('https://api.example.com/products', () =>
-      HttpResponse.json({
-        items: [
-          {
-            id: 'product-1',
-            name: 'Example product',
-            description: 'Mock product description',
-            imageUrl: 'https://images.example.com/product.jpg',
-            price: 42,
-          },
-        ],
-      })
-    ),
-  ],
-};
-```
-
-```tsx
-// .storybook/preview.tsx
-import type { Preview } from '@storybook/react-vite';
-import { initialize, mswLoader } from 'msw-storybook-addon';
-import { mswHandlers } from './msw-handlers';
-
-initialize({
-  onUnhandledRequest: 'bypass',
-});
-
-const preview: Preview = {
-  loaders: [mswLoader],
-  parameters: {
-    msw: {
-      handlers: mswHandlers,
-    },
-  },
-};
-
-export default preview;
-```
-
-```ts
-// .storybook/main.ts
-import type { StorybookConfig } from '@storybook/react-vite';
-
-const config: StorybookConfig = {
-  staticDirs: ['../public'],
-};
-
-export default config;
-```
-
-Keep these mocks global.
-Do not put fetch mocks in individual stories.
-Only add handlers for requests that the shared preview setup or the stories actually use.
-Do not add catch-all handlers that can hide unrelated failures.
-If the defaults are not enough, improve the shared default setup instead.
-Seed browser state when needed, but do not mock `window`, `document`, `navigator`, observers, or similar runtime APIs.
-The only exception is `mockdate` when date-based rendering exists.
-
-## 5. Write stories
-
-Try to find around 10 good candidate components for story files.
-Write colocated stories for top-level components, from low-level reusable components up to page components.
-Write up to 10 story files, or fewer only if the codebase clearly has fewer meaningful targets.
-
-The stories should use JSX copied from real usage patterns in:
-- pages
-- app shells
-- routes
-- tests
-- existing feature code
-
-As a rule of thumb, each story file should have around 3 story exports when the component or page has enough meaningful states.
-It can have more when the real usage supports it, up to 10 story exports in one file.
-
-Always show all imports explicitly in story and preview files.
-Do not rely on omitted or implied imports in examples or generated code.
-
-For simple components where props drive the state, prefer `args` stories — no `render` function needed:
-
-```tsx
-import type { Meta, StoryObj } from '@storybook/react-vite';
-import { expect } from 'storybook/test';
-import { Button } from './Button';
-
-const meta = {
-  component: Button,
-} satisfies Meta<typeof Button>;
-
-export default meta;
-type Story = StoryObj<typeof meta>;
-
-export const Primary: Story = {
-  args: {
-    variant: 'primary',
-    children: 'Save',
-  },
-  play: async ({ canvas }) => {
-    await expect(canvas.getByRole('button', { name: /save/i })).toBeVisible();
-  },
-};
-
-export const Disabled: Story = {
-  args: {
-    variant: 'primary',
-    disabled: true,
-    children: 'Save',
-  },
-  play: async ({ canvas }) => {
-    await expect(canvas.getByRole('button')).toBeDisabled();
-  },
-};
-```
-
-Use `render` when the story needs composition — wrapping the component in layout, combining multiple components, or passing children as JSX:
-
-```tsx
-import type { Meta, StoryObj } from '@storybook/react-vite';
-import { expect } from 'storybook/test';
-import { Button } from './Button';
-import { Card } from './Card';
-
-const meta = {
-  component: Button,
-} satisfies Meta<typeof Button>;
-
-export default meta;
-type Story = StoryObj<typeof meta>;
-
-export const InsideCard: Story = {
-  render: () => (
-    <Card>
-      <Button disabled={false}>Save</Button>
-    </Card>
-  ),
-  play: async ({ canvas, userEvent }) => {
-    await expect(canvas.getByRole('button', { name: /save/i })).toBeVisible();
-    await userEvent.click(canvas.getByRole('button', { name: /save/i }));
-  },
-};
-```
-
-Example of copying real page JSX:
-
-```tsx
-// real app
-return (
-  <div className="page-shell">
-    <FiltersPanel />
-    {products.map((product) => (
-      <ProductCard key={product.id} product={product} />
-    ))}
-  </div>
-);
-```
-
-```tsx
-import type { Meta, StoryObj } from '@storybook/react-vite';
-import { expect } from 'storybook/test';
-import { FiltersPanel } from './FiltersPanel';
-import { ProductCard } from './ProductCard';
-import { mockProduct } from './mockProduct';
-
-const meta = {
-  component: ProductCard,
-} satisfies Meta<typeof ProductCard>;
-
-export default meta;
-type Story = StoryObj<typeof meta>;
-
-// story
-export const Default: Story = {
-  render: () => (
-    <div className="page-shell">
-      <FiltersPanel />
-      <ProductCard product={mockProduct} />
-    </div>
-  ),
-  play: async ({ canvas }) => {
-    await expect(canvas.getByText(/example product/i)).toBeVisible();
-  },
-};
-```
-
-Keep app mocking and runtime setup in preview, not in the stories.
-Do not build large story-specific harnesses.
-Do not write story files for subcomponents, hooks, contexts, or helpers.
-Do not create new application components.
-Do not add a custom `title`.
-Do not stop after only a few easy targets if the codebase has more meaningful components or pages available.
-
-## 6. Write a play function for every story
-
-Every named story export must have a `play` function.
-The `play` function is not optional, even for simple stories.
-
-The purpose of the `play` function is to prove that the story actually works in the copied Storybook environment:
-- the story renders something real and non-empty
-- the decorators provide the needed context
-- the CSS is applied well enough for the intended state to be visible
-- the MSW mocks or seeded browser state are actually being used
-- important interactions, async loading states, and portals behave correctly
-
-Use `play` functions to verify behavior, not just to click around.
-A story without assertions is incomplete.
-
-Use tools from `storybook/test` such as:
-- `expect`
-- `waitFor`
-
-Prefer `canvas` and `userEvent` from the `play` context.
-Do not destructure `canvasElement` just to create `const canvas = within(canvasElement)`.
-Do not import `userEvent` from `storybook/test`; use `userEvent` from the `play` context instead.
-Only use `canvasElement.ownerDocument` when you need to query outside the canvas, such as for portals.
-Example:
-
-```tsx
-import type { StoryObj } from '@storybook/react-vite';
-
-export const FilledForm: Story = {
-  play: async ({ canvas, userEvent }) => {
-    const emailInput = canvas.getByLabelText('email', {
-      selector: 'input',
-    });
-
-    await userEvent.type(emailInput, 'example-email@email.com', {
-      delay: 100,
-    });
-
-    const passwordInput = canvas.getByLabelText('password', {
-      selector: 'input',
-    });
-
-    await userEvent.type(passwordInput, 'ExamplePassword', {
-      delay: 100,
-    });
-
-    const submitButton = canvas.getByRole('button');
-    await userEvent.click(submitButton);
-  },
-};
-```
-
-The assertions should match the real pattern you copied:
-- for provider-backed stories, assert the provider-dependent UI appears correctly
-- for mocked-data stories, wait for the mocked data to appear and assert on it
-- for CSS-sensitive states, assert on visibility, text layout, class-driven states, or meaningful computed styles
-- for routing or navigation stories, assert the routed state or navigation outcome
-- for portal stories, query from `canvasElement.ownerDocument` when the UI renders outside the canvas
-
-Examples of useful checks:
-- a themed button has the expected label and is visibly enabled or disabled
-- a modal opened through a decorator or provider is visible in the portal root
-- mocked API data appears in the page instead of a loading spinner forever
-- a selected tab actually shows the selected panel
-- a toast, alert, or badge has the expected accessible text and visual state
-- a CSS class or computed style confirms the real state that matters
-
-## 7. Cover the patterns you found
-
-Write stories for the real patterns in the codebase, for example:
-- a low-level reusable component in real JSX usage
-- a provider-backed component
-- a browser-state-backed component
-- a fetched-data component
-- a real page component
-
-Use `App.tsx` to inspect the real provider tree and usage patterns, but do not make a story for `App` when the codebase has actual page components.
-
-Example page story:
-
-```tsx
-import type { Meta, StoryObj } from '@storybook/react-vite';
-import { expect } from 'storybook/test';
-import { ProductPage } from './ProductPage';
-
-const meta = {
-  component: ProductPage,
-} satisfies Meta<typeof ProductPage>;
-
-export default meta;
-type Story = StoryObj<typeof meta>;
-
-export const Default: Story = {
-  render: () => <ProductPage />,
-  play: async ({ canvas }) => {
-    await expect(canvas.getByRole('heading', { name: /products/i })).toBeVisible();
-  },
-};
-```
-
-## 8. Verify both rendering and types
-
-As you work, verify the stories with Vitest:
-
-```bash
-npx vitest --project storybook <path-to-story-file>
-```
-
-Also verify types so you catch missing required props, broken imports, and preview typing issues. Run the same TypeScript command the project itself uses.
-
-```bash
-<project-specific-typescript-command>
-```
-
-After verification passes, review every changed file and remove anything that is not needed for the final solution, especially debug fixes, overly broad mocks, unnecessary dependencies, and eval artifacts.
-
-Once all stories pass and types are clean, run a full Storybook build as a final check:
-
-```bash
-npx storybook build
-```
-
-If the build fails, fix the issue before finishing. Common build failures include missing dependencies, broken imports that only surface during static analysis, or configuration issues in `.storybook/main.ts`.
-
-Keep iterating until:
-- every story you wrote passes
-- every story you wrote has a meaningful passing `play` function
-- the changed stories and preview setup pass the project's real TypeScript check
-- `storybook build` succeeds
-- the rendered output looks sensible
-- the default global mocked environment is strong enough that stories do not need manual fetch overrides
-- stories no longer fail because the shared preview setup and story JSX are fixed
diff --git a/scripts/eval/prompts/setup.md b/scripts/eval/prompts/setup.md
deleted file mode 100644
index 6ee4d37596c4..000000000000
--- a/scripts/eval/prompts/setup.md
+++ /dev/null
@@ -1,204 +0,0 @@
-Attention: The following instructions must be followed in order to successfully set up Storybook in this project. Do not skip steps or attempt to do them out of order.
-
-Your goal is to make Storybook fully functional in this project by analyzing the codebase,
-configuring the preview with the right decorators, and writing example stories for 9 components.
-
-Work through these steps in order. After each story file, run Vitest to verify it renders.
-If the test fails, read the error, fix the issue, and re-run until it passes before moving on.
-
-### Step 1: Analyze the codebase
-
-Before writing any stories, understand what the components need to render:
-
-- Scan the project for context providers, theme systems, routers, stores, and i18n setups.
-  Look at the app's entry point (e.g. `App.tsx`, `main.tsx`, `layout.tsx`) to see what
-  providers wrap the component tree.
-- Identify global CSS or style imports required for components to look correct.
-- Note any path aliases configured in tsconfig or bundler config.
-- Read `.storybook/main.ts` (or `main.js`) to find the `stories` glob patterns.
-  Your story files must match those patterns to be picked up by Storybook.
-
-### Step 2: Configure `.storybook/preview.ts` with decorators
-
-Add decorators that wrap every story with the providers your components need.
-Without this, most non-trivial components will crash.
-
-If the project uses CSF Factory (look for `definePreview` in `.storybook/preview.ts`):
-```ts
-// .storybook/preview.ts
-import '../src/index.css'; // import global styles
-
-import { definePreview } from 'storybook/preview';
-
-export const config = definePreview({
-  decorators: [
-    (Story) => (
-      <ThemeProvider theme={theme}>
-        <MemoryRouter>
-          <Story />
-        </MemoryRouter>
-      </ThemeProvider>
-    ),
-  ],
-});
-```
-
-Otherwise:
-```ts
-// .storybook/preview.ts
-import '../src/index.css'; // import global styles
-
-const preview = {
-  decorators: [
-    (Story) => (
-      <ThemeProvider theme={theme}>
-        <MemoryRouter>
-          <Story />
-        </MemoryRouter>
-      </ThemeProvider>
-    ),
-  ],
-};
-export default preview;
-```
-
-Common decorators to add:
-- **Theme providers** (e.g. ThemeProvider, MUI ThemeProvider, styled-components, Tailwind)
-- **Router** (e.g. MemoryRouter, BrowserRouter mock)
-- **State stores** (e.g. Redux Provider, Zustand, Jotai)
-- **i18n** (e.g. IntlProvider, I18nextProvider)
-- **Global CSS** — import global stylesheets at the top of preview.ts
-
-### Step 3: Write stories for 9 components
-
-Pick 9 real components from the codebase, 3 of each complexity level.
-Use the title prefix `AI Generated/<Complexity>/<ComponentName>` so they are grouped
-together in the Storybook sidebar.
-
-**Simple (3 components)** — Presentational with few props, no internal state.
-Examples: Button, Badge, Avatar, Icon, Label, Chip.
-Title format: `AI Generated/Simple/<ComponentName>`
-
-**Medium (3 components)** — Multiple visual variants or composed from simpler components.
-Examples: Card, Alert, Input, Select, Tooltip, Tabs.
-Title format: `AI Generated/Medium/<ComponentName>`
-
-**Complex (3 components)** — Internal state, side effects, or deep composition.
-Examples: Modal, DataTable, Form, Dropdown, Accordion, Sidebar.
-Title format: `AI Generated/Complex/<ComponentName>`
-
-For each component, create a `<ComponentName>.stories.ts` file next to the component.
-Each file must have at least 2 story exports covering the component's main states.
-Make sure the file location and naming matches the `stories` patterns in `.storybook/main.ts`.
-
-If the project uses CSF Factory (look for `definePreview` / `config.meta` patterns):
-
-Story format (CSF Factory — this project uses CSF factories):
-```ts
-import { config } from '#.storybook/preview';
-import { Button } from './Button';
-
-const meta = config.meta({
-  title: 'AI Generated/Simple/Button',
-  component: Button,
-});
-
-export const Default = meta.story({
-  args: {
-    label: 'Click me',
-  },
-});
-
-export const Disabled = meta.story({
-  args: {
-    label: 'Disabled',
-    disabled: true,
-  },
-});
-```
-
-Otherwise:
-
-Story format (CSF):
-```ts
-import type { Meta, StoryObj } from '@storybook/react';
-import { Button } from './Button';
-
-const meta = {
-  title: 'AI Generated/Simple/Button',
-  component: Button,
-} satisfies Meta<typeof Button>;
-
-export default meta;
-type Story = StoryObj<typeof meta>;
-
-export const Default: Story = {
-  args: {
-    label: 'Click me',
-  },
-};
-
-export const Disabled: Story = {
-  args: {
-    label: 'Disabled',
-    disabled: true,
-  },
-};
-```
-
-Rules:
-- Every named export is a story. Use `args` to set props.
-- Provide all required props via `args` — check the component's types.
-- If a component needs per-story decorators (beyond the global ones), add them in the meta.
-- Do NOT use `any` types. Use the component's prop types for type safety.
-
-Reference: https://storybook.js.org/docs/latest/writing-stories
-
-### Step 4: Verify each story with Vitest
-
-After writing each story file, immediately verify it:
-
-```bash
-npx vitest --project storybook <path-to-story-file>
-```
-
-**Self-healing loop — repeat for every story file:**
-1. Write/update the story file
-2. Run `npx vitest --project storybook <path-to-story-file>`
-3. If it fails: read the error output carefully
-   - Missing provider → add a decorator in `.storybook/preview.ts` or in the story meta
-   - Missing prop → add the required prop to `args`
-   - Import error → fix the import path
-   - CSS/asset error → add static dirs or import the stylesheet
-4. Fix the issue and go back to step 2
-5. Once the test passes, move to the next component
-
-After all 9 story files pass individually, run the full suite:
-```bash
-npx vitest --project storybook
-```
-
-Once all stories pass, run a full Storybook build as a final check:
-```bash
-npx storybook build
-```
-
-If the build fails, fix the issue before finishing.
-
-### Checklist
-
-- [ ] Analyzed codebase for providers, global styles, and path aliases
-- [ ] Read story patterns from `.storybook/main.ts`
-- [ ] Configured `.storybook/preview.ts` with necessary decorators
-- [ ] Simple component 1: story written and passing
-- [ ] Simple component 2: story written and passing
-- [ ] Simple component 3: story written and passing
-- [ ] Medium component 1: story written and passing
-- [ ] Medium component 2: story written and passing
-- [ ] Medium component 3: story written and passing
-- [ ] Complex component 1: story written and passing
-- [ ] Complex component 2: story written and passing
-- [ ] Complex component 3: story written and passing
-- [ ] Full Vitest suite passes: `npx vitest --project storybook`
-- [ ] `npx storybook build` succeeds
-- [ ] Run `npx storybook doctor` to check for common issues (version mismatches, duplicated deps, etc.)
diff --git a/scripts/eval/run-batch.ts b/scripts/eval/run-batch.ts
index 815d6a663da7..e64bf5d1c824 100644
--- a/scripts/eval/run-batch.ts
+++ b/scripts/eval/run-batch.ts
@@ -86,7 +86,7 @@ export interface RunBatchOptions {
   repoRoot?: string;
   evalRoot?: string;
   batchTimestamp?: string;
-  /** Required when `descriptors` are not provided — prompt template basename (prompts/{name}.md). */
+  /** Required when `descriptors` are not provided — prompt variant name from the CLI registry. */
   prompt?: string;
   /** Skip interactive confirmation (large API / token usage). */
   yes?: boolean;
@@ -437,7 +437,8 @@ const runBatchOptions = {
   concurrency: { type: 'string' as const, description: 'Max concurrent runs (default: 8)' },
   prompt: {
     type: 'string' as const,
-    description: 'Prompt template name (required; file: scripts/eval/prompts/{name}.md)',
+    description:
+      'Prompt variant name (required; registered in code/lib/cli-storybook/src/ai/prompts/)',
   },
   agents: {
     type: 'string' as const,

From 9412695be9a13536ccce4778ed3e21e87ed8a936 Mon Sep 17 00:00:00 2001
From: Kasper Peulen <kasperpeulen@gmail.com>
Date: Mon, 20 Apr 2026 21:44:22 +0700
Subject: [PATCH 06/17] CLI: Port pattern-copy-play prompt improvements from
 #34596

Bring the three prompt-content changes that were about to ship in #34596 onto
the post-refactor layout. Applies to code/lib/cli-storybook/src/ai/prompts/
pattern-copy-play.ts (previously getSetupInstructions in prompt.ts):

- New end-state paragraph in the intro clarifying that the shared preview should
  own all providers, CSS, browser state, and network mocks so rendering the
  component in the story is enough.
- New "#### Args vs render" subsection under Step 5 with two full examples
  (args-driven Button, render-based composition inside Card), via two new
  self-contained helpers getArgsStoryExample and getRenderCompositionExample.
- New Step 7 "Prove CSS is loaded in exactly one story named CssCheck" asserting
  a component-specific computed style via getComputedStyle to catch "renders but
  CSS never loaded" failures. Steps 8 and 9 renumbered accordingly.

Makes #34596 redundant against this branch.
---
 .../src/ai/prompts/pattern-copy-play.ts       | 171 +++++++++++++++++-
 1 file changed, 169 insertions(+), 2 deletions(-)

diff --git a/code/lib/cli-storybook/src/ai/prompts/pattern-copy-play.ts b/code/lib/cli-storybook/src/ai/prompts/pattern-copy-play.ts
index 7f53f05d15bb..f29aa26dbd01 100644
--- a/code/lib/cli-storybook/src/ai/prompts/pattern-copy-play.ts
+++ b/code/lib/cli-storybook/src/ai/prompts/pattern-copy-play.ts
@@ -265,6 +265,144 @@ function getNeedsWorkTagExample(projectInfo: ProjectInfo): string {
   `;
 }
 
+function getArgsStoryExample(projectInfo: ProjectInfo): string {
+  if (projectInfo.hasCsfFactoryPreview) {
+    return dedent`
+      \`\`\`tsx
+      import preview from '#.storybook/preview';
+      import { expect } from 'storybook/test';
+      import { Button } from './Button';
+
+      const meta = preview.meta({
+        component: Button,
+        tags: ['ai-generated'],
+      });
+
+      export const Primary = meta.story({
+        args: {
+          variant: 'primary',
+          children: 'Save',
+        },
+        play: async ({ canvas }) => {
+          await expect(canvas.getByRole('button', { name: /save/i })).toBeVisible();
+        },
+      });
+
+      export const Disabled = meta.story({
+        args: {
+          variant: 'primary',
+          disabled: true,
+          children: 'Save',
+        },
+        play: async ({ canvas }) => {
+          await expect(canvas.getByRole('button')).toBeDisabled();
+        },
+      });
+      \`\`\`
+    `;
+  }
+
+  const typeImport = getTypeImportSource(projectInfo);
+
+  return dedent`
+    \`\`\`tsx
+    import type { Meta, StoryObj } from '${typeImport}';
+    import { expect } from 'storybook/test';
+    import { Button } from './Button';
+
+    const meta = {
+      component: Button,
+      tags: ['ai-generated'],
+    } satisfies Meta<typeof Button>;
+
+    export default meta;
+    type Story = StoryObj<typeof meta>;
+
+    export const Primary: Story = {
+      args: {
+        variant: 'primary',
+        children: 'Save',
+      },
+      play: async ({ canvas }) => {
+        await expect(canvas.getByRole('button', { name: /save/i })).toBeVisible();
+      },
+    };
+
+    export const Disabled: Story = {
+      args: {
+        variant: 'primary',
+        disabled: true,
+        children: 'Save',
+      },
+      play: async ({ canvas }) => {
+        await expect(canvas.getByRole('button')).toBeDisabled();
+      },
+    };
+    \`\`\`
+  `;
+}
+
+function getRenderCompositionExample(projectInfo: ProjectInfo): string {
+  if (projectInfo.hasCsfFactoryPreview) {
+    return dedent`
+      \`\`\`tsx
+      import preview from '#.storybook/preview';
+      import { expect } from 'storybook/test';
+      import { Button } from './Button';
+      import { Card } from './Card';
+
+      const meta = preview.meta({
+        component: Button,
+        tags: ['ai-generated'],
+      });
+
+      export const InsideCard = meta.story({
+        render: () => (
+          <Card>
+            <Button disabled={false}>Save</Button>
+          </Card>
+        ),
+        play: async ({ canvas, userEvent }) => {
+          await expect(canvas.getByRole('button', { name: /save/i })).toBeVisible();
+          await userEvent.click(canvas.getByRole('button', { name: /save/i }));
+        },
+      });
+      \`\`\`
+    `;
+  }
+
+  const typeImport = getTypeImportSource(projectInfo);
+
+  return dedent`
+    \`\`\`tsx
+    import type { Meta, StoryObj } from '${typeImport}';
+    import { expect } from 'storybook/test';
+    import { Button } from './Button';
+    import { Card } from './Card';
+
+    const meta = {
+      component: Button,
+      tags: ['ai-generated'],
+    } satisfies Meta<typeof Button>;
+
+    export default meta;
+    type Story = StoryObj<typeof meta>;
+
+    export const InsideCard: Story = {
+      render: () => (
+        <Card>
+          <Button disabled={false}>Save</Button>
+        </Card>
+      ),
+      play: async ({ canvas, userEvent }) => {
+        await expect(canvas.getByRole('button', { name: /save/i })).toBeVisible();
+        await userEvent.click(canvas.getByRole('button', { name: /save/i }));
+      },
+    };
+    \`\`\`
+  `;
+}
+
 function getPageStoryExample(projectInfo: ProjectInfo): string {
   if (projectInfo.hasCsfFactoryPreview) {
     return dedent`
@@ -328,6 +466,8 @@ export function patternCopyPlayInstructions(projectInfo: ProjectInfo): string {
     Your goal is to make Storybook fully functional in this project by analyzing the codebase,
     configuring the preview with the right decorators, and writing stories for some components.
 
+    The end state should be a Storybook where any component — from a small button to a full page — can be added without story-specific workarounds. All necessary providers, CSS, browser state, and network mocks should live in the shared preview so that just rendering the component in the story is enough.
+
     After each created story, run Vitest to verify it renders.
     If the test fails, read the error, fix the issue, and re-run until it passes before moving on.
 
@@ -566,6 +706,16 @@ export function patternCopyPlayInstructions(projectInfo: ProjectInfo): string {
 
     ${getNeedsWorkTagExample(projectInfo)}
 
+    #### Args vs render
+
+    For simple components where props drive the state, prefer \`args\` stories — no \`render\` function needed:
+
+    ${getArgsStoryExample(projectInfo)}
+
+    Use \`render\` when the story needs composition — wrapping the component in layout, combining multiple components, or passing children as JSX:
+
+    ${getRenderCompositionExample(projectInfo)}
+
     Keep app mocking and runtime setup in preview, not in the stories.
     Do not build large story-specific harnesses.
     Do not write story files for subcomponents, hooks, contexts, or helpers.
@@ -645,7 +795,24 @@ export function patternCopyPlayInstructions(projectInfo: ProjectInfo): string {
     - a toast, alert, or badge has the expected accessible text and visual state
     - a CSS class or computed style confirms the real state that matters
 
-    ### Step 7: Cover the patterns you found
+    ### Step 7: Prove CSS is loaded in exactly one story named \`CssCheck\`
+
+    In exactly one story, named \`CssCheck\`, assert a component-specific computed style. \`toBeVisible\` passes on an unstyled component; a concrete style value proves the shared preview loaded the app's CSS.
+
+    Pick a visually distinctive component, read a styling value from its source, and assert it with \`getComputedStyle\`:
+
+    \`\`\`tsx
+    export const CssCheck: Story = {
+      args: { children: "Submit" },
+      play: async ({ canvas }) => {
+        const button = canvas.getByRole("button", { name: /submit/i });
+        // PrimaryButton uses bg-blue-600 — fails if Tailwind / global CSS did not load.
+        await expect(getComputedStyle(button).backgroundColor).toBe("rgb(37, 99, 235)");
+      },
+    };
+    \`\`\`
+
+    ### Step 8: Cover the patterns you found
 
     Write stories for the real patterns in the codebase, for example:
 
@@ -661,7 +828,7 @@ export function patternCopyPlayInstructions(projectInfo: ProjectInfo): string {
 
     ${getPageStoryExample(projectInfo)}
 
-    ### Step 8: Verify both rendering and types
+    ### Step 9: Verify both rendering and types
 
     As you work, verify the stories with Vitest:
 

From 741237eb3c87550b9d0f57ca9a8e4475f9fd49c7 Mon Sep 17 00:00:00 2001
From: Kasper Peulen <kasperpeulen@gmail.com>
Date: Mon, 20 Apr 2026 22:16:07 +0700
Subject: [PATCH 07/17] chore: oxfmt scripts/eval/eval.ts (fix CI format-check)

Trivial one-line signature reflow picked up by `oxfmt --check` after
the merge of #34602 into this branch. No behavior change.
---
 scripts/eval/eval.ts | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts
index afa3565b26ed..b66d39148c27 100644
--- a/scripts/eval/eval.ts
+++ b/scripts/eval/eval.ts
@@ -241,11 +241,7 @@ function inferAgent(model: string): AgentId {
   throw new Error(`No agent found for model: ${model}`);
 }
 
-function buildManualCommand(
-  variant: AgentVariant,
-  promptPath: string,
-  promptName: string
-): string {
+function buildManualCommand(variant: AgentVariant, promptPath: string, promptName: string): string {
   // EVAL_SETUP_PROMPT must be in the env the agent inherits, so that the
   // agent's own `npx storybook ai setup` tool call picks the right variant.
   const envPrefix = `EVAL_SETUP_PROMPT=${promptName} `;

From 5be3f0b2806a5bfbbd603f25120dc740f9c7d9af Mon Sep 17 00:00:00 2001
From: Kasper Peulen <kasperpeulen@gmail.com>
Date: Tue, 21 Apr 2026 17:18:45 +0700
Subject: [PATCH 08/17] CLI: Capture real ai-setup markdown in trial records
 and normalize prompt exports

- Spawn `npx storybook ai setup` inside the trial workspace with
  `EVAL_SETUP_PROMPT=<name>` and save its stdout as `prompt.content` so
  `data.json` and transcript docs carry the project-aware instructions
  instead of the one-line nudge.
- Rename every prompt variant's builder to `instructions` and use
  namespace imports in the prompts registry so all variant files share
  one export convention.
- Fix two stale `run-trial.test.ts` assertions that still expected the
  full markdown as the agent prompt; mock `tinyexec` and cover the new
  `prompt.content` field.
- Collapse `buildManualCommand` signature in `eval.ts` onto one line so
  `yarn fmt:check` passes.
---
 .../lib/cli-storybook/src/ai/prompts/index.ts | 12 +++--
 .../src/ai/prompts/pattern-copy-play.ts       |  2 +-
 .../lib/cli-storybook/src/ai/prompts/setup.ts |  2 +-
 scripts/eval/eval.ts                          |  6 +--
 scripts/eval/lib/run-trial.test.ts            | 27 ++++++++--
 scripts/eval/lib/run-trial.ts                 | 53 ++++++++++++++++---
 6 files changed, 81 insertions(+), 21 deletions(-)

diff --git a/code/lib/cli-storybook/src/ai/prompts/index.ts b/code/lib/cli-storybook/src/ai/prompts/index.ts
index 799afe31cd5a..d53612ef5b3d 100644
--- a/code/lib/cli-storybook/src/ai/prompts/index.ts
+++ b/code/lib/cli-storybook/src/ai/prompts/index.ts
@@ -1,16 +1,18 @@
 import type { AiPrompt, ProjectInfo } from '../types.ts';
 
-import { patternCopyPlayInstructions } from './pattern-copy-play.ts';
-import { setupInstructions } from './setup.ts';
+import * as patternCopyPlay from './pattern-copy-play.ts';
+import * as setup from './setup.ts';
 
 /**
  * Registry of all prompt builders. Each key is a prompt identifier used only
  * internally (by the eval harness via `EVAL_SETUP_PROMPT`); users never see
- * these names.
+ * these names. Each variant file exports an `instructions(projectInfo)`
+ * function; namespace imports keep the registry self-describing and make the
+ * convention uniform.
  */
 const PROMPT_BUILDERS = {
-  'pattern-copy-play': patternCopyPlayInstructions,
-  setup: setupInstructions,
+  'pattern-copy-play': patternCopyPlay.instructions,
+  setup: setup.instructions,
 } satisfies Record<string, (projectInfo: ProjectInfo) => string>;
 
 export type PromptName = keyof typeof PROMPT_BUILDERS;
diff --git a/code/lib/cli-storybook/src/ai/prompts/pattern-copy-play.ts b/code/lib/cli-storybook/src/ai/prompts/pattern-copy-play.ts
index f29aa26dbd01..573e1fbc55db 100644
--- a/code/lib/cli-storybook/src/ai/prompts/pattern-copy-play.ts
+++ b/code/lib/cli-storybook/src/ai/prompts/pattern-copy-play.ts
@@ -456,7 +456,7 @@ function getPageStoryExample(projectInfo: ProjectInfo): string {
   `;
 }
 
-export function patternCopyPlayInstructions(projectInfo: ProjectInfo): string {
+export function instructions(projectInfo: ProjectInfo): string {
   const configDir = projectInfo.configDir;
   const typeImport = getTypeImportSource(projectInfo);
 
diff --git a/code/lib/cli-storybook/src/ai/prompts/setup.ts b/code/lib/cli-storybook/src/ai/prompts/setup.ts
index 33028403bfc6..e9cdb3ff710d 100644
--- a/code/lib/cli-storybook/src/ai/prompts/setup.ts
+++ b/code/lib/cli-storybook/src/ai/prompts/setup.ts
@@ -142,7 +142,7 @@ function getNeedsWorkTagExample(projectInfo: ProjectInfo): string {
   `;
 }
 
-export function setupInstructions(projectInfo: ProjectInfo): string {
+export function instructions(projectInfo: ProjectInfo): string {
   const configDir = projectInfo.configDir;
 
   return dedent`
diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts
index afa3565b26ed..b66d39148c27 100644
--- a/scripts/eval/eval.ts
+++ b/scripts/eval/eval.ts
@@ -241,11 +241,7 @@ function inferAgent(model: string): AgentId {
   throw new Error(`No agent found for model: ${model}`);
 }
 
-function buildManualCommand(
-  variant: AgentVariant,
-  promptPath: string,
-  promptName: string
-): string {
+function buildManualCommand(variant: AgentVariant, promptPath: string, promptName: string): string {
   // EVAL_SETUP_PROMPT must be in the env the agent inherits, so that the
   // agent's own `npx storybook ai setup` tool call picks the right variant.
   const envPrefix = `EVAL_SETUP_PROMPT=${promptName} `;
diff --git a/scripts/eval/lib/run-trial.test.ts b/scripts/eval/lib/run-trial.test.ts
index 6022b45b5e62..d7bae7ee55d4 100644
--- a/scripts/eval/lib/run-trial.test.ts
+++ b/scripts/eval/lib/run-trial.test.ts
@@ -45,7 +45,15 @@ vi.mock('./agents/claude-code', () => ({
 vi.mock('./agents/codex', () => ({
   codexAgent: { name: 'codex', execute: vi.fn() },
 }));
+vi.mock('tinyexec', () => ({
+  x: vi.fn().mockResolvedValue({
+    exitCode: 0,
+    stdout: '# Storybook Setup\n\nFull project-aware instructions...',
+    stderr: '',
+  }),
+}));
 
+import { x } from 'tinyexec';
 import { claudeAgent } from './agents/claude-code.ts';
 import { collectGhostStoriesGrade, grade } from './grade.ts';
 import { prepareTrial } from './prepare-trial.ts';
@@ -158,13 +166,25 @@ describe('runTrial pipeline', () => {
 
     const params = vi.mocked(claudeAgent.execute).mock.calls[0][0];
     expect(params).toMatchObject({
-      prompt: expect.stringContaining('set up Storybook'),
+      prompt: expect.stringContaining('npx storybook ai setup'),
       projectPath: TMP,
       variant: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' },
       resultsDir: join(TMP, '.storybook', 'eval-results'),
+      env: { EVAL_SETUP_PROMPT: 'setup' },
     });
     expect(params.logger).toBeDefined();
 
+    expect(vi.mocked(x)).toHaveBeenCalledWith(
+      'npx',
+      ['storybook', 'ai', 'setup'],
+      expect.objectContaining({
+        nodeOptions: expect.objectContaining({
+          cwd: TMP,
+          env: expect.objectContaining({ EVAL_SETUP_PROMPT: 'setup' }),
+        }),
+      })
+    );
+
     const gradeWorkspace = vi.mocked(grade).mock.calls[0][0];
     expect(gradeWorkspace).toMatchObject({
       baselineCommit: 'deadbeef',
@@ -217,6 +237,7 @@ describe('runTrial pipeline', () => {
       },
       prompt: {
         name: 'setup',
+        content: expect.stringContaining('Full project-aware instructions'),
       },
       artifacts: {
         buildOutput: { path: '.storybook/eval-results/build-output.txt', success: true },
@@ -227,7 +248,7 @@ describe('runTrial pipeline', () => {
       },
       docs: {
         transcript: {
-          prompt: expect.stringContaining('set up Storybook'),
+          prompt: expect.stringContaining('Full project-aware instructions'),
         },
       },
     });
@@ -235,7 +256,7 @@ describe('runTrial pipeline', () => {
     expect(data).not.toHaveProperty('artifacts.screenshotOutput');
 
     const promptContent = readFileSync(join(resultsDir, 'prompt.md'), 'utf-8');
-    expect(promptContent).toContain('set up Storybook');
+    expect(promptContent).toContain('npx storybook ai setup');
     expect(() => readFileSync(join(resultsDir, 'summary.json'), 'utf-8')).toThrow();
     expect(() => readFileSync(join(resultsDir, 'transcript.json'), 'utf-8')).toThrow();
   });
diff --git a/scripts/eval/lib/run-trial.ts b/scripts/eval/lib/run-trial.ts
index 5616aeeafc0a..1571696d682a 100644
--- a/scripts/eval/lib/run-trial.ts
+++ b/scripts/eval/lib/run-trial.ts
@@ -1,5 +1,6 @@
 import { writeFile } from 'node:fs/promises';
 import { join } from 'pathe';
+import { x } from 'tinyexec';
 import type { Logger } from './utils.ts';
 import type { AgentId, AgentDriver, AgentVariant } from './agents/config.ts';
 import type { Project } from './projects.ts';
@@ -70,7 +71,13 @@ export async function runTrial(config: TrialConfig, logger?: Logger): Promise<Ru
   const prompt = loadPrompt(promptName);
   await writeFile(join(workspace.resultsDir, 'prompt.md'), prompt);
 
-  // 5. Execute the agent. EVAL_SETUP_PROMPT is forwarded into the agent's
+  // 5. Capture the full markdown the agent will receive from `ai setup` so
+  //    the trial record contains a reproducible, project-aware snapshot of
+  //    the instructions (not just the one-line nudge). Runs the same CLI the
+  //    agent will run, in the same workspace, with the same env.
+  const promptContent = await captureAiSetupMarkdown(workspace.projectPath, promptName, log);
+
+  // 6. Execute the agent. EVAL_SETUP_PROMPT is forwarded into the agent's
   //    environment so its `ai setup` tool call resolves to the selected
   //    prompt variant (unset for real users → always the default).
   log.log(`  Running ${agentName} (${model}, effort=${variant.effort})...`);
@@ -99,7 +106,7 @@ export async function runTrial(config: TrialConfig, logger?: Logger): Promise<Ru
     },
   };
 
-  // 6. Write provisional data so the baseline-owned MDX files can resolve it during grading.
+  // 7. Write provisional data so the baseline-owned MDX files can resolve it during grading.
   const provisionalData = buildEvalData({
     id: trialId,
     timestamp,
@@ -107,7 +114,7 @@ export async function runTrial(config: TrialConfig, logger?: Logger): Promise<Ru
     variant,
     prompt: {
       name: promptName,
-      content: prompt,
+      content: promptContent,
     },
     baselineCommit: workspace.baselineCommit,
     environment,
@@ -136,10 +143,10 @@ export async function runTrial(config: TrialConfig, logger?: Logger): Promise<Ru
     JSON.stringify(provisionalData, null, 2)
   );
 
-  // 6. Grade the results using story-render preview gain as the score.
+  // 8. Grade the results using story-render preview gain as the score.
   const { grade: trialGrade, score } = await grade(workspace, log, baselineGhostStories);
 
-  // 7. Rewrite the provisional data with the final grade.
+  // 9. Rewrite the provisional data with the final grade.
   const reportForCommit = buildEvalData({
     ...provisionalData,
     grade: trialGrade,
@@ -162,7 +169,7 @@ export async function runTrial(config: TrialConfig, logger?: Logger): Promise<Ru
     JSON.stringify(reportForCommit, null, 2)
   );
 
-  // 8. Commit, push, and open the benchmark PR
+  // 10. Commit, push, and open the benchmark PR
   const publish = await publishTrialBranch({
     data: reportForCommit,
     workspace,
@@ -176,3 +183,37 @@ export async function runTrial(config: TrialConfig, logger?: Logger): Promise<Ru
     publish,
   };
 }
+
+/**
+ * Run `npx storybook ai setup` inside the prepared trial workspace and return
+ * its stdout — the exact project-aware markdown the agent will receive from
+ * the same CLI invocation. `EVAL_SETUP_PROMPT` selects the variant.
+ */
+async function captureAiSetupMarkdown(
+  projectPath: string,
+  promptName: string,
+  log: Logger
+): Promise<string> {
+  const result = await x('npx', ['storybook', 'ai', 'setup'], {
+    throwOnError: false,
+    timeout: 60_000,
+    nodeOptions: {
+      cwd: projectPath,
+      env: {
+        ...process.env,
+        STORYBOOK_DISABLE_TELEMETRY: '1',
+        EVAL_SETUP_PROMPT: promptName,
+      },
+    },
+  });
+
+  if (result.exitCode !== 0) {
+    log.logError(
+      `Failed to capture ai setup markdown (exit ${result.exitCode}). Falling back to nudge-only record.`
+    );
+    log.logError(result.stderr.trim() || result.stdout.trim());
+    return '';
+  }
+
+  return result.stdout.trim();
+}

From 42f4ed9fe0b207cb9dfbef59225d7cef3d01ce3a Mon Sep 17 00:00:00 2001
From: Kasper Peulen <kasperpeulen@gmail.com>
Date: Tue, 21 Apr 2026 18:31:12 +0700
Subject: [PATCH 09/17] CLI: Address ai-setup eval review findings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Gate ai-setup preview snapshot + ai-setup-pending cache write on
  !disableTelemetry — the record is only consumed by the ai-setup-evidence
  telemetry event, so it has no consumer when telemetry is off.
- Plumb disableTelemetry through AiSetupOptions and add a unit test covering
  the enabled / disabled / default paths.
- Swap `requested in PROMPT_BUILDERS` for `Object.hasOwn(...)` so prototype
  property names in EVAL_SETUP_PROMPT fall back to the default.
- Wrap captureAiSetupMarkdown in try/catch so spawn or timeout failures log
  and return an empty string instead of aborting the trial; set
  STORYBOOK_DISABLE_TELEMETRY=1 on the subprocess env.
---
 code/lib/cli-storybook/src/ai/index.test.ts   | 89 +++++++++++++++++++
 code/lib/cli-storybook/src/ai/index.ts        | 33 ++++---
 .../lib/cli-storybook/src/ai/prompts/index.ts |  2 +-
 code/lib/cli-storybook/src/ai/types.ts        |  2 +
 docs/configure/integration/eslint-plugin.mdx  | 30 +++----
 scripts/eval/lib/run-trial.test.ts            |  5 +-
 scripts/eval/lib/run-trial.ts                 | 49 ++++++----
 7 files changed, 163 insertions(+), 47 deletions(-)
 create mode 100644 code/lib/cli-storybook/src/ai/index.test.ts

diff --git a/code/lib/cli-storybook/src/ai/index.test.ts b/code/lib/cli-storybook/src/ai/index.test.ts
new file mode 100644
index 000000000000..bafa33bd31b3
--- /dev/null
+++ b/code/lib/cli-storybook/src/ai/index.test.ts
@@ -0,0 +1,89 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+vi.mock('storybook/internal/common', async () => {
+  const actual = await vi.importActual<typeof import('storybook/internal/common')>(
+    'storybook/internal/common'
+  );
+  return {
+    ...actual,
+    cache: { set: vi.fn(), get: vi.fn(), remove: vi.fn() },
+  };
+});
+
+vi.mock('storybook/internal/telemetry', () => ({
+  telemetry: vi.fn(),
+  getSessionId: vi.fn().mockResolvedValue('session-xyz'),
+  snapshotPreviewFile: vi
+    .fn()
+    .mockResolvedValue({ previewPath: '/proj/.storybook/preview.ts', previewHash: 'abc' }),
+}));
+
+vi.mock('storybook/internal/node-logger', () => ({
+  logger: { log: vi.fn(), error: vi.fn(), warn: vi.fn(), debug: vi.fn() },
+}));
+
+vi.mock('../../../create-storybook/src/services/ProjectTypeService.ts', () => ({
+  ProjectTypeService: class {
+    async detectLanguage() {
+      return 'ts';
+    }
+  },
+}));
+
+vi.mock('../automigrate/helpers/mainConfigFile.ts', () => ({
+  getStorybookData: vi.fn().mockResolvedValue({
+    versionInstalled: '10.4.0',
+    frameworkPackage: '@storybook/react-vite',
+    rendererPackage: '@storybook/react',
+    renderer: 'react',
+    builderPackage: '@storybook/builder-vite',
+    addons: [],
+    configDir: '/proj/.storybook',
+    storiesPaths: [],
+    hasCsfFactoryPreview: false,
+    packageManager: {},
+  }),
+}));
+
+import { cache } from 'storybook/internal/common';
+import { snapshotPreviewFile, telemetry } from 'storybook/internal/telemetry';
+
+import { aiSetup } from './index.ts';
+
+beforeEach(() => {
+  vi.mocked(cache.set).mockClear();
+  vi.mocked(snapshotPreviewFile).mockClear();
+  vi.mocked(telemetry).mockClear();
+});
+
+describe('aiSetup telemetry gating', () => {
+  it('records ai-setup-pending + preview snapshot when telemetry is enabled', async () => {
+    await aiSetup({ configDir: '/proj/.storybook', disableTelemetry: false });
+
+    expect(vi.mocked(snapshotPreviewFile)).toHaveBeenCalledTimes(1);
+    expect(vi.mocked(cache.set)).toHaveBeenCalledWith(
+      'ai-setup-pending',
+      expect.objectContaining({
+        configDir: expect.stringContaining('.storybook'),
+        sessionId: 'session-xyz',
+        previewPath: '/proj/.storybook/preview.ts',
+        previewHash: 'abc',
+      })
+    );
+    expect(vi.mocked(telemetry)).toHaveBeenCalledWith('ai-setup', expect.any(Object));
+  });
+
+  it('skips snapshot + cache write when telemetry is disabled', async () => {
+    await aiSetup({ configDir: '/proj/.storybook', disableTelemetry: true });
+
+    expect(vi.mocked(snapshotPreviewFile)).not.toHaveBeenCalled();
+    expect(vi.mocked(cache.set)).not.toHaveBeenCalled();
+  });
+
+  it('treats missing disableTelemetry as enabled (backwards compatible default)', async () => {
+    await aiSetup({ configDir: '/proj/.storybook' });
+
+    expect(vi.mocked(snapshotPreviewFile)).toHaveBeenCalledTimes(1);
+    expect(vi.mocked(cache.set)).toHaveBeenCalledWith('ai-setup-pending', expect.any(Object));
+  });
+});
diff --git a/code/lib/cli-storybook/src/ai/index.ts b/code/lib/cli-storybook/src/ai/index.ts
index 4107590ffc68..1e8e765f7852 100644
--- a/code/lib/cli-storybook/src/ai/index.ts
+++ b/code/lib/cli-storybook/src/ai/index.ts
@@ -19,7 +19,12 @@ import { generateMarkdownOutput } from './prompt.ts';
 import type { ProjectInfo, AiSetupOptions } from './types.ts';
 
 export async function aiSetup(options: AiSetupOptions): Promise<void> {
-  const { configDir: userConfigDir, packageManager: packageManagerName, output } = options;
+  const {
+    configDir: userConfigDir,
+    packageManager: packageManagerName,
+    output,
+    disableTelemetry,
+  } = options;
 
   let projectInfo: ProjectInfo;
 
@@ -99,17 +104,21 @@ export async function aiSetup(options: AiSetupOptions): Promise<void> {
 
   // Snapshot the preview file baseline and cache the pending setup record.
   // Subsequent CLI entry points (dev, build, doctor, etc.) read this to
-  // collect evidence of what the agent accomplished.
-  const resolvedConfigDir = resolve(projectInfo.configDir);
-  const previewSnapshot = await snapshotPreviewFile(resolvedConfigDir);
-  const sessionId = await getSessionId();
-  const pendingRecord: AiSetupPendingRecord = {
-    timestamp: Date.now(),
-    sessionId,
-    configDir: resolvedConfigDir,
-    ...previewSnapshot,
-  };
-  await cache.set('ai-setup-pending', pendingRecord);
+  // collect evidence of what the agent accomplished — but only via telemetry
+  // (the `ai-setup-evidence` event). Skip the snapshot + cache write when
+  // telemetry is disabled so there's nobody to read it.
+  if (!disableTelemetry) {
+    const resolvedConfigDir = resolve(projectInfo.configDir);
+    const previewSnapshot = await snapshotPreviewFile(resolvedConfigDir);
+    const sessionId = await getSessionId();
+    const pendingRecord: AiSetupPendingRecord = {
+      timestamp: Date.now(),
+      sessionId,
+      configDir: resolvedConfigDir,
+      ...previewSnapshot,
+    };
+    await cache.set('ai-setup-pending', pendingRecord);
+  }
 
   if (output) {
     const outputPath = resolve(output);
diff --git a/code/lib/cli-storybook/src/ai/prompts/index.ts b/code/lib/cli-storybook/src/ai/prompts/index.ts
index d53612ef5b3d..641e3bfe6f33 100644
--- a/code/lib/cli-storybook/src/ai/prompts/index.ts
+++ b/code/lib/cli-storybook/src/ai/prompts/index.ts
@@ -35,7 +35,7 @@ const EVAL_SETUP_PROMPT_ENV = 'EVAL_SETUP_PROMPT';
 
 function resolvePromptName(): PromptName {
   const requested = process.env[EVAL_SETUP_PROMPT_ENV]?.trim();
-  if (requested && requested in PROMPT_BUILDERS) {
+  if (requested && Object.hasOwn(PROMPT_BUILDERS, requested)) {
     return requested as PromptName;
   }
   return DEFAULT_PROMPT_NAME;
diff --git a/code/lib/cli-storybook/src/ai/types.ts b/code/lib/cli-storybook/src/ai/types.ts
index 3a9a1b52be28..93a3ea88aabd 100644
--- a/code/lib/cli-storybook/src/ai/types.ts
+++ b/code/lib/cli-storybook/src/ai/types.ts
@@ -4,6 +4,8 @@ export interface AiSetupOptions {
   configDir?: string;
   packageManager?: string;
   output?: string;
+  /** Populated from the program-level `--disable-telemetry` flag (defaults from `STORYBOOK_DISABLE_TELEMETRY`). */
+  disableTelemetry?: boolean;
 }
 
 export interface ProjectInfo {
diff --git a/docs/configure/integration/eslint-plugin.mdx b/docs/configure/integration/eslint-plugin.mdx
index 1e20b0ce2714..9990fe924ecc 100644
--- a/docs/configure/integration/eslint-plugin.mdx
+++ b/docs/configure/integration/eslint-plugin.mdx
@@ -39,10 +39,10 @@ For more details on why this line is required in the `.eslintignore` file, refer
 If you are using [flat config style](https://eslint.org/docs/latest/use/configure/configuration-files-new), add this to your configuration file:
 
 ```js title="eslint.config.js"
-import { defineConfig, globalIgnores } from 'eslint/config';
+import { defineConfig, globalIgnores } from "eslint/config";
 
 export default defineConfig([
-  globalIgnores(['!.storybook'], 'Include Storybook Directory'),
+  globalIgnores(["!.storybook"], "Include Storybook Directory"),
   // ...
 ]);
 ```
@@ -91,12 +91,12 @@ Optionally, you can override, add to, or disable individual rules. You likely do
 Use the `eslint.config.js` file to configure rules using the [flat config style](https://eslint.org/docs/latest/use/configure/configuration-files-new). This is the default in ESLint v9, but can be used starting from ESLint v8.57.0. [ESLint docs](https://eslint.org/docs/latest/use/configure/configuration-files-new).
 
 ```js title="eslint.config.js"
-import storybook from 'eslint-plugin-storybook';
+import storybook from "eslint-plugin-storybook";
 // Replace the eslint/config package with @eslint/config-helpers if you're using an older version of ESLint.
-import { defineConfig } from 'eslint/config';
+import { defineConfig } from "eslint/config";
 
 export default defineConfig([
-  ...storybook.configs['flat/recommended'],
+  ...storybook.configs["flat/recommended"],
   // Add more configuration options and generic rulesets here, such as js.configs.recommended
 ]);
 ```
@@ -104,13 +104,13 @@ export default defineConfig([
 In case you are using utility functions from tools like `tseslint`, you might need to register the plugin a little differently:
 
 ```ts title="eslint.config.ts"
-import storybook from 'eslint-plugin-storybook';
-import somePlugin from 'some-plugin';
-import tseslint from 'typescript-eslint';
+import storybook from "eslint-plugin-storybook";
+import somePlugin from "some-plugin";
+import tseslint from "typescript-eslint";
 
 export default tseslint.config(
   somePlugin,
-  storybook.configs['flat/recommended'], // notice that it is not destructured
+  storybook.configs["flat/recommended"], // notice that it is not destructured
 );
 ```
 
@@ -119,19 +119,19 @@ export default tseslint.config(
 Optionally, you can override, add, or disable individual rules. You likely don't want these settings to be applied to every file, so ensure that you add a flat config section in your `eslint.config.js` file that applies the overrides only to your story files.
 
 ```js title="eslint.config.js"
-import storybook from 'eslint-plugin-storybook';
-import { defineConfig } from 'eslint/config';
+import storybook from "eslint-plugin-storybook";
+import { defineConfig } from "eslint/config";
 
 export default defineConfig([
-  ...storybook.configs['flat/recommended'],
+  ...storybook.configs["flat/recommended"],
   {
     // 👇 This should match the `stories` property in .storybook/main.js|ts
-    files: ['**/*.stories.@(ts|tsx|js|jsx|mjs|cjs)'],
+    files: ["**/*.stories.@(ts|tsx|js|jsx|mjs|cjs)"],
     rules: {
       // 👇 Enable this rule
-      'storybook/csf-component': 'error',
+      "storybook/csf-component": "error",
       // 👇 Disable this rule
-      'storybook/default-exports': 'off',
+      "storybook/default-exports": "off",
     },
   },
 ]);
diff --git a/scripts/eval/lib/run-trial.test.ts b/scripts/eval/lib/run-trial.test.ts
index d7bae7ee55d4..7db1fc7560ea 100644
--- a/scripts/eval/lib/run-trial.test.ts
+++ b/scripts/eval/lib/run-trial.test.ts
@@ -180,7 +180,10 @@ describe('runTrial pipeline', () => {
       expect.objectContaining({
         nodeOptions: expect.objectContaining({
           cwd: TMP,
-          env: expect.objectContaining({ EVAL_SETUP_PROMPT: 'setup' }),
+          env: expect.objectContaining({
+            EVAL_SETUP_PROMPT: 'setup',
+            STORYBOOK_DISABLE_TELEMETRY: '1',
+          }),
         }),
       })
     );
diff --git a/scripts/eval/lib/run-trial.ts b/scripts/eval/lib/run-trial.ts
index 1571696d682a..6add6624f83f 100644
--- a/scripts/eval/lib/run-trial.ts
+++ b/scripts/eval/lib/run-trial.ts
@@ -187,33 +187,46 @@ export async function runTrial(config: TrialConfig, logger?: Logger): Promise<Ru
 /**
  * Run `npx storybook ai setup` inside the prepared trial workspace and return
  * its stdout — the exact project-aware markdown the agent will receive from
- * the same CLI invocation. `EVAL_SETUP_PROMPT` selects the variant.
+ * the same CLI invocation. `EVAL_SETUP_PROMPT` selects the variant;
+ * `STORYBOOK_DISABLE_TELEMETRY` keeps the harness's capture invocation out of
+ * telemetry.
+ *
+ * Failures (spawn errors, timeouts, non-zero exit) are logged and swallowed:
+ * capturing the prompt content is bookkeeping, not the thing being measured,
+ * so it must never abort the trial.
  */
 async function captureAiSetupMarkdown(
   projectPath: string,
   promptName: string,
   log: Logger
 ): Promise<string> {
-  const result = await x('npx', ['storybook', 'ai', 'setup'], {
-    throwOnError: false,
-    timeout: 60_000,
-    nodeOptions: {
-      cwd: projectPath,
-      env: {
-        ...process.env,
-        STORYBOOK_DISABLE_TELEMETRY: '1',
-        EVAL_SETUP_PROMPT: promptName,
+  try {
+    const result = await x('npx', ['storybook', 'ai', 'setup'], {
+      throwOnError: false,
+      timeout: 60_000,
+      nodeOptions: {
+        cwd: projectPath,
+        env: {
+          ...process.env,
+          EVAL_SETUP_PROMPT: promptName,
+          STORYBOOK_DISABLE_TELEMETRY: '1',
+        },
       },
-    },
-  });
-
-  if (result.exitCode !== 0) {
+    });
+
+    if (result.exitCode !== 0) {
+      log.logError(
+        `Failed to capture ai setup markdown (exit ${result.exitCode}). Falling back to nudge-only record.`
+      );
+      log.logError(result.stderr.trim() || result.stdout.trim());
+      return '';
+    }
+
+    return result.stdout.trim();
+  } catch (error) {
     log.logError(
-      `Failed to capture ai setup markdown (exit ${result.exitCode}). Falling back to nudge-only record.`
+      `Failed to capture ai setup markdown (${error instanceof Error ? error.message : String(error)}). Falling back to nudge-only record.`
     );
-    log.logError(result.stderr.trim() || result.stdout.trim());
     return '';
   }
-
-  return result.stdout.trim();
 }

From d86161ff29667ce3898d6d79bcdd375aef757450 Mon Sep 17 00:00:00 2001
From: Kasper Peulen <kasperpeulen@gmail.com>
Date: Tue, 21 Apr 2026 18:56:44 +0700
Subject: [PATCH 10/17] Docs: Drop stray eslint-plugin.mdx quote-style change

The single-to-double quote edit landed by accident in commit 4 of this PR
and doesn't belong to the eval-prompts refactor. It also broke
`yarn fmt:check` (oxfmt enforces single quotes). Restoring the base-
branch state fixes CI and keeps this PR scoped to the CLI/eval work.
---
 docs/configure/integration/eslint-plugin.mdx | 30 ++++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/docs/configure/integration/eslint-plugin.mdx b/docs/configure/integration/eslint-plugin.mdx
index 9990fe924ecc..1e20b0ce2714 100644
--- a/docs/configure/integration/eslint-plugin.mdx
+++ b/docs/configure/integration/eslint-plugin.mdx
@@ -39,10 +39,10 @@ For more details on why this line is required in the `.eslintignore` file, refer
 If you are using [flat config style](https://eslint.org/docs/latest/use/configure/configuration-files-new), add this to your configuration file:
 
 ```js title="eslint.config.js"
-import { defineConfig, globalIgnores } from "eslint/config";
+import { defineConfig, globalIgnores } from 'eslint/config';
 
 export default defineConfig([
-  globalIgnores(["!.storybook"], "Include Storybook Directory"),
+  globalIgnores(['!.storybook'], 'Include Storybook Directory'),
   // ...
 ]);
 ```
@@ -91,12 +91,12 @@ Optionally, you can override, add to, or disable individual rules. You likely do
 Use the `eslint.config.js` file to configure rules using the [flat config style](https://eslint.org/docs/latest/use/configure/configuration-files-new). This is the default in ESLint v9, but can be used starting from ESLint v8.57.0. [ESLint docs](https://eslint.org/docs/latest/use/configure/configuration-files-new).
 
 ```js title="eslint.config.js"
-import storybook from "eslint-plugin-storybook";
+import storybook from 'eslint-plugin-storybook';
 // Replace the eslint/config package with @eslint/config-helpers if you're using an older version of ESLint.
-import { defineConfig } from "eslint/config";
+import { defineConfig } from 'eslint/config';
 
 export default defineConfig([
-  ...storybook.configs["flat/recommended"],
+  ...storybook.configs['flat/recommended'],
   // Add more configuration options and generic rulesets here, such as js.configs.recommended
 ]);
 ```
@@ -104,13 +104,13 @@ export default defineConfig([
 In case you are using utility functions from tools like `tseslint`, you might need to register the plugin a little differently:
 
 ```ts title="eslint.config.ts"
-import storybook from "eslint-plugin-storybook";
-import somePlugin from "some-plugin";
-import tseslint from "typescript-eslint";
+import storybook from 'eslint-plugin-storybook';
+import somePlugin from 'some-plugin';
+import tseslint from 'typescript-eslint';
 
 export default tseslint.config(
   somePlugin,
-  storybook.configs["flat/recommended"], // notice that it is not destructured
+  storybook.configs['flat/recommended'], // notice that it is not destructured
 );
 ```
 
@@ -119,19 +119,19 @@ export default tseslint.config(
 Optionally, you can override, add, or disable individual rules. You likely don't want these settings to be applied to every file, so ensure that you add a flat config section in your `eslint.config.js` file that applies the overrides only to your story files.
 
 ```js title="eslint.config.js"
-import storybook from "eslint-plugin-storybook";
-import { defineConfig } from "eslint/config";
+import storybook from 'eslint-plugin-storybook';
+import { defineConfig } from 'eslint/config';
 
 export default defineConfig([
-  ...storybook.configs["flat/recommended"],
+  ...storybook.configs['flat/recommended'],
   {
     // 👇 This should match the `stories` property in .storybook/main.js|ts
-    files: ["**/*.stories.@(ts|tsx|js|jsx|mjs|cjs)"],
+    files: ['**/*.stories.@(ts|tsx|js|jsx|mjs|cjs)'],
     rules: {
       // 👇 Enable this rule
-      "storybook/csf-component": "error",
+      'storybook/csf-component': 'error',
       // 👇 Disable this rule
-      "storybook/default-exports": "off",
+      'storybook/default-exports': 'off',
     },
   },
 ]);

From a6aaf4f8869e075f5fb714a78d1efb3a814bcadd Mon Sep 17 00:00:00 2001
From: Kasper Peulen <kasperpeulen@gmail.com>
Date: Wed, 22 Apr 2026 21:19:31 +0700
Subject: [PATCH 11/17] CI: type sync-storybook-version env as
 NodeJS.ProcessEnv

TypeScript narrowed the spread-derived object literal to only the
two explicit string properties, so `delete env.CI` failed the
`scripts:check` nx task on the merge commit. Annotating the local
with `NodeJS.ProcessEnv` restores the expected index signature
and lets the `delete` compile.
---
 scripts/eval/sync-storybook-version.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/eval/sync-storybook-version.ts b/scripts/eval/sync-storybook-version.ts
index d639fc505250..e7ace41eff3d 100644
--- a/scripts/eval/sync-storybook-version.ts
+++ b/scripts/eval/sync-storybook-version.ts
@@ -214,7 +214,7 @@ async function defaultRunUpgrade({
   // `--yes`/`--force` already disable prompts. `CI`, `YARN_ENABLE_IMMUTABLE_INSTALLS`,
   // and `npm_config_frozen_lockfile` would refuse lockfile updates and leave
   // package.json and the lockfile out of sync, so unset them here.
-  const env = {
+  const env: NodeJS.ProcessEnv = {
     ...process.env,
     YARN_ENABLE_IMMUTABLE_INSTALLS: 'false',
     npm_config_frozen_lockfile: 'false',

From d9db8a818e54a633691c88ae473b00b78ae09de2 Mon Sep 17 00:00:00 2001
From: Kasper Peulen <kasperpeulen@gmail.com>
Date: Wed, 22 Apr 2026 21:25:42 +0700
Subject: [PATCH 12/17] Eval: Type-annotate env literal in
 sync-storybook-version

`delete env.CI` failed TypeScript check because spreading `process.env`
into a literal with typed string fields narrowed `env` to only those
literal keys, losing the `[key: string]: string | undefined` index
signature. Annotating as `NodeJS.ProcessEnv` preserves the index
signature so `delete env.CI` type-checks.

Fixes the only non-skipped CI failure on this PR (scripts typecheck).
---
 scripts/eval/sync-storybook-version.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/eval/sync-storybook-version.ts b/scripts/eval/sync-storybook-version.ts
index d639fc505250..e7ace41eff3d 100644
--- a/scripts/eval/sync-storybook-version.ts
+++ b/scripts/eval/sync-storybook-version.ts
@@ -214,7 +214,7 @@ async function defaultRunUpgrade({
   // `--yes`/`--force` already disable prompts. `CI`, `YARN_ENABLE_IMMUTABLE_INSTALLS`,
   // and `npm_config_frozen_lockfile` would refuse lockfile updates and leave
   // package.json and the lockfile out of sync, so unset them here.
-  const env = {
+  const env: NodeJS.ProcessEnv = {
     ...process.env,
     YARN_ENABLE_IMMUTABLE_INSTALLS: 'false',
     npm_config_frozen_lockfile: 'false',

From 14d97d2982185a12eca4fba3b6e214a34fe4b24b Mon Sep 17 00:00:00 2001
From: yannbf <yannbf@gmail.com>
Date: Wed, 29 Apr 2026 00:02:49 +0200
Subject: [PATCH 13/17] simplify css check

---
 scripts/eval/lib/grade.test.ts         | 88 --------------------------
 scripts/eval/lib/grade.ts              | 87 +++++--------------------
 scripts/eval/lib/publish-trial.test.ts |  5 +-
 scripts/eval/lib/result-docs.test.ts   |  1 -
 scripts/eval/lib/run-trial.test.ts     |  4 +-
 scripts/eval/lib/run-trial.ts          |  1 -
 scripts/eval/lib/story-render.ts       |  5 +-
 7 files changed, 24 insertions(+), 167 deletions(-)

diff --git a/scripts/eval/lib/grade.test.ts b/scripts/eval/lib/grade.test.ts
index 737feec93425..1d6cf13b1779 100644
--- a/scripts/eval/lib/grade.test.ts
+++ b/scripts/eval/lib/grade.test.ts
@@ -1,7 +1,6 @@
 import { describe, expect, it } from 'vitest';
 
 import {
-  diffAddsTokenInStoryFiles,
   filterStorybookFiles,
   computeQualityScore,
   countTypeCheckErrors,
@@ -57,93 +56,6 @@ describe('filterStorybookFiles', () => {
   });
 });
 
-describe('diffAddsTokenInStoryFiles', () => {
-  const storyChanges: FileChange[] = [
-    { path: 'src/Button.stories.tsx', gitStatus: 'A' },
-    { path: '.storybook/preview.tsx', gitStatus: 'M' },
-  ];
-
-  it('returns true when the token is added inside a story file', () => {
-    const diff = [
-      'diff --git a/src/Button.stories.tsx b/src/Button.stories.tsx',
-      '--- a/src/Button.stories.tsx',
-      '+++ b/src/Button.stories.tsx',
-      '@@ -0,0 +1,3 @@',
-      '+  const button = canvas.getByRole("button");',
-      '+  await expect(getComputedStyle(button).backgroundColor).toBe("rgb(37, 99, 235)");',
-    ].join('\n');
-    expect(diffAddsTokenInStoryFiles(diff, storyChanges, 'getComputedStyle')).toBe(true);
-  });
-
-  it('ignores the token when it only appears in non-story files (prompt.md, data.json)', () => {
-    const diff = [
-      '+++ b/.storybook/eval-results/prompt.md',
-      '+Use getComputedStyle to prove CSS loaded',
-      '+++ b/.storybook/eval-results/data.json',
-      '+"content": "...getComputedStyle..."',
-    ].join('\n');
-    expect(diffAddsTokenInStoryFiles(diff, storyChanges, 'getComputedStyle')).toBe(false);
-  });
-
-  it('ignores the token on removed lines', () => {
-    const diff = [
-      '+++ b/src/Button.stories.tsx',
-      '-  await expect(getComputedStyle(button).backgroundColor).toBe("red");',
-    ].join('\n');
-    expect(diffAddsTokenInStoryFiles(diff, storyChanges, 'getComputedStyle')).toBe(false);
-  });
-
-  it('does not match the `+++ b/...` file header itself', () => {
-    const diff = ['+++ b/src/getComputedStyle-notes.stories.tsx'].join('\n');
-    // The header mentions the token, but no content line does. Also the file is not in changes.
-    expect(diffAddsTokenInStoryFiles(diff, storyChanges, 'getComputedStyle')).toBe(false);
-  });
-
-  it('only considers files present in storybookChanges as story files', () => {
-    const diff = [
-      '+++ b/src/Button.stories.tsx',
-      '+  await expect(getComputedStyle(button).backgroundColor).toBe("red");',
-    ].join('\n');
-    // Pass an empty list — even though the file is named like a story, it is not in changes.
-    expect(diffAddsTokenInStoryFiles(diff, [], 'getComputedStyle')).toBe(false);
-  });
-
-  it('ignores non-story storybook files like .storybook/preview.tsx', () => {
-    const diff = [
-      '+++ b/.storybook/preview.tsx',
-      '+import { initialize } from "getComputedStyle";',
-    ].join('\n');
-    expect(diffAddsTokenInStoryFiles(diff, storyChanges, 'getComputedStyle')).toBe(false);
-  });
-
-  it('returns false for an empty diff', () => {
-    expect(diffAddsTokenInStoryFiles('', storyChanges, 'getComputedStyle')).toBe(false);
-  });
-
-  it('detects an `export const CssCheck` story added in a story file', () => {
-    const diff = [
-      '+++ b/src/Button.stories.tsx',
-      '@@ -0,0 +1,6 @@',
-      '+export const CssCheck: Story = {',
-      '+  args: { children: "Submit" },',
-      '+  play: async ({ canvas }) => {',
-      '+    const button = canvas.getByRole("button");',
-      '+    await expect(getComputedStyle(button).backgroundColor).toBe("rgb(37, 99, 235)");',
-      '+  },',
-      '+};',
-    ].join('\n');
-    expect(diffAddsTokenInStoryFiles(diff, storyChanges, 'export const CssCheck')).toBe(true);
-  });
-
-  it('ignores `export const CssCheck` added outside of story files (e.g. prompt.md)', () => {
-    const diff = [
-      '+++ b/.storybook/eval-results/prompt.md',
-      '+Name this story `CssCheck`, for example `export const CssCheck: Story = { ... }`.',
-    ].join('\n');
-    expect(diffAddsTokenInStoryFiles(diff, storyChanges, 'export const CssCheck')).toBe(false);
-  });
-});
-
 describe('computeQualityScore', () => {
   it('uses normalized preview gain as the score', () => {
     const result = computeQualityScore({
diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts
index 53a78faa3d5a..592171ec7086 100644
--- a/scripts/eval/lib/grade.ts
+++ b/scripts/eval/lib/grade.ts
@@ -11,7 +11,6 @@ import {
   getGeneratedStoryFiles,
   getScriptRunCommand,
   runStoryRenderPass,
-  STORY_FILE_PATTERN,
   type StoryRenderGrade,
   withBaselinePreviewEnvironment,
 } from './story-render.ts';
@@ -53,11 +52,6 @@ export interface Grade {
   ghostStories?: GhostStoryGrade;
   baselinePreviewStories?: StoryRenderGrade;
   storyRender?: StoryRenderGrade;
-  /**
-   * True when the agent added a story named `CssCheck` (a `play` function that asserts a
-   * component-specific computed style, to prove the shared preview loaded the app's CSS).
-   */
-  hasCssCheckStory: boolean;
 }
 
 /** Filter file changes to only storybook-related ones. */
@@ -99,40 +93,6 @@ export function countTypeCheckErrors(tscOutput: string): number {
   return (tscOutput.match(/error TS\d+/g) || []).length;
 }
 
-/**
- * Walks a unified `git diff` patch and returns true if any added line (`+`, not the `+++` header)
- * inside a story file contains `token`.
- *
- * Guards against false positives from the prompt markdown, the agent transcript, and other
- * artifacts that end up in the diff because we stage every file in the trial worktree.
- */
-export function diffAddsTokenInStoryFiles(
-  rawDiff: string,
-  storybookChanges: FileChange[],
-  token: string
-): boolean {
-  const changedStoryPaths = new Set(
-    storybookChanges
-      .filter((change) => change.gitStatus !== 'D' && STORY_FILE_PATTERN.test(change.path))
-      .map((change) => change.path)
-  );
-  if (changedStoryPaths.size === 0) return false;
-
-  let currentPathIsStory = false;
-  for (const line of rawDiff.split('\n')) {
-    if (line.startsWith('+++ ')) {
-      // `+++ b/<path>` (or `+++ /dev/null` for deletions). Track whether we're now inside a story file.
-      const path = line.slice(4).replace(/^b\//, '');
-      currentPathIsStory = changedStoryPaths.has(path);
-      continue;
-    }
-    if (!currentPathIsStory) continue;
-    if (!line.startsWith('+') || line.startsWith('+++')) continue;
-    if (line.includes(token)) return true;
-  }
-  return false;
-}
-
 /** Parse git diff --name-status output into FileChange objects. */
 export function parseChangedFiles(gitOutput: string): FileChange[] {
   return gitOutput
@@ -161,23 +121,12 @@ export async function grade(
 
   // Changed files
   logger.logStep('Collecting agent changes...');
-  const { changes: fileChanges, rawDiff } = await getChangedFiles(repoRoot, baselineCommit);
+  const fileChanges = await getChangedFiles(repoRoot, baselineCommit);
   const storybookChanges = filterStorybookFiles(fileChanges);
   logger.logSuccess(
     `${fileChanges.length} files changed (${storybookChanges.length} storybook-related)`
   );
 
-  const hasCssCheckStory = diffAddsTokenInStoryFiles(
-    rawDiff,
-    storybookChanges,
-    'export const CssCheck'
-  );
-  if (hasCssCheckStory) {
-    logger.logSuccess('CssCheck story present (export const CssCheck added in a story file)');
-  } else {
-    logger.logError('CssCheck story missing (no export const CssCheck added in a story file)');
-  }
-
   // Storybook build + TypeScript check in parallel
   logger.logStep('Running storybook build + typecheck...');
   const [build, tsc] = await Promise.all([
@@ -231,6 +180,15 @@ export async function grade(
     logger,
   });
 
+  const cssCheck = storyRenderRun.summary?.cssCheck ?? 'not-run';
+  if (cssCheck === 'pass') {
+    logger.logSuccess('CssCheck story passed');
+  } else if (cssCheck === 'fail') {
+    logger.logError('CssCheck story failed');
+  } else {
+    logger.logError('CssCheck story missing or not run');
+  }
+
   const baselinePreviewRun = await withBaselinePreviewEnvironment({
     repoRoot,
     baselineCommit,
@@ -257,7 +215,6 @@ export async function grade(
     ghostStories,
     baselinePreviewStories: baselinePreviewRun.summary,
     storyRender: storyRenderRun.summary,
-    hasCssCheckStory,
   };
 
   const score = computeQualityScore({
@@ -292,27 +249,15 @@ function parseGitDiffStatus(rawStatus?: string): GitDiffStatus {
     : 'M';
 }
 
-async function getChangedFiles(
-  repoRoot: string,
-  baseline: string
-): Promise<{ changes: FileChange[]; rawDiff: string }> {
+async function getChangedFiles(repoRoot: string, baseline: string): Promise<FileChange[]> {
   // Stage all files so `git diff --cached` picks up new files the agent created.
   // Safe: this runs on an ephemeral trial copy, not the real repo.
   await x('git', ['add', '-A'], { nodeOptions: { cwd: repoRoot } });
-  const [nameStatus, patch] = await Promise.all([
-    x('git', ['diff', '--cached', '--name-status', baseline], {
-      throwOnError: false,
-      nodeOptions: { cwd: repoRoot },
-    }),
-    x('git', ['diff', '--cached', baseline], {
-      throwOnError: false,
-      nodeOptions: { cwd: repoRoot },
-    }),
-  ]);
-  return {
-    changes: parseChangedFiles(nameStatus.stdout),
-    rawDiff: patch.stdout,
-  };
+  const { stdout } = await x('git', ['diff', '--cached', '--name-status', baseline], {
+    throwOnError: false,
+    nodeOptions: { cwd: repoRoot },
+  });
+  return parseChangedFiles(stdout);
 }
 
 export async function collectGhostStoriesGrade(
diff --git a/scripts/eval/lib/publish-trial.test.ts b/scripts/eval/lib/publish-trial.test.ts
index a9400c7bf277..177fadeeefe3 100644
--- a/scripts/eval/lib/publish-trial.test.ts
+++ b/scripts/eval/lib/publish-trial.test.ts
@@ -173,12 +173,12 @@ describe('publishTrialBranch', () => {
             total: 8,
             passed: 4,
             storyFiles: 3,
+            cssCheck: 'not-run' as const,
           },
           buildSuccess: true,
           typeCheckErrors: 0,
           fileChanges: [],
           storybookChanges: [],
-          hasCssCheckStory: false,
           ghostStories: {
             candidateCount: 6,
             total: 4,
@@ -189,6 +189,7 @@ describe('publishTrialBranch', () => {
             total: 8,
             passed: 6,
             storyFiles: 3,
+            cssCheck: 'not-run' as const,
           },
         },
         score: {
@@ -342,7 +343,6 @@ describe('publishTrialBranch', () => {
             typeCheckErrors: 0,
             fileChanges: [],
             storybookChanges: [],
-            hasCssCheckStory: false,
           },
           score: {
             score: 1,
@@ -462,7 +462,6 @@ describe('publishTrialBranch', () => {
           typeCheckErrors: 0,
           fileChanges: [],
           storybookChanges: [],
-          hasCssCheckStory: false,
         },
         score: {
           score: 1,
diff --git a/scripts/eval/lib/result-docs.test.ts b/scripts/eval/lib/result-docs.test.ts
index 878a9d4ceb1a..f53b31a1536e 100644
--- a/scripts/eval/lib/result-docs.test.ts
+++ b/scripts/eval/lib/result-docs.test.ts
@@ -262,7 +262,6 @@ describe('normalizeTranscriptForDocs', () => {
         typeCheckErrors: 0,
         fileChanges: [],
         storybookChanges: [],
-        hasCssCheckStory: false,
       },
       score: {
         score: 1,
diff --git a/scripts/eval/lib/run-trial.test.ts b/scripts/eval/lib/run-trial.test.ts
index 5932d80c1670..ef042b423172 100644
--- a/scripts/eval/lib/run-trial.test.ts
+++ b/scripts/eval/lib/run-trial.test.ts
@@ -352,7 +352,6 @@ describe('runTrial pipeline', () => {
           typeCheckErrors: 0,
           fileChanges: [],
           storybookChanges: [],
-          hasCssCheckStory: false,
         },
         score: {
           score: 0,
@@ -433,6 +432,7 @@ function setupMocks(overrides?: {
         total: 6,
         passed: 2,
         storyFiles: 3,
+        cssCheck: 'not-run' as const,
       },
       buildSuccess,
       typeCheckErrors,
@@ -444,13 +444,13 @@ function setupMocks(overrides?: {
         { path: '.storybook/preview.tsx', gitStatus: 'A' },
         { path: 'src/Button.stories.tsx', gitStatus: 'A' },
       ],
-      hasCssCheckStory: true,
       ...(buildSuccess
         ? {
             storyRender: {
               total: 6,
               passed: 4,
               storyFiles: 3,
+              cssCheck: 'pass' as const,
             },
           }
         : {}),
diff --git a/scripts/eval/lib/run-trial.ts b/scripts/eval/lib/run-trial.ts
index 2d60c09ac09c..6add6624f83f 100644
--- a/scripts/eval/lib/run-trial.ts
+++ b/scripts/eval/lib/run-trial.ts
@@ -125,7 +125,6 @@ export async function runTrial(config: TrialConfig, logger?: Logger): Promise<Ru
       typeCheckErrors: 0,
       fileChanges: [],
       storybookChanges: [],
-      hasCssCheckStory: false,
     },
     score: {
       score: 0,
diff --git a/scripts/eval/lib/story-render.ts b/scripts/eval/lib/story-render.ts
index f8c6ef3473a2..2530878da5ae 100644
--- a/scripts/eval/lib/story-render.ts
+++ b/scripts/eval/lib/story-render.ts
@@ -7,12 +7,13 @@ import type { FileChange } from './grade.ts';
 import { detectPackageManager, resolveInstallRoot } from './package-manager.ts';
 import type { Logger } from './utils.ts';
 
-export const STORY_FILE_PATTERN = /\.(stories|story)\.[tj]sx?$/;
+const STORY_FILE_PATTERN = /\.(stories|story)\.[tj]sx?$/;
 
 export interface StoryRenderGrade {
   total: number;
   passed: number;
   storyFiles: number;
+  cssCheck: 'pass' | 'fail' | 'not-run';
 }
 
 export interface StoryRenderRunResult {
@@ -82,6 +83,7 @@ export async function runStoryRenderPass(opts: {
         total: 0,
         passed: 0,
         storyFiles: 0,
+        cssCheck: 'not-run' as const,
       },
     };
   }
@@ -181,6 +183,7 @@ async function readStoryRenderSummary(reportPath: string, storyFiles: number) {
     total: parsed.total,
     passed: parsed.passed,
     storyFiles,
+    cssCheck: parsed.cssCheck,
   } satisfies StoryRenderGrade;
 }
 

From fd6a2b051b51194ecf4ce305f91abe5c48070d60 Mon Sep 17 00:00:00 2001
From: yannbf <yannbf@gmail.com>
Date: Wed, 29 Apr 2026 10:38:25 +0200
Subject: [PATCH 14/17] add csscheck to pr body

---
 scripts/eval/lib/publish-trial.test.ts | 1 +
 scripts/eval/lib/publish-trial.ts      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/scripts/eval/lib/publish-trial.test.ts b/scripts/eval/lib/publish-trial.test.ts
index 177fadeeefe3..b8eb583859aa 100644
--- a/scripts/eval/lib/publish-trial.test.ts
+++ b/scripts/eval/lib/publish-trial.test.ts
@@ -249,6 +249,7 @@ describe('publishTrialBranch', () => {
     expect(prBody).toContain('Ghost stories after: `3/4 (75%)`');
     expect(prBody).toContain('Vitest pass rate before preview changes: `4/8 (50%)`');
     expect(prBody).toContain('Vitest pass rate after preview changes: `6/8 (75%)`');
+    expect(prBody).toContain('CssCheck: `not-run`');
     expect(prBody).toContain('[.storybook/eval-results/data.json](');
     expect(prBody).toContain('<summary>Full prompt</summary>');
     expect(prBody.match(/<details>/g)).toHaveLength(1);
diff --git a/scripts/eval/lib/publish-trial.ts b/scripts/eval/lib/publish-trial.ts
index ea9104dc8b66..330e6189fb57 100644
--- a/scripts/eval/lib/publish-trial.ts
+++ b/scripts/eval/lib/publish-trial.ts
@@ -260,6 +260,7 @@ function renderPrBody(opts: { branch: string; data: EvalData }) {
     `- Ghost stories after: \`${postAgentGhostStories}\``,
     `- Vitest pass rate before preview changes: \`${baselinePreviewStories}\``,
     `- Vitest pass rate after preview changes: \`${postAgentStoryRender}\``,
+    `- CssCheck: \`${opts.data.grade.storyRender?.cssCheck ?? 'not-run'}\``,
     `- Duration: \`${formatDuration(opts.data.execution.duration)}\``,
     `- Cost: \`${formatCost(opts.data.execution.cost)}\``,
     `- Raw data: [${getEvalResultsRelativePath('data.json', opts.data.project.projectDir)}](${dataUrl})`,

From 8e7c3972c5aa7a30ce71512f47af17a7357434be Mon Sep 17 00:00:00 2001
From: Steve Dodier-Lazaro <Sidnioulz@users.noreply.github.com>
Date: Wed, 29 Apr 2026 14:36:03 +0200
Subject: [PATCH 15/17] Fix inconsistencies in eval README

Co-authored-by: Steve Dodier-Lazaro <Sidnioulz@users.noreply.github.com>
---
 scripts/eval/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/eval/README.md b/scripts/eval/README.md
index 18c8c413eb64..6d983a94282f 100644
--- a/scripts/eval/README.md
+++ b/scripts/eval/README.md
@@ -4,7 +4,7 @@ The eval harness benchmarks how well AI coding agents (Claude, Codex) can set up
 
 ## Prerequisites
 
-- `**gh` CLI\*\* — installed and authenticated (`gh auth login`)
+- **`gh` CLI** — installed and authenticated (`gh auth login`)
 - **Claude Code CLI** and/or **Codex CLI** — installed with an active subscription
 
 ## How it works
@@ -13,7 +13,7 @@ The eval harness benchmarks how well AI coding agents (Claude, Codex) can set up
 
 The system forms a cycle:
 
-1. `**sync-baselines.ts`\*\* pushes a canonical `.storybook` config to each benchmark repo so every trial starts from the same known-good baseline.
+1. `**sync-baselines.ts**` pushes a canonical `.storybook` config to each benchmark repo so every trial starts from the same known-good baseline.
 2. `**eval.ts**` (single trial) or `**run-batch.ts**` (batch) creates a git worktree from a benchmark repo, runs an agent inside it, grades the output, and publishes a draft PR with structured result data.
 3. `**collect-pr-data.ts**` scrapes those draft PRs via the GitHub API and loads the results into a local SQLite database for analysis.
 
@@ -275,7 +275,7 @@ The eval mirrors the real user flow exactly:
 3. The **agent** runs `npx storybook ai setup` itself as a tool call.
 4. The agent reads the resulting project-aware markdown and follows it.
 
-The harness hands step (1) to the trial agent as its task. It never spawns `ai setup` itself — that's the agent's job, just like with real users.
+The harness hands steps (1) and (2) to the trial agent as its task. Eval starts at step (3).
 
 ### How variant selection works
 

From af1479ab4ec1f9e29f098edd94d434722efb5383 Mon Sep 17 00:00:00 2001
From: yannbf <yannbf@gmail.com>
Date: Wed, 29 Apr 2026 17:36:00 +0200
Subject: [PATCH 16/17] Account for PR feedback

---
 code/lib/cli-storybook/src/ai/index.test.ts   |  1 +
 code/lib/cli-storybook/src/ai/index.ts        | 12 ++--
 code/lib/cli-storybook/src/ai/prompt.ts       |  8 +--
 .../lib/cli-storybook/src/ai/prompts/index.ts | 55 ---------------
 .../src/ai/setup-prompts/index.ts             | 68 +++++++++++++++++++
 .../pattern-copy-play.ts                      | 12 ++++
 .../ai/{prompts => setup-prompts}/setup.ts    | 12 ++++
 scripts/eval/eval.ts                          | 17 +++--
 scripts/eval/lib/run-trial.test.ts            |  6 +-
 scripts/eval/lib/run-trial.ts                 |  7 +-
 scripts/eval/lib/utils.ts                     |  2 +-
 11 files changed, 125 insertions(+), 75 deletions(-)
 delete mode 100644 code/lib/cli-storybook/src/ai/prompts/index.ts
 create mode 100644 code/lib/cli-storybook/src/ai/setup-prompts/index.ts
 rename code/lib/cli-storybook/src/ai/{prompts => setup-prompts}/pattern-copy-play.ts (97%)
 rename code/lib/cli-storybook/src/ai/{prompts => setup-prompts}/setup.ts (94%)

diff --git a/code/lib/cli-storybook/src/ai/index.test.ts b/code/lib/cli-storybook/src/ai/index.test.ts
index bafa33bd31b3..21dd70b2898a 100644
--- a/code/lib/cli-storybook/src/ai/index.test.ts
+++ b/code/lib/cli-storybook/src/ai/index.test.ts
@@ -16,6 +16,7 @@ vi.mock('storybook/internal/telemetry', () => ({
   snapshotPreviewFile: vi
     .fn()
     .mockResolvedValue({ previewPath: '/proj/.storybook/preview.ts', previewHash: 'abc' }),
+  isTelemetryModuleEnabled: vi.fn(() => true),
 }));
 
 vi.mock('storybook/internal/node-logger', () => ({
diff --git a/code/lib/cli-storybook/src/ai/index.ts b/code/lib/cli-storybook/src/ai/index.ts
index 1e8e765f7852..ad08be694cba 100644
--- a/code/lib/cli-storybook/src/ai/index.ts
+++ b/code/lib/cli-storybook/src/ai/index.ts
@@ -6,6 +6,7 @@ import { cache } from 'storybook/internal/common';
 import { logger } from 'storybook/internal/node-logger';
 import {
   getSessionId,
+  isTelemetryModuleEnabled,
   snapshotPreviewFile,
   telemetry,
   type AiSetupPendingRecord,
@@ -19,12 +20,7 @@ import { generateMarkdownOutput } from './prompt.ts';
 import type { ProjectInfo, AiSetupOptions } from './types.ts';
 
 export async function aiSetup(options: AiSetupOptions): Promise<void> {
-  const {
-    configDir: userConfigDir,
-    packageManager: packageManagerName,
-    output,
-    disableTelemetry,
-  } = options;
+  const { configDir: userConfigDir, packageManager: packageManagerName, output } = options;
 
   let projectInfo: ProjectInfo;
 
@@ -84,7 +80,7 @@ export async function aiSetup(options: AiSetupOptions): Promise<void> {
     return;
   }
 
-  const result = generateMarkdownOutput(projectInfo);
+  const result = await generateMarkdownOutput(projectInfo);
   const markdownOutput = result.markdown;
 
   await telemetry('ai-setup', {
@@ -107,7 +103,7 @@ export async function aiSetup(options: AiSetupOptions): Promise<void> {
   // collect evidence of what the agent accomplished — but only via telemetry
   // (the `ai-setup-evidence` event). Skip the snapshot + cache write when
   // telemetry is disabled so there's nobody to read it.
-  if (!disableTelemetry) {
+  if (isTelemetryModuleEnabled()) {
     const resolvedConfigDir = resolve(projectInfo.configDir);
     const previewSnapshot = await snapshotPreviewFile(resolvedConfigDir);
     const sessionId = await getSessionId();
diff --git a/code/lib/cli-storybook/src/ai/prompt.ts b/code/lib/cli-storybook/src/ai/prompt.ts
index 86c3258554af..55d9a5a2b52b 100644
--- a/code/lib/cli-storybook/src/ai/prompt.ts
+++ b/code/lib/cli-storybook/src/ai/prompt.ts
@@ -1,7 +1,7 @@
 import { dedent } from 'ts-dedent';
 
 import type { ProjectInfo } from './types.ts';
-import { getPrompts } from './prompts/index.ts';
+import { getPrompts } from './setup-prompts/index.ts';
 
 function getProjectOverview(projectInfo: ProjectInfo): string {
   return dedent`
@@ -19,10 +19,10 @@ function getProjectOverview(projectInfo: ProjectInfo): string {
   `;
 }
 
-export function generateMarkdownOutput(projectInfo: ProjectInfo): {
+export async function generateMarkdownOutput(projectInfo: ProjectInfo): Promise<{
   markdown: string;
-} {
-  const { prompts: aiPrompts } = getPrompts(projectInfo);
+}> {
+  const { prompts: aiPrompts } = await getPrompts(projectInfo);
 
   const sections: string[] = [];
 
diff --git a/code/lib/cli-storybook/src/ai/prompts/index.ts b/code/lib/cli-storybook/src/ai/prompts/index.ts
deleted file mode 100644
index 641e3bfe6f33..000000000000
--- a/code/lib/cli-storybook/src/ai/prompts/index.ts
+++ /dev/null
@@ -1,55 +0,0 @@
-import type { AiPrompt, ProjectInfo } from '../types.ts';
-
-import * as patternCopyPlay from './pattern-copy-play.ts';
-import * as setup from './setup.ts';
-
-/**
- * Registry of all prompt builders. Each key is a prompt identifier used only
- * internally (by the eval harness via `EVAL_SETUP_PROMPT`); users never see
- * these names. Each variant file exports an `instructions(projectInfo)`
- * function; namespace imports keep the registry self-describing and make the
- * convention uniform.
- */
-const PROMPT_BUILDERS = {
-  'pattern-copy-play': patternCopyPlay.instructions,
-  setup: setup.instructions,
-} satisfies Record<string, (projectInfo: ProjectInfo) => string>;
-
-export type PromptName = keyof typeof PROMPT_BUILDERS;
-
-export const PROMPT_NAMES = Object.keys(PROMPT_BUILDERS) as PromptName[];
-
-/**
- * The single prompt variant that ships to real users. Running
- * `npx storybook ai setup` without any overrides always produces this prompt.
- */
-export const DEFAULT_PROMPT_NAME: PromptName = 'pattern-copy-play';
-
-/**
- * Internal env var read only by `getPrompts`. The eval harness sets this
- * before spawning `ai setup` to select a non-default prompt variant for A/B
- * comparison. Unknown values fall back to the default so a typo never breaks
- * the CLI for real users.
- */
-const EVAL_SETUP_PROMPT_ENV = 'EVAL_SETUP_PROMPT';
-
-function resolvePromptName(): PromptName {
-  const requested = process.env[EVAL_SETUP_PROMPT_ENV]?.trim();
-  if (requested && Object.hasOwn(PROMPT_BUILDERS, requested)) {
-    return requested as PromptName;
-  }
-  return DEFAULT_PROMPT_NAME;
-}
-
-export function getPrompts(projectInfo: ProjectInfo): { prompts: AiPrompt[] } {
-  const name = resolvePromptName();
-  return {
-    prompts: [
-      {
-        name,
-        description: 'Set up Storybook for success',
-        instructions: PROMPT_BUILDERS[name](projectInfo),
-      },
-    ],
-  };
-}
diff --git a/code/lib/cli-storybook/src/ai/setup-prompts/index.ts b/code/lib/cli-storybook/src/ai/setup-prompts/index.ts
new file mode 100644
index 000000000000..ab8d1a7c3905
--- /dev/null
+++ b/code/lib/cli-storybook/src/ai/setup-prompts/index.ts
@@ -0,0 +1,68 @@
+import type { AiPrompt, ProjectInfo } from '../types.ts';
+
+import * as patternCopyPlay from './pattern-copy-play.ts';
+
+/**
+ * Main prompt used currently in `npx storybook ai setup` command. If you promote a new prompt to be default, move this to the FORMERLY_USED_PROMPTS object below.
+ */
+const CURRENTLY_USED_PROMPT: Record<string, (projectInfo: ProjectInfo) => string> = {
+  'pattern-copy-play': patternCopyPlay.instructions,
+};
+
+/**
+ * Names of variants registered behind `EVAL_SETUP_PROMPT`. Loaded on demand
+ * from sibling files so the bundler can code-split them away from the
+ * default-only path that real users hit.
+ */
+const FORMERLY_USED_PROMPTS: Record<string, () => Promise<(projectInfo: ProjectInfo) => string>> = {
+  setup: async () => (await import('./setup.ts')).instructions,
+};
+
+export type PromptName = string;
+
+/** Names available to the eval harness — defaults plus experimental variants. */
+export const PROMPT_NAMES: PromptName[] = [
+  ...Object.keys(CURRENTLY_USED_PROMPT),
+  ...Object.keys(FORMERLY_USED_PROMPTS),
+];
+
+/**
+ * The single prompt variant that ships to real users. Running
+ * `npx storybook ai setup` without any overrides always produces this prompt.
+ */
+export const DEFAULT_PROMPT_NAME: PromptName = 'pattern-copy-play';
+
+/**
+ * Internal env var read only by `getPrompts`. The eval harness sets this
+ * before spawning `ai setup` to select a non-default prompt variant for A/B
+ * comparison. Unknown values fall back to the default so a typo never breaks
+ * the CLI for real users.
+ */
+const EVAL_SETUP_PROMPT_ENV = 'EVAL_SETUP_PROMPT';
+
+function resolvePromptName(): PromptName {
+  const requested = process.env[EVAL_SETUP_PROMPT_ENV]?.trim();
+  if (
+    requested &&
+    (Object.hasOwn(CURRENTLY_USED_PROMPT, requested) ||
+      Object.hasOwn(FORMERLY_USED_PROMPTS, requested))
+  ) {
+    return requested;
+  }
+  return DEFAULT_PROMPT_NAME;
+}
+
+export async function getPrompts(projectInfo: ProjectInfo): Promise<{ prompts: AiPrompt[] }> {
+  const name = resolvePromptName();
+  const builder = CURRENTLY_USED_PROMPT[name] ?? (await FORMERLY_USED_PROMPTS[name]());
+
+  return {
+    prompts: [
+      {
+        name,
+        description: 'Set up Storybook for success',
+        instructions: builder(projectInfo),
+      },
+    ],
+  };
+}
diff --git a/code/lib/cli-storybook/src/ai/prompts/pattern-copy-play.ts b/code/lib/cli-storybook/src/ai/setup-prompts/pattern-copy-play.ts
similarity index 97%
rename from code/lib/cli-storybook/src/ai/prompts/pattern-copy-play.ts
rename to code/lib/cli-storybook/src/ai/setup-prompts/pattern-copy-play.ts
index 573e1fbc55db..a8abe0493f15 100644
--- a/code/lib/cli-storybook/src/ai/prompts/pattern-copy-play.ts
+++ b/code/lib/cli-storybook/src/ai/setup-prompts/pattern-copy-play.ts
@@ -1,3 +1,15 @@
+/**
+ * Prompt variant: `pattern-copy-play` (current default for `npx storybook ai setup`)
+ *
+ * - Created: 2026-04-22 (eval iteration 2, default since this PR)
+ * - Status: shipping default — produced by every `ai setup` invocation
+ *   without `EVAL_SETUP_PROMPT` set.
+ * - Reference eval results:
+ *   https://github.com/search?q=is:pr label:"prompt:pattern-copy-play" org:storybook-tmp&type=pullrequests
+ *
+ * Update this header when iterating: bump the iteration number and link the
+ * latest eval run so reviewers can compare variants without spelunking git.
+ */
 import { dedent } from 'ts-dedent';
 
 import type { ProjectInfo } from '../types.ts';
diff --git a/code/lib/cli-storybook/src/ai/prompts/setup.ts b/code/lib/cli-storybook/src/ai/setup-prompts/setup.ts
similarity index 94%
rename from code/lib/cli-storybook/src/ai/prompts/setup.ts
rename to code/lib/cli-storybook/src/ai/setup-prompts/setup.ts
index e9cdb3ff710d..916028087d38 100644
--- a/code/lib/cli-storybook/src/ai/prompts/setup.ts
+++ b/code/lib/cli-storybook/src/ai/setup-prompts/setup.ts
@@ -1,3 +1,15 @@
+/**
+ * Prompt variant: `setup`
+ *
+ * - Created: 2026-04-15 (eval iteration 1, baseline before `pattern-copy-play`)
+ * - Status: experimental — not the default. Selected only when the eval
+ *   harness sets `EVAL_SETUP_PROMPT=setup`.
+ * - Reference eval results:
+ *   https://github.com/search?q=is:pr label:"prompt:setup" org:storybook-tmp&type=pullrequests
+ *
+ * Update this header when iterating: bump the iteration number and link the
+ * latest eval run so reviewers can compare variants without spelunking git.
+ */
 import { dedent } from 'ts-dedent';
 
 import type { ProjectInfo } from '../types.ts';
diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts
index b66d39148c27..3e5d3b27fcff 100644
--- a/scripts/eval/eval.ts
+++ b/scripts/eval/eval.ts
@@ -31,7 +31,7 @@ import {
 } from './lib/agents/config.ts';
 import { prepareTrial } from './lib/prepare-trial.ts';
 import { PROJECTS } from './lib/projects.ts';
-import { runTrial, type TrialConfig } from './lib/run-trial.ts';
+import { captureAiSetupMarkdown, runTrial, type TrialConfig } from './lib/run-trial.ts';
 import {
   captureEnvironment,
   createLogger,
@@ -192,12 +192,21 @@ if (args.manual) {
   const promptPath = join(workspace.resultsDir, 'prompt.md');
   await writeFile(promptPath, prompt);
 
+  const setupPromptPath = join(workspace.resultsDir, 'setup-prompt.md');
+  const setupPromptContent = await captureAiSetupMarkdown(
+    workspace.projectPath,
+    promptName,
+    logger
+  );
+  await writeFile(setupPromptPath, setupPromptContent);
+
   const cliCommand = buildManualCommand(variant, promptPath, promptName);
 
   logger.log(pc.bold('\n── Manual mode ──'));
-  logger.log(`\n  Trial dir:    ${pc.cyan(workspace.trialDir)}`);
-  logger.log(`  Project dir:  ${pc.cyan(workspace.projectPath)}`);
-  logger.log(`  Prompt file:  ${pc.cyan(promptPath)}`);
+  logger.log(`\n  Trial dir:        ${pc.cyan(workspace.trialDir)}`);
+  logger.log(`  Project dir:      ${pc.cyan(workspace.projectPath)}`);
+  logger.log(`  Prompt file:      ${pc.cyan(promptPath)}`);
+  logger.log(`  Setup prompt:     ${pc.cyan(setupPromptPath)}`);
   logger.log(pc.bold('\nRun the agent yourself:\n'));
   logger.log(`  ${pc.green('cd')} ${workspace.projectPath}`);
   logger.log(`  ${pc.green(cliCommand)}\n`);
diff --git a/scripts/eval/lib/run-trial.test.ts b/scripts/eval/lib/run-trial.test.ts
index ef042b423172..957492069b2d 100644
--- a/scripts/eval/lib/run-trial.test.ts
+++ b/scripts/eval/lib/run-trial.test.ts
@@ -217,7 +217,7 @@ describe('runTrial pipeline', () => {
     });
   });
 
-  it('writes data.json and prompt.md to results dir', async () => {
+  it('writes data.json, prompt.md, and setup-prompt.md to results dir', async () => {
     setupMocks();
 
     await runTrial(baseConfig);
@@ -260,6 +260,10 @@ describe('runTrial pipeline', () => {
 
     const promptContent = readFileSync(join(resultsDir, 'prompt.md'), 'utf-8');
     expect(promptContent).toContain('npx storybook ai setup');
+
+    const setupPromptContent = readFileSync(join(resultsDir, 'setup-prompt.md'), 'utf-8');
+    expect(setupPromptContent).toContain('Full project-aware instructions');
+
     expect(() => readFileSync(join(resultsDir, 'summary.json'), 'utf-8')).toThrow();
     expect(() => readFileSync(join(resultsDir, 'transcript.json'), 'utf-8')).toThrow();
   });
diff --git a/scripts/eval/lib/run-trial.ts b/scripts/eval/lib/run-trial.ts
index 6add6624f83f..03152c03a1c2 100644
--- a/scripts/eval/lib/run-trial.ts
+++ b/scripts/eval/lib/run-trial.ts
@@ -74,8 +74,11 @@ export async function runTrial(config: TrialConfig, logger?: Logger): Promise<Ru
   // 5. Capture the full markdown the agent will receive from `ai setup` so
   //    the trial record contains a reproducible, project-aware snapshot of
   //    the instructions (not just the one-line nudge). Runs the same CLI the
-  //    agent will run, in the same workspace, with the same env.
+  //    agent will run, in the same workspace, with the same env. Persisted as
+  //    a separate file so the resulting PR diff shows the exact instructions
+  //    the agent was given for this trial.
   const promptContent = await captureAiSetupMarkdown(workspace.projectPath, promptName, log);
+  await writeFile(join(workspace.resultsDir, 'setup-prompt.md'), promptContent);
 
   // 6. Execute the agent. EVAL_SETUP_PROMPT is forwarded into the agent's
   //    environment so its `ai setup` tool call resolves to the selected
@@ -195,7 +198,7 @@ export async function runTrial(config: TrialConfig, logger?: Logger): Promise<Ru
  * capturing the prompt content is bookkeeping, not the thing being measured,
  * so it must never abort the trial.
  */
-async function captureAiSetupMarkdown(
+export async function captureAiSetupMarkdown(
   projectPath: string,
   promptName: string,
   log: Logger
diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts
index db4b961c7feb..ab8c16a79d6e 100644
--- a/scripts/eval/lib/utils.ts
+++ b/scripts/eval/lib/utils.ts
@@ -6,7 +6,7 @@ import { AI_SETUP_PROMPT } from '../../../code/core/src/shared/constants/ai-prom
 import {
   DEFAULT_PROMPT_NAME,
   PROMPT_NAMES,
-} from '../../../code/lib/cli-storybook/src/ai/prompts/index.ts';
+} from '../../../code/lib/cli-storybook/src/ai/setup-prompts/index.ts';
 
 export interface Logger {
   log: (msg: string) => void;

From b35ff84fac7deb59dee1ba5a38f0b9303ecd007a Mon Sep 17 00:00:00 2001
From: yannbf <yannbf@gmail.com>
Date: Wed, 29 Apr 2026 20:14:38 +0200
Subject: [PATCH 17/17] fix test

---
 code/lib/cli-storybook/src/ai/index.test.ts | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/code/lib/cli-storybook/src/ai/index.test.ts b/code/lib/cli-storybook/src/ai/index.test.ts
index 21dd70b2898a..d483b5833cef 100644
--- a/code/lib/cli-storybook/src/ai/index.test.ts
+++ b/code/lib/cli-storybook/src/ai/index.test.ts
@@ -47,7 +47,11 @@ vi.mock('../automigrate/helpers/mainConfigFile.ts', () => ({
 }));
 
 import { cache } from 'storybook/internal/common';
-import { snapshotPreviewFile, telemetry } from 'storybook/internal/telemetry';
+import {
+  isTelemetryModuleEnabled,
+  snapshotPreviewFile,
+  telemetry,
+} from 'storybook/internal/telemetry';
 
 import { aiSetup } from './index.ts';
 
@@ -75,6 +79,8 @@ describe('aiSetup telemetry gating', () => {
   });
 
   it('skips snapshot + cache write when telemetry is disabled', async () => {
+    vi.mocked(isTelemetryModuleEnabled).mockReturnValueOnce(false);
+
     await aiSetup({ configDir: '/proj/.storybook', disableTelemetry: true });
 
     expect(vi.mocked(snapshotPreviewFile)).not.toHaveBeenCalled();