storybookjs · kasperpeulen · Mar 31, 2026 · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026
diff --git a/.agents/skills/review-pr/SKILL.md b/.agents/skills/review-pr/SKILL.md
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -32,7 +32,7 @@ jobs:
   generate-and-run-config:
     executor: 
       name: node/default
-      resource_class: small
+      resource_class: large
     steps:
       - node/install:
           install-yarn: true

diff --git a/.gitignore b/.gitignore
@@ -79,4 +79,11 @@ CLAUDE.local.md
 .cursor/mcp.json
 .vscode/mcp.json
 .mcp.json
-.nx/polygraph
+.nx/polygraph
+
+# Eval system
+scripts/eval/.cache
+scripts/eval/results
+
+# review-pr skill output
+.pr-review
diff --git a/AGENTS.md b/AGENTS.md
@@ -9,10 +9,11 @@ This file is the canonical instruction source for coding agents. Files like `CLA
 Storybook is a large TypeScript monorepo. The git root is the repo root, the main code lives in `code/`, and build tooling lives in `scripts/`. The default branch is `next`.
 
 - **Base branch**: `next` (all PRs should target `next`, not `main`)
-- **Node.js**: `22.21.1` (see `.nvmrc`)
+- **Node.js**: `22.22.1` (see `.nvmrc`) — supports `.ts` natively via type stripping (no loader needed)
 - **Package Manager**: Yarn Berry
 - **Task orchestration**: NX plus the custom `yarn task` runner
 - **CI environment**: Linux and Windows
+- **TS execution**: Migrating from `jiti` to native `node` for running `.ts` files. New scripts should use `node ./path/file.ts` with explicit `.ts` import extensions (enabled by `allowImportingTsExtensions` in tsconfig). Legacy scripts still use `jiti` but should be migrated over time.
 
 ## Repository Structure
 
@@ -234,7 +235,7 @@ When writing tests:
 
 After changing files:
 
-1. Format with `cd code && oxfmt`
+1. Format with `yarn fmt:write` (run from the repo root)
 2. Lint with `yarn --cwd code lint:js:cmd <file-relative-to-code-folder> --fix` or `cd code && yarn lint:js:cmd <file-relative-to-code-folder>`
 3. Run relevant tests before submitting a PR
 

diff --git a/code/core/src/core-server/index.ts b/code/core/src/core-server/index.ts
@@ -32,3 +32,6 @@ export {
 } from './stores/test-provider';
 
 export { getServerPort } from './utils/server-address';
+
+export { getComponentCandidates } from './utils/ghost-stories/get-candidates';
+export { runGhostStories } from './utils/ghost-stories/run-story-tests';
diff --git a/code/core/src/core-server/server-channel/ghost-stories-channel.ts b/code/core/src/core-server/server-channel/ghost-stories-channel.ts
@@ -9,7 +9,7 @@ import {
 import type { CoreConfig, Options } from 'storybook/internal/types';
 
 import { getComponentCandidates } from '../utils/ghost-stories/get-candidates';
-import { runStoryTests } from '../utils/ghost-stories/run-story-tests';
+import { runGhostStories } from '../utils/ghost-stories/run-story-tests';
 
 export function initGhostStoriesChannel(
   channel: Channel,
@@ -91,7 +91,7 @@ export function initGhostStoriesChannel(
 
       // Phase 2: Run tests on those candidates Vitest. The components will be transformed directly to tests
       // If they pass, it means that creating a story file for them would succeed.
-      const testRunResult = await runStoryTests(candidatesResult.candidates);
+      const testRunResult = await runGhostStories(candidatesResult.candidates);
       stats.totalRunDuration = Date.now() - ghostRunStart;
       stats.testRunDuration = testRunResult.duration;
       if (testRunResult.runError) {

diff --git a/code/core/src/core-server/utils/ghost-stories/get-candidates.ts b/code/core/src/core-server/utils/ghost-stories/get-candidates.ts
@@ -1,12 +1,11 @@
 import { readFile } from 'node:fs/promises';
 
 import { babelParse, traverse } from 'storybook/internal/babel';
-import { logger } from 'storybook/internal/node-logger';
 
 // eslint-disable-next-line depend/ban-dependencies
 import { glob } from 'glob';
 
-import { getComponentComplexity } from './component-analyzer';
+import { getComponentComplexity } from './component-analyzer.ts';
 
 // A valid candidate includes React code and at least one export
 function isValidCandidate(source: string): boolean {
@@ -128,9 +127,12 @@ export async function getCandidatesForStorybook(
 export async function getComponentCandidates({
   sampleSize = 20,
   globPattern = '**/*.{tsx,jsx}',
+  cwd = process.cwd(),
 }: {
   sampleSize?: number;
   globPattern?: string;
+  /** Working directory for glob. Defaults to process.cwd(). */
+  cwd?: string;
 } = {}): Promise<{
   candidates: string[];
   error?: string;
@@ -145,7 +147,7 @@ export async function getComponentCandidates({
 
     // Find files matching the glob pattern
     files = await glob(globPattern, {
-      cwd: process.cwd(),
+      cwd,
       absolute: true,
       ignore: [
         '**/node_modules/**',

diff --git a/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts b/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts
@@ -1,6 +1,10 @@
-import type { ErrorCategory } from '../../../shared/utils/categorize-render-errors';
-import { categorizeError } from '../../../shared/utils/categorize-render-errors';
-import { type ErrorCategorizationResult, type StoryTestResult, type TestRunSummary } from './types';
+import type { ErrorCategory } from '../../../shared/utils/categorize-render-errors.ts';
+import { categorizeError } from '../../../shared/utils/categorize-render-errors.ts';
+import {
+  type ErrorCategorizationResult,
+  type StoryTestResult,
+  type TestRunSummary,
+} from './types.ts';
 
 /**
  * For a given list of test results:

diff --git a/code/core/src/core-server/utils/ghost-stories/run-story-tests.ts b/code/core/src/core-server/utils/ghost-stories/run-story-tests.ts
@@ -5,10 +5,21 @@ import { executeCommand, resolvePathInStorybookCache } from 'storybook/internal/
 
 import { join } from 'pathe';
 
-import { parseVitestResults } from './parse-vitest-report';
-import type { TestRunSummary } from './types';
+import { parseVitestResults } from './parse-vitest-report.ts';
+import type { TestRunSummary } from './types.ts';
 
-export async function runStoryTests(componentFilePaths: string[]): Promise<TestRunSummary> {
+/**
+ * Run ghost stories: execute vitest on component file paths to auto-generate
+ * and test stories that don't exist on disk.
+ *
+ * @param componentFilePaths - Absolute paths to component files to test.
+ * @param options.cwd - Working directory for vitest. Defaults to process.cwd().
+ */
+export async function runGhostStories(
+  componentFilePaths: string[],
+  options?: { cwd?: string }
+): Promise<TestRunSummary> {
+  const cwd = options?.cwd;
   try {
     // Create the cache directory for story discovery tests
     const cacheDir = resolvePathInStorybookCache('ghost-stories-tests');
@@ -34,6 +45,7 @@ export async function runStoryTests(componentFilePaths: string[]): Promise<TestR
           `--outputFile=${outputFile}`,
           ...componentFilePaths,
         ],
+        cwd,
         stdio: 'pipe',
         env: {
           STORYBOOK_COMPONENT_PATHS: componentFilePaths.join(';'),

diff --git a/code/core/src/shared/utils/categorize-render-errors.ts b/code/core/src/shared/utils/categorize-render-errors.ts
@@ -3,7 +3,7 @@ import {
   isRouterPackage,
   isStateManagementPackage,
   isStylingPackage,
-} from './ecosystem-identifier';
+} from './ecosystem-identifier.ts';
 
 export const ERROR_CATEGORIES = {
   MISSING_PROVIDER: 'MISSING_PROVIDER',

diff --git a/code/tsconfig.json b/code/tsconfig.json
@@ -13,6 +13,8 @@
     "lib": ["dom", "dom.iterable", "esnext"],
     "module": "Preserve",
     "moduleResolution": "bundler",
+    // Required for explicit .ts import extensions — migrating toward native Node TS execution
+    "allowImportingTsExtensions": true,
     "noImplicitAny": true,
     "noUnusedLocals": false,
     "skipLibCheck": true,

diff --git a/scripts/ci/common-jobs.ts b/scripts/ci/common-jobs.ts
@@ -67,7 +67,7 @@ export const build_linux = defineJob('Build (linux)', (workflowName) => ({
 export const fmt = defineJob('Format check', () => ({
   executor: {
     name: 'sb_node_22_classic',
-    class: 'medium+',
+    class: 'xlarge',
   },
   steps: [
     git.checkout(),

diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts
@@ -0,0 +1,201 @@
+/**
+ * Eval harness entry point.
+ *
+ * Runs with `node ./eval/eval.ts` (no jiti). Node 22+ supports .ts natively
+ * via type stripping. Import specifiers use explicit .ts extensions.
+ *
+ * Usage:
+ *   node eval/eval.ts -p mealdrop                       # claude defaults
+ *   node eval/eval.ts -p mealdrop -a codex             # codex defaults
+ *   node eval/eval.ts -p mealdrop -m gpt-5.4           # codex (inferred)
+ *   node eval/eval.ts -p mealdrop -a claude -e max     # claude with max effort
+ *   node eval/eval.ts -p mealdrop --manual             # prepare only, print instructions
+ *   node eval/eval.ts --list-projects
+ *   node eval/eval.ts --list-models
+ *   node eval/eval.ts --list-prompts
+ */
+import { writeFile } from 'node:fs/promises';
+import { join } from 'node:path';
+import { parseArgs } from 'node:util';
+import { z } from 'zod';
+import pc from 'picocolors';
+import {
+  AGENT_IDS,
+  AGENTS,
+  CLAUDE_EFFORTS,
+  CLAUDE_MODELS,
+  CODEX_EFFORTS,
+  CODEX_MODELS,
+  type AgentId,
+  type AgentVariant,
+} from './lib/agents/config.ts';
+import { prepareTrial } from './lib/prepare-trial.ts';
+import { PROJECTS } from './lib/projects.ts';
+import { runTrial, type TrialConfig } from './lib/run-trial.ts';
+import {
+  captureEnvironment,
+  createLogger,
+  formatCost,
+  formatDuration,
+  generateTrialId,
+  listPrompts,
+  loadPrompt,
+} from './lib/utils.ts';
+
+const PROJECT_NAMES = PROJECTS.map((p) => p.name) as [string, ...string[]];
+
+const base = {
+  project: z.enum(PROJECT_NAMES).optional(),
+  prompt: z.string().default('setup'),
+  verbose: z.boolean().default(false),
+  manual: z.boolean().default(false),
+  listProjects: z.boolean().default(false),
+  listModels: z.boolean().default(false),
+  listPrompts: z.boolean().default(false),
+};
+
+const argsSchema = z.discriminatedUnion('agent', [
+  z.object({
+    ...base,
+    agent: z.literal('claude'),
+    model: z.enum(CLAUDE_MODELS).default(AGENTS.claude.defaultModel),
+    effort: z.enum(CLAUDE_EFFORTS).default(AGENTS.claude.defaultEffort),
+  }),
+  z.object({
+    ...base,
+    agent: z.literal('codex'),
+    model: z.enum(CODEX_MODELS).default(AGENTS.codex.defaultModel),
+    effort: z.enum(CODEX_EFFORTS).default(AGENTS.codex.defaultEffort),
+  }),
+]);
+
+const { values } = parseArgs({
+  options: {
+    project: { type: 'string', short: 'p' },
+    agent: { type: 'string', short: 'a' },
+    model: { type: 'string', short: 'm' },
+    effort: { type: 'string', short: 'e' },
+    prompt: { type: 'string' },
+    verbose: { type: 'boolean', short: 'v' },
+    manual: { type: 'boolean' },
+    'list-projects': { type: 'boolean' },
+    'list-models': { type: 'boolean' },
+    'list-prompts': { type: 'boolean' },
+  },
+  args: process.argv.slice(2),
+  strict: true,
+});
+
+// Resolve the discriminator: explicit --agent, inferred from --model, or default to claude.
+const agent = values.agent ?? (values.model ? inferAgent(values.model) : 'claude');
+
+const parsed = argsSchema.safeParse({
+  ...values,
+  agent,
+  listProjects: values['list-projects'],
+  listModels: values['list-models'],
+  listPrompts: values['list-prompts'],
+});
+
+if (!parsed.success) {
+  for (const issue of parsed.error.issues) {
+    console.error(pc.red(`  ${issue.path.join('.')}: ${issue.message}`));
+  }
+  process.exit(1);
+}
+
+const args = parsed.data;
+const logger = createLogger();
+
+if (args.listProjects) {
+  for (const project of PROJECTS) {
+    logger.log(`  ${pc.bold(project.name)} — ${project.description}`);
+  }
+  process.exit(0);
+}
+if (args.listModels) {
+  for (const [name, { models }] of Object.entries(AGENTS)) {
+    logger.log(`\n  ${pc.bold(name)}`);
+    for (const model of models) logger.log(`    ${model}`);
+  }
+  process.exit(0);
+}
+if (args.listPrompts) {
+  for (const name of listPrompts()) logger.log(`  ${pc.bold(name)}`);
+  process.exit(0);
+}
+
+if (!args.project) {
+  logger.log(pc.red(`Specify a project with -p. Available: ${PROJECT_NAMES.join(', ')}`));
+  process.exit(1);
+}
+const project = PROJECTS.find((p) => p.name === args.project)!;
+const variant = toVariant(args);
+
+logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`));
+logger.log(
+  `Agent: ${variant.agent} | Model: ${variant.model} | Effort: ${variant.effort} | Prompt: ${args.prompt}\n`
+);
+
+if (args.manual) {
+  const trialId = generateTrialId(project.name, variant.agent, variant.model, args.prompt);
+  const workspace = await prepareTrial(project, trialId, logger);
+  await captureEnvironment(workspace.resultsDir);
+
+  const prompt = loadPrompt(args.prompt);
+  const promptPath = join(workspace.resultsDir, 'prompt.md');
+  await writeFile(promptPath, prompt);
+
+  const cliCommand = buildManualCommand(variant, promptPath);
+
+  logger.log(pc.bold('\n── Manual mode ──'));
+  logger.log(`\n  Trial dir:    ${pc.cyan(workspace.trialDir)}`);
+  logger.log(`  Project dir:  ${pc.cyan(workspace.projectPath)}`);
+  logger.log(`  Prompt file:  ${pc.cyan(promptPath)}`);
+  logger.log(pc.bold('\nRun the agent yourself:\n'));
+  logger.log(`  ${pc.green('cd')} ${workspace.projectPath}`);
+  logger.log(`  ${pc.green(cliCommand)}\n`);
+} else {
+  const result = await runTrial(
+    { project, variant, prompt: args.prompt, verbose: args.verbose } satisfies TrialConfig,
+    logger
+  );
+
+  const ghost = result.grade.ghostStories;
+  const ghostStr = ghost
+    ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)`
+    : '-';
+
+  logger.log(pc.bold('\nResult'));
+  logger.log(`  Build:   ${result.grade.buildSuccess ? pc.green('PASS') : pc.red('FAIL')}`);
+  logger.log(`  Ghost:   ${ghostStr}`);
+  logger.log(`  TS Err:  ${result.grade.typeCheckErrors}`);
+  logger.log(`  Score:   ${result.score.score}`);
+  logger.log(`  Cost:    ${formatCost(result.execution.cost)}`);
+  logger.log(`  Time:    ${formatDuration(result.execution.duration)}`);
+  logger.log(`  Turns:   ${result.execution.turns}`);
+
+  logger.log('\nDone.');
+}
+
+function inferAgent(model: string): AgentId {
+  for (const id of AGENT_IDS) {
+    if (AGENTS[id].models.some((candidate) => candidate === model)) return id;
+  }
+  throw new Error(`No agent found for model: ${model}`);
+}
+
+function buildManualCommand(variant: AgentVariant, promptPath: string): string {
+  const promptArg = `"$(cat ${promptPath})"`;
+  if (variant.agent === 'claude') {
+    const sdkModel = AGENTS.claude.sdkModelIds[variant.model] ?? variant.model;
+    return `claude --model ${sdkModel} ${promptArg}`;
+  }
+  return `codex --model ${variant.model} --reasoning-effort ${variant.effort} ${promptArg}`;
+}
+
+function toVariant(args: z.infer<typeof argsSchema>): AgentVariant {
+  return args.agent === 'claude'
+    ? { agent: 'claude', model: args.model, effort: args.effort }
+    : { agent: 'codex', model: args.model, effort: args.effort };
+}