From 1303e343b48b43ebafa3b9908abb6a93c7ae72f8 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 18:11:30 +0700 Subject: [PATCH 01/63] Add LLM eval system for Storybook agentic setup (M0) Eval system to test how well AI agents complete Storybook setup after `npx storybook@latest init --yes` on real-world projects. Features: - Multi-LLM support: Claude Code (Opus/Sonnet/Haiku), GitHub Copilot CLI (Claude models + GPT-5.2-codex, GPT-5.2, GPT-5.1-codex-max) - 6 test projects covering different tech stacks: styled-components/Redux, Tailwind/HeadlessUI, Zustand, ECharts, GraphQL - Structured JSON output with execution metrics (cost, duration, turns) and grading (build success, TypeScript errors, quality score) - CLI with project/model/agent selection, iterations, custom prompts Usage: npx jiti scripts/eval/eval.ts --project wikitok --model claude-sonnet-4-6 Refs: https://github.com/storybookjs/storybook/issues/34295 --- .gitignore | 6 +- scripts/eval/config.ts | 48 ++++++ scripts/eval/eval.ts | 157 ++++++++++++++++++++ scripts/eval/lib/agents/claude-code.ts | 81 ++++++++++ scripts/eval/lib/agents/copilot.ts | 50 +++++++ scripts/eval/lib/generate-prompt.ts | 19 +++ scripts/eval/lib/grade.ts | 87 +++++++++++ scripts/eval/lib/prepare-trial.ts | 198 +++++++++++++++++++++++++ scripts/eval/lib/run-task.ts | 58 ++++++++ scripts/eval/lib/utils.ts | 118 +++++++++++++++ scripts/eval/prompts/setup.md | 33 +++++ scripts/eval/types.ts | 140 +++++++++++++++++ scripts/package.json | 1 + 13 files changed, 995 insertions(+), 1 deletion(-) create mode 100644 scripts/eval/config.ts create mode 100644 scripts/eval/eval.ts create mode 100644 scripts/eval/lib/agents/claude-code.ts create mode 100644 scripts/eval/lib/agents/copilot.ts create mode 100644 scripts/eval/lib/generate-prompt.ts create mode 100644 scripts/eval/lib/grade.ts create mode 100644 scripts/eval/lib/prepare-trial.ts create mode 100644 scripts/eval/lib/run-task.ts create mode 100644 scripts/eval/lib/utils.ts create mode 100644 scripts/eval/prompts/setup.md create mode 100644 scripts/eval/types.ts diff --git a/.gitignore b/.gitignore index 43107a4f3e07..1afe80035d56 100644 --- a/.gitignore +++ b/.gitignore @@ -79,4 +79,8 @@ CLAUDE.local.md .cursor/mcp.json .vscode/mcp.json .mcp.json -.nx/polygraph \ No newline at end of file +.nx/polygraph + +# Eval system +scripts/eval/.cache +scripts/eval/results \ No newline at end of file diff --git a/scripts/eval/config.ts b/scripts/eval/config.ts new file mode 100644 index 000000000000..dac1ec502f3f --- /dev/null +++ b/scripts/eval/config.ts @@ -0,0 +1,48 @@ +import type { Project, AgentName, Agent } from './types'; +import { claudeCodeAgent } from './lib/agents/claude-code'; +import { copilotAgent } from './lib/agents/copilot'; + +export const PROJECTS: Project[] = [ + { + name: 'mealdrop', + repo: 'https://github.com/yannbf/mealdrop', + branch: 'without-storybook', + description: 'Styled components, Redux, React Router', + }, + { + name: 'edgy', + repo: 'https://github.com/catherineisonline/edgy', + description: 'Tailwind, HeadlessUI, React Router', + }, + { + name: 'wikitok', + repo: 'https://github.com/IsaacGemal/wikitok', + projectDir: 'frontend', + description: 'Simple project with Tailwind', + }, + { + name: 'baklava', + repo: 'https://github.com/fortanix/baklava', + branch: 'master', + description: 'Component library with Zustand', + }, + { + name: 'echarts', + repo: 'https://github.com/tmkx/echarts-react', + description: 'ECharts React wrapper', + }, + { + name: 'evergreen-ci', + repo: 'https://github.com/evergreen-ci/ui', + projectDir: 'packages/lib', + description: 'GraphQL', + }, +]; + +export const agents: Record = { + 'claude-code': claudeCodeAgent, + 'copilot-cli': copilotAgent, +}; + +export const DEFAULT_AGENT: AgentName = 'claude-code'; +export const DEFAULT_MODEL = 'claude-sonnet-4-6' as const; diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts new file mode 100644 index 000000000000..82ebaab8a2fa --- /dev/null +++ b/scripts/eval/eval.ts @@ -0,0 +1,157 @@ +import { Command } from 'commander'; +import pc from 'picocolors'; +import type { TrialConfig, TrialResult, AgentName, SupportedModel } from './types'; +import { SUPPORTED_MODELS_BY_AGENT } from './types'; +import { PROJECTS, DEFAULT_AGENT, DEFAULT_MODEL } from './config'; +import { runTask } from './lib/run-task'; +import { log, formatDuration, formatCost } from './lib/utils'; + +const program = new Command() + .name('eval') + .description('Evaluate AI agents on Storybook setup tasks') + .option('-p, --project ', 'run only this project (by name)') + .option('-a, --agent ', 'agent to use', DEFAULT_AGENT) + .option('-m, --model ', 'model to use', DEFAULT_MODEL) + .option('--prompt ', 'custom prompt file path') + .option('-n, --iterations ', 'number of iterations per project', '1') + .option('-v, --verbose', 'verbose output') + .option('--list-projects', 'list available projects and exit') + .option('--list-models', 'list supported models and exit'); + +program.parse(); + +const opts = program.opts(); + +// --- List commands --- + +if (opts.listProjects) { + log('Available projects:'); + for (const p of PROJECTS) { + log(` ${pc.bold(p.name)} - ${p.description || p.repo}`); + } + process.exit(0); +} + +if (opts.listModels) { + log('Supported models by agent:'); + for (const [agent, models] of Object.entries(SUPPORTED_MODELS_BY_AGENT)) { + log(`\n ${pc.bold(agent)}:`); + for (const m of models) { + log(` - ${m}`); + } + } + process.exit(0); +} + +// --- Validate inputs --- + +const agentName = opts.agent as AgentName; +const model = opts.model as SupportedModel; +const iterations = parseInt(opts.iterations as string, 10); + +const supportedModels = SUPPORTED_MODELS_BY_AGENT[agentName]; +if (!supportedModels) { + log(pc.red(`Unknown agent: ${agentName}. Use --list-models to see available agents.`)); + process.exit(1); +} +if (!supportedModels.includes(model)) { + log(pc.red(`Model ${model} is not supported by agent ${agentName}. Use --list-models to see options.`)); + process.exit(1); +} + +// Filter projects +const projects = opts.project + ? PROJECTS.filter((p) => p.name === opts.project) + : PROJECTS; + +if (projects.length === 0) { + log(pc.red(`Project not found: ${opts.project}. Use --list-projects to see available projects.`)); + process.exit(1); +} + +// --- Run evals --- + +log(pc.bold('\nStorybook Setup Eval')); +log(`Agent: ${pc.cyan(agentName)} | Model: ${pc.cyan(model)} | Iterations: ${iterations}`); +log(`Projects: ${projects.map((p) => p.name).join(', ')}`); + +const allResults: TrialResult[] = []; + +for (const project of projects) { + for (let i = 0; i < iterations; i++) { + const suffix = iterations > 1 ? ` (iteration ${i + 1}/${iterations})` : ''; + log(pc.bold(`\n${'='.repeat(60)}`)); + log(pc.bold(`${project.name}${suffix}`)); + log(`${project.description || ''}`); + log(pc.bold('='.repeat(60))); + + const config: TrialConfig = { + project, + agent: agentName, + model, + promptFile: opts.prompt as string | undefined, + verbose: opts.verbose as boolean | undefined, + }; + + try { + const result = await runTask(config); + allResults.push(result); + } catch (error) { + log(pc.red(`\nFailed to evaluate ${project.name}: ${error instanceof Error ? error.message : error}`)); + if (opts.verbose && error instanceof Error) { + log(error.stack || ''); + } + } + } +} + +// --- Print summary table --- + +if (allResults.length > 0) { + log(pc.bold('\n\nResults Summary')); + log('='.repeat(100)); + + // Header + const header = [ + 'Project'.padEnd(15), + 'Build'.padEnd(7), + 'TS Err'.padEnd(8), + 'Quality'.padEnd(9), + 'Cost'.padEnd(8), + 'Time'.padEnd(8), + 'Turns'.padEnd(7), + ].join(' | '); + log(header); + log('-'.repeat(100)); + + // Rows + for (const r of allResults) { + const buildStr = r.grading.buildSuccess ? 'PASS' : 'FAIL'; + const buildColored = r.grading.buildSuccess ? pc.green(buildStr) : pc.red(buildStr); + const row = [ + r.project.padEnd(15), + buildStr.padEnd(7).replace(buildStr, buildColored), + String(r.grading.typeCheckErrors).padEnd(8), + String(r.quality.score).padEnd(9), + formatCost(r.execution.cost).padEnd(8), + formatDuration(r.execution.duration).padEnd(8), + String(r.execution.turns).padEnd(7), + ].join(' | '); + log(row); + } + + log('-'.repeat(100)); + + // Aggregate + const avgQuality = + allResults.reduce((sum, r) => sum + r.quality.score, 0) / allResults.length; + const totalCost = allResults.reduce((sum, r) => sum + (r.execution.cost || 0), 0); + const passRate = + allResults.filter((r) => r.grading.buildSuccess).length / allResults.length; + + log(`\nBuild pass rate: ${pc.bold(`${Math.round(passRate * 100)}%`)}`); + log(`Average quality: ${pc.bold(avgQuality.toFixed(2))}`); + log(`Total cost: ${pc.bold(formatCost(totalCost))}`); +} + +log('\nDone.'); diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts new file mode 100644 index 000000000000..c10250f80333 --- /dev/null +++ b/scripts/eval/lib/agents/claude-code.ts @@ -0,0 +1,81 @@ +import { writeFileSync } from 'node:fs'; +import { join } from 'node:path'; +import type { Agent, ExecutionResult, SupportedModel } from '../../types'; +import { exec } from '../utils'; + +export const claudeCodeAgent: Agent = { + name: 'claude-code', + + async execute( + prompt: string, + projectPath: string, + model: SupportedModel, + options?: { verbose?: boolean; resultsDir?: string } + ): Promise { + const { verbose, resultsDir } = options ?? {}; + const startTime = Date.now(); + + const args = [ + '--print', + '--model', + model, + '--output-format', + 'stream-json', + '--verbose', + '--max-turns', + '50', + prompt, + ]; + + const result = await exec('claude', args, { + cwd: projectPath, + timeout: 600_000, // 10 minutes + throwOnError: false, + stdin: 'ignore', + env: { + ...process.env, + CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: '1', + }, + }); + + const duration = (Date.now() - startTime) / 1000; + + // Save raw output for debugging + if (resultsDir) { + writeFileSync(join(resultsDir, 'agent-stdout.txt'), result.stdout); + writeFileSync(join(resultsDir, 'agent-stderr.txt'), result.stderr); + } + + // Parse stream-json output for metrics + let cost: number | undefined; + let turns = 0; + let durationApi: number | undefined; + + const lines = result.stdout.split('\n').filter(Boolean); + for (const line of lines) { + try { + const msg = JSON.parse(line); + if (msg.type === 'result') { + cost = msg.total_cost_usd; + turns = msg.num_turns ?? 0; + durationApi = msg.duration_api_ms ? msg.duration_api_ms / 1000 : undefined; + } + } catch { + // Skip non-JSON lines + } + } + + if (verbose && result.stderr) { + process.stderr.write(result.stderr); + } + + return { + agent: 'claude-code', + model, + cost, + duration, + durationApi, + turns, + }; + }, +}; diff --git a/scripts/eval/lib/agents/copilot.ts b/scripts/eval/lib/agents/copilot.ts new file mode 100644 index 000000000000..23754f0d4d0a --- /dev/null +++ b/scripts/eval/lib/agents/copilot.ts @@ -0,0 +1,50 @@ +import { writeFileSync } from 'node:fs'; +import { join } from 'node:path'; +import type { Agent, ExecutionResult, SupportedModel } from '../../types'; +import { exec } from '../utils'; + +export const copilotAgent: Agent = { + name: 'copilot-cli', + + async execute( + prompt: string, + projectPath: string, + model: SupportedModel, + options?: { verbose?: boolean; resultsDir?: string } + ): Promise { + const { verbose, resultsDir } = options ?? {}; + const startTime = Date.now(); + + const args = ['--model', model, prompt]; + + const result = await exec('copilot', args, { + cwd: projectPath, + timeout: 600_000, + throwOnError: false, + stdin: 'ignore', + }); + + const duration = (Date.now() - startTime) / 1000; + + if (resultsDir) { + writeFileSync(join(resultsDir, 'agent-stdout.txt'), result.stdout); + writeFileSync(join(resultsDir, 'agent-stderr.txt'), result.stderr); + } + + // Count tool execution markers as a proxy for turns + const toolMarkers = result.stdout.match(/[✓✗]/g) || []; + const turns = toolMarkers.length; + + if (verbose) { + if (result.stdout) process.stdout.write(result.stdout); + if (result.stderr) process.stderr.write(result.stderr); + } + + return { + agent: 'copilot-cli', + model, + duration, + turns, + }; + }, +}; diff --git a/scripts/eval/lib/generate-prompt.ts b/scripts/eval/lib/generate-prompt.ts new file mode 100644 index 000000000000..93ce7b94ce37 --- /dev/null +++ b/scripts/eval/lib/generate-prompt.ts @@ -0,0 +1,19 @@ +import { readFileSync, existsSync } from 'node:fs'; +import { resolve } from 'node:path'; +import { PROMPTS_DIR } from './utils'; + +/** + * Load and return the setup prompt. + * + * If a custom prompt file is specified, it takes precedence. + * Otherwise, the built-in `prompts/setup.md` is used. + */ +export function generatePrompt(promptFile?: string): string { + const file = promptFile ? resolve(promptFile) : resolve(PROMPTS_DIR, 'setup.md'); + + if (!existsSync(file)) { + throw new Error(`Prompt file not found: ${file}`); + } + + return readFileSync(file, 'utf-8'); +} diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts new file mode 100644 index 000000000000..1266f90c6e45 --- /dev/null +++ b/scripts/eval/lib/grade.ts @@ -0,0 +1,87 @@ +import { writeFileSync } from 'node:fs'; +import { join } from 'node:path'; +import type { GradingResult, QualityResult, TrialPaths } from '../types'; +import { logStep, logSuccess, logError, exec } from './utils'; + +/** + * Grade a trial by running storybook build and typecheck. + */ +export async function grade( + paths: TrialPaths +): Promise<{ grading: GradingResult; quality: QualityResult }> { + const { projectPath, resultsDir } = paths; + + // --- Storybook Build --- + logStep('Running storybook build...'); + const buildResult = await exec('npx', ['storybook', 'build', '--quiet'], { + cwd: projectPath, + timeout: 300_000, + throwOnError: false, + env: { + STORYBOOK_DISABLE_TELEMETRY: '1', + NODE_OPTIONS: '--max_old_space_size=4096', + PATH: process.env.PATH, + HOME: process.env.HOME, + npm_config_registry: 'https://registry.npmjs.org/', + }, + }); + + const buildSuccess = buildResult.exitCode === 0; + const buildOutput = buildResult.stdout + '\n' + buildResult.stderr; + writeFileSync(join(resultsDir, 'build-output.txt'), buildOutput); + + if (buildSuccess) { + logSuccess('Storybook build succeeded'); + } else { + logError(`Storybook build failed (exit code ${buildResult.exitCode})`); + } + + // --- TypeScript Check --- + logStep('Running typecheck...'); + const tscResult = await exec('npx', ['tsc', '--noEmit'], { + cwd: projectPath, + timeout: 120_000, + throwOnError: false, + env: { + PATH: process.env.PATH, + HOME: process.env.HOME, + npm_config_registry: 'https://registry.npmjs.org/', + }, + }); + + const typeCheckOutput = tscResult.stdout + '\n' + tscResult.stderr; + writeFileSync(join(resultsDir, 'typecheck-output.txt'), typeCheckOutput); + + const typeCheckErrors = (typeCheckOutput.match(/error TS\d+/g) || []).length; + + if (typeCheckErrors === 0) { + logSuccess('No TypeScript errors'); + } else { + logError(`${typeCheckErrors} TypeScript error(s)`); + } + + const grading: GradingResult = { + buildSuccess, + buildError: buildSuccess ? undefined : buildOutput.slice(-2000), + typeCheckErrors, + typeCheckOutput: typeCheckErrors > 0 ? typeCheckOutput.slice(-2000) : undefined, + }; + + const quality = calculateQuality(grading); + + return { grading, quality }; +} + +function calculateQuality(grading: GradingResult): QualityResult { + const buildScore = grading.buildSuccess ? 1 : 0; + const typeCheckScore = Math.max(0, 1 - grading.typeCheckErrors / 20); + const score = buildScore * 0.7 + typeCheckScore * 0.3; + + return { + score: Math.round(score * 100) / 100, + breakdown: { + build: buildScore, + typecheck: Math.round(typeCheckScore * 100) / 100, + }, + }; +} diff --git a/scripts/eval/lib/prepare-trial.ts b/scripts/eval/lib/prepare-trial.ts new file mode 100644 index 000000000000..25e2ddd6ead3 --- /dev/null +++ b/scripts/eval/lib/prepare-trial.ts @@ -0,0 +1,198 @@ +import { existsSync, mkdirSync, cpSync, rmSync, readdirSync } from 'node:fs'; +import { join } from 'node:path'; +import type { Project, TrialPaths } from '../types'; +import { CACHE_DIR, TRIALS_DIR, logStep, logSuccess, exec } from './utils'; + +/** + * Clean environment for npm/npx commands that should use the public registry. + * The storybook monorepo has a .npmrc pointing to localhost:6002 (verdaccio), + * which leaks through process.env and breaks commands outside the monorepo. + */ +function cleanNpmEnv(): Record { + const env = { ...process.env }; + // Force public registry + env.npm_config_registry = 'https://registry.npmjs.org/'; + // Remove any verdaccio-related env vars + for (const key of Object.keys(env)) { + if (key.startsWith('npm_config_') && key !== 'npm_config_registry') { + delete env[key]; + } + } + return env; +} + +/** + * Clone a repo to the cache (or reset to the target branch if already cached). + */ +async function ensureRepoClone(project: Project): Promise { + const cacheDir = join(CACHE_DIR, project.name); + + if (existsSync(cacheDir)) { + logStep(`Updating cached clone for ${project.name}...`); + const branch = project.branch || (await getDefaultBranch(cacheDir)); + await exec('git', ['fetch', 'origin'], { cwd: cacheDir }); + await exec('git', ['checkout', branch], { cwd: cacheDir }); + await exec('git', ['reset', '--hard', `origin/${branch}`], { cwd: cacheDir }); + await exec('git', ['clean', '-fdx', '-e', 'node_modules'], { cwd: cacheDir }); + return cacheDir; + } + + logStep(`Cloning ${project.repo}...`); + mkdirSync(CACHE_DIR, { recursive: true }); + const args = ['clone', '--depth', '1', project.repo, cacheDir]; + if (project.branch) { + args.splice(1, 0, '--branch', project.branch); + } + await exec('git', args, { timeout: 120_000 }); + + return cacheDir; +} + +async function getDefaultBranch(repoDir: string): Promise { + const result = await exec('git', ['rev-parse', '--abbrev-ref', 'HEAD'], { cwd: repoDir }); + return result.stdout.trim(); +} + +/** + * Recursively remove storybook-related files from a directory. + */ +function cleanStorybookFiles(dir: string) { + // Remove .storybook directory + const storybookDir = join(dir, '.storybook'); + if (existsSync(storybookDir)) { + rmSync(storybookDir, { recursive: true }); + logStep('Removed .storybook/'); + } + + // Remove story files + const removedCount = removeStoryFiles(dir); + if (removedCount > 0) { + logStep(`Removed ${removedCount} story file(s)`); + } +} + +function removeStoryFiles(dir: string, count = 0): number { + let removed = count; + try { + const entries = readdirSync(dir, { withFileTypes: true }); + for (const entry of entries) { + if (['node_modules', '.git', 'dist', 'build'].includes(entry.name)) continue; + + const fullPath = join(dir, entry.name); + if (entry.isDirectory()) { + if (entry.name === 'stories' || entry.name === '__stories__') { + rmSync(fullPath, { recursive: true }); + removed++; + } else { + removed = removeStoryFiles(fullPath, removed); + } + } else if (/\.(stories|story)\.(tsx?|jsx?|mdx?)$/.test(entry.name)) { + rmSync(fullPath); + removed++; + } + } + } catch { + // Skip directories we can't read + } + return removed; +} + +/** + * Detect the package manager and install dependencies. + */ +async function installDependencies(dir: string) { + const env = cleanNpmEnv(); + if (existsSync(join(dir, 'pnpm-lock.yaml')) || existsSync(join(dir, 'pnpm-workspace.yaml'))) { + logStep('Installing dependencies with pnpm...'); + await exec('pnpm', ['install', '--no-frozen-lockfile'], { cwd: dir, timeout: 300_000, env }); + } else if (existsSync(join(dir, 'yarn.lock'))) { + logStep('Installing dependencies with yarn...'); + if (existsSync(join(dir, '.yarnrc.yml'))) { + await exec('yarn', ['install', '--no-immutable'], { cwd: dir, timeout: 300_000, env }); + } else { + await exec('yarn', ['install'], { cwd: dir, timeout: 300_000, env }); + } + } else if (existsSync(join(dir, 'bun.lockb')) || existsSync(join(dir, 'bun.lock'))) { + logStep('Installing dependencies with bun...'); + await exec('bun', ['install'], { cwd: dir, timeout: 300_000, env }); + } else { + logStep('Installing dependencies with npm...'); + await exec('npm', ['install'], { cwd: dir, timeout: 300_000, env }); + } +} + +/** + * Run `npx storybook@latest init --yes` to scaffold Storybook. + */ +async function initStorybook(dir: string) { + logStep('Running storybook init...'); + const env = cleanNpmEnv(); + await exec('npx', ['storybook@latest', 'init', '--yes', '--no-dev'], { + cwd: dir, + timeout: 300_000, + env: { + ...env, + STORYBOOK_DISABLE_TELEMETRY: '1', + }, + }); +} + +/** + * Prepare a trial: clone repo, clean storybook files, install deps, init storybook. + */ +export async function prepareTrial(project: Project, trialId: string): Promise { + // 1. Ensure repo is in the cache + const cacheDir = await ensureRepoClone(project); + logSuccess(`Repo cached at ${cacheDir}`); + + // 2. Create trial directory and copy project + const trialDir = join(TRIALS_DIR, trialId); + const projectDest = join(trialDir, 'project'); + mkdirSync(trialDir, { recursive: true }); + + logStep('Copying project to trial directory...'); + cpSync(cacheDir, projectDest, { + recursive: true, + filter: (src) => { + // Exclude .git and node_modules from copy + const relative = src.slice(cacheDir.length); + if (relative.includes('node_modules')) return false; + if (relative.startsWith('/.git') || relative === '/.git') return false; + return true; + }, + }); + + // Initialize a fresh git repo so the agent can use git + await exec('git', ['init'], { cwd: projectDest }); + await exec('git', ['add', '.'], { cwd: projectDest }); + await exec('git', ['commit', '-m', 'Initial commit', '--allow-empty'], { + cwd: projectDest, + env: { + ...process.env, + GIT_AUTHOR_NAME: 'eval', + GIT_AUTHOR_EMAIL: 'eval@storybook.js.org', + GIT_COMMITTER_NAME: 'eval', + GIT_COMMITTER_EMAIL: 'eval@storybook.js.org', + }, + }); + + // 3. Determine the working path (handle monorepo projectDir) + const projectPath = project.projectDir ? join(projectDest, project.projectDir) : projectDest; + + // 4. Clean storybook files + cleanStorybookFiles(projectPath); + + // 5. Install dependencies + await installDependencies(projectPath); + logSuccess('Dependencies installed'); + + // 6. Run storybook init + await initStorybook(projectPath); + logSuccess('Storybook initialized'); + + // 7. Create results directory + const resultsDir = join(trialDir, 'results'); + mkdirSync(resultsDir, { recursive: true }); + + return { trialDir, projectPath, resultsDir }; +} diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-task.ts new file mode 100644 index 000000000000..c6f6c3dbe56f --- /dev/null +++ b/scripts/eval/lib/run-task.ts @@ -0,0 +1,58 @@ +import { writeFileSync } from 'node:fs'; +import { join } from 'node:path'; +import type { TrialConfig, TrialResult } from '../types'; +import { MODEL_TIERS } from '../types'; +import { agents } from '../config'; +import { prepareTrial } from './prepare-trial'; +import { generatePrompt } from './generate-prompt'; +import { grade } from './grade'; +import { generateTrialId, log, logSuccess } from './utils'; + +/** + * Run a full eval trial: prepare -> execute agent -> grade. + */ +export async function runTask(config: TrialConfig): Promise { + const { project, agent: agentName, model, promptFile, verbose } = config; + const trialId = generateTrialId(project.name, agentName, model); + const timestamp = new Date().toISOString(); + + log(`\nPreparing ${project.name}...`); + + // 1. Prepare the trial (clone, clean, init storybook) + const paths = await prepareTrial(project, trialId); + + // 2. Generate the prompt + const prompt = generatePrompt(promptFile); + writeFileSync(join(paths.resultsDir, 'prompt.md'), prompt); + + // 3. Execute the agent + log(` Running ${agentName} (${model})...`); + const agent = agents[agentName]; + const execution = await agent.execute(prompt, paths.projectPath, model, { + verbose, + resultsDir: paths.resultsDir, + }); + logSuccess(`Agent completed (${Math.round(execution.duration)}s, ${execution.cost ? `$${execution.cost.toFixed(2)}` : 'cost N/A'}, ${execution.turns} turns)`); + + // 4. Grade the results + const { grading, quality } = await grade(paths); + + // 5. Assemble final result + const result: TrialResult = { + project: project.name, + agent: agentName, + model, + modelTier: MODEL_TIERS[model], + timestamp, + promptFile: promptFile || 'setup.md', + execution, + grading, + quality, + }; + + // Save summary + writeFileSync(join(paths.resultsDir, 'summary.json'), JSON.stringify(result, null, 2)); + logSuccess(`Results saved to ${paths.resultsDir}`); + + return result; +} diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts new file mode 100644 index 000000000000..59006bbd39e3 --- /dev/null +++ b/scripts/eval/lib/utils.ts @@ -0,0 +1,118 @@ +import { resolve } from 'node:path'; +import pc from 'picocolors'; +import { x } from 'tinyexec'; + +/** Root of the storybook monorepo */ +export const REPO_ROOT = resolve(import.meta.dirname, '..', '..', '..'); + +/** Directory for eval trials and caches (outside the monorepo to avoid workspace interference) */ +export const EVAL_ROOT = resolve(REPO_ROOT, '..', 'storybook-eval'); + +/** Cached repo clones */ +export const CACHE_DIR = resolve(EVAL_ROOT, '.cache', 'repos'); + +/** Trial output base directory */ +export const TRIALS_DIR = resolve(EVAL_ROOT, 'trials'); + +/** Built-in prompts directory */ +export const PROMPTS_DIR = resolve(import.meta.dirname, '..', 'prompts'); + +export function log(msg: string) { + console.log(msg); +} + +export function logStep(msg: string) { + console.log(` ${pc.cyan('>')} ${msg}`); +} + +export function logSuccess(msg: string) { + console.log(` ${pc.green('✓')} ${msg}`); +} + +export function logError(msg: string) { + console.log(` ${pc.red('✗')} ${msg}`); +} + +export function logWarn(msg: string) { + console.log(` ${pc.yellow('!')} ${msg}`); +} + +export function formatDuration(seconds: number): string { + if (seconds < 60) return `${Math.round(seconds)}s`; + const mins = Math.floor(seconds / 60); + const secs = Math.round(seconds % 60); + return `${mins}m${secs}s`; +} + +export function formatCost(cost?: number): string { + if (cost == null) return '-'; + return `$${cost.toFixed(2)}`; +} + +export function generateTrialId(projectName: string, agent: string, model: string): string { + const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19); + return `${timestamp}-${projectName}-${agent}-${model}`; +} + +/** Options for the exec helper */ +interface ExecOptions { + cwd?: string; + env?: Record; + timeout?: number; + /** If true, don't throw on non-zero exit code (default: true = throw) */ + throwOnError?: boolean; + /** Set to 'ignore' to suppress stdin */ + stdin?: 'ignore'; +} + +/** Result from exec helper */ +export interface ExecResult { + stdout: string; + stderr: string; + exitCode: number | null; +} + +/** + * Thin wrapper around tinyexec's `x()` with timeout support via AbortController. + */ +export async function exec( + command: string, + args: string[], + options: ExecOptions = {} +): Promise { + const { cwd, env, timeout, throwOnError = true, stdin } = options; + + const controller = timeout ? new AbortController() : undefined; + const timer = timeout + ? setTimeout(() => controller!.abort(), timeout) + : undefined; + + const stdio = stdin === 'ignore' + ? (['ignore', 'pipe', 'pipe'] as const) + : undefined; + + try { + const result = await x(command, args, { + throwOnError: false, + nodeOptions: { + cwd, + env: env as NodeJS.ProcessEnv, + signal: controller?.signal, + ...(stdio ? { stdio } : {}), + }, + }); + + if (throwOnError && result.exitCode !== 0) { + const msg = `Command failed: ${command} ${args.join(' ')}\n${result.stderr}`; + throw new Error(msg); + } + + return { + stdout: result.stdout, + stderr: result.stderr, + exitCode: result.exitCode, + }; + } finally { + if (timer) clearTimeout(timer); + } +} diff --git a/scripts/eval/prompts/setup.md b/scripts/eval/prompts/setup.md new file mode 100644 index 000000000000..95b8aeb31b8c --- /dev/null +++ b/scripts/eval/prompts/setup.md @@ -0,0 +1,33 @@ +# Complete Storybook Setup + +Storybook has just been initialized in this project with `npx storybook@latest init --yes`. +The basic scaffolding is in place but the setup needs to be completed so that stories render correctly. + +## Steps + +1. **Analyze the project**: Read `package.json` and source code to understand the tech stack — CSS framework, state management, routing, theming, and any global providers. + +2. **Configure `.storybook/preview.ts`**: Make stories render like the real app by adding: + - Global CSS imports (Tailwind CSS, global stylesheets, CSS resets, font imports) + - Provider decorators wrapping every story (Redux store, React Router, Theme providers, i18n, etc.) + - Appropriate `parameters` (viewport, backgrounds, etc.) + +3. **Configure `.storybook/main.ts`**: Adjust if needed: + - `staticDirs` for public assets (images, fonts) + - Framework-specific overrides (e.g., `viteFinal` or `webpackFinal`) + - Autodocs if the project uses JSDoc or TSDoc + +4. **Verify the setup**: Run `npx storybook build` to check for errors. If it fails: + - Read the error output carefully + - Fix the root cause (missing import, wrong config, etc.) + - Run the build again + - Repeat until the build succeeds + +## Guidelines + +- Look at the app's entry point (`main.tsx`, `index.tsx`, `App.tsx`) to find providers and global setup +- Check for CSS framework config files (`tailwind.config.*`, `postcss.config.*`, etc.) +- Keep changes minimal — only modify what is needed to make stories render +- Do NOT create new stories or components +- Do NOT remove existing stories +- Prefer importing existing app utilities over re-implementing them diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts new file mode 100644 index 000000000000..c74d0a2a0ff6 --- /dev/null +++ b/scripts/eval/types.ts @@ -0,0 +1,140 @@ +/** + * Core types for the Storybook setup eval system. + * + * The eval tests how well an AI agent can complete a Storybook setup + * (after `npx storybook@latest init --yes`) across real-world projects. + */ + +// --- Agent & Model Types --- + +export type AgentName = 'claude-code' | 'copilot-cli'; + +export const CLAUDE_MODELS = ['claude-opus-4-6', 'claude-sonnet-4-6', 'claude-haiku-4-5'] as const; + +export const COPILOT_MODELS = [ + ...CLAUDE_MODELS, + 'gpt-5.2-codex', + 'gpt-5.2', + 'gpt-5.1-codex-max', +] as const; + +export type ClaudeModel = (typeof CLAUDE_MODELS)[number]; +export type CopilotModel = (typeof COPILOT_MODELS)[number]; +export type SupportedModel = CopilotModel; + +export type ModelTier = 'opus' | 'sonnet' | 'haiku' | 'codex'; + +export const MODEL_TIERS: Record = { + 'claude-opus-4-6': 'opus', + 'claude-sonnet-4-6': 'sonnet', + 'claude-haiku-4-5': 'haiku', + 'gpt-5.2-codex': 'codex', + 'gpt-5.2': 'codex', + 'gpt-5.1-codex-max': 'codex', +}; + +export const SUPPORTED_MODELS_BY_AGENT: Record = { + 'claude-code': CLAUDE_MODELS, + 'copilot-cli': COPILOT_MODELS, +}; + +// --- Project Types --- + +export interface Project { + /** Display name */ + name: string; + /** Git repo URL */ + repo: string; + /** Branch to clone (defaults to repo default) */ + branch?: string; + /** Subdirectory within the repo where the project lives */ + projectDir?: string; + /** Human-readable description of the project's tech stack */ + description?: string; +} + +// --- Trial Types --- + +export interface TrialConfig { + project: Project; + agent: AgentName; + model: SupportedModel; + /** Path to a custom prompt file (defaults to built-in setup.md) */ + promptFile?: string; + verbose?: boolean; +} + +export interface TrialPaths { + /** Root directory for this trial */ + trialDir: string; + /** Path to the project within the trial (where storybook is initialized) */ + projectPath: string; + /** Path where grading outputs are saved */ + resultsDir: string; +} + +// --- Execution Types --- + +export interface ExecutionResult { + agent: string; + model: string; + /** Total API cost in USD */ + cost?: number; + /** Wall-clock duration in seconds */ + duration: number; + /** API-only duration in seconds */ + durationApi?: number; + /** Number of agent turns */ + turns: number; +} + +// --- Grading Types --- + +export interface GradingResult { + /** Did `storybook build` exit with code 0? */ + buildSuccess: boolean; + /** Build error output (if failed) */ + buildError?: string; + /** Number of TypeScript errors from `tsc --noEmit` */ + typeCheckErrors: number; + /** TypeScript error output */ + typeCheckOutput?: string; +} + +// --- Quality Score --- + +export interface QualityResult { + /** Composite score from 0 to 1 */ + score: number; + /** Individual metric scores */ + breakdown: { + build: number; + typecheck: number; + }; +} + +// --- Final Result --- + +export interface TrialResult { + project: string; + agent: string; + model: string; + modelTier: ModelTier; + timestamp: string; + promptFile: string; + execution: ExecutionResult; + grading: GradingResult; + quality: QualityResult; +} + +// --- Agent Interface --- + +export interface Agent { + name: AgentName; + execute( + prompt: string, + projectPath: string, + model: SupportedModel, + options?: { verbose?: boolean; resultsDir?: string } + ): Promise; +} diff --git a/scripts/package.json b/scripts/package.json index 11ca41bd541d..bbfc84292fe1 100644 --- a/scripts/package.json +++ b/scripts/package.json @@ -9,6 +9,7 @@ "check": "jiti ./check/check-package.ts", "check-package": "jiti ./check-package.ts", "docs:codemod": "jiti ./snippets/codemod.ts", + "eval": "jiti ./eval/eval.ts", "generate-sandboxes": "jiti ./sandbox/generate.ts", "get-report-message": "jiti ./get-report-message.ts", "get-sandbox-dir": "jiti ./get-sandbox-dir.ts", From 143577e375e1a4a97d0879292297e32de4d08bec Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 19:01:55 +0700 Subject: [PATCH 02/63] Migrate eval agents from CLI to SDK Replace CLI process spawning with proper SDKs: - Claude: @anthropic-ai/claude-agent-sdk with query() API - Codex: @openai/codex-sdk with thread streaming API Benefits: structured responses, proper cost tracking, no stream-json parsing, no CLI installation dependency, full conversation transcript. --- scripts/eval/config.ts | 4 +- scripts/eval/lib/agents/claude-code.ts | 83 +++++++-------- scripts/eval/lib/agents/codex.ts | 54 ++++++++++ scripts/eval/lib/agents/copilot.ts | 50 --------- scripts/eval/types.ts | 21 ++-- scripts/package.json | 2 + yarn.lock | 138 +++++++++++++++++++++++-- 7 files changed, 233 insertions(+), 119 deletions(-) create mode 100644 scripts/eval/lib/agents/codex.ts delete mode 100644 scripts/eval/lib/agents/copilot.ts diff --git a/scripts/eval/config.ts b/scripts/eval/config.ts index dac1ec502f3f..b7a18779730c 100644 --- a/scripts/eval/config.ts +++ b/scripts/eval/config.ts @@ -1,6 +1,6 @@ import type { Project, AgentName, Agent } from './types'; import { claudeCodeAgent } from './lib/agents/claude-code'; -import { copilotAgent } from './lib/agents/copilot'; +import { codexAgent } from './lib/agents/codex'; export const PROJECTS: Project[] = [ { @@ -41,7 +41,7 @@ export const PROJECTS: Project[] = [ export const agents: Record = { 'claude-code': claudeCodeAgent, - 'copilot-cli': copilotAgent, + codex: codexAgent, }; export const DEFAULT_AGENT: AgentName = 'claude-code'; diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index c10250f80333..ae02e7e6a224 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -1,7 +1,7 @@ +import { query } from '@anthropic-ai/claude-agent-sdk'; import { writeFileSync } from 'node:fs'; import { join } from 'node:path'; import type { Agent, ExecutionResult, SupportedModel } from '../../types'; -import { exec } from '../utils'; export const claudeCodeAgent: Agent = { name: 'claude-code', @@ -15,58 +15,51 @@ export const claudeCodeAgent: Agent = { const { verbose, resultsDir } = options ?? {}; const startTime = Date.now(); - const args = [ - '--print', - '--model', - model, - '--output-format', - 'stream-json', - '--verbose', - '--max-turns', - '50', - prompt, - ]; - - const result = await exec('claude', args, { - cwd: projectPath, - timeout: 600_000, // 10 minutes - throwOnError: false, - stdin: 'ignore', - env: { - ...process.env, - CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: '1', - }, - }); - - const duration = (Date.now() - startTime) / 1000; - - // Save raw output for debugging - if (resultsDir) { - writeFileSync(join(resultsDir, 'agent-stdout.txt'), result.stdout); - writeFileSync(join(resultsDir, 'agent-stderr.txt'), result.stderr); - } - - // Parse stream-json output for metrics let cost: number | undefined; let turns = 0; let durationApi: number | undefined; + const messages: unknown[] = []; - const lines = result.stdout.split('\n').filter(Boolean); - for (const line of lines) { - try { - const msg = JSON.parse(line); - if (msg.type === 'result') { - cost = msg.total_cost_usd; - turns = msg.num_turns ?? 0; - durationApi = msg.duration_api_ms ? msg.duration_api_ms / 1000 : undefined; + for await (const message of query({ + prompt, + options: { + model, + cwd: projectPath, + allowedTools: ['Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep'], + maxTurns: 50, + systemPrompt: { type: 'preset', preset: 'claude_code' }, + }, + })) { + messages.push(message); + + if (verbose && 'type' in message && message.type === 'assistant') { + const content = (message as Record).content; + if (Array.isArray(content)) { + for (const block of content) { + if (block.type === 'text') { + process.stderr.write(block.text + '\n'); + } + } + } + } + + if ('type' in message && message.type === 'result') { + const result = message as Record; + if (result.subtype === 'success') { + cost = result.total_cost_usd as number | undefined; + turns = (result.num_turns as number) ?? 0; + durationApi = + typeof result.duration_api_ms === 'number' + ? result.duration_api_ms / 1000 + : undefined; } - } catch { - // Skip non-JSON lines } } - if (verbose && result.stderr) { - process.stderr.write(result.stderr); + const duration = (Date.now() - startTime) / 1000; + + if (resultsDir) { + writeFileSync(join(resultsDir, 'transcript.json'), JSON.stringify(messages, null, 2)); } return { diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts new file mode 100644 index 000000000000..883043cb2a9f --- /dev/null +++ b/scripts/eval/lib/agents/codex.ts @@ -0,0 +1,54 @@ +import { Codex } from '@openai/codex-sdk'; +import { writeFileSync } from 'node:fs'; +import { join } from 'node:path'; +import type { Agent, ExecutionResult, SupportedModel } from '../../types'; + +export const codexAgent: Agent = { + name: 'codex', + + async execute( + prompt: string, + projectPath: string, + model: SupportedModel, + options?: { verbose?: boolean; resultsDir?: string } + ): Promise { + const { verbose, resultsDir } = options ?? {}; + const startTime = Date.now(); + + const codex = new Codex({ model }); + const thread = codex.startThread({ workingDirectory: projectPath }); + const { events } = await thread.runStreamed(prompt); + + const items: unknown[] = []; + + for await (const event of events) { + if (verbose && event.type === 'item.completed') { + const item = event.item as Record; + if (item.type === 'message' && Array.isArray(item.content)) { + for (const block of item.content) { + if (typeof block === 'object' && block !== null && 'text' in block) { + process.stderr.write(`${(block as { text: string }).text}\n`); + } + } + } + } + + if (event.type === 'item.completed') { + items.push(event.item); + } + } + + const duration = (Date.now() - startTime) / 1000; + + if (resultsDir) { + writeFileSync(join(resultsDir, 'transcript.json'), JSON.stringify(items, null, 2)); + } + + return { + agent: 'codex', + model, + duration, + turns: items.length, + }; + }, +}; diff --git a/scripts/eval/lib/agents/copilot.ts b/scripts/eval/lib/agents/copilot.ts deleted file mode 100644 index 23754f0d4d0a..000000000000 --- a/scripts/eval/lib/agents/copilot.ts +++ /dev/null @@ -1,50 +0,0 @@ -import { writeFileSync } from 'node:fs'; -import { join } from 'node:path'; -import type { Agent, ExecutionResult, SupportedModel } from '../../types'; -import { exec } from '../utils'; - -export const copilotAgent: Agent = { - name: 'copilot-cli', - - async execute( - prompt: string, - projectPath: string, - model: SupportedModel, - options?: { verbose?: boolean; resultsDir?: string } - ): Promise { - const { verbose, resultsDir } = options ?? {}; - const startTime = Date.now(); - - const args = ['--model', model, prompt]; - - const result = await exec('copilot', args, { - cwd: projectPath, - timeout: 600_000, - throwOnError: false, - stdin: 'ignore', - }); - - const duration = (Date.now() - startTime) / 1000; - - if (resultsDir) { - writeFileSync(join(resultsDir, 'agent-stdout.txt'), result.stdout); - writeFileSync(join(resultsDir, 'agent-stderr.txt'), result.stderr); - } - - // Count tool execution markers as a proxy for turns - const toolMarkers = result.stdout.match(/[✓✗]/g) || []; - const turns = toolMarkers.length; - - if (verbose) { - if (result.stdout) process.stdout.write(result.stdout); - if (result.stderr) process.stderr.write(result.stderr); - } - - return { - agent: 'copilot-cli', - model, - duration, - turns, - }; - }, -}; diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index c74d0a2a0ff6..5d1439e9c0d3 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -7,20 +7,15 @@ // --- Agent & Model Types --- -export type AgentName = 'claude-code' | 'copilot-cli'; +export type AgentName = 'claude-code' | 'codex'; export const CLAUDE_MODELS = ['claude-opus-4-6', 'claude-sonnet-4-6', 'claude-haiku-4-5'] as const; -export const COPILOT_MODELS = [ - ...CLAUDE_MODELS, - 'gpt-5.2-codex', - 'gpt-5.2', - 'gpt-5.1-codex-max', -] as const; +export const CODEX_MODELS = ['o4-mini', 'o3', 'gpt-4.1'] as const; export type ClaudeModel = (typeof CLAUDE_MODELS)[number]; -export type CopilotModel = (typeof COPILOT_MODELS)[number]; -export type SupportedModel = CopilotModel; +export type CodexModel = (typeof CODEX_MODELS)[number]; +export type SupportedModel = ClaudeModel | CodexModel; export type ModelTier = 'opus' | 'sonnet' | 'haiku' | 'codex'; @@ -28,14 +23,14 @@ export const MODEL_TIERS: Record = { 'claude-opus-4-6': 'opus', 'claude-sonnet-4-6': 'sonnet', 'claude-haiku-4-5': 'haiku', - 'gpt-5.2-codex': 'codex', - 'gpt-5.2': 'codex', - 'gpt-5.1-codex-max': 'codex', + 'o4-mini': 'codex', + 'o3': 'codex', + 'gpt-4.1': 'codex', }; export const SUPPORTED_MODELS_BY_AGENT: Record = { 'claude-code': CLAUDE_MODELS, - 'copilot-cli': COPILOT_MODELS, + codex: CODEX_MODELS, }; // --- Project Types --- diff --git a/scripts/package.json b/scripts/package.json index bbfc84292fe1..f3031bcf8178 100644 --- a/scripts/package.json +++ b/scripts/package.json @@ -42,10 +42,12 @@ }, "dependencies": { "@actions/core": "^1.11.1", + "@anthropic-ai/claude-agent-sdk": "^0.2.85", "@fal-works/esbuild-plugin-global-externals": "^2.1.2", "@google-cloud/bigquery": "^6.2.1", "@octokit/graphql": "^5.0.6", "@octokit/request": "^8.4.1", + "@openai/codex-sdk": "^0.117.0", "@polka/parse": "^1.0.0-next.28", "@testing-library/dom": "^10.4.0", "@testing-library/jest-dom": "^6.9.1", diff --git a/yarn.lock b/yarn.lock index cee8c18346d7..719ae2d9c2e5 100644 --- a/yarn.lock +++ b/yarn.lock @@ -436,6 +436,44 @@ __metadata: languageName: node linkType: hard +"@anthropic-ai/claude-agent-sdk@npm:^0.2.85": + version: 0.2.85 + resolution: "@anthropic-ai/claude-agent-sdk@npm:0.2.85" + dependencies: + "@img/sharp-darwin-arm64": "npm:^0.34.2" + "@img/sharp-darwin-x64": "npm:^0.34.2" + "@img/sharp-linux-arm": "npm:^0.34.2" + "@img/sharp-linux-arm64": "npm:^0.34.2" + "@img/sharp-linux-x64": "npm:^0.34.2" + "@img/sharp-linuxmusl-arm64": "npm:^0.34.2" + "@img/sharp-linuxmusl-x64": "npm:^0.34.2" + "@img/sharp-win32-arm64": "npm:^0.34.2" + "@img/sharp-win32-x64": "npm:^0.34.2" + peerDependencies: + zod: ^4.0.0 + dependenciesMeta: + "@img/sharp-darwin-arm64": + optional: true + "@img/sharp-darwin-x64": + optional: true + "@img/sharp-linux-arm": + optional: true + "@img/sharp-linux-arm64": + optional: true + "@img/sharp-linux-x64": + optional: true + "@img/sharp-linuxmusl-arm64": + optional: true + "@img/sharp-linuxmusl-x64": + optional: true + "@img/sharp-win32-arm64": + optional: true + "@img/sharp-win32-x64": + optional: true + checksum: 10c0/5bb31712460b03b264b489c38a2ddcac62ba60aad50da8cd6d3cebdaf46fae84c37473f25b7a4e20a6bda6f2310b4cc9f3574bc3f2e8f73a4a6e6bd0e04bd827 + languageName: node + linkType: hard + "@aw-web-design/x-default-browser@npm:1.4.126": version: 1.4.126 resolution: "@aw-web-design/x-default-browser@npm:1.4.126" @@ -2972,7 +3010,7 @@ __metadata: languageName: node linkType: hard -"@img/sharp-darwin-arm64@npm:0.34.5": +"@img/sharp-darwin-arm64@npm:0.34.5, @img/sharp-darwin-arm64@npm:^0.34.2": version: 0.34.5 resolution: "@img/sharp-darwin-arm64@npm:0.34.5" dependencies: @@ -2984,7 +3022,7 @@ __metadata: languageName: node linkType: hard -"@img/sharp-darwin-x64@npm:0.34.5": +"@img/sharp-darwin-x64@npm:0.34.5, @img/sharp-darwin-x64@npm:^0.34.2": version: 0.34.5 resolution: "@img/sharp-darwin-x64@npm:0.34.5" dependencies: @@ -3066,7 +3104,7 @@ __metadata: languageName: node linkType: hard -"@img/sharp-linux-arm64@npm:0.34.5": +"@img/sharp-linux-arm64@npm:0.34.5, @img/sharp-linux-arm64@npm:^0.34.2": version: 0.34.5 resolution: "@img/sharp-linux-arm64@npm:0.34.5" dependencies: @@ -3078,7 +3116,7 @@ __metadata: languageName: node linkType: hard -"@img/sharp-linux-arm@npm:0.34.5": +"@img/sharp-linux-arm@npm:0.34.5, @img/sharp-linux-arm@npm:^0.34.2": version: 0.34.5 resolution: "@img/sharp-linux-arm@npm:0.34.5" dependencies: @@ -3126,7 +3164,7 @@ __metadata: languageName: node linkType: hard -"@img/sharp-linux-x64@npm:0.34.5": +"@img/sharp-linux-x64@npm:0.34.5, @img/sharp-linux-x64@npm:^0.34.2": version: 0.34.5 resolution: "@img/sharp-linux-x64@npm:0.34.5" dependencies: @@ -3138,7 +3176,7 @@ __metadata: languageName: node linkType: hard -"@img/sharp-linuxmusl-arm64@npm:0.34.5": +"@img/sharp-linuxmusl-arm64@npm:0.34.5, @img/sharp-linuxmusl-arm64@npm:^0.34.2": version: 0.34.5 resolution: "@img/sharp-linuxmusl-arm64@npm:0.34.5" dependencies: @@ -3150,7 +3188,7 @@ __metadata: languageName: node linkType: hard -"@img/sharp-linuxmusl-x64@npm:0.34.5": +"@img/sharp-linuxmusl-x64@npm:0.34.5, @img/sharp-linuxmusl-x64@npm:^0.34.2": version: 0.34.5 resolution: "@img/sharp-linuxmusl-x64@npm:0.34.5" dependencies: @@ -3171,7 +3209,7 @@ __metadata: languageName: node linkType: hard -"@img/sharp-win32-arm64@npm:0.34.5": +"@img/sharp-win32-arm64@npm:0.34.5, @img/sharp-win32-arm64@npm:^0.34.2": version: 0.34.5 resolution: "@img/sharp-win32-arm64@npm:0.34.5" conditions: os=win32 & cpu=arm64 @@ -3185,7 +3223,7 @@ __metadata: languageName: node linkType: hard -"@img/sharp-win32-x64@npm:0.34.5": +"@img/sharp-win32-x64@npm:0.34.5, @img/sharp-win32-x64@npm:^0.34.2": version: 0.34.5 resolution: "@img/sharp-win32-x64@npm:0.34.5" conditions: os=win32 & cpu=x64 @@ -4665,6 +4703,86 @@ __metadata: languageName: node linkType: hard +"@openai/codex-darwin-arm64@npm:@openai/codex@0.117.0-darwin-arm64": + version: 0.117.0-darwin-arm64 + resolution: "@openai/codex@npm:0.117.0-darwin-arm64" + conditions: os=darwin & cpu=arm64 + languageName: node + linkType: hard + +"@openai/codex-darwin-x64@npm:@openai/codex@0.117.0-darwin-x64": + version: 0.117.0-darwin-x64 + resolution: "@openai/codex@npm:0.117.0-darwin-x64" + conditions: os=darwin & cpu=x64 + languageName: node + linkType: hard + +"@openai/codex-linux-arm64@npm:@openai/codex@0.117.0-linux-arm64": + version: 0.117.0-linux-arm64 + resolution: "@openai/codex@npm:0.117.0-linux-arm64" + conditions: os=linux & cpu=arm64 + languageName: node + linkType: hard + +"@openai/codex-linux-x64@npm:@openai/codex@0.117.0-linux-x64": + version: 0.117.0-linux-x64 + resolution: "@openai/codex@npm:0.117.0-linux-x64" + conditions: os=linux & cpu=x64 + languageName: node + linkType: hard + +"@openai/codex-sdk@npm:^0.117.0": + version: 0.117.0 + resolution: "@openai/codex-sdk@npm:0.117.0" + dependencies: + "@openai/codex": "npm:0.117.0" + checksum: 10c0/96f86890fd45a4030a8e9b6f8466389a015d0ee534b1661b56463a1fd210c6fc3af0ea1f3ce57306a13a9b6ff6197d6409a4d5af7f6d7c90e672009eee15e3fd + languageName: node + linkType: hard + +"@openai/codex-win32-arm64@npm:@openai/codex@0.117.0-win32-arm64": + version: 0.117.0-win32-arm64 + resolution: "@openai/codex@npm:0.117.0-win32-arm64" + conditions: os=win32 & cpu=arm64 + languageName: node + linkType: hard + +"@openai/codex-win32-x64@npm:@openai/codex@0.117.0-win32-x64": + version: 0.117.0-win32-x64 + resolution: "@openai/codex@npm:0.117.0-win32-x64" + conditions: os=win32 & cpu=x64 + languageName: node + linkType: hard + +"@openai/codex@npm:0.117.0": + version: 0.117.0 + resolution: "@openai/codex@npm:0.117.0" + dependencies: + "@openai/codex-darwin-arm64": "npm:@openai/codex@0.117.0-darwin-arm64" + "@openai/codex-darwin-x64": "npm:@openai/codex@0.117.0-darwin-x64" + "@openai/codex-linux-arm64": "npm:@openai/codex@0.117.0-linux-arm64" + "@openai/codex-linux-x64": "npm:@openai/codex@0.117.0-linux-x64" + "@openai/codex-win32-arm64": "npm:@openai/codex@0.117.0-win32-arm64" + "@openai/codex-win32-x64": "npm:@openai/codex@0.117.0-win32-x64" + dependenciesMeta: + "@openai/codex-darwin-arm64": + optional: true + "@openai/codex-darwin-x64": + optional: true + "@openai/codex-linux-arm64": + optional: true + "@openai/codex-linux-x64": + optional: true + "@openai/codex-win32-arm64": + optional: true + "@openai/codex-win32-x64": + optional: true + bin: + codex: bin/codex.js + checksum: 10c0/a5104a396f0f33558c9a402012bf2dd954f5d3465d3b0bb5fe780d265760a3c72b64af4a2d42a0012f661b7e4a274a42c5d4f5582de115613557f480dbec3b5b + languageName: node + linkType: hard + "@oxc-project/runtime@npm:0.115.0": version: 0.115.0 resolution: "@oxc-project/runtime@npm:0.115.0" @@ -8717,10 +8835,12 @@ __metadata: resolution: "@storybook/scripts@workspace:scripts" dependencies: "@actions/core": "npm:^1.11.1" + "@anthropic-ai/claude-agent-sdk": "npm:^0.2.85" "@fal-works/esbuild-plugin-global-externals": "npm:^2.1.2" "@google-cloud/bigquery": "npm:^6.2.1" "@octokit/graphql": "npm:^5.0.6" "@octokit/request": "npm:^8.4.1" + "@openai/codex-sdk": "npm:^0.117.0" "@polka/parse": "npm:^1.0.0-next.28" "@testing-library/dom": "npm:^10.4.0" "@testing-library/jest-dom": "npm:^6.9.1" From 20cc6b9b51b339590a329a0955940d355c4e51b4 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 20:11:42 +0700 Subject: [PATCH 03/63] Incorporate improvements from PR review + pre-prepared repos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Pre-prepared eval-baseline branches on forked repos (kasperpeulen/*) eliminates storybook init during trials - Cache system: first run clones + installs, subsequent runs copy from cache — agent starts immediately - Post-init baseline commit for clean git diffs - Richer result schema: changed files, setup patterns, ghost stories - Ghost stories grading via STORYBOOK_COMPONENT_PATHS + Vitest - Setup pattern detection (tailwind, redux, router, etc.) - Better prompt: allows story creation, focuses on real components - Smarter cleanup: only removes starter stories, not project stories Tested on wikitok: quality 1.0, build pass, 7/7 ghost stories, $0.78 --- scripts/eval/config.ts | 30 +++- scripts/eval/eval.ts | 13 +- scripts/eval/lib/agents/claude-code.ts | 40 +++-- scripts/eval/lib/ghost-stories.ts | 167 +++++++++++++++++ scripts/eval/lib/grade.ts | 54 +++++- scripts/eval/lib/prepare-trial.ts | 215 ++++++---------------- scripts/eval/lib/run-task.ts | 9 +- scripts/eval/lib/setup-patterns.ts | 116 ++++++++++++ scripts/eval/prepare-repos.ts | 240 +++++++++++++++++++++++++ scripts/eval/prompts/setup.md | 43 ++--- scripts/eval/types.ts | 70 +++++--- 11 files changed, 750 insertions(+), 247 deletions(-) create mode 100644 scripts/eval/lib/ghost-stories.ts create mode 100644 scripts/eval/lib/setup-patterns.ts create mode 100644 scripts/eval/prepare-repos.ts diff --git a/scripts/eval/config.ts b/scripts/eval/config.ts index b7a18779730c..62cd086d5bbb 100644 --- a/scripts/eval/config.ts +++ b/scripts/eval/config.ts @@ -2,38 +2,52 @@ import type { Project, AgentName, Agent } from './types'; import { claudeCodeAgent } from './lib/agents/claude-code'; import { codexAgent } from './lib/agents/codex'; +/** + * Pre-prepared eval baseline repos. + * + * Each repo is a fork with an `eval-baseline` branch where: + * - Storybook files were cleaned + * - `npx storybook@latest init --yes --no-dev` was run + * - All deps installed and committed + * + * To regenerate: `npx jiti scripts/eval/prepare-repos.ts` + */ export const PROJECTS: Project[] = [ { name: 'mealdrop', - repo: 'https://github.com/yannbf/mealdrop', - branch: 'without-storybook', + repo: 'https://github.com/kasperpeulen/mealdrop', + branch: 'eval-baseline', description: 'Styled components, Redux, React Router', }, { name: 'edgy', - repo: 'https://github.com/catherineisonline/edgy', + repo: 'https://github.com/kasperpeulen/edgy', + branch: 'eval-baseline', description: 'Tailwind, HeadlessUI, React Router', }, { name: 'wikitok', - repo: 'https://github.com/IsaacGemal/wikitok', + repo: 'https://github.com/kasperpeulen/wikitok', + branch: 'eval-baseline', projectDir: 'frontend', description: 'Simple project with Tailwind', }, { name: 'baklava', - repo: 'https://github.com/fortanix/baklava', - branch: 'master', + repo: 'https://github.com/kasperpeulen/baklava', + branch: 'eval-baseline', description: 'Component library with Zustand', }, { name: 'echarts', - repo: 'https://github.com/tmkx/echarts-react', + repo: 'https://github.com/kasperpeulen/echarts-react', + branch: 'eval-baseline', description: 'ECharts React wrapper', }, { name: 'evergreen-ci', - repo: 'https://github.com/evergreen-ci/ui', + repo: 'https://github.com/kasperpeulen/ui', + branch: 'eval-baseline', projectDir: 'packages/lib', description: 'GraphQL', }, diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 82ebaab8a2fa..7be3a63a82c8 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -109,29 +109,36 @@ for (const project of projects) { if (allResults.length > 0) { log(pc.bold('\n\nResults Summary')); - log('='.repeat(100)); + log('='.repeat(120)); // Header const header = [ 'Project'.padEnd(15), 'Build'.padEnd(7), 'TS Err'.padEnd(8), + 'Ghost'.padEnd(12), + 'Patterns'.padEnd(10), 'Quality'.padEnd(9), 'Cost'.padEnd(8), 'Time'.padEnd(8), 'Turns'.padEnd(7), ].join(' | '); log(header); - log('-'.repeat(100)); + log('-'.repeat(120)); // Rows for (const r of allResults) { const buildStr = r.grading.buildSuccess ? 'PASS' : 'FAIL'; const buildColored = r.grading.buildSuccess ? pc.green(buildStr) : pc.red(buildStr); + const ghost = r.grading.ghostStories; + const ghostStr = ghost ? `${ghost.passed}/${ghost.total}` : '-'; + const patternsStr = String(r.grading.setupPatterns.length); const row = [ r.project.padEnd(15), buildStr.padEnd(7).replace(buildStr, buildColored), String(r.grading.typeCheckErrors).padEnd(8), + ghostStr.padEnd(12), + patternsStr.padEnd(10), String(r.quality.score).padEnd(9), formatCost(r.execution.cost).padEnd(8), formatDuration(r.execution.duration).padEnd(8), @@ -140,7 +147,7 @@ if (allResults.length > 0) { log(row); } - log('-'.repeat(100)); + log('-'.repeat(120)); // Aggregate const avgQuality = diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index ae02e7e6a224..bd52d4ee280c 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -1,16 +1,16 @@ -import { query } from '@anthropic-ai/claude-agent-sdk'; -import { writeFileSync } from 'node:fs'; -import { join } from 'node:path'; -import type { Agent, ExecutionResult, SupportedModel } from '../../types'; +import { query } from "@anthropic-ai/claude-agent-sdk"; +import { writeFileSync } from "node:fs"; +import { join } from "node:path"; +import type { Agent, ExecutionResult, SupportedModel } from "../../types"; export const claudeCodeAgent: Agent = { - name: 'claude-code', + name: "claude-code", async execute( prompt: string, projectPath: string, model: SupportedModel, - options?: { verbose?: boolean; resultsDir?: string } + options?: { verbose?: boolean; resultsDir?: string }, ): Promise { const { verbose, resultsDir } = options ?? {}; const startTime = Date.now(); @@ -25,33 +25,37 @@ export const claudeCodeAgent: Agent = { options: { model, cwd: projectPath, - allowedTools: ['Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep'], + allowedTools: ["Read", "Write", "Edit", "Bash", "Glob", "Grep"], maxTurns: 50, - systemPrompt: { type: 'preset', preset: 'claude_code' }, + debug: true, + systemPrompt: { type: "preset", preset: "claude_code" }, }, })) { + console.log(message); messages.push(message); - if (verbose && 'type' in message && message.type === 'assistant') { + if ("type" in message && message.type === "assistant") { const content = (message as Record).content; if (Array.isArray(content)) { for (const block of content) { - if (block.type === 'text') { - process.stderr.write(block.text + '\n'); + console.log(block.text); + if (block.type === "text") { + process.stderr.write(block.text + "\n"); + } else if (block.type === "tool_use") { + const tool = block as { name?: string; input?: unknown }; + process.stderr.write(` [tool] ${tool.name}\n`); } } } } - if ('type' in message && message.type === 'result') { + if ("type" in message && message.type === "result") { const result = message as Record; - if (result.subtype === 'success') { + if (result.subtype === "success") { cost = result.total_cost_usd as number | undefined; turns = (result.num_turns as number) ?? 0; durationApi = - typeof result.duration_api_ms === 'number' - ? result.duration_api_ms / 1000 - : undefined; + typeof result.duration_api_ms === "number" ? result.duration_api_ms / 1000 : undefined; } } } @@ -59,11 +63,11 @@ export const claudeCodeAgent: Agent = { const duration = (Date.now() - startTime) / 1000; if (resultsDir) { - writeFileSync(join(resultsDir, 'transcript.json'), JSON.stringify(messages, null, 2)); + writeFileSync(join(resultsDir, "transcript.json"), JSON.stringify(messages, null, 2)); } return { - agent: 'claude-code', + agent: "claude-code", model, cost, duration, diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts new file mode 100644 index 000000000000..bf616bde7f6d --- /dev/null +++ b/scripts/eval/lib/ghost-stories.ts @@ -0,0 +1,167 @@ +import { readFileSync, existsSync } from 'node:fs'; +import { join, relative } from 'node:path'; +import { exec } from './utils'; +import type { GhostStoriesResult } from '../types'; +import { logStep, logSuccess, logError } from './utils'; + +/** + * Run ghost stories: discover candidate components, auto-generate stories + * via the Vitest component transform, and measure rendering success. + * + * This leverages the existing @storybook/addon-vitest componentTransform which + * activates when `STORYBOOK_COMPONENT_PATHS` env var is set. After `storybook init`, + * the addon is already configured. + */ +export async function runGhostStories( + projectPath: string, + resultsDir: string +): Promise { + logStep('Running ghost stories...'); + + // 1. Find candidate React components + const candidates = await findCandidateComponents(projectPath); + if (candidates.length === 0) { + logError('No candidate components found'); + return undefined; + } + logStep(`Found ${candidates.length} candidate component(s)`); + + // 2. Run vitest with STORYBOOK_COMPONENT_PATHS to trigger componentTransform + const reportPath = join(resultsDir, 'ghost-stories-report.json'); + await exec( + 'npx', + [ + 'vitest', + 'run', + '--project=storybook', + '--reporter=json', + `--outputFile=${reportPath}`, + '--testTimeout=10000', + ], + { + cwd: projectPath, + timeout: 120_000, + throwOnError: false, + env: { + PATH: process.env.PATH, + HOME: process.env.HOME, + npm_config_registry: 'https://registry.npmjs.org/', + STORYBOOK_COMPONENT_PATHS: candidates.join(','), + }, + } + ); + + // 3. Parse results + if (!existsSync(reportPath)) { + logError('Ghost stories: no Vitest report generated'); + return { candidateCount: candidates.length, total: 0, passed: 0, successRate: 0 }; + } + + try { + const report = JSON.parse(readFileSync(reportPath, 'utf-8')); + const testResults = report.testResults || []; + + let total = 0; + let passed = 0; + for (const suite of testResults) { + for (const test of suite.assertionResults || []) { + total++; + if (test.status === 'passed') passed++; + } + } + + const successRate = total > 0 ? Math.round((passed / total) * 100) / 100 : 0; + + if (total > 0) { + logSuccess(`Ghost stories: ${passed}/${total} passed (${Math.round(successRate * 100)}%)`); + } else { + logError('Ghost stories: no tests found in report'); + } + + return { candidateCount: candidates.length, total, passed, successRate }; + } catch { + logError('Ghost stories: failed to parse Vitest report'); + return { candidateCount: candidates.length, total: 0, passed: 0, successRate: 0 }; + } +} + +/** + * Find candidate React component files in the project. + * + * Looks for .tsx/.jsx files that contain JSX and exports, + * excluding tests, stories, config files, and node_modules. + */ +async function findCandidateComponents(projectPath: string): Promise { + const result = await exec( + 'find', + [ + projectPath, + '-type', + 'f', + '(', + '-name', + '*.tsx', + '-o', + '-name', + '*.jsx', + ')', + '-not', + '-path', + '*/node_modules/*', + '-not', + '-path', + '*/.storybook/*', + '-not', + '-path', + '*/dist/*', + '-not', + '-path', + '*/build/*', + '-not', + '-name', + '*.test.*', + '-not', + '-name', + '*.spec.*', + '-not', + '-name', + '*.stories.*', + '-not', + '-name', + '*.story.*', + '-not', + '-name', + 'vite.config.*', + '-not', + '-name', + 'vitest.config.*', + ], + { cwd: projectPath, throwOnError: false } + ); + + const files = result.stdout.trim().split('\n').filter(Boolean); + + // Filter for files that look like React components (have JSX + exports) + const candidates: Array<{ path: string; complexity: number }> = []; + for (const file of files) { + try { + const content = readFileSync(file, 'utf-8'); + const hasExport = /export\s+(default\s+)?/.test(content); + const hasJsx = /<[A-Z]/.test(content) || /return\s*\(?\s* l.trim()).length; + const imports = (content.match(/^import\s/gm) || []).length; + const complexity = Math.min(1, (lines + imports * 0.5) / 100); + + candidates.push({ path: relative(projectPath, file), complexity }); + } catch { + // skip unreadable files + } + } + + // Sort by complexity (simplest first), take top 20 + candidates.sort((a, b) => a.complexity - b.complexity); + return candidates.slice(0, 20).map((c) => c.path); +} diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index 1266f90c6e45..925712b8b904 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -1,15 +1,31 @@ import { writeFileSync } from 'node:fs'; import { join } from 'node:path'; -import type { GradingResult, QualityResult, TrialPaths } from '../types'; +import type { GradingResult, QualityResult, TrialPaths, ChangedFile } from '../types'; import { logStep, logSuccess, logError, exec } from './utils'; +import { detectSetupPatterns } from './setup-patterns'; +import { runGhostStories } from './ghost-stories'; /** - * Grade a trial by running storybook build and typecheck. + * Grade a trial by measuring what the agent changed and whether the build works. */ export async function grade( paths: TrialPaths ): Promise<{ grading: GradingResult; quality: QualityResult }> { - const { projectPath, resultsDir } = paths; + const { repoRoot, projectPath, resultsDir, baselineCommit } = paths; + + // --- Changed Files (diff from baseline) --- + logStep('Collecting agent changes...'); + const changedFiles = await getChangedFiles(repoRoot, baselineCommit); + const storybookFiles = changedFiles.filter( + (f) => f.path.includes('.storybook/') || /\.(stories|story)\.[tj]sx?$/.test(f.path) + ); + logSuccess(`${changedFiles.length} files changed (${storybookFiles.length} storybook-related)`); + + // --- Setup Patterns --- + const setupPatterns = detectSetupPatterns(projectPath); + if (setupPatterns.length > 0) { + logSuccess(`Detected patterns: ${setupPatterns.map((p) => p.label).join(', ')}`); + } // --- Storybook Build --- logStep('Running storybook build...'); @@ -51,7 +67,6 @@ export async function grade( const typeCheckOutput = tscResult.stdout + '\n' + tscResult.stderr; writeFileSync(join(resultsDir, 'typecheck-output.txt'), typeCheckOutput); - const typeCheckErrors = (typeCheckOutput.match(/error TS\d+/g) || []).length; if (typeCheckErrors === 0) { @@ -60,18 +75,47 @@ export async function grade( logError(`${typeCheckErrors} TypeScript error(s)`); } + // --- Ghost Stories --- + const ghostStories = buildSuccess + ? await runGhostStories(projectPath, resultsDir) + : undefined; + const grading: GradingResult = { buildSuccess, buildError: buildSuccess ? undefined : buildOutput.slice(-2000), typeCheckErrors, typeCheckOutput: typeCheckErrors > 0 ? typeCheckOutput.slice(-2000) : undefined, + changedFiles, + storybookFiles, + setupPatterns, + ghostStories, }; const quality = calculateQuality(grading); - return { grading, quality }; } +async function getChangedFiles(repoRoot: string, baselineCommit: string): Promise { + // Stage everything so diff sees new files too + await exec('git', ['add', '-A'], { cwd: repoRoot }); + const result = await exec('git', ['diff', '--cached', '--name-status', baselineCommit], { + cwd: repoRoot, + throwOnError: false, + }); + + return result.stdout + .trim() + .split('\n') + .filter(Boolean) + .map((line) => { + const [status, ...pathParts] = line.split('\t'); + return { + path: pathParts.join('\t'), + status: (status?.charAt(0) || 'M') as ChangedFile['status'], + }; + }); +} + function calculateQuality(grading: GradingResult): QualityResult { const buildScore = grading.buildSuccess ? 1 : 0; const typeCheckScore = Math.max(0, 1 - grading.typeCheckErrors / 20); diff --git a/scripts/eval/lib/prepare-trial.ts b/scripts/eval/lib/prepare-trial.ts index 25e2ddd6ead3..173640d9c35a 100644 --- a/scripts/eval/lib/prepare-trial.ts +++ b/scripts/eval/lib/prepare-trial.ts @@ -1,18 +1,11 @@ -import { existsSync, mkdirSync, cpSync, rmSync, readdirSync } from 'node:fs'; +import { existsSync, mkdirSync, cpSync } from 'node:fs'; import { join } from 'node:path'; import type { Project, TrialPaths } from '../types'; import { CACHE_DIR, TRIALS_DIR, logStep, logSuccess, exec } from './utils'; -/** - * Clean environment for npm/npx commands that should use the public registry. - * The storybook monorepo has a .npmrc pointing to localhost:6002 (verdaccio), - * which leaks through process.env and breaks commands outside the monorepo. - */ function cleanNpmEnv(): Record { const env = { ...process.env }; - // Force public registry env.npm_config_registry = 'https://registry.npmjs.org/'; - // Remove any verdaccio-related env vars for (const key of Object.keys(env)) { if (key.startsWith('npm_config_') && key !== 'npm_config_registry') { delete env[key]; @@ -21,178 +14,70 @@ function cleanNpmEnv(): Record { return env; } -/** - * Clone a repo to the cache (or reset to the target branch if already cached). - */ -async function ensureRepoClone(project: Project): Promise { - const cacheDir = join(CACHE_DIR, project.name); - - if (existsSync(cacheDir)) { - logStep(`Updating cached clone for ${project.name}...`); - const branch = project.branch || (await getDefaultBranch(cacheDir)); - await exec('git', ['fetch', 'origin'], { cwd: cacheDir }); - await exec('git', ['checkout', branch], { cwd: cacheDir }); - await exec('git', ['reset', '--hard', `origin/${branch}`], { cwd: cacheDir }); - await exec('git', ['clean', '-fdx', '-e', 'node_modules'], { cwd: cacheDir }); - return cacheDir; - } - - logStep(`Cloning ${project.repo}...`); - mkdirSync(CACHE_DIR, { recursive: true }); - const args = ['clone', '--depth', '1', project.repo, cacheDir]; - if (project.branch) { - args.splice(1, 0, '--branch', project.branch); - } - await exec('git', args, { timeout: 120_000 }); - - return cacheDir; -} - -async function getDefaultBranch(repoDir: string): Promise { - const result = await exec('git', ['rev-parse', '--abbrev-ref', 'HEAD'], { cwd: repoDir }); - return result.stdout.trim(); -} - -/** - * Recursively remove storybook-related files from a directory. - */ -function cleanStorybookFiles(dir: string) { - // Remove .storybook directory - const storybookDir = join(dir, '.storybook'); - if (existsSync(storybookDir)) { - rmSync(storybookDir, { recursive: true }); - logStep('Removed .storybook/'); - } - - // Remove story files - const removedCount = removeStoryFiles(dir); - if (removedCount > 0) { - logStep(`Removed ${removedCount} story file(s)`); - } -} - -function removeStoryFiles(dir: string, count = 0): number { - let removed = count; - try { - const entries = readdirSync(dir, { withFileTypes: true }); - for (const entry of entries) { - if (['node_modules', '.git', 'dist', 'build'].includes(entry.name)) continue; - - const fullPath = join(dir, entry.name); - if (entry.isDirectory()) { - if (entry.name === 'stories' || entry.name === '__stories__') { - rmSync(fullPath, { recursive: true }); - removed++; - } else { - removed = removeStoryFiles(fullPath, removed); - } - } else if (/\.(stories|story)\.(tsx?|jsx?|mdx?)$/.test(entry.name)) { - rmSync(fullPath); - removed++; - } - } - } catch { - // Skip directories we can't read - } - return removed; -} - -/** - * Detect the package manager and install dependencies. - */ -async function installDependencies(dir: string) { +async function installDeps(dir: string) { const env = cleanNpmEnv(); - if (existsSync(join(dir, 'pnpm-lock.yaml')) || existsSync(join(dir, 'pnpm-workspace.yaml'))) { - logStep('Installing dependencies with pnpm...'); - await exec('pnpm', ['install', '--no-frozen-lockfile'], { cwd: dir, timeout: 300_000, env }); - } else if (existsSync(join(dir, 'yarn.lock'))) { - logStep('Installing dependencies with yarn...'); - if (existsSync(join(dir, '.yarnrc.yml'))) { - await exec('yarn', ['install', '--no-immutable'], { cwd: dir, timeout: 300_000, env }); - } else { - await exec('yarn', ['install'], { cwd: dir, timeout: 300_000, env }); - } - } else if (existsSync(join(dir, 'bun.lockb')) || existsSync(join(dir, 'bun.lock'))) { - logStep('Installing dependencies with bun...'); - await exec('bun', ['install'], { cwd: dir, timeout: 300_000, env }); + const p = (f: string) => existsSync(join(dir, f)); + + let cmd: string; + let args: string[]; + if (p('pnpm-lock.yaml') || p('pnpm-workspace.yaml')) { + cmd = 'pnpm'; + args = ['install', '--no-frozen-lockfile']; + } else if (p('yarn.lock')) { + cmd = 'yarn'; + args = p('.yarnrc.yml') ? ['install', '--no-immutable'] : ['install']; + } else if (p('bun.lockb') || p('bun.lock')) { + cmd = 'bun'; + args = ['install']; } else { - logStep('Installing dependencies with npm...'); - await exec('npm', ['install'], { cwd: dir, timeout: 300_000, env }); + cmd = 'npm'; + args = ['install', '--ignore-scripts']; } -} -/** - * Run `npx storybook@latest init --yes` to scaffold Storybook. - */ -async function initStorybook(dir: string) { - logStep('Running storybook init...'); - const env = cleanNpmEnv(); - await exec('npx', ['storybook@latest', 'init', '--yes', '--no-dev'], { - cwd: dir, - timeout: 300_000, - env: { - ...env, - STORYBOOK_DISABLE_TELEMETRY: '1', - }, - }); + logStep(`Installing with ${cmd}...`); + await exec(cmd, args, { cwd: dir, timeout: 300_000, env }); } /** - * Prepare a trial: clone repo, clean storybook files, install deps, init storybook. + * Prepare a trial directory. + * + * First run per project: clone eval-baseline branch → install deps → cache it. + * Subsequent runs: copy from cache. Agent starts immediately. */ export async function prepareTrial(project: Project, trialId: string): Promise { - // 1. Ensure repo is in the cache - const cacheDir = await ensureRepoClone(project); - logSuccess(`Repo cached at ${cacheDir}`); - - // 2. Create trial directory and copy project + const cacheDir = join(CACHE_DIR, project.name); const trialDir = join(TRIALS_DIR, trialId); - const projectDest = join(trialDir, 'project'); + const repoRoot = join(trialDir, 'project'); mkdirSync(trialDir, { recursive: true }); - logStep('Copying project to trial directory...'); - cpSync(cacheDir, projectDest, { - recursive: true, - filter: (src) => { - // Exclude .git and node_modules from copy - const relative = src.slice(cacheDir.length); - if (relative.includes('node_modules')) return false; - if (relative.startsWith('/.git') || relative === '/.git') return false; - return true; - }, - }); - - // Initialize a fresh git repo so the agent can use git - await exec('git', ['init'], { cwd: projectDest }); - await exec('git', ['add', '.'], { cwd: projectDest }); - await exec('git', ['commit', '-m', 'Initial commit', '--allow-empty'], { - cwd: projectDest, - env: { - ...process.env, - GIT_AUTHOR_NAME: 'eval', - GIT_AUTHOR_EMAIL: 'eval@storybook.js.org', - GIT_COMMITTER_NAME: 'eval', - GIT_COMMITTER_EMAIL: 'eval@storybook.js.org', - }, - }); - - // 3. Determine the working path (handle monorepo projectDir) - const projectPath = project.projectDir ? join(projectDest, project.projectDir) : projectDest; - - // 4. Clean storybook files - cleanStorybookFiles(projectPath); - - // 5. Install dependencies - await installDependencies(projectPath); - logSuccess('Dependencies installed'); + if (existsSync(join(cacheDir, '.git'))) { + // Fast path: copy from cache + logStep('Copying from cache...'); + cpSync(cacheDir, repoRoot, { recursive: true }); + } else { + // First run: clone directly to trial dir, install, then cache + logStep(`Cloning ${project.repo}#${project.branch}...`); + mkdirSync(CACHE_DIR, { recursive: true }); + await exec('git', ['clone', '--depth', '1', '--branch', project.branch!, project.repo, repoRoot], { + timeout: 120_000, + }); + + const projectPath = project.projectDir ? join(repoRoot, project.projectDir) : repoRoot; + await installDeps(projectPath); + logSuccess('Dependencies installed'); + + // Save to cache for next time + logStep('Caching for future runs...'); + cpSync(repoRoot, cacheDir, { recursive: true }); + } - // 6. Run storybook init - await initStorybook(projectPath); - logSuccess('Storybook initialized'); + const baselineResult = await exec('git', ['rev-parse', 'HEAD'], { cwd: repoRoot }); + const baselineCommit = baselineResult.stdout.trim(); - // 7. Create results directory + const projectPath = project.projectDir ? join(repoRoot, project.projectDir) : repoRoot; const resultsDir = join(trialDir, 'results'); mkdirSync(resultsDir, { recursive: true }); - return { trialDir, projectPath, resultsDir }; + logSuccess('Trial ready'); + return { trialDir, repoRoot, projectPath, resultsDir, baselineCommit }; } diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-task.ts index c6f6c3dbe56f..33c9dd9cc9a0 100644 --- a/scripts/eval/lib/run-task.ts +++ b/scripts/eval/lib/run-task.ts @@ -18,7 +18,7 @@ export async function runTask(config: TrialConfig): Promise { log(`\nPreparing ${project.name}...`); - // 1. Prepare the trial (clone, clean, init storybook) + // 1. Prepare the trial (clone, clean, init storybook, baseline commit) const paths = await prepareTrial(project, trialId); // 2. Generate the prompt @@ -32,25 +32,28 @@ export async function runTask(config: TrialConfig): Promise { verbose, resultsDir: paths.resultsDir, }); - logSuccess(`Agent completed (${Math.round(execution.duration)}s, ${execution.cost ? `$${execution.cost.toFixed(2)}` : 'cost N/A'}, ${execution.turns} turns)`); + logSuccess( + `Agent completed (${Math.round(execution.duration)}s, ${execution.cost ? `$${execution.cost.toFixed(2)}` : 'cost N/A'}, ${execution.turns} turns)` + ); // 4. Grade the results const { grading, quality } = await grade(paths); // 5. Assemble final result const result: TrialResult = { + schemaVersion: 1, project: project.name, agent: agentName, model, modelTier: MODEL_TIERS[model], timestamp, promptFile: promptFile || 'setup.md', + baselineCommit: paths.baselineCommit, execution, grading, quality, }; - // Save summary writeFileSync(join(paths.resultsDir, 'summary.json'), JSON.stringify(result, null, 2)); logSuccess(`Results saved to ${paths.resultsDir}`); diff --git a/scripts/eval/lib/setup-patterns.ts b/scripts/eval/lib/setup-patterns.ts new file mode 100644 index 000000000000..4343ada7c526 --- /dev/null +++ b/scripts/eval/lib/setup-patterns.ts @@ -0,0 +1,116 @@ +import { readFileSync, existsSync, readdirSync } from 'node:fs'; +import { join, relative } from 'node:path'; +import type { SetupPattern } from '../types'; + +interface PatternRule { + id: string; + label: string; + /** Regex to match in file contents */ + pattern: RegExp; + /** Only check files matching these extensions */ + extensions?: string[]; +} + +const RULES: PatternRule[] = [ + { + id: 'global-css', + label: 'Global CSS import', + pattern: /import\s+['"][^'"]+\.(css|scss|sass|less)['"]|import\s+['"]tailwindcss/, + extensions: ['.ts', '.tsx', '.js', '.jsx'], + }, + { + id: 'tailwind', + label: 'Tailwind CSS', + pattern: /@tailwind|tailwindcss|tailwind\.css/, + }, + { + id: 'styled-components', + label: 'Styled Components', + pattern: /styled-components|ThemeProvider.*styled|createGlobalStyle/, + }, + { + id: 'router-provider', + label: 'React Router provider', + pattern: /MemoryRouter|BrowserRouter|RouterProvider|createMemoryRouter/, + }, + { + id: 'redux-provider', + label: 'Redux Provider', + pattern: /react-redux.*Provider| filePath.endsWith(ext))) { + continue; + } + try { + const content = readFileSync(filePath, 'utf-8'); + if (rule.pattern.test(content)) { + matches.push(relative(projectPath, filePath)); + } + } catch { + // skip unreadable files + } + } + if (matches.length > 0) { + results.push({ id: rule.id, label: rule.label, sourceFiles: matches }); + } + } + + return results; +} + +function collectFiles(dir: string): string[] { + const files: string[] = []; + try { + for (const entry of readdirSync(dir, { withFileTypes: true })) { + const full = join(dir, entry.name); + if (entry.isDirectory()) { + files.push(...collectFiles(full)); + } else { + files.push(full); + } + } + } catch { + // skip + } + return files; +} diff --git a/scripts/eval/prepare-repos.ts b/scripts/eval/prepare-repos.ts new file mode 100644 index 000000000000..1372ed993a09 --- /dev/null +++ b/scripts/eval/prepare-repos.ts @@ -0,0 +1,240 @@ +/** + * One-time script to prepare eval baseline repos. + * + * For each benchmark project: + * 1. Fork the repo to your GitHub account + * 2. Clone the fork + * 3. Clean storybook files, install deps, run `storybook init` + * 4. Commit and push as `eval-baseline` branch + * + * After this, each eval trial just does a fast shallow clone of the + * prepared branch — no more storybook init during trials. + * + * Usage: npx jiti scripts/eval/prepare-repos.ts + */ + +import { existsSync, mkdirSync, readFileSync, writeFileSync, rmSync, readdirSync } from 'node:fs'; +import { join } from 'node:path'; +import pc from 'picocolors'; + +const EVAL_ROOT = join(import.meta.dirname, '..', '..', '..', '..', 'storybook-eval'); +const PREP_DIR = join(EVAL_ROOT, 'prepared-repos'); +const BASELINE_BRANCH = 'eval-baseline'; + +/** Known storybook init starter files that are safe to remove. */ +const STARTER_FILES = new Set([ + 'button.stories.ts', 'button.stories.tsx', 'button.stories.js', 'button.stories.jsx', + 'header.stories.ts', 'header.stories.tsx', 'header.stories.js', 'header.stories.jsx', + 'page.stories.ts', 'page.stories.tsx', 'page.stories.js', 'page.stories.jsx', + 'button.tsx', 'button.jsx', 'button.ts', 'button.js', 'button.css', + 'header.tsx', 'header.jsx', 'header.ts', 'header.js', 'header.css', + 'page.tsx', 'page.jsx', 'page.ts', 'page.js', 'page.css', + 'configure-your-project.mdx', +]); + +interface BenchmarkRepo { + name: string; + repo: string; + branch?: string; + projectDir?: string; +} + +const REPOS: BenchmarkRepo[] = [ + { name: 'mealdrop', repo: 'yannbf/mealdrop', branch: 'without-storybook' }, + { name: 'edgy', repo: 'catherineisonline/edgy' }, + { name: 'wikitok', repo: 'IsaacGemal/wikitok', projectDir: 'frontend' }, + { name: 'baklava', repo: 'fortanix/baklava', branch: 'master' }, + { name: 'echarts', repo: 'tmkx/echarts-react' }, + { name: 'evergreen-ci', repo: 'evergreen-ci/ui', projectDir: 'packages/lib' }, +]; + +function cleanNpmEnv(): Record { + const env: Record = {}; + for (const [k, v] of Object.entries(process.env)) { + if (v != null && !k.startsWith('npm_config_')) env[k] = v; + } + env.npm_config_registry = 'https://registry.npmjs.org/'; + return env; +} + +const GIT_ENV = { + GIT_AUTHOR_NAME: 'eval', + GIT_AUTHOR_EMAIL: 'eval@storybook.js.org', + GIT_COMMITTER_NAME: 'eval', + GIT_COMMITTER_EMAIL: 'eval@storybook.js.org', +}; + +async function run(cmd: string, args: string[], opts: { cwd?: string; env?: Record; timeout?: number } = {}) { + const { x } = await import('tinyexec'); + const result = await x(cmd, args, { + throwOnError: false, + nodeOptions: { + cwd: opts.cwd, + env: (opts.env ?? process.env) as NodeJS.ProcessEnv, + timeout: opts.timeout, + }, + }); + if (result.exitCode !== 0) { + throw new Error(`${cmd} ${args.join(' ')} failed (${result.exitCode}):\n${result.stderr}`); + } + return result; +} + +function stripStorybookDeps(pkgPath: string) { + if (!existsSync(pkgPath)) return; + const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8')); + let changed = false; + for (const field of ['dependencies', 'devDependencies', 'peerDependencies']) { + const deps = pkg[field]; + if (!deps) continue; + for (const key of Object.keys(deps)) { + if (key === 'storybook' || key.startsWith('@storybook/') || key === 'eslint-plugin-storybook') { + delete deps[key]; + changed = true; + } + } + } + if (pkg.scripts) { + for (const key of Object.keys(pkg.scripts)) { + if (key === 'storybook' || key === 'build-storybook') { + delete pkg.scripts[key]; + changed = true; + } + } + } + if (changed) writeFileSync(pkgPath, JSON.stringify(pkg, null, 2) + '\n'); +} + +function cleanStorybookFiles(dir: string) { + for (const name of ['.storybook', 'storybook-static']) { + const target = join(dir, name); + if (existsSync(target)) rmSync(target, { recursive: true }); + } + for (const storiesDir of ['stories', join('src', 'stories')]) { + const target = join(dir, storiesDir); + if (existsSync(target) && isStarterDirectory(target)) { + rmSync(target, { recursive: true }); + } + } + stripStorybookDeps(join(dir, 'package.json')); +} + +function isStarterDirectory(dir: string): boolean { + try { + return readdirSync(dir, { withFileTypes: true }).every( + (e) => !e.isDirectory() && STARTER_FILES.has(e.name.toLowerCase()) + ); + } catch { + return false; + } +} + +function detectPM(dir: string): string { + if (existsSync(join(dir, 'pnpm-lock.yaml'))) return 'pnpm'; + if (existsSync(join(dir, 'yarn.lock'))) return 'yarn'; + if (existsSync(join(dir, 'bun.lockb')) || existsSync(join(dir, 'bun.lock'))) return 'bun'; + return 'npm'; +} + +async function installDeps(dir: string) { + const env = cleanNpmEnv(); + const pm = detectPM(dir); + console.log(` > Installing with ${pm}...`); + const args = pm === 'pnpm' ? ['install', '--no-frozen-lockfile'] + : pm === 'yarn' && existsSync(join(dir, '.yarnrc.yml')) ? ['install', '--no-immutable'] + : ['install']; + await run(pm, args, { cwd: dir, env, timeout: 300_000 }); +} + +async function prepareRepo(repo: BenchmarkRepo) { + console.log(pc.bold(`\n=== ${repo.name} ===`)); + const repoDir = join(PREP_DIR, repo.name); + + // 1. Fork (idempotent — gh fork is a no-op if already forked) + console.log(` > Forking ${repo.repo}...`); + try { + await run('gh', ['repo', 'fork', repo.repo, '--clone=false']); + } catch { + console.log(` ! Fork may already exist, continuing...`); + } + + // Figure out the fork name (gh forks to authenticated user) + const whoami = (await run('gh', ['api', 'user', '--jq', '.login'])).stdout.trim(); + const forkSlug = `${whoami}/${repo.repo.split('/')[1]}`; + console.log(` > Fork: ${forkSlug}`); + + // 2. Clone (or pull) the fork + if (existsSync(repoDir)) { + console.log(` > Updating existing clone...`); + await run('git', ['fetch', 'origin'], { cwd: repoDir }); + const branch = repo.branch || (await run('git', ['remote', 'show', 'origin'], { cwd: repoDir })) + .stdout.match(/HEAD branch:\s*(\S+)/)?.[1] || 'main'; + await run('git', ['checkout', branch], { cwd: repoDir }); + await run('git', ['reset', '--hard', `origin/${branch}`], { cwd: repoDir }); + await run('git', ['clean', '-fdx', '-e', 'node_modules'], { cwd: repoDir }); + } else { + console.log(` > Cloning ${forkSlug}...`); + const cloneArgs = ['clone', `https://github.com/${forkSlug}.git`, repoDir]; + if (repo.branch) cloneArgs.splice(1, 0, '--branch', repo.branch); + await run('git', cloneArgs, { timeout: 120_000 }); + } + + // 3. Create eval-baseline branch + console.log(` > Creating ${BASELINE_BRANCH} branch...`); + await run('git', ['checkout', '-B', BASELINE_BRANCH], { cwd: repoDir }); + + // 4. Clean storybook files + const projectDir = repo.projectDir ? join(repoDir, repo.projectDir) : repoDir; + cleanStorybookFiles(projectDir); + + // 5. Install dependencies + await installDeps(projectDir); + + // 6. Run storybook init + console.log(` > Running storybook init...`); + const env = cleanNpmEnv(); + await run('npx', ['storybook@latest', 'init', '--yes', '--no-dev'], { + cwd: projectDir, + env: { ...env, STORYBOOK_DISABLE_TELEMETRY: '1' }, + timeout: 300_000, + }); + + // 7. Post-init install + await installDeps(projectDir); + + // 8. Commit everything + console.log(` > Committing baseline...`); + await run('git', ['add', '-A'], { cwd: repoDir, env: { ...cleanNpmEnv(), ...GIT_ENV } }); + await run('git', ['commit', '-m', 'eval baseline after storybook init', '--allow-empty'], { + cwd: repoDir, + env: { ...cleanNpmEnv(), ...GIT_ENV }, + }); + + // 9. Force-push the baseline branch + console.log(` > Pushing ${BASELINE_BRANCH}...`); + await run('git', ['push', '-f', 'origin', BASELINE_BRANCH], { cwd: repoDir }); + + console.log(pc.green(` ✓ ${repo.name} ready at ${forkSlug}#${BASELINE_BRANCH}`)); + return { name: repo.name, forkRepo: `https://github.com/${forkSlug}`, branch: BASELINE_BRANCH, projectDir: repo.projectDir }; +} + +// --- Main --- +mkdirSync(PREP_DIR, { recursive: true }); + +console.log(pc.bold('Preparing eval baseline repos')); +console.log(`Output: ${PREP_DIR}\n`); + +const results = []; +for (const repo of REPOS) { + try { + const result = await prepareRepo(repo); + results.push(result); + } catch (error) { + console.log(pc.red(` ✗ Failed: ${error instanceof Error ? error.message : error}`)); + } +} + +console.log(pc.bold('\n\nPrepared repos:')); +for (const r of results) { + console.log(` ${r.name}: ${r.forkRepo}#${r.branch}${r.projectDir ? ` (${r.projectDir})` : ''}`); +} diff --git a/scripts/eval/prompts/setup.md b/scripts/eval/prompts/setup.md index 95b8aeb31b8c..f1e9aeb84726 100644 --- a/scripts/eval/prompts/setup.md +++ b/scripts/eval/prompts/setup.md @@ -1,33 +1,26 @@ -# Complete Storybook Setup +You are finishing Storybook setup for an existing React + Vite codebase. -Storybook has just been initialized in this project with `npx storybook@latest init --yes`. -The basic scaffolding is in place but the setup needs to be completed so that stories render correctly. +## Starting state -## Steps +- Storybook was already installed with `npx storybook@latest init --yes`. +- Do not rerun `storybook init`. +- The goal is not to create a demo app. The goal is to make Storybook work for the actual project code. -1. **Analyze the project**: Read `package.json` and source code to understand the tech stack — CSS framework, state management, routing, theming, and any global providers. +## Objectives -2. **Configure `.storybook/preview.ts`**: Make stories render like the real app by adding: - - Global CSS imports (Tailwind CSS, global stylesheets, CSS resets, font imports) - - Provider decorators wrapping every story (Redux store, React Router, Theme providers, i18n, etc.) - - Appropriate `parameters` (viewport, backgrounds, etc.) +1. Make Storybook render the project's real components with the providers, globals, aliases, styles, mocks, and environment they need. +2. Replace or remove init placeholder stories/components when they stop being useful. +3. Add or update a small representative set of stories for existing components from the project. +4. Prefer reusable setup in `.storybook` over per-story hacks. -3. **Configure `.storybook/main.ts`**: Adjust if needed: - - `staticDirs` for public assets (images, fonts) - - Framework-specific overrides (e.g., `viteFinal` or `webpackFinal`) - - Autodocs if the project uses JSDoc or TSDoc +## Constraints -4. **Verify the setup**: Run `npx storybook build` to check for errors. If it fails: - - Read the error output carefully - - Fix the root cause (missing import, wrong config, etc.) - - Run the build again - - Repeat until the build succeeds +- Keep changes focused on Storybook setup and the minimum related support files. +- Avoid changing product source unless genuinely required. +- Reuse existing app providers and styling entry points when possible. -## Guidelines +## Verification -- Look at the app's entry point (`main.tsx`, `index.tsx`, `App.tsx`) to find providers and global setup -- Check for CSS framework config files (`tailwind.config.*`, `postcss.config.*`, etc.) -- Keep changes minimal — only modify what is needed to make stories render -- Do NOT create new stories or components -- Do NOT remove existing stories -- Prefer importing existing app utilities over re-implementing them +- Run your own non-interactive verification commands. +- Fix the highest-signal Storybook problem first. +- Iterate until the setup is stable enough that another user can keep writing stories without additional setup work. diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index 5d1439e9c0d3..dbddcc27442d 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -24,7 +24,7 @@ export const MODEL_TIERS: Record = { 'claude-sonnet-4-6': 'sonnet', 'claude-haiku-4-5': 'haiku', 'o4-mini': 'codex', - 'o3': 'codex', + o3: 'codex', 'gpt-4.1': 'codex', }; @@ -36,15 +36,10 @@ export const SUPPORTED_MODELS_BY_AGENT: Record Date: Fri, 27 Mar 2026 20:28:49 +0700 Subject: [PATCH 04/63] Add Google Sheets upload, run IDs, and environment capture - Google Sheets integration via Apps Script webhook (set EVAL_GOOGLE_SHEETS_URL) - Run ID (per session) and upload ID (for grouping) like MCP eval - Environment capture (node version, git branch/commit) - Included google-apps-script.js for setting up the spreadsheet --- scripts/eval/eval.ts | 8 +- scripts/eval/google-apps-script.js | 85 ++++++++++++++++ scripts/eval/lib/agents/claude-code.ts | 100 +++++++++++++----- scripts/eval/lib/run-task.ts | 51 ++++++---- scripts/eval/lib/save.ts | 134 +++++++++++++++++++++++++ scripts/eval/lib/utils.ts | 4 +- 6 files changed, 333 insertions(+), 49 deletions(-) create mode 100644 scripts/eval/google-apps-script.js create mode 100644 scripts/eval/lib/save.ts diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 7be3a63a82c8..62ad6e6c4913 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -1,3 +1,4 @@ +import { randomUUID } from 'node:crypto'; import { Command } from 'commander'; import pc from 'picocolors'; import type { TrialConfig, TrialResult, AgentName, SupportedModel } from './types'; @@ -15,6 +16,7 @@ const program = new Command() .option('--prompt ', 'custom prompt file path') .option('-n, --iterations ', 'number of iterations per project', '1') .option('-v, --verbose', 'verbose output') + .option('-u, --upload-id ', 'upload ID for grouping results in Google Sheets') .option('--list-projects', 'list available projects and exit') .option('--list-models', 'list supported models and exit'); @@ -71,9 +73,13 @@ if (projects.length === 0) { // --- Run evals --- +const runId = randomUUID().slice(0, 8); +const uploadId = (opts.uploadId as string) || `eval-${runId}`; + log(pc.bold('\nStorybook Setup Eval')); log(`Agent: ${pc.cyan(agentName)} | Model: ${pc.cyan(model)} | Iterations: ${iterations}`); log(`Projects: ${projects.map((p) => p.name).join(', ')}`); +log(`Run: ${runId} | Upload: ${uploadId}`); const allResults: TrialResult[] = []; @@ -94,7 +100,7 @@ for (const project of projects) { }; try { - const result = await runTask(config); + const result = await runTask(config, runId, uploadId); allResults.push(result); } catch (error) { log(pc.red(`\nFailed to evaluate ${project.name}: ${error instanceof Error ? error.message : error}`)); diff --git a/scripts/eval/google-apps-script.js b/scripts/eval/google-apps-script.js new file mode 100644 index 000000000000..164d8a059bdd --- /dev/null +++ b/scripts/eval/google-apps-script.js @@ -0,0 +1,85 @@ +/** + * Google Apps Script for Storybook Setup Evaluations + * + * Instructions: + * 1. Create a new Google Sheet for eval results + * 2. Go to Extensions > Apps Script + * 3. Replace the contents with this code + * 4. Click "Deploy" > "New deployment" + * 5. Select type: "Web app" + * 6. Execute as: "Me" + * 7. Who has access: "Anyone" + * 8. Click "Deploy" and copy the web app URL + * 9. Set EVAL_GOOGLE_SHEETS_URL= in your environment + * + * Authorization: + * Run authorize() from the editor to trigger the authorization prompt. + * Click "Review Permissions" → Select account → "Advanced" → "Go to [project] (unsafe)" → "Allow" + */ + +function toTitleCase(key) { + return key + .replace(/([A-Z])/g, " $1") + .replace(/^./, (str) => str.toUpperCase()) + .trim(); +} + +function ensureHeaders(sheet, keys) { + const firstHeaderCell = sheet.getRange(1, 1).getValue(); + if (firstHeaderCell === "") { + const headers = keys.map(toTitleCase); + sheet.getRange(1, 1, 1, headers.length).setValues([headers]); + sheet.getRange(1, 1, 1, headers.length).setFontWeight("bold"); + } +} + +function appendRow(sheet, keys, rowData) { + var lock = LockService.getScriptLock(); + try { + lock.waitLock(120000); + var lastRow = sheet.getLastRow(); + var targetRow = lastRow < 1 ? 2 : lastRow + 1; + sheet.getRange(targetRow, 1, 1, rowData.length).setValues([rowData]); + SpreadsheetApp.flush(); + return targetRow; + } finally { + lock.releaseLock(); + } +} + +function prepareRowData(keys, data) { + return keys.map(function (key) { + var value = data[key]; + if (typeof value === "boolean") return value ? "TRUE" : "FALSE"; + if (value === null || value === undefined) return ""; + return value; + }); +} + +// eslint-disable-next-line @typescript-eslint/no-unused-vars +function doPost(e) { + try { + var data = JSON.parse(e.postData.contents); + var sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet(); + var keys = Object.keys(data); + var rowData = prepareRowData(keys, data); + + ensureHeaders(sheet, keys); + var targetRow = appendRow(sheet, keys, rowData); + + return ContentService.createTextOutput( + JSON.stringify({ success: true, row: targetRow }), + ).setMimeType(ContentService.MimeType.JSON); + } catch (error) { + return ContentService.createTextOutput( + JSON.stringify({ success: false, error: error.toString() }), + ).setMimeType(ContentService.MimeType.JSON); + } +} + +// eslint-disable-next-line @typescript-eslint/no-unused-vars +function authorize() { + var spreadsheet = SpreadsheetApp.getActiveSpreadsheet(); + var file = DriveApp.getFileById(spreadsheet.getId()); + console.log("Authorized! File:", file.getName()); +} diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index bd52d4ee280c..c55f2bb46bec 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -1,8 +1,74 @@ +import type { SDKMessage } from "@anthropic-ai/claude-agent-sdk"; import { query } from "@anthropic-ai/claude-agent-sdk"; import { writeFileSync } from "node:fs"; import { join } from "node:path"; import type { Agent, ExecutionResult, SupportedModel } from "../../types"; +function logMessage(message: SDKMessage) { + const log = (prefix: string, text: string) => process.stderr.write(`${prefix} ${text}\n`); + + switch (message.type) { + case "assistant": { + for (const block of message.message.content) { + if (block.type === "text") { + log("💬", block.text); + } else if (block.type === "tool_use") { + log("🔧", `${block.name}(${JSON.stringify(block.input).slice(0, 200)})`); + } + } + if (message.error) { + log("❌", `Assistant error: ${message.error}`); + } + break; + } + case "user": { + const content = message.message.content; + if (!Array.isArray(content)) break; + for (const block of content) { + if (block.type === "tool_result") { + const text = + typeof block.content === "string" + ? block.content.slice(0, 200) + : Array.isArray(block.content) + ? block.content + .map((b: { type: string; text?: string }) => + "text" in b ? b.text : `[${b.type}]`, + ) + .join("") + .slice(0, 200) + : "[no content]"; + log("📎", `tool_result(${block.tool_use_id?.slice(-8)}): ${text}`); + } + } + break; + } + case "result": + if (message.subtype === "success") { + log("✅", `Done — ${message.num_turns} turns, $${message.total_cost_usd?.toFixed(4)}`); + } else { + log("❌", `Error (${message.subtype}): ${message.errors?.join(", ")}`); + } + break; + case "system": + if (message.subtype === "init") { + log("🚀", `Session started — model: ${message.model}`); + } else if (message.subtype === "api_retry") { + log("🔄", `API retry: attempt ${message.attempt}/${message.max_retries}`); + } else if (message.subtype === "status") { + log("📊", `status: ${message.status ?? "unknown"}`); + } + break; + case "tool_use_summary": + log("📋", message.summary.slice(0, 200)); + break; + case "rate_limit_event": + log("⏳", `Rate limited — status: ${message.rate_limit_info?.status}, resets at: ${message.rate_limit_info?.resetsAt}`); + break; + default: + break; + } +} + export const claudeCodeAgent: Agent = { name: "claude-code", @@ -10,9 +76,9 @@ export const claudeCodeAgent: Agent = { prompt: string, projectPath: string, model: SupportedModel, - options?: { verbose?: boolean; resultsDir?: string }, + options?: { resultsDir?: string }, ): Promise { - const { verbose, resultsDir } = options ?? {}; + const { resultsDir } = options ?? {}; const startTime = Date.now(); let cost: number | undefined; @@ -31,32 +97,14 @@ export const claudeCodeAgent: Agent = { systemPrompt: { type: "preset", preset: "claude_code" }, }, })) { - console.log(message); + logMessage(message); messages.push(message); - if ("type" in message && message.type === "assistant") { - const content = (message as Record).content; - if (Array.isArray(content)) { - for (const block of content) { - console.log(block.text); - if (block.type === "text") { - process.stderr.write(block.text + "\n"); - } else if (block.type === "tool_use") { - const tool = block as { name?: string; input?: unknown }; - process.stderr.write(` [tool] ${tool.name}\n`); - } - } - } - } - - if ("type" in message && message.type === "result") { - const result = message as Record; - if (result.subtype === "success") { - cost = result.total_cost_usd as number | undefined; - turns = (result.num_turns as number) ?? 0; - durationApi = - typeof result.duration_api_ms === "number" ? result.duration_api_ms / 1000 : undefined; - } + if (message.type === "result" && message.subtype === "success") { + cost = message.total_cost_usd as number | undefined; + turns = (message.num_turns as number) ?? 0; + durationApi = + typeof message.duration_api_ms === "number" ? message.duration_api_ms / 1000 : undefined; } } diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-task.ts index 33c9dd9cc9a0..38a1789016b1 100644 --- a/scripts/eval/lib/run-task.ts +++ b/scripts/eval/lib/run-task.ts @@ -1,31 +1,39 @@ -import { writeFileSync } from 'node:fs'; -import { join } from 'node:path'; -import type { TrialConfig, TrialResult } from '../types'; -import { MODEL_TIERS } from '../types'; -import { agents } from '../config'; -import { prepareTrial } from './prepare-trial'; -import { generatePrompt } from './generate-prompt'; -import { grade } from './grade'; -import { generateTrialId, log, logSuccess } from './utils'; +import { writeFileSync } from "node:fs"; +import { join } from "node:path"; +import type { TrialConfig, TrialResult } from "../types"; +import { MODEL_TIERS } from "../types"; +import { agents } from "../config"; +import { prepareTrial } from "./prepare-trial"; +import { generatePrompt } from "./generate-prompt"; +import { grade } from "./grade"; +import { captureEnvironment, saveToGoogleSheets } from "./save"; +import { generateTrialId, log, logSuccess } from "./utils"; /** - * Run a full eval trial: prepare -> execute agent -> grade. + * Run a full eval trial: prepare -> execute agent -> grade -> save. */ -export async function runTask(config: TrialConfig): Promise { +export async function runTask( + config: TrialConfig, + runId: string, + uploadId: string, +): Promise { const { project, agent: agentName, model, promptFile, verbose } = config; const trialId = generateTrialId(project.name, agentName, model); const timestamp = new Date().toISOString(); log(`\nPreparing ${project.name}...`); - // 1. Prepare the trial (clone, clean, init storybook, baseline commit) + // 1. Prepare the trial const paths = await prepareTrial(project, trialId); - // 2. Generate the prompt + // 2. Capture environment + const environment = await captureEnvironment(paths.resultsDir); + + // 3. Generate the prompt const prompt = generatePrompt(promptFile); - writeFileSync(join(paths.resultsDir, 'prompt.md'), prompt); + writeFileSync(join(paths.resultsDir, "prompt.md"), prompt); - // 3. Execute the agent + // 4. Execute the agent log(` Running ${agentName} (${model})...`); const agent = agents[agentName]; const execution = await agent.execute(prompt, paths.projectPath, model, { @@ -33,13 +41,13 @@ export async function runTask(config: TrialConfig): Promise { resultsDir: paths.resultsDir, }); logSuccess( - `Agent completed (${Math.round(execution.duration)}s, ${execution.cost ? `$${execution.cost.toFixed(2)}` : 'cost N/A'}, ${execution.turns} turns)` + `Agent completed (${Math.round(execution.duration)}s, ${execution.cost ? `$${execution.cost.toFixed(2)}` : "cost N/A"}, ${execution.turns} turns)`, ); - // 4. Grade the results + // 5. Grade the results const { grading, quality } = await grade(paths); - // 5. Assemble final result + // 6. Assemble final result const result: TrialResult = { schemaVersion: 1, project: project.name, @@ -47,15 +55,18 @@ export async function runTask(config: TrialConfig): Promise { model, modelTier: MODEL_TIERS[model], timestamp, - promptFile: promptFile || 'setup.md', + promptFile: promptFile || "setup.md", baselineCommit: paths.baselineCommit, execution, grading, quality, }; - writeFileSync(join(paths.resultsDir, 'summary.json'), JSON.stringify(result, null, 2)); + writeFileSync(join(paths.resultsDir, "summary.json"), JSON.stringify(result, null, 2)); logSuccess(`Results saved to ${paths.resultsDir}`); + // 7. Upload to Google Sheets + await saveToGoogleSheets(result, environment, runId, uploadId); + return result; } diff --git a/scripts/eval/lib/save.ts b/scripts/eval/lib/save.ts new file mode 100644 index 000000000000..a6083524a9df --- /dev/null +++ b/scripts/eval/lib/save.ts @@ -0,0 +1,134 @@ +import { writeFileSync } from "node:fs"; +import { join, relative } from "node:path"; +import type { TrialResult } from "../types"; +import { logStep, logSuccess, logError, exec, EVAL_ROOT } from "./utils"; + +// Google Apps Script webhook URL for the Storybook Setup Eval spreadsheet. +// To set up your own: see google-apps-script.js in this directory. +const GOOGLE_SHEETS_URL = process.env.EVAL_GOOGLE_SHEETS_URL; + +interface SheetsData { + uploadId: string; + runId: string; + timestamp: string; + project: string; + agent: string; + model: string; + modelTier: string; + promptFile: string; + buildSuccess: boolean; + typeCheckErrors: number; + ghostStoriesPassed: number | null; + ghostStoriesTotal: number | null; + ghostStoriesRate: number | null; + setupPatterns: string; + changedFiles: number; + storybookFiles: number; + qualityScore: number; + cost: number | "unknown"; + duration: number; + turns: number; + gitBranch: string; + gitCommit: string; + trialPath: string; +} + +export interface Environment { + nodeVersion: string; + gitBranch: string; + gitCommit: string; + timestamp: string; +} + +/** + * Capture environment info for reproducibility. + */ +export async function captureEnvironment(resultsDir: string): Promise { + const nodeVersion = process.version; + + let gitBranch = "unknown"; + let gitCommit = "unknown"; + try { + gitBranch = (await exec("git", ["rev-parse", "--abbrev-ref", "HEAD"])).stdout.trim(); + gitCommit = (await exec("git", ["rev-parse", "HEAD"])).stdout.trim(); + } catch { + // Not in a git repo + } + + const env: Environment = { + nodeVersion, + gitBranch, + gitCommit, + timestamp: new Date().toISOString(), + }; + + writeFileSync(join(resultsDir, "environment.json"), JSON.stringify(env, null, 2)); + return env; +} + +/** + * Upload a trial result to Google Sheets. + */ +export async function saveToGoogleSheets( + result: TrialResult, + environment: Environment, + runId: string, + uploadId: string, +): Promise { + if (!GOOGLE_SHEETS_URL) { + logStep("Skipping Google Sheets (set EVAL_GOOGLE_SHEETS_URL to enable)"); + return; + } + + logStep("Uploading to Google Sheets..."); + + const ghost = result.grading.ghostStories; + const data: SheetsData = { + uploadId, + runId, + timestamp: result.timestamp, + project: result.project, + agent: result.agent, + model: result.model, + modelTier: result.modelTier, + promptFile: result.promptFile, + buildSuccess: result.grading.buildSuccess, + typeCheckErrors: result.grading.typeCheckErrors, + ghostStoriesPassed: ghost?.passed ?? null, + ghostStoriesTotal: ghost?.total ?? null, + ghostStoriesRate: ghost?.successRate ?? null, + setupPatterns: result.grading.setupPatterns.map((p) => p.id).join(", "), + changedFiles: result.grading.changedFiles.length, + storybookFiles: result.grading.storybookFiles.length, + qualityScore: result.quality.score, + cost: result.execution.cost ?? "unknown", + duration: result.execution.duration, + turns: result.execution.turns, + gitBranch: environment.gitBranch, + gitCommit: environment.gitCommit, + trialPath: relative(EVAL_ROOT, ""), + }; + + try { + const response = await fetch(GOOGLE_SHEETS_URL, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(data), + redirect: "manual", + }); + + // Google Apps Script may return HTML on redirect — treat as success + const contentType = response.headers.get("content-type"); + if (contentType?.includes("application/json")) { + const body = (await response.json()) as { success: boolean; error?: string }; + if (!body.success) { + logError(`Google Sheets error: ${body.error}`); + return; + } + } + + logSuccess("Uploaded to Google Sheets"); + } catch (error) { + logError(`Google Sheets upload failed: ${error instanceof Error ? error.message : error}`); + } +} diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts index 59006bbd39e3..f0eafb8c079d 100644 --- a/scripts/eval/lib/utils.ts +++ b/scripts/eval/lib/utils.ts @@ -87,8 +87,8 @@ export async function exec( ? setTimeout(() => controller!.abort(), timeout) : undefined; - const stdio = stdin === 'ignore' - ? (['ignore', 'pipe', 'pipe'] as const) + const stdio: ['ignore', 'pipe', 'pipe'] | undefined = stdin === 'ignore' + ? ['ignore', 'pipe', 'pipe'] : undefined; try { From 3e74467eea7a734a86c862915c921f8c2bff8ca6 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 20:37:04 +0700 Subject: [PATCH 05/63] Add composable prompt variants and vitest-based self-heal Prompts are now composable: --prompt setup self-heal doctor Each name maps to prompts/{name}.md, concatenated in order. Available prompts: - setup: base setup prompt (default) - self-heal: iterative fix loop using vitest --project=storybook - doctor: run diagnostics before large config changes Updated verification to prefer vitest over storybook build since storybook init creates the vitest integration automatically. --- scripts/eval/eval.ts | 16 +++++++++--- scripts/eval/lib/generate-prompt.ts | 39 +++++++++++++++++++++-------- scripts/eval/lib/run-task.ts | 6 ++--- scripts/eval/lib/save.ts | 4 +-- scripts/eval/prompts/doctor.md | 3 +++ scripts/eval/prompts/self-heal.md | 11 ++++++++ scripts/eval/prompts/setup.md | 3 ++- scripts/eval/types.ts | 5 ++-- 8 files changed, 65 insertions(+), 22 deletions(-) create mode 100644 scripts/eval/prompts/doctor.md create mode 100644 scripts/eval/prompts/self-heal.md diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 62ad6e6c4913..004de6b9e4ed 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -5,6 +5,7 @@ import type { TrialConfig, TrialResult, AgentName, SupportedModel } from './type import { SUPPORTED_MODELS_BY_AGENT } from './types'; import { PROJECTS, DEFAULT_AGENT, DEFAULT_MODEL } from './config'; import { runTask } from './lib/run-task'; +import { listPrompts } from './lib/generate-prompt'; import { log, formatDuration, formatCost } from './lib/utils'; const program = new Command() @@ -13,12 +14,13 @@ const program = new Command() .option('-p, --project ', 'run only this project (by name)') .option('-a, --agent ', 'agent to use', DEFAULT_AGENT) .option('-m, --model ', 'model to use', DEFAULT_MODEL) - .option('--prompt ', 'custom prompt file path') + .option('--prompt ', 'prompt names to compose (from prompts/ dir)', ['setup']) .option('-n, --iterations ', 'number of iterations per project', '1') .option('-v, --verbose', 'verbose output') .option('-u, --upload-id ', 'upload ID for grouping results in Google Sheets') .option('--list-projects', 'list available projects and exit') - .option('--list-models', 'list supported models and exit'); + .option('--list-models', 'list supported models and exit') + .option('--list-prompts', 'list available prompts and exit'); program.parse(); @@ -34,6 +36,14 @@ if (opts.listProjects) { process.exit(0); } +if (opts.listPrompts) { + log('Available prompts (compose with --prompt name1 name2):'); + for (const name of listPrompts()) { + log(` ${pc.bold(name)}`); + } + process.exit(0); +} + if (opts.listModels) { log('Supported models by agent:'); for (const [agent, models] of Object.entries(SUPPORTED_MODELS_BY_AGENT)) { @@ -95,7 +105,7 @@ for (const project of projects) { project, agent: agentName, model, - promptFile: opts.prompt as string | undefined, + prompts: opts.prompt as string[], verbose: opts.verbose as boolean | undefined, }; diff --git a/scripts/eval/lib/generate-prompt.ts b/scripts/eval/lib/generate-prompt.ts index 93ce7b94ce37..779c395c75b2 100644 --- a/scripts/eval/lib/generate-prompt.ts +++ b/scripts/eval/lib/generate-prompt.ts @@ -1,19 +1,36 @@ -import { readFileSync, existsSync } from 'node:fs'; -import { resolve } from 'node:path'; -import { PROMPTS_DIR } from './utils'; +import { readFileSync, existsSync, readdirSync } from "node:fs"; +import { resolve, basename } from "node:path"; +import { PROMPTS_DIR } from "./utils"; /** - * Load and return the setup prompt. + * Build a prompt by concatenating one or more markdown files from prompts/. * - * If a custom prompt file is specified, it takes precedence. - * Otherwise, the built-in `prompts/setup.md` is used. + * Names are resolved as `prompts/{name}.md`. Multiple names are joined + * with a blank line, so you can compose: `["setup", "self-heal"]`. + * + * If no names are given, defaults to `["setup"]`. */ -export function generatePrompt(promptFile?: string): string { - const file = promptFile ? resolve(promptFile) : resolve(PROMPTS_DIR, 'setup.md'); +export function generatePrompt(names?: string[]): string { + const promptNames = names && names.length > 0 ? names : ["setup"]; - if (!existsSync(file)) { - throw new Error(`Prompt file not found: ${file}`); + const parts: string[] = []; + for (const name of promptNames) { + const file = resolve(PROMPTS_DIR, `${name}.md`); + if (!existsSync(file)) { + throw new Error(`Prompt not found: ${file}\nAvailable: ${listPrompts().join(", ")}`); + } + parts.push(readFileSync(file, "utf-8").trim()); } - return readFileSync(file, 'utf-8'); + return parts.join("\n\n"); +} + +/** + * List available prompt names (without .md extension). + */ +export function listPrompts(): string[] { + if (!existsSync(PROMPTS_DIR)) return []; + return readdirSync(PROMPTS_DIR) + .filter((f) => f.endsWith(".md")) + .map((f) => basename(f, ".md")); } diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-task.ts index 38a1789016b1..e6e5c9c9ad84 100644 --- a/scripts/eval/lib/run-task.ts +++ b/scripts/eval/lib/run-task.ts @@ -17,7 +17,7 @@ export async function runTask( runId: string, uploadId: string, ): Promise { - const { project, agent: agentName, model, promptFile, verbose } = config; + const { project, agent: agentName, model, prompts: promptNames, verbose } = config; const trialId = generateTrialId(project.name, agentName, model); const timestamp = new Date().toISOString(); @@ -30,7 +30,7 @@ export async function runTask( const environment = await captureEnvironment(paths.resultsDir); // 3. Generate the prompt - const prompt = generatePrompt(promptFile); + const prompt = generatePrompt(promptNames); writeFileSync(join(paths.resultsDir, "prompt.md"), prompt); // 4. Execute the agent @@ -55,7 +55,7 @@ export async function runTask( model, modelTier: MODEL_TIERS[model], timestamp, - promptFile: promptFile || "setup.md", + prompts: promptNames || ["setup"], baselineCommit: paths.baselineCommit, execution, grading, diff --git a/scripts/eval/lib/save.ts b/scripts/eval/lib/save.ts index a6083524a9df..fa741828afbd 100644 --- a/scripts/eval/lib/save.ts +++ b/scripts/eval/lib/save.ts @@ -15,7 +15,7 @@ interface SheetsData { agent: string; model: string; modelTier: string; - promptFile: string; + prompts: string; buildSuccess: boolean; typeCheckErrors: number; ghostStoriesPassed: number | null; @@ -91,7 +91,7 @@ export async function saveToGoogleSheets( agent: result.agent, model: result.model, modelTier: result.modelTier, - promptFile: result.promptFile, + prompts: result.prompts.join("+"), buildSuccess: result.grading.buildSuccess, typeCheckErrors: result.grading.typeCheckErrors, ghostStoriesPassed: ghost?.passed ?? null, diff --git a/scripts/eval/prompts/doctor.md b/scripts/eval/prompts/doctor.md new file mode 100644 index 000000000000..9fc75e4d2d41 --- /dev/null +++ b/scripts/eval/prompts/doctor.md @@ -0,0 +1,3 @@ +## Diagnostics first + +Before making large configuration changes, prefer a fast diagnostic command if it can reduce ambiguity. Use Storybook-specific diagnostics when available (e.g. `npx storybook doctor`). diff --git a/scripts/eval/prompts/self-heal.md b/scripts/eval/prompts/self-heal.md new file mode 100644 index 000000000000..a5950f8263de --- /dev/null +++ b/scripts/eval/prompts/self-heal.md @@ -0,0 +1,11 @@ +## Self-healing loop + +Storybook init created a Vitest integration (`npx vitest --project=storybook`). Use it to verify your setup: + +1. Run `npx vitest run --project=storybook` to test if stories render. +2. Read the error output carefully — it tells you exactly which stories fail and why. +3. Make the smallest fix that addresses the root cause (missing provider, missing CSS, wrong alias, etc.). +4. Re-run `npx vitest run --project=storybook`. +5. Repeat until all stories pass or remaining failures are clearly outside Storybook setup scope. + +Do not stop after the first partial improvement. Keep iterating. diff --git a/scripts/eval/prompts/setup.md b/scripts/eval/prompts/setup.md index f1e9aeb84726..77958ce0219e 100644 --- a/scripts/eval/prompts/setup.md +++ b/scripts/eval/prompts/setup.md @@ -21,6 +21,7 @@ You are finishing Storybook setup for an existing React + Vite codebase. ## Verification -- Run your own non-interactive verification commands. +- Use `npx vitest run --project=storybook` to verify stories render correctly. +- Use `npx storybook build` as a secondary check. - Fix the highest-signal Storybook problem first. - Iterate until the setup is stable enough that another user can keep writing stories without additional setup work. diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index dbddcc27442d..f00849836e38 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -49,7 +49,8 @@ export interface TrialConfig { project: Project; agent: AgentName; model: SupportedModel; - promptFile?: string; + /** Prompt names to compose (from prompts/ dir). Defaults to ["setup"]. */ + prompts?: string[]; verbose?: boolean; } @@ -145,7 +146,7 @@ export interface TrialResult { model: string; modelTier: ModelTier; timestamp: string; - promptFile: string; + prompts: string[]; baselineCommit: string; execution: ExecutionResult; grading: GradingResult; From 7a8d08b3ecd526067ed9312e5fd47fb94e477283 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 20:50:30 +0700 Subject: [PATCH 06/63] Simplify eval codebase (-308 lines) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Move cleanEnv to utils (was duplicated in prepare-trial and grade) - Replace fast-glob/glob with Node 22 built-in fs.globSync - Compact setup-patterns rules into tuple array - Remove manual file recursion in setup-patterns and ghost-stories - Fix save.ts bug (relative(EVAL_ROOT, "") → removed trialPath) - Remove unused logWarn, simplify logging helpers - Tighten prepare-trial install detection into single expression --- scripts/eval/lib/ghost-stories.ts | 173 ++++++++--------------------- scripts/eval/lib/grade.ts | 138 ++++++++--------------- scripts/eval/lib/prepare-trial.ts | 75 ++++--------- scripts/eval/lib/save.ts | 65 ++--------- scripts/eval/lib/setup-patterns.ts | 124 ++++----------------- scripts/eval/lib/utils.ts | 121 ++++++++------------ 6 files changed, 194 insertions(+), 502 deletions(-) diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts index bf616bde7f6d..c02da00c7926 100644 --- a/scripts/eval/lib/ghost-stories.ts +++ b/scripts/eval/lib/ghost-stories.ts @@ -1,167 +1,84 @@ -import { readFileSync, existsSync } from 'node:fs'; -import { join, relative } from 'node:path'; -import { exec } from './utils'; -import type { GhostStoriesResult } from '../types'; -import { logStep, logSuccess, logError } from './utils'; +import { readFileSync, existsSync, globSync } from "node:fs"; +import { join } from "node:path"; +import type { GhostStoriesResult } from "../types"; +import { logStep, logSuccess, logError, exec, cleanEnv } from "./utils"; /** * Run ghost stories: discover candidate components, auto-generate stories - * via the Vitest component transform, and measure rendering success. - * - * This leverages the existing @storybook/addon-vitest componentTransform which - * activates when `STORYBOOK_COMPONENT_PATHS` env var is set. After `storybook init`, - * the addon is already configured. + * via the addon-vitest componentTransform, and measure rendering success. */ export async function runGhostStories( projectPath: string, - resultsDir: string + resultsDir: string, ): Promise { - logStep('Running ghost stories...'); + logStep("Running ghost stories..."); - // 1. Find candidate React components - const candidates = await findCandidateComponents(projectPath); + const candidates = findCandidates(projectPath); if (candidates.length === 0) { - logError('No candidate components found'); + logError("No candidate components found"); return undefined; } logStep(`Found ${candidates.length} candidate component(s)`); - // 2. Run vitest with STORYBOOK_COMPONENT_PATHS to trigger componentTransform - const reportPath = join(resultsDir, 'ghost-stories-report.json'); + const reportPath = join(resultsDir, "ghost-stories-report.json"); await exec( - 'npx', - [ - 'vitest', - 'run', - '--project=storybook', - '--reporter=json', - `--outputFile=${reportPath}`, - '--testTimeout=10000', - ], + "npx", + ["vitest", "run", "--project=storybook", "--reporter=json", `--outputFile=${reportPath}`, "--testTimeout=10000"], { cwd: projectPath, timeout: 120_000, throwOnError: false, - env: { - PATH: process.env.PATH, - HOME: process.env.HOME, - npm_config_registry: 'https://registry.npmjs.org/', - STORYBOOK_COMPONENT_PATHS: candidates.join(','), - }, - } + env: { ...cleanEnv(), STORYBOOK_COMPONENT_PATHS: candidates.join(",") }, + }, ); - // 3. Parse results if (!existsSync(reportPath)) { - logError('Ghost stories: no Vitest report generated'); + logError("Ghost stories: no Vitest report generated"); return { candidateCount: candidates.length, total: 0, passed: 0, successRate: 0 }; } try { - const report = JSON.parse(readFileSync(reportPath, 'utf-8')); - const testResults = report.testResults || []; - + const report = JSON.parse(readFileSync(reportPath, "utf-8")); let total = 0; let passed = 0; - for (const suite of testResults) { - for (const test of suite.assertionResults || []) { + for (const suite of report.testResults ?? []) { + for (const test of suite.assertionResults ?? []) { total++; - if (test.status === 'passed') passed++; + if (test.status === "passed") passed++; } } - const successRate = total > 0 ? Math.round((passed / total) * 100) / 100 : 0; - - if (total > 0) { - logSuccess(`Ghost stories: ${passed}/${total} passed (${Math.round(successRate * 100)}%)`); - } else { - logError('Ghost stories: no tests found in report'); - } - + if (total > 0) logSuccess(`Ghost stories: ${passed}/${total} passed (${Math.round(successRate * 100)}%)`); return { candidateCount: candidates.length, total, passed, successRate }; } catch { - logError('Ghost stories: failed to parse Vitest report'); + logError("Ghost stories: failed to parse Vitest report"); return { candidateCount: candidates.length, total: 0, passed: 0, successRate: 0 }; } } -/** - * Find candidate React component files in the project. - * - * Looks for .tsx/.jsx files that contain JSX and exports, - * excluding tests, stories, config files, and node_modules. - */ -async function findCandidateComponents(projectPath: string): Promise { - const result = await exec( - 'find', - [ - projectPath, - '-type', - 'f', - '(', - '-name', - '*.tsx', - '-o', - '-name', - '*.jsx', - ')', - '-not', - '-path', - '*/node_modules/*', - '-not', - '-path', - '*/.storybook/*', - '-not', - '-path', - '*/dist/*', - '-not', - '-path', - '*/build/*', - '-not', - '-name', - '*.test.*', - '-not', - '-name', - '*.spec.*', - '-not', - '-name', - '*.stories.*', - '-not', - '-name', - '*.story.*', - '-not', - '-name', - 'vite.config.*', - '-not', - '-name', - 'vitest.config.*', - ], - { cwd: projectPath, throwOnError: false } - ); - - const files = result.stdout.trim().split('\n').filter(Boolean); - - // Filter for files that look like React components (have JSX + exports) - const candidates: Array<{ path: string; complexity: number }> = []; - for (const file of files) { - try { - const content = readFileSync(file, 'utf-8'); - const hasExport = /export\s+(default\s+)?/.test(content); - const hasJsx = /<[A-Z]/.test(content) || /return\s*\(?\s* l.trim()).length; - const imports = (content.match(/^import\s/gm) || []).length; - const complexity = Math.min(1, (lines + imports * 0.5) / 100); +/** Find .tsx/.jsx files that look like React components, sorted by simplicity. */ +function findCandidates(projectPath: string): string[] { + const SKIP = new Set(["node_modules", ".storybook", "dist", "build", ".git"]); + const files = globSync("**/*.{tsx,jsx}", { + cwd: projectPath, + exclude: (f) => SKIP.has(f.name), + }); - candidates.push({ path: relative(projectPath, file), complexity }); - } catch { - // skip unreadable files - } - } - - // Sort by complexity (simplest first), take top 20 - candidates.sort((a, b) => a.complexity - b.complexity); - return candidates.slice(0, 20).map((c) => c.path); + return files + .filter((f) => !/\.(test|spec|stories|story)\./.test(f) && !/config\./.test(f)) + .map((f) => { + try { + const content = readFileSync(join(projectPath, f), "utf-8"); + if (!/export\s/.test(content)) return null; + if (!/<[A-Z]/.test(content) && !/return\s*\(?\s* l.trim()).length; + return { path: f, complexity: Math.min(1, lines / 100) }; + } catch { + return null; + } + }) + .filter(Boolean) + .sort((a, b) => a!.complexity - b!.complexity) + .slice(0, 20) + .map((c) => c!.path); } diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index 925712b8b904..b03d307ef79d 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -1,131 +1,91 @@ -import { writeFileSync } from 'node:fs'; -import { join } from 'node:path'; -import type { GradingResult, QualityResult, TrialPaths, ChangedFile } from '../types'; -import { logStep, logSuccess, logError, exec } from './utils'; -import { detectSetupPatterns } from './setup-patterns'; -import { runGhostStories } from './ghost-stories'; - -/** - * Grade a trial by measuring what the agent changed and whether the build works. - */ -export async function grade( - paths: TrialPaths -): Promise<{ grading: GradingResult; quality: QualityResult }> { +import { writeFileSync } from "node:fs"; +import { join } from "node:path"; +import type { GradingResult, QualityResult, TrialPaths, ChangedFile } from "../types"; +import { logStep, logSuccess, logError, exec, cleanEnv } from "./utils"; +import { detectSetupPatterns } from "./setup-patterns"; +import { runGhostStories } from "./ghost-stories"; + +export async function grade(paths: TrialPaths): Promise<{ grading: GradingResult; quality: QualityResult }> { const { repoRoot, projectPath, resultsDir, baselineCommit } = paths; + const env = cleanEnv(); - // --- Changed Files (diff from baseline) --- - logStep('Collecting agent changes...'); + // Changed files + logStep("Collecting agent changes..."); const changedFiles = await getChangedFiles(repoRoot, baselineCommit); const storybookFiles = changedFiles.filter( - (f) => f.path.includes('.storybook/') || /\.(stories|story)\.[tj]sx?$/.test(f.path) + (f) => f.path.includes(".storybook/") || /\.(stories|story)\.[tj]sx?$/.test(f.path), ); logSuccess(`${changedFiles.length} files changed (${storybookFiles.length} storybook-related)`); - // --- Setup Patterns --- + // Setup patterns const setupPatterns = detectSetupPatterns(projectPath); - if (setupPatterns.length > 0) { - logSuccess(`Detected patterns: ${setupPatterns.map((p) => p.label).join(', ')}`); - } + if (setupPatterns.length > 0) logSuccess(`Detected patterns: ${setupPatterns.map((p) => p.label).join(", ")}`); - // --- Storybook Build --- - logStep('Running storybook build...'); - const buildResult = await exec('npx', ['storybook', 'build', '--quiet'], { + // Storybook build + logStep("Running storybook build..."); + const build = await exec("npx", ["storybook", "build", "--quiet"], { cwd: projectPath, timeout: 300_000, throwOnError: false, - env: { - STORYBOOK_DISABLE_TELEMETRY: '1', - NODE_OPTIONS: '--max_old_space_size=4096', - PATH: process.env.PATH, - HOME: process.env.HOME, - npm_config_registry: 'https://registry.npmjs.org/', - }, + env: { ...env, STORYBOOK_DISABLE_TELEMETRY: "1", NODE_OPTIONS: "--max_old_space_size=4096" }, }); - - const buildSuccess = buildResult.exitCode === 0; - const buildOutput = buildResult.stdout + '\n' + buildResult.stderr; - writeFileSync(join(resultsDir, 'build-output.txt'), buildOutput); - + const buildSuccess = build.exitCode === 0; + writeFileSync(join(resultsDir, "build-output.txt"), build.stdout + "\n" + build.stderr); if (buildSuccess) { - logSuccess('Storybook build succeeded'); + logSuccess("Storybook build succeeded"); } else { - logError(`Storybook build failed (exit code ${buildResult.exitCode})`); + logError(`Storybook build failed (exit ${build.exitCode})`); } - // --- TypeScript Check --- - logStep('Running typecheck...'); - const tscResult = await exec('npx', ['tsc', '--noEmit'], { - cwd: projectPath, - timeout: 120_000, - throwOnError: false, - env: { - PATH: process.env.PATH, - HOME: process.env.HOME, - npm_config_registry: 'https://registry.npmjs.org/', - }, - }); - - const typeCheckOutput = tscResult.stdout + '\n' + tscResult.stderr; - writeFileSync(join(resultsDir, 'typecheck-output.txt'), typeCheckOutput); - const typeCheckErrors = (typeCheckOutput.match(/error TS\d+/g) || []).length; - + // TypeScript check + logStep("Running typecheck..."); + const tsc = await exec("npx", ["tsc", "--noEmit"], { cwd: projectPath, timeout: 120_000, throwOnError: false, env }); + const tscOutput = tsc.stdout + "\n" + tsc.stderr; + writeFileSync(join(resultsDir, "typecheck-output.txt"), tscOutput); + const typeCheckErrors = (tscOutput.match(/error TS\d+/g) || []).length; if (typeCheckErrors === 0) { - logSuccess('No TypeScript errors'); + logSuccess("No TypeScript errors"); } else { logError(`${typeCheckErrors} TypeScript error(s)`); } - // --- Ghost Stories --- - const ghostStories = buildSuccess - ? await runGhostStories(projectPath, resultsDir) - : undefined; + // Ghost stories (only if build passed) + const ghostStories = buildSuccess ? await runGhostStories(projectPath, resultsDir) : undefined; const grading: GradingResult = { buildSuccess, - buildError: buildSuccess ? undefined : buildOutput.slice(-2000), + buildError: buildSuccess ? undefined : (build.stdout + "\n" + build.stderr).slice(-2000), typeCheckErrors, - typeCheckOutput: typeCheckErrors > 0 ? typeCheckOutput.slice(-2000) : undefined, + typeCheckOutput: typeCheckErrors > 0 ? tscOutput.slice(-2000) : undefined, changedFiles, storybookFiles, setupPatterns, ghostStories, }; - const quality = calculateQuality(grading); - return { grading, quality }; + // Quality = 70% build + 30% typecheck + const buildScore = buildSuccess ? 1 : 0; + const tcScore = Math.max(0, 1 - typeCheckErrors / 20); + const score = Math.round((buildScore * 0.7 + tcScore * 0.3) * 100) / 100; + + return { + grading, + quality: { score, breakdown: { build: buildScore, typecheck: Math.round(tcScore * 100) / 100 } }, + }; } -async function getChangedFiles(repoRoot: string, baselineCommit: string): Promise { - // Stage everything so diff sees new files too - await exec('git', ['add', '-A'], { cwd: repoRoot }); - const result = await exec('git', ['diff', '--cached', '--name-status', baselineCommit], { +async function getChangedFiles(repoRoot: string, baseline: string): Promise { + await exec("git", ["add", "-A"], { cwd: repoRoot }); + const { stdout } = await exec("git", ["diff", "--cached", "--name-status", baseline], { cwd: repoRoot, throwOnError: false, }); - - return result.stdout + return stdout .trim() - .split('\n') + .split("\n") .filter(Boolean) .map((line) => { - const [status, ...pathParts] = line.split('\t'); - return { - path: pathParts.join('\t'), - status: (status?.charAt(0) || 'M') as ChangedFile['status'], - }; + const [status, ...parts] = line.split("\t"); + return { path: parts.join("\t"), status: (status?.charAt(0) || "M") as ChangedFile["status"] }; }); } - -function calculateQuality(grading: GradingResult): QualityResult { - const buildScore = grading.buildSuccess ? 1 : 0; - const typeCheckScore = Math.max(0, 1 - grading.typeCheckErrors / 20); - const score = buildScore * 0.7 + typeCheckScore * 0.3; - - return { - score: Math.round(score * 100) / 100, - breakdown: { - build: buildScore, - typecheck: Math.round(typeCheckScore * 100) / 100, - }, - }; -} diff --git a/scripts/eval/lib/prepare-trial.ts b/scripts/eval/lib/prepare-trial.ts index 173640d9c35a..549928f9290d 100644 --- a/scripts/eval/lib/prepare-trial.ts +++ b/scripts/eval/lib/prepare-trial.ts @@ -1,83 +1,54 @@ -import { existsSync, mkdirSync, cpSync } from 'node:fs'; -import { join } from 'node:path'; -import type { Project, TrialPaths } from '../types'; -import { CACHE_DIR, TRIALS_DIR, logStep, logSuccess, exec } from './utils'; - -function cleanNpmEnv(): Record { - const env = { ...process.env }; - env.npm_config_registry = 'https://registry.npmjs.org/'; - for (const key of Object.keys(env)) { - if (key.startsWith('npm_config_') && key !== 'npm_config_registry') { - delete env[key]; - } - } - return env; -} +import { existsSync, mkdirSync, cpSync } from "node:fs"; +import { join } from "node:path"; +import type { Project, TrialPaths } from "../types"; +import { CACHE_DIR, TRIALS_DIR, logStep, logSuccess, exec, cleanEnv } from "./utils"; async function installDeps(dir: string) { - const env = cleanNpmEnv(); - const p = (f: string) => existsSync(join(dir, f)); - - let cmd: string; - let args: string[]; - if (p('pnpm-lock.yaml') || p('pnpm-workspace.yaml')) { - cmd = 'pnpm'; - args = ['install', '--no-frozen-lockfile']; - } else if (p('yarn.lock')) { - cmd = 'yarn'; - args = p('.yarnrc.yml') ? ['install', '--no-immutable'] : ['install']; - } else if (p('bun.lockb') || p('bun.lock')) { - cmd = 'bun'; - args = ['install']; - } else { - cmd = 'npm'; - args = ['install', '--ignore-scripts']; - } + const env = cleanEnv(); + const has = (f: string) => existsSync(join(dir, f)); + const [cmd, args]: [string, string[]] = has("pnpm-lock.yaml") || has("pnpm-workspace.yaml") + ? ["pnpm", ["install", "--no-frozen-lockfile"]] + : has("yarn.lock") + ? ["yarn", has(".yarnrc.yml") ? ["install", "--no-immutable"] : ["install"]] + : has("bun.lockb") || has("bun.lock") + ? ["bun", ["install"]] + : ["npm", ["install", "--ignore-scripts"]]; logStep(`Installing with ${cmd}...`); await exec(cmd, args, { cwd: dir, timeout: 300_000, env }); } /** - * Prepare a trial directory. - * - * First run per project: clone eval-baseline branch → install deps → cache it. + * First run: clone eval-baseline → install deps → cache it. * Subsequent runs: copy from cache. Agent starts immediately. */ export async function prepareTrial(project: Project, trialId: string): Promise { const cacheDir = join(CACHE_DIR, project.name); const trialDir = join(TRIALS_DIR, trialId); - const repoRoot = join(trialDir, 'project'); + const repoRoot = join(trialDir, "project"); mkdirSync(trialDir, { recursive: true }); - if (existsSync(join(cacheDir, '.git'))) { - // Fast path: copy from cache - logStep('Copying from cache...'); + if (existsSync(join(cacheDir, ".git"))) { + logStep("Copying from cache..."); cpSync(cacheDir, repoRoot, { recursive: true }); } else { - // First run: clone directly to trial dir, install, then cache logStep(`Cloning ${project.repo}#${project.branch}...`); mkdirSync(CACHE_DIR, { recursive: true }); - await exec('git', ['clone', '--depth', '1', '--branch', project.branch!, project.repo, repoRoot], { + await exec("git", ["clone", "--depth", "1", "--branch", project.branch!, project.repo, repoRoot], { timeout: 120_000, }); - const projectPath = project.projectDir ? join(repoRoot, project.projectDir) : repoRoot; await installDeps(projectPath); - logSuccess('Dependencies installed'); - - // Save to cache for next time - logStep('Caching for future runs...'); + logSuccess("Dependencies installed"); + logStep("Caching for future runs..."); cpSync(repoRoot, cacheDir, { recursive: true }); } - const baselineResult = await exec('git', ['rev-parse', 'HEAD'], { cwd: repoRoot }); - const baselineCommit = baselineResult.stdout.trim(); - + const baselineCommit = (await exec("git", ["rev-parse", "HEAD"], { cwd: repoRoot })).stdout.trim(); const projectPath = project.projectDir ? join(repoRoot, project.projectDir) : repoRoot; - const resultsDir = join(trialDir, 'results'); + const resultsDir = join(trialDir, "results"); mkdirSync(resultsDir, { recursive: true }); - logSuccess('Trial ready'); + logSuccess("Trial ready"); return { trialDir, repoRoot, projectPath, resultsDir, baselineCommit }; } diff --git a/scripts/eval/lib/save.ts b/scripts/eval/lib/save.ts index fa741828afbd..7f1e3ef6b698 100644 --- a/scripts/eval/lib/save.ts +++ b/scripts/eval/lib/save.ts @@ -1,77 +1,33 @@ import { writeFileSync } from "node:fs"; -import { join, relative } from "node:path"; +import { join } from "node:path"; import type { TrialResult } from "../types"; -import { logStep, logSuccess, logError, exec, EVAL_ROOT } from "./utils"; +import { logStep, logSuccess, logError, exec } from "./utils"; -// Google Apps Script webhook URL for the Storybook Setup Eval spreadsheet. -// To set up your own: see google-apps-script.js in this directory. const GOOGLE_SHEETS_URL = process.env.EVAL_GOOGLE_SHEETS_URL; -interface SheetsData { - uploadId: string; - runId: string; - timestamp: string; - project: string; - agent: string; - model: string; - modelTier: string; - prompts: string; - buildSuccess: boolean; - typeCheckErrors: number; - ghostStoriesPassed: number | null; - ghostStoriesTotal: number | null; - ghostStoriesRate: number | null; - setupPatterns: string; - changedFiles: number; - storybookFiles: number; - qualityScore: number; - cost: number | "unknown"; - duration: number; - turns: number; - gitBranch: string; - gitCommit: string; - trialPath: string; -} - export interface Environment { nodeVersion: string; gitBranch: string; gitCommit: string; - timestamp: string; } -/** - * Capture environment info for reproducibility. - */ export async function captureEnvironment(resultsDir: string): Promise { - const nodeVersion = process.version; - let gitBranch = "unknown"; let gitCommit = "unknown"; try { gitBranch = (await exec("git", ["rev-parse", "--abbrev-ref", "HEAD"])).stdout.trim(); gitCommit = (await exec("git", ["rev-parse", "HEAD"])).stdout.trim(); } catch { - // Not in a git repo + /* not in a git repo */ } - - const env: Environment = { - nodeVersion, - gitBranch, - gitCommit, - timestamp: new Date().toISOString(), - }; - + const env = { nodeVersion: process.version, gitBranch, gitCommit }; writeFileSync(join(resultsDir, "environment.json"), JSON.stringify(env, null, 2)); return env; } -/** - * Upload a trial result to Google Sheets. - */ export async function saveToGoogleSheets( result: TrialResult, - environment: Environment, + env: Environment, runId: string, uploadId: string, ): Promise { @@ -79,11 +35,10 @@ export async function saveToGoogleSheets( logStep("Skipping Google Sheets (set EVAL_GOOGLE_SHEETS_URL to enable)"); return; } - logStep("Uploading to Google Sheets..."); const ghost = result.grading.ghostStories; - const data: SheetsData = { + const data = { uploadId, runId, timestamp: result.timestamp, @@ -104,9 +59,8 @@ export async function saveToGoogleSheets( cost: result.execution.cost ?? "unknown", duration: result.execution.duration, turns: result.execution.turns, - gitBranch: environment.gitBranch, - gitCommit: environment.gitCommit, - trialPath: relative(EVAL_ROOT, ""), + gitBranch: env.gitBranch, + gitCommit: env.gitCommit, }; try { @@ -116,8 +70,6 @@ export async function saveToGoogleSheets( body: JSON.stringify(data), redirect: "manual", }); - - // Google Apps Script may return HTML on redirect — treat as success const contentType = response.headers.get("content-type"); if (contentType?.includes("application/json")) { const body = (await response.json()) as { success: boolean; error?: string }; @@ -126,7 +78,6 @@ export async function saveToGoogleSheets( return; } } - logSuccess("Uploaded to Google Sheets"); } catch (error) { logError(`Google Sheets upload failed: ${error instanceof Error ? error.message : error}`); diff --git a/scripts/eval/lib/setup-patterns.ts b/scripts/eval/lib/setup-patterns.ts index 4343ada7c526..105a14ba59e9 100644 --- a/scripts/eval/lib/setup-patterns.ts +++ b/scripts/eval/lib/setup-patterns.ts @@ -1,116 +1,40 @@ -import { readFileSync, existsSync, readdirSync } from 'node:fs'; -import { join, relative } from 'node:path'; -import type { SetupPattern } from '../types'; +import { readFileSync, existsSync, globSync } from "node:fs"; +import { join, relative } from "node:path"; +import type { SetupPattern } from "../types"; -interface PatternRule { - id: string; - label: string; - /** Regex to match in file contents */ - pattern: RegExp; - /** Only check files matching these extensions */ - extensions?: string[]; -} - -const RULES: PatternRule[] = [ - { - id: 'global-css', - label: 'Global CSS import', - pattern: /import\s+['"][^'"]+\.(css|scss|sass|less)['"]|import\s+['"]tailwindcss/, - extensions: ['.ts', '.tsx', '.js', '.jsx'], - }, - { - id: 'tailwind', - label: 'Tailwind CSS', - pattern: /@tailwind|tailwindcss|tailwind\.css/, - }, - { - id: 'styled-components', - label: 'Styled Components', - pattern: /styled-components|ThemeProvider.*styled|createGlobalStyle/, - }, - { - id: 'router-provider', - label: 'React Router provider', - pattern: /MemoryRouter|BrowserRouter|RouterProvider|createMemoryRouter/, - }, - { - id: 'redux-provider', - label: 'Redux Provider', - pattern: /react-redux.*Provider| = [ + ["global-css", "Global CSS import", /import\s+['"][^'"]+\.(css|scss|sass|less)['"]|import\s+['"]tailwindcss/], + ["tailwind", "Tailwind CSS", /@tailwind|tailwindcss|tailwind\.css/], + ["styled-components", "Styled Components", /styled-components|createGlobalStyle/], + ["router-provider", "React Router", /MemoryRouter|BrowserRouter|RouterProvider/], + ["redux-provider", "Redux Provider", /react-redux.*Provider| join(dir, f)); const results: SetupPattern[] = []; - for (const rule of RULES) { - const matches: string[] = []; - for (const filePath of filesToScan) { - if (rule.extensions && !rule.extensions.some((ext) => filePath.endsWith(ext))) { - continue; - } + for (const [id, label, pattern] of RULES) { + const matches = files.filter((f) => { try { - const content = readFileSync(filePath, 'utf-8'); - if (rule.pattern.test(content)) { - matches.push(relative(projectPath, filePath)); - } + return pattern.test(readFileSync(f, "utf-8")); } catch { - // skip unreadable files + return false; } - } + }); if (matches.length > 0) { - results.push({ id: rule.id, label: rule.label, sourceFiles: matches }); + results.push({ id, label, sourceFiles: matches.map((f) => relative(projectPath, f)) }); } } return results; } - -function collectFiles(dir: string): string[] { - const files: string[] = []; - try { - for (const entry of readdirSync(dir, { withFileTypes: true })) { - const full = join(dir, entry.name); - if (entry.isDirectory()) { - files.push(...collectFiles(full)); - } else { - files.push(full); - } - } - } catch { - // skip - } - return files; -} diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts index f0eafb8c079d..4e68f05d8df2 100644 --- a/scripts/eval/lib/utils.ts +++ b/scripts/eval/lib/utils.ts @@ -1,95 +1,70 @@ -import { resolve } from 'node:path'; -import pc from 'picocolors'; -import { x } from 'tinyexec'; +import { resolve } from "node:path"; +import pc from "picocolors"; +import { x } from "tinyexec"; -/** Root of the storybook monorepo */ -export const REPO_ROOT = resolve(import.meta.dirname, '..', '..', '..'); +export const REPO_ROOT = resolve(import.meta.dirname, "..", "..", ".."); +export const EVAL_ROOT = resolve(REPO_ROOT, "..", "storybook-eval"); +export const CACHE_DIR = resolve(EVAL_ROOT, ".cache", "repos"); +export const TRIALS_DIR = resolve(EVAL_ROOT, "trials"); +export const PROMPTS_DIR = resolve(import.meta.dirname, "..", "prompts"); -/** Directory for eval trials and caches (outside the monorepo to avoid workspace interference) */ -export const EVAL_ROOT = resolve(REPO_ROOT, '..', 'storybook-eval'); +// --- Logging --- -/** Cached repo clones */ -export const CACHE_DIR = resolve(EVAL_ROOT, '.cache', 'repos'); +export const log = (msg: string) => console.log(msg); +export const logStep = (msg: string) => console.log(` ${pc.cyan(">")} ${msg}`); +export const logSuccess = (msg: string) => console.log(` ${pc.green("✓")} ${msg}`); +export const logError = (msg: string) => console.log(` ${pc.red("✗")} ${msg}`); -/** Trial output base directory */ -export const TRIALS_DIR = resolve(EVAL_ROOT, 'trials'); +export const formatDuration = (s: number) => + s < 60 ? `${Math.round(s)}s` : `${Math.floor(s / 60)}m${Math.round(s % 60)}s`; -/** Built-in prompts directory */ -export const PROMPTS_DIR = resolve(import.meta.dirname, '..', 'prompts'); +export const formatCost = (cost?: number) => (cost == null ? "-" : `$${cost.toFixed(2)}`); -export function log(msg: string) { - console.log(msg); +export function generateTrialId(project: string, agent: string, model: string) { + return `${new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19)}-${project}-${agent}-${model}`; } -export function logStep(msg: string) { - console.log(` ${pc.cyan('>')} ${msg}`); -} - -export function logSuccess(msg: string) { - console.log(` ${pc.green('✓')} ${msg}`); -} - -export function logError(msg: string) { - console.log(` ${pc.red('✗')} ${msg}`); -} - -export function logWarn(msg: string) { - console.log(` ${pc.yellow('!')} ${msg}`); -} - -export function formatDuration(seconds: number): string { - if (seconds < 60) return `${Math.round(seconds)}s`; - const mins = Math.floor(seconds / 60); - const secs = Math.round(seconds % 60); - return `${mins}m${secs}s`; -} - -export function formatCost(cost?: number): string { - if (cost == null) return '-'; - return `$${cost.toFixed(2)}`; -} +// --- Clean npm env --- -export function generateTrialId(projectName: string, agent: string, model: string): string { - const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19); - return `${timestamp}-${projectName}-${agent}-${model}`; +/** + * Process env with verdaccio registry overrides stripped. + * The storybook monorepo's .npmrc points to localhost:6002. + */ +export function cleanEnv(): Record { + const env = { ...process.env }; + env.npm_config_registry = "https://registry.npmjs.org/"; + for (const key of Object.keys(env)) { + if (key.startsWith("npm_config_") && key !== "npm_config_registry") { + delete env[key]; + } + } + return env; } -/** Options for the exec helper */ -interface ExecOptions { - cwd?: string; - env?: Record; - timeout?: number; - /** If true, don't throw on non-zero exit code (default: true = throw) */ - throwOnError?: boolean; - /** Set to 'ignore' to suppress stdin */ - stdin?: 'ignore'; -} +// --- Exec --- -/** Result from exec helper */ export interface ExecResult { stdout: string; stderr: string; exitCode: number | null; } -/** - * Thin wrapper around tinyexec's `x()` with timeout support via AbortController. - */ export async function exec( command: string, args: string[], - options: ExecOptions = {} + options: { + cwd?: string; + env?: Record; + timeout?: number; + throwOnError?: boolean; + stdin?: "ignore"; + } = {}, ): Promise { const { cwd, env, timeout, throwOnError = true, stdin } = options; - const controller = timeout ? new AbortController() : undefined; - const timer = timeout - ? setTimeout(() => controller!.abort(), timeout) - : undefined; - - const stdio: ['ignore', 'pipe', 'pipe'] | undefined = stdin === 'ignore' - ? ['ignore', 'pipe', 'pipe'] - : undefined; + const timer = timeout ? setTimeout(() => controller!.abort(), timeout) : undefined; + const stdio: ["ignore", "pipe", "pipe"] | undefined = + stdin === "ignore" ? ["ignore", "pipe", "pipe"] : undefined; try { const result = await x(command, args, { @@ -103,15 +78,9 @@ export async function exec( }); if (throwOnError && result.exitCode !== 0) { - const msg = `Command failed: ${command} ${args.join(' ')}\n${result.stderr}`; - throw new Error(msg); + throw new Error(`Command failed: ${command} ${args.join(" ")}\n${result.stderr}`); } - - return { - stdout: result.stdout, - stderr: result.stderr, - exitCode: result.exitCode, - }; + return { stdout: result.stdout, stderr: result.stderr, exitCode: result.exitCode }; } finally { if (timer) clearTimeout(timer); } From 6c3e7163a6ded7899894c9de4d6423eb9f8dcf74 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 21:54:39 +0700 Subject: [PATCH 07/63] =?UTF-8?q?Remove=20cleanEnv=20from=20grading=20?= =?UTF-8?q?=E2=80=94=20only=20needed=20for=20installDeps?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/eval/lib/ghost-stories.ts | 4 ++-- scripts/eval/lib/grade.ts | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts index c02da00c7926..79832b381554 100644 --- a/scripts/eval/lib/ghost-stories.ts +++ b/scripts/eval/lib/ghost-stories.ts @@ -1,7 +1,7 @@ import { readFileSync, existsSync, globSync } from "node:fs"; import { join } from "node:path"; import type { GhostStoriesResult } from "../types"; -import { logStep, logSuccess, logError, exec, cleanEnv } from "./utils"; +import { logStep, logSuccess, logError, exec } from "./utils"; /** * Run ghost stories: discover candidate components, auto-generate stories @@ -28,7 +28,7 @@ export async function runGhostStories( cwd: projectPath, timeout: 120_000, throwOnError: false, - env: { ...cleanEnv(), STORYBOOK_COMPONENT_PATHS: candidates.join(",") }, + env: { ...process.env, STORYBOOK_COMPONENT_PATHS: candidates.join(",") }, }, ); diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index b03d307ef79d..5e0c45aefb9c 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -1,13 +1,12 @@ import { writeFileSync } from "node:fs"; import { join } from "node:path"; import type { GradingResult, QualityResult, TrialPaths, ChangedFile } from "../types"; -import { logStep, logSuccess, logError, exec, cleanEnv } from "./utils"; +import { logStep, logSuccess, logError, exec } from "./utils"; import { detectSetupPatterns } from "./setup-patterns"; import { runGhostStories } from "./ghost-stories"; export async function grade(paths: TrialPaths): Promise<{ grading: GradingResult; quality: QualityResult }> { const { repoRoot, projectPath, resultsDir, baselineCommit } = paths; - const env = cleanEnv(); // Changed files logStep("Collecting agent changes..."); @@ -27,7 +26,7 @@ export async function grade(paths: TrialPaths): Promise<{ grading: GradingResult cwd: projectPath, timeout: 300_000, throwOnError: false, - env: { ...env, STORYBOOK_DISABLE_TELEMETRY: "1", NODE_OPTIONS: "--max_old_space_size=4096" }, + env: { ...process.env, STORYBOOK_DISABLE_TELEMETRY: "1", NODE_OPTIONS: "--max_old_space_size=4096" }, }); const buildSuccess = build.exitCode === 0; writeFileSync(join(resultsDir, "build-output.txt"), build.stdout + "\n" + build.stderr); @@ -39,7 +38,7 @@ export async function grade(paths: TrialPaths): Promise<{ grading: GradingResult // TypeScript check logStep("Running typecheck..."); - const tsc = await exec("npx", ["tsc", "--noEmit"], { cwd: projectPath, timeout: 120_000, throwOnError: false, env }); + const tsc = await exec("npx", ["tsc", "--noEmit"], { cwd: projectPath, timeout: 120_000, throwOnError: false }); const tscOutput = tsc.stdout + "\n" + tsc.stderr; writeFileSync(join(resultsDir, "typecheck-output.txt"), tscOutput); const typeCheckErrors = (tscOutput.match(/error TS\d+/g) || []).length; From e11b9bdfcea2f2baeeee4761f43c7070d1e12783 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 21:56:19 +0700 Subject: [PATCH 08/63] =?UTF-8?q?Remove=20cleanEnv=20entirely=20=E2=80=94?= =?UTF-8?q?=20.npmrc=20is=20only=20in=20the=20monorepo,=20not=20in=20trial?= =?UTF-8?q?=20dirs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/eval/lib/prepare-trial.ts | 5 ++--- scripts/eval/lib/utils.ts | 17 ----------------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/scripts/eval/lib/prepare-trial.ts b/scripts/eval/lib/prepare-trial.ts index 549928f9290d..1d5b80ce686b 100644 --- a/scripts/eval/lib/prepare-trial.ts +++ b/scripts/eval/lib/prepare-trial.ts @@ -1,10 +1,9 @@ import { existsSync, mkdirSync, cpSync } from "node:fs"; import { join } from "node:path"; import type { Project, TrialPaths } from "../types"; -import { CACHE_DIR, TRIALS_DIR, logStep, logSuccess, exec, cleanEnv } from "./utils"; +import { CACHE_DIR, TRIALS_DIR, logStep, logSuccess, exec } from "./utils"; async function installDeps(dir: string) { - const env = cleanEnv(); const has = (f: string) => existsSync(join(dir, f)); const [cmd, args]: [string, string[]] = has("pnpm-lock.yaml") || has("pnpm-workspace.yaml") ? ["pnpm", ["install", "--no-frozen-lockfile"]] @@ -15,7 +14,7 @@ async function installDeps(dir: string) { : ["npm", ["install", "--ignore-scripts"]]; logStep(`Installing with ${cmd}...`); - await exec(cmd, args, { cwd: dir, timeout: 300_000, env }); + await exec(cmd, args, { cwd: dir, timeout: 300_000 }); } /** diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts index 4e68f05d8df2..f44f43f763c3 100644 --- a/scripts/eval/lib/utils.ts +++ b/scripts/eval/lib/utils.ts @@ -24,23 +24,6 @@ export function generateTrialId(project: string, agent: string, model: string) { return `${new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19)}-${project}-${agent}-${model}`; } -// --- Clean npm env --- - -/** - * Process env with verdaccio registry overrides stripped. - * The storybook monorepo's .npmrc points to localhost:6002. - */ -export function cleanEnv(): Record { - const env = { ...process.env }; - env.npm_config_registry = "https://registry.npmjs.org/"; - for (const key of Object.keys(env)) { - if (key.startsWith("npm_config_") && key !== "npm_config_registry") { - delete env[key]; - } - } - return env; -} - // --- Exec --- export interface ExecResult { From 2be54f4d9487ba1ab2dfdad15294230391f7af54 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 22:04:39 +0700 Subject: [PATCH 09/63] Switch from jiti to native Node TS support, add .ts extensions to all imports --- scripts/eval/config.ts | 6 +++--- scripts/eval/eval.ts | 12 ++++++------ scripts/eval/lib/agents/claude-code.ts | 2 +- scripts/eval/lib/agents/codex.ts | 2 +- scripts/eval/lib/generate-prompt.ts | 2 +- scripts/eval/lib/ghost-stories.ts | 4 ++-- scripts/eval/lib/grade.ts | 8 ++++---- scripts/eval/lib/prepare-trial.ts | 4 ++-- scripts/eval/lib/run-task.ts | 16 ++++++++-------- scripts/eval/lib/save.ts | 4 ++-- scripts/eval/lib/setup-patterns.ts | 2 +- scripts/package.json | 2 +- 12 files changed, 32 insertions(+), 32 deletions(-) diff --git a/scripts/eval/config.ts b/scripts/eval/config.ts index 62cd086d5bbb..613f2ff50ee1 100644 --- a/scripts/eval/config.ts +++ b/scripts/eval/config.ts @@ -1,6 +1,6 @@ -import type { Project, AgentName, Agent } from './types'; -import { claudeCodeAgent } from './lib/agents/claude-code'; -import { codexAgent } from './lib/agents/codex'; +import type { Project, AgentName, Agent } from './types.ts'; +import { claudeCodeAgent } from './lib/agents/claude-code.ts'; +import { codexAgent } from './lib/agents/codex.ts'; /** * Pre-prepared eval baseline repos. diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 004de6b9e4ed..733fa1321d9d 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -1,12 +1,12 @@ import { randomUUID } from 'node:crypto'; import { Command } from 'commander'; import pc from 'picocolors'; -import type { TrialConfig, TrialResult, AgentName, SupportedModel } from './types'; -import { SUPPORTED_MODELS_BY_AGENT } from './types'; -import { PROJECTS, DEFAULT_AGENT, DEFAULT_MODEL } from './config'; -import { runTask } from './lib/run-task'; -import { listPrompts } from './lib/generate-prompt'; -import { log, formatDuration, formatCost } from './lib/utils'; +import type { TrialConfig, TrialResult, AgentName, SupportedModel } from './types.ts'; +import { SUPPORTED_MODELS_BY_AGENT } from './types.ts'; +import { PROJECTS, DEFAULT_AGENT, DEFAULT_MODEL } from './config.ts'; +import { runTask } from './lib/run-task.ts'; +import { listPrompts } from './lib/generate-prompt.ts'; +import { log, formatDuration, formatCost } from './lib/utils.ts'; const program = new Command() .name('eval') diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index c55f2bb46bec..d13c9241ad56 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -2,7 +2,7 @@ import type { SDKMessage } from "@anthropic-ai/claude-agent-sdk"; import { query } from "@anthropic-ai/claude-agent-sdk"; import { writeFileSync } from "node:fs"; import { join } from "node:path"; -import type { Agent, ExecutionResult, SupportedModel } from "../../types"; +import type { Agent, ExecutionResult, SupportedModel } from "../../types.ts"; function logMessage(message: SDKMessage) { const log = (prefix: string, text: string) => process.stderr.write(`${prefix} ${text}\n`); diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts index 883043cb2a9f..549ee4d2087d 100644 --- a/scripts/eval/lib/agents/codex.ts +++ b/scripts/eval/lib/agents/codex.ts @@ -1,7 +1,7 @@ import { Codex } from '@openai/codex-sdk'; import { writeFileSync } from 'node:fs'; import { join } from 'node:path'; -import type { Agent, ExecutionResult, SupportedModel } from '../../types'; +import type { Agent, ExecutionResult, SupportedModel } from '../../types.ts'; export const codexAgent: Agent = { name: 'codex', diff --git a/scripts/eval/lib/generate-prompt.ts b/scripts/eval/lib/generate-prompt.ts index 779c395c75b2..dfac03a4a426 100644 --- a/scripts/eval/lib/generate-prompt.ts +++ b/scripts/eval/lib/generate-prompt.ts @@ -1,6 +1,6 @@ import { readFileSync, existsSync, readdirSync } from "node:fs"; import { resolve, basename } from "node:path"; -import { PROMPTS_DIR } from "./utils"; +import { PROMPTS_DIR } from "./utils.ts"; /** * Build a prompt by concatenating one or more markdown files from prompts/. diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts index 79832b381554..f7c20b3652f2 100644 --- a/scripts/eval/lib/ghost-stories.ts +++ b/scripts/eval/lib/ghost-stories.ts @@ -1,7 +1,7 @@ import { readFileSync, existsSync, globSync } from "node:fs"; import { join } from "node:path"; -import type { GhostStoriesResult } from "../types"; -import { logStep, logSuccess, logError, exec } from "./utils"; +import type { GhostStoriesResult } from "../types.ts"; +import { logStep, logSuccess, logError, exec } from "./utils.ts"; /** * Run ghost stories: discover candidate components, auto-generate stories diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index 5e0c45aefb9c..9bfd6e5ab986 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -1,9 +1,9 @@ import { writeFileSync } from "node:fs"; import { join } from "node:path"; -import type { GradingResult, QualityResult, TrialPaths, ChangedFile } from "../types"; -import { logStep, logSuccess, logError, exec } from "./utils"; -import { detectSetupPatterns } from "./setup-patterns"; -import { runGhostStories } from "./ghost-stories"; +import type { GradingResult, QualityResult, TrialPaths, ChangedFile } from "../types.ts"; +import { logStep, logSuccess, logError, exec } from "./utils.ts"; +import { detectSetupPatterns } from "./setup-patterns.ts"; +import { runGhostStories } from "./ghost-stories.ts"; export async function grade(paths: TrialPaths): Promise<{ grading: GradingResult; quality: QualityResult }> { const { repoRoot, projectPath, resultsDir, baselineCommit } = paths; diff --git a/scripts/eval/lib/prepare-trial.ts b/scripts/eval/lib/prepare-trial.ts index 1d5b80ce686b..53ac3dd0b0ab 100644 --- a/scripts/eval/lib/prepare-trial.ts +++ b/scripts/eval/lib/prepare-trial.ts @@ -1,7 +1,7 @@ import { existsSync, mkdirSync, cpSync } from "node:fs"; import { join } from "node:path"; -import type { Project, TrialPaths } from "../types"; -import { CACHE_DIR, TRIALS_DIR, logStep, logSuccess, exec } from "./utils"; +import type { Project, TrialPaths } from "../types.ts"; +import { CACHE_DIR, TRIALS_DIR, logStep, logSuccess, exec } from "./utils.ts"; async function installDeps(dir: string) { const has = (f: string) => existsSync(join(dir, f)); diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-task.ts index e6e5c9c9ad84..849db788f945 100644 --- a/scripts/eval/lib/run-task.ts +++ b/scripts/eval/lib/run-task.ts @@ -1,13 +1,13 @@ import { writeFileSync } from "node:fs"; import { join } from "node:path"; -import type { TrialConfig, TrialResult } from "../types"; -import { MODEL_TIERS } from "../types"; -import { agents } from "../config"; -import { prepareTrial } from "./prepare-trial"; -import { generatePrompt } from "./generate-prompt"; -import { grade } from "./grade"; -import { captureEnvironment, saveToGoogleSheets } from "./save"; -import { generateTrialId, log, logSuccess } from "./utils"; +import type { TrialConfig, TrialResult } from "../types.ts"; +import { MODEL_TIERS } from "../types.ts"; +import { agents } from "../config.ts"; +import { prepareTrial } from "./prepare-trial.ts"; +import { generatePrompt } from "./generate-prompt.ts"; +import { grade } from "./grade.ts"; +import { captureEnvironment, saveToGoogleSheets } from "./save.ts"; +import { generateTrialId, log, logSuccess } from "./utils.ts"; /** * Run a full eval trial: prepare -> execute agent -> grade -> save. diff --git a/scripts/eval/lib/save.ts b/scripts/eval/lib/save.ts index 7f1e3ef6b698..a644737e3758 100644 --- a/scripts/eval/lib/save.ts +++ b/scripts/eval/lib/save.ts @@ -1,7 +1,7 @@ import { writeFileSync } from "node:fs"; import { join } from "node:path"; -import type { TrialResult } from "../types"; -import { logStep, logSuccess, logError, exec } from "./utils"; +import type { TrialResult } from "../types.ts"; +import { logStep, logSuccess, logError, exec } from "./utils.ts"; const GOOGLE_SHEETS_URL = process.env.EVAL_GOOGLE_SHEETS_URL; diff --git a/scripts/eval/lib/setup-patterns.ts b/scripts/eval/lib/setup-patterns.ts index 105a14ba59e9..d7a022f891f7 100644 --- a/scripts/eval/lib/setup-patterns.ts +++ b/scripts/eval/lib/setup-patterns.ts @@ -1,6 +1,6 @@ import { readFileSync, existsSync, globSync } from "node:fs"; import { join, relative } from "node:path"; -import type { SetupPattern } from "../types"; +import type { SetupPattern } from "../types.ts"; const RULES: Array<[id: string, label: string, pattern: RegExp]> = [ ["global-css", "Global CSS import", /import\s+['"][^'"]+\.(css|scss|sass|less)['"]|import\s+['"]tailwindcss/], diff --git a/scripts/package.json b/scripts/package.json index f3031bcf8178..abb5e6360401 100644 --- a/scripts/package.json +++ b/scripts/package.json @@ -9,7 +9,7 @@ "check": "jiti ./check/check-package.ts", "check-package": "jiti ./check-package.ts", "docs:codemod": "jiti ./snippets/codemod.ts", - "eval": "jiti ./eval/eval.ts", + "eval": "node ./eval/eval.ts", "generate-sandboxes": "jiti ./sandbox/generate.ts", "get-report-message": "jiti ./get-report-message.ts", "get-sandbox-dir": "jiti ./get-sandbox-dir.ts", From 5aabbda5220c2a96ee10925d8a9fe044697a959f Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 22:09:11 +0700 Subject: [PATCH 10/63] Update models: Sonnet 4.6, Opus 4.6, Haiku 4.5, GPT 5.4 Medium/High --- scripts/eval/types.ts | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index f00849836e38..c0cb155d16c6 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -9,23 +9,22 @@ export type AgentName = 'claude-code' | 'codex'; -export const CLAUDE_MODELS = ['claude-opus-4-6', 'claude-sonnet-4-6', 'claude-haiku-4-5'] as const; +export const CLAUDE_MODELS = ['claude-sonnet-4-6', 'claude-opus-4-6', 'claude-haiku-4-5'] as const; -export const CODEX_MODELS = ['o4-mini', 'o3', 'gpt-4.1'] as const; +export const CODEX_MODELS = ['gpt-5.4-medium', 'gpt-5.4-high'] as const; export type ClaudeModel = (typeof CLAUDE_MODELS)[number]; export type CodexModel = (typeof CODEX_MODELS)[number]; export type SupportedModel = ClaudeModel | CodexModel; -export type ModelTier = 'opus' | 'sonnet' | 'haiku' | 'codex'; +export type ModelTier = 'opus' | 'sonnet' | 'haiku' | 'codex-medium' | 'codex-high'; export const MODEL_TIERS: Record = { - 'claude-opus-4-6': 'opus', 'claude-sonnet-4-6': 'sonnet', + 'claude-opus-4-6': 'opus', 'claude-haiku-4-5': 'haiku', - 'o4-mini': 'codex', - o3: 'codex', - 'gpt-4.1': 'codex', + 'gpt-5.4-medium': 'codex-medium', + 'gpt-5.4-high': 'codex-high', }; export const SUPPORTED_MODELS_BY_AGENT: Record = { From 986988a7fd6a1666a60485637dcdbfe0f7065858 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 22:15:20 +0700 Subject: [PATCH 11/63] =?UTF-8?q?Decouple=20agent=20=C3=97=20model=20?= =?UTF-8?q?=C3=97=20effort=20as=20three=20independent=20axes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/eval/eval.ts | 24 ++++++---- scripts/eval/lib/agents/claude-code.ts | 8 ++-- scripts/eval/lib/agents/codex.ts | 47 ++++++++++--------- scripts/eval/lib/run-task.ts | 8 ++-- scripts/eval/lib/save.ts | 2 +- scripts/eval/types.ts | 62 +++++++------------------- 6 files changed, 66 insertions(+), 85 deletions(-) diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 733fa1321d9d..85c0de69887f 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -1,8 +1,8 @@ import { randomUUID } from 'node:crypto'; import { Command } from 'commander'; import pc from 'picocolors'; -import type { TrialConfig, TrialResult, AgentName, SupportedModel } from './types.ts'; -import { SUPPORTED_MODELS_BY_AGENT } from './types.ts'; +import type { TrialConfig, TrialResult, AgentName, SupportedModel, Effort } from './types.ts'; +import { MODELS_BY_AGENT, EFFORTS } from './types.ts'; import { PROJECTS, DEFAULT_AGENT, DEFAULT_MODEL } from './config.ts'; import { runTask } from './lib/run-task.ts'; import { listPrompts } from './lib/generate-prompt.ts'; @@ -14,6 +14,7 @@ const program = new Command() .option('-p, --project ', 'run only this project (by name)') .option('-a, --agent ', 'agent to use', DEFAULT_AGENT) .option('-m, --model ', 'model to use', DEFAULT_MODEL) + .option('-e, --effort ', 'effort level: low, medium, high, max', 'high') .option('--prompt ', 'prompt names to compose (from prompts/ dir)', ['setup']) .option('-n, --iterations ', 'number of iterations per project', '1') .option('-v, --verbose', 'verbose output') @@ -45,13 +46,12 @@ if (opts.listPrompts) { } if (opts.listModels) { - log('Supported models by agent:'); - for (const [agent, models] of Object.entries(SUPPORTED_MODELS_BY_AGENT)) { + log('Models by agent:'); + for (const [agent, models] of Object.entries(MODELS_BY_AGENT)) { log(`\n ${pc.bold(agent)}:`); - for (const m of models) { - log(` - ${m}`); - } + for (const m of models) log(` - ${m}`); } + log(`\n Effort levels: ${EFFORTS.join(', ')}`); process.exit(0); } @@ -59,9 +59,10 @@ if (opts.listModels) { const agentName = opts.agent as AgentName; const model = opts.model as SupportedModel; +const effort = opts.effort as Effort; const iterations = parseInt(opts.iterations as string, 10); -const supportedModels = SUPPORTED_MODELS_BY_AGENT[agentName]; +const supportedModels = MODELS_BY_AGENT[agentName]; if (!supportedModels) { log(pc.red(`Unknown agent: ${agentName}. Use --list-models to see available agents.`)); process.exit(1); @@ -70,6 +71,10 @@ if (!supportedModels.includes(model)) { log(pc.red(`Model ${model} is not supported by agent ${agentName}. Use --list-models to see options.`)); process.exit(1); } +if (!EFFORTS.includes(effort)) { + log(pc.red(`Unknown effort: ${effort}. Options: ${EFFORTS.join(', ')}`)); + process.exit(1); +} // Filter projects const projects = opts.project @@ -87,7 +92,7 @@ const runId = randomUUID().slice(0, 8); const uploadId = (opts.uploadId as string) || `eval-${runId}`; log(pc.bold('\nStorybook Setup Eval')); -log(`Agent: ${pc.cyan(agentName)} | Model: ${pc.cyan(model)} | Iterations: ${iterations}`); +log(`Agent: ${pc.cyan(agentName)} | Model: ${pc.cyan(model)} | Effort: ${pc.cyan(effort)} | Iterations: ${iterations}`); log(`Projects: ${projects.map((p) => p.name).join(', ')}`); log(`Run: ${runId} | Upload: ${uploadId}`); @@ -105,6 +110,7 @@ for (const project of projects) { project, agent: agentName, model, + effort, prompts: opts.prompt as string[], verbose: opts.verbose as boolean | undefined, }; diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index d13c9241ad56..b1208fc8e9df 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -2,7 +2,7 @@ import type { SDKMessage } from "@anthropic-ai/claude-agent-sdk"; import { query } from "@anthropic-ai/claude-agent-sdk"; import { writeFileSync } from "node:fs"; import { join } from "node:path"; -import type { Agent, ExecutionResult, SupportedModel } from "../../types.ts"; +import type { Agent, Effort, ExecutionResult, SupportedModel } from "../../types.ts"; function logMessage(message: SDKMessage) { const log = (prefix: string, text: string) => process.stderr.write(`${prefix} ${text}\n`); @@ -76,9 +76,9 @@ export const claudeCodeAgent: Agent = { prompt: string, projectPath: string, model: SupportedModel, - options?: { resultsDir?: string }, + options?: { effort?: Effort; resultsDir?: string }, ): Promise { - const { resultsDir } = options ?? {}; + const { effort = "high", resultsDir } = options ?? {}; const startTime = Date.now(); let cost: number | undefined; @@ -93,6 +93,7 @@ export const claudeCodeAgent: Agent = { cwd: projectPath, allowedTools: ["Read", "Write", "Edit", "Bash", "Glob", "Grep"], maxTurns: 50, + effort, debug: true, systemPrompt: { type: "preset", preset: "claude_code" }, }, @@ -117,6 +118,7 @@ export const claudeCodeAgent: Agent = { return { agent: "claude-code", model, + effort, cost, duration, durationApi, diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts index 549ee4d2087d..ee3999c06cad 100644 --- a/scripts/eval/lib/agents/codex.ts +++ b/scripts/eval/lib/agents/codex.ts @@ -1,54 +1,57 @@ -import { Codex } from '@openai/codex-sdk'; -import { writeFileSync } from 'node:fs'; -import { join } from 'node:path'; -import type { Agent, ExecutionResult, SupportedModel } from '../../types.ts'; +import { Codex } from "@openai/codex-sdk"; +import { writeFileSync } from "node:fs"; +import { join } from "node:path"; +import type { Agent, Effort, ExecutionResult, SupportedModel } from "../../types.ts"; + +/** Map our unified effort to Codex's model_reasoning_effort values. */ +const CODEX_EFFORT: Record = { + low: "low", + medium: "medium", + high: "high", + max: "xhigh", +}; export const codexAgent: Agent = { - name: 'codex', + name: "codex", async execute( prompt: string, projectPath: string, model: SupportedModel, - options?: { verbose?: boolean; resultsDir?: string } + options?: { effort?: Effort; verbose?: boolean; resultsDir?: string }, ): Promise { - const { verbose, resultsDir } = options ?? {}; + const { effort = "high", resultsDir } = options ?? {}; const startTime = Date.now(); - const codex = new Codex({ model }); + const codex = new Codex({ + model, + config: { model_reasoning_effort: CODEX_EFFORT[effort] }, + }); const thread = codex.startThread({ workingDirectory: projectPath }); const { events } = await thread.runStreamed(prompt); const items: unknown[] = []; for await (const event of events) { - if (verbose && event.type === 'item.completed') { + if (event.type === "item.completed") { const item = event.item as Record; - if (item.type === 'message' && Array.isArray(item.content)) { + items.push(item); + if (item.type === "message" && Array.isArray(item.content)) { for (const block of item.content) { - if (typeof block === 'object' && block !== null && 'text' in block) { + if (typeof block === "object" && block !== null && "text" in block) { process.stderr.write(`${(block as { text: string }).text}\n`); } } } } - - if (event.type === 'item.completed') { - items.push(event.item); - } } const duration = (Date.now() - startTime) / 1000; if (resultsDir) { - writeFileSync(join(resultsDir, 'transcript.json'), JSON.stringify(items, null, 2)); + writeFileSync(join(resultsDir, "transcript.json"), JSON.stringify(items, null, 2)); } - return { - agent: 'codex', - model, - duration, - turns: items.length, - }; + return { agent: "codex", model, effort, duration, turns: items.length }; }, }; diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-task.ts index 849db788f945..b3e3cb01574e 100644 --- a/scripts/eval/lib/run-task.ts +++ b/scripts/eval/lib/run-task.ts @@ -1,7 +1,6 @@ import { writeFileSync } from "node:fs"; import { join } from "node:path"; import type { TrialConfig, TrialResult } from "../types.ts"; -import { MODEL_TIERS } from "../types.ts"; import { agents } from "../config.ts"; import { prepareTrial } from "./prepare-trial.ts"; import { generatePrompt } from "./generate-prompt.ts"; @@ -17,7 +16,7 @@ export async function runTask( runId: string, uploadId: string, ): Promise { - const { project, agent: agentName, model, prompts: promptNames, verbose } = config; + const { project, agent: agentName, model, effort, prompts: promptNames, verbose } = config; const trialId = generateTrialId(project.name, agentName, model); const timestamp = new Date().toISOString(); @@ -34,9 +33,10 @@ export async function runTask( writeFileSync(join(paths.resultsDir, "prompt.md"), prompt); // 4. Execute the agent - log(` Running ${agentName} (${model})...`); + log(` Running ${agentName} (${model}, effort=${effort})...`); const agent = agents[agentName]; const execution = await agent.execute(prompt, paths.projectPath, model, { + effort, verbose, resultsDir: paths.resultsDir, }); @@ -53,7 +53,7 @@ export async function runTask( project: project.name, agent: agentName, model, - modelTier: MODEL_TIERS[model], + effort, timestamp, prompts: promptNames || ["setup"], baselineCommit: paths.baselineCommit, diff --git a/scripts/eval/lib/save.ts b/scripts/eval/lib/save.ts index a644737e3758..6fb8f392e9fb 100644 --- a/scripts/eval/lib/save.ts +++ b/scripts/eval/lib/save.ts @@ -45,7 +45,7 @@ export async function saveToGoogleSheets( project: result.project, agent: result.agent, model: result.model, - modelTier: result.modelTier, + effort: result.effort, prompts: result.prompts.join("+"), buildSuccess: result.grading.buildSuccess, typeCheckErrors: result.grading.typeCheckErrors, diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index c0cb155d16c6..979b4f6ea83c 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -1,34 +1,25 @@ /** * Core types for the Storybook setup eval system. * - * The eval tests how well an AI agent can complete a Storybook setup - * (after `npx storybook@latest init --yes`) across real-world projects. + * Three independent axes: agent × model × effort */ -// --- Agent & Model Types --- +// --- Agent, Model, Effort --- -export type AgentName = 'claude-code' | 'codex'; +export type AgentName = "claude-code" | "codex"; +export type Effort = "low" | "medium" | "high" | "max"; -export const CLAUDE_MODELS = ['claude-sonnet-4-6', 'claude-opus-4-6', 'claude-haiku-4-5'] as const; - -export const CODEX_MODELS = ['gpt-5.4-medium', 'gpt-5.4-high'] as const; +export const CLAUDE_MODELS = ["claude-sonnet-4-6", "claude-opus-4-6", "claude-haiku-4-5"] as const; +export const CODEX_MODELS = ["gpt-5.4"] as const; +export const ALL_MODELS = [...CLAUDE_MODELS, ...CODEX_MODELS] as const; +export const EFFORTS: Effort[] = ["low", "medium", "high", "max"]; export type ClaudeModel = (typeof CLAUDE_MODELS)[number]; export type CodexModel = (typeof CODEX_MODELS)[number]; export type SupportedModel = ClaudeModel | CodexModel; -export type ModelTier = 'opus' | 'sonnet' | 'haiku' | 'codex-medium' | 'codex-high'; - -export const MODEL_TIERS: Record = { - 'claude-sonnet-4-6': 'sonnet', - 'claude-opus-4-6': 'opus', - 'claude-haiku-4-5': 'haiku', - 'gpt-5.4-medium': 'codex-medium', - 'gpt-5.4-high': 'codex-high', -}; - -export const SUPPORTED_MODELS_BY_AGENT: Record = { - 'claude-code': CLAUDE_MODELS, +export const MODELS_BY_AGENT: Record = { + "claude-code": CLAUDE_MODELS, codex: CODEX_MODELS, }; @@ -48,19 +39,16 @@ export interface TrialConfig { project: Project; agent: AgentName; model: SupportedModel; - /** Prompt names to compose (from prompts/ dir). Defaults to ["setup"]. */ + effort: Effort; prompts?: string[]; verbose?: boolean; } export interface TrialPaths { trialDir: string; - /** Root of the cloned repo (git root) */ repoRoot: string; - /** Working path where storybook lives (may differ from repoRoot for monorepos) */ projectPath: string; resultsDir: string; - /** The git commit hash of the post-init baseline */ baselineCommit: string; } @@ -69,6 +57,7 @@ export interface TrialPaths { export interface ExecutionResult { agent: string; model: string; + effort: string; cost?: number; duration: number; durationApi?: number; @@ -79,7 +68,7 @@ export interface ExecutionResult { export interface ChangedFile { path: string; - status: 'A' | 'M' | 'D' | 'R'; + status: "A" | "M" | "D" | "R"; } // --- Setup Patterns --- @@ -87,7 +76,6 @@ export interface ChangedFile { export interface SetupPattern { id: string; label: string; - /** Files where this pattern was detected */ sourceFiles: string[]; } @@ -98,31 +86,16 @@ export interface GradingResult { buildError?: string; typeCheckErrors: number; typeCheckOutput?: string; - /** Files changed by the agent (diff from post-init baseline) */ changedFiles: ChangedFile[]; - /** Storybook-related files changed by the agent */ storybookFiles: ChangedFile[]; - /** Setup patterns the agent configured */ setupPatterns: SetupPattern[]; - /** Ghost stories grading (placeholder until CLI command exists) */ ghostStories?: GhostStoriesResult; } -/** - * Ghost stories result - measures actual story rendering success. - * - * Currently a placeholder. The Storybook ghost stories feature is triggered - * via channel events (ghostStoriesRequest), not a CLI command. - * A `storybook ghost-stories` command needs to be built first. - */ export interface GhostStoriesResult { - /** How many candidate components were found */ candidateCount: number; - /** How many stories were generated and tested */ total: number; - /** How many stories rendered successfully */ passed: number; - /** Success rate (passed / total) */ successRate: number; } @@ -130,10 +103,7 @@ export interface GhostStoriesResult { export interface QualityResult { score: number; - breakdown: { - build: number; - typecheck: number; - }; + breakdown: { build: number; typecheck: number }; } // --- Final Result --- @@ -143,7 +113,7 @@ export interface TrialResult { project: string; agent: string; model: string; - modelTier: ModelTier; + effort: string; timestamp: string; prompts: string[]; baselineCommit: string; @@ -160,6 +130,6 @@ export interface Agent { prompt: string, projectPath: string, model: SupportedModel, - options?: { verbose?: boolean; resultsDir?: string } + options?: { effort?: Effort; verbose?: boolean; resultsDir?: string }, ): Promise; } From 1ee462d97f8b5faebb61ba34d6ce10929f14a184 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 22:20:25 +0700 Subject: [PATCH 12/63] Simplify prompt to single name, add per-agent default model --- scripts/eval/config.ts | 6 +++++- scripts/eval/eval.ts | 10 +++++----- scripts/eval/lib/generate-prompt.ts | 30 +++++++---------------------- scripts/eval/lib/run-task.ts | 6 +++--- scripts/eval/lib/save.ts | 2 +- scripts/eval/types.ts | 4 ++-- 6 files changed, 23 insertions(+), 35 deletions(-) diff --git a/scripts/eval/config.ts b/scripts/eval/config.ts index 613f2ff50ee1..df665668d75e 100644 --- a/scripts/eval/config.ts +++ b/scripts/eval/config.ts @@ -59,4 +59,8 @@ export const agents: Record = { }; export const DEFAULT_AGENT: AgentName = 'claude-code'; -export const DEFAULT_MODEL = 'claude-sonnet-4-6' as const; + +export const DEFAULT_MODEL: Record = { + 'claude-code': 'claude-sonnet-4-6', + codex: 'gpt-5.4', +}; diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 85c0de69887f..9ed15f6665bf 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -13,9 +13,9 @@ const program = new Command() .description('Evaluate AI agents on Storybook setup tasks') .option('-p, --project ', 'run only this project (by name)') .option('-a, --agent ', 'agent to use', DEFAULT_AGENT) - .option('-m, --model ', 'model to use', DEFAULT_MODEL) + .option('-m, --model ', 'model to use (default: per agent)') .option('-e, --effort ', 'effort level: low, medium, high, max', 'high') - .option('--prompt ', 'prompt names to compose (from prompts/ dir)', ['setup']) + .option('--prompt ', 'prompt name (from prompts/ dir)', 'setup') .option('-n, --iterations ', 'number of iterations per project', '1') .option('-v, --verbose', 'verbose output') .option('-u, --upload-id ', 'upload ID for grouping results in Google Sheets') @@ -38,7 +38,7 @@ if (opts.listProjects) { } if (opts.listPrompts) { - log('Available prompts (compose with --prompt name1 name2):'); + log('Available prompts (use with --prompt ):'); for (const name of listPrompts()) { log(` ${pc.bold(name)}`); } @@ -58,7 +58,7 @@ if (opts.listModels) { // --- Validate inputs --- const agentName = opts.agent as AgentName; -const model = opts.model as SupportedModel; +const model = (opts.model ?? DEFAULT_MODEL[agentName]) as SupportedModel; const effort = opts.effort as Effort; const iterations = parseInt(opts.iterations as string, 10); @@ -111,7 +111,7 @@ for (const project of projects) { agent: agentName, model, effort, - prompts: opts.prompt as string[], + prompt: opts.prompt as string, verbose: opts.verbose as boolean | undefined, }; diff --git a/scripts/eval/lib/generate-prompt.ts b/scripts/eval/lib/generate-prompt.ts index dfac03a4a426..1debad6620ba 100644 --- a/scripts/eval/lib/generate-prompt.ts +++ b/scripts/eval/lib/generate-prompt.ts @@ -2,32 +2,16 @@ import { readFileSync, existsSync, readdirSync } from "node:fs"; import { resolve, basename } from "node:path"; import { PROMPTS_DIR } from "./utils.ts"; -/** - * Build a prompt by concatenating one or more markdown files from prompts/. - * - * Names are resolved as `prompts/{name}.md`. Multiple names are joined - * with a blank line, so you can compose: `["setup", "self-heal"]`. - * - * If no names are given, defaults to `["setup"]`. - */ -export function generatePrompt(names?: string[]): string { - const promptNames = names && names.length > 0 ? names : ["setup"]; - - const parts: string[] = []; - for (const name of promptNames) { - const file = resolve(PROMPTS_DIR, `${name}.md`); - if (!existsSync(file)) { - throw new Error(`Prompt not found: ${file}\nAvailable: ${listPrompts().join(", ")}`); - } - parts.push(readFileSync(file, "utf-8").trim()); +/** Load a prompt by name from prompts/{name}.md. Defaults to "setup". */ +export function generatePrompt(name = "setup"): string { + const file = resolve(PROMPTS_DIR, `${name}.md`); + if (!existsSync(file)) { + throw new Error(`Prompt not found: ${file}\nAvailable: ${listPrompts().join(", ")}`); } - - return parts.join("\n\n"); + return readFileSync(file, "utf-8").trim(); } -/** - * List available prompt names (without .md extension). - */ +/** List available prompt names. */ export function listPrompts(): string[] { if (!existsSync(PROMPTS_DIR)) return []; return readdirSync(PROMPTS_DIR) diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-task.ts index b3e3cb01574e..52960c67abfa 100644 --- a/scripts/eval/lib/run-task.ts +++ b/scripts/eval/lib/run-task.ts @@ -16,7 +16,7 @@ export async function runTask( runId: string, uploadId: string, ): Promise { - const { project, agent: agentName, model, effort, prompts: promptNames, verbose } = config; + const { project, agent: agentName, model, effort, prompt: promptName, verbose } = config; const trialId = generateTrialId(project.name, agentName, model); const timestamp = new Date().toISOString(); @@ -29,7 +29,7 @@ export async function runTask( const environment = await captureEnvironment(paths.resultsDir); // 3. Generate the prompt - const prompt = generatePrompt(promptNames); + const prompt = generatePrompt(promptName); writeFileSync(join(paths.resultsDir, "prompt.md"), prompt); // 4. Execute the agent @@ -55,7 +55,7 @@ export async function runTask( model, effort, timestamp, - prompts: promptNames || ["setup"], + prompt: promptName || "setup", baselineCommit: paths.baselineCommit, execution, grading, diff --git a/scripts/eval/lib/save.ts b/scripts/eval/lib/save.ts index 6fb8f392e9fb..266461cee4e6 100644 --- a/scripts/eval/lib/save.ts +++ b/scripts/eval/lib/save.ts @@ -46,7 +46,7 @@ export async function saveToGoogleSheets( agent: result.agent, model: result.model, effort: result.effort, - prompts: result.prompts.join("+"), + prompt: result.prompt, buildSuccess: result.grading.buildSuccess, typeCheckErrors: result.grading.typeCheckErrors, ghostStoriesPassed: ghost?.passed ?? null, diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index 979b4f6ea83c..1823c194191b 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -40,7 +40,7 @@ export interface TrialConfig { agent: AgentName; model: SupportedModel; effort: Effort; - prompts?: string[]; + prompt?: string; verbose?: boolean; } @@ -115,7 +115,7 @@ export interface TrialResult { model: string; effort: string; timestamp: string; - prompts: string[]; + prompt: string; baselineCommit: string; execution: ExecutionResult; grading: GradingResult; From 06c5f9ad274a18bbe0f4ff3f22094b890804f5ea Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 22:30:57 +0700 Subject: [PATCH 13/63] Split into eval.ts (single run) and eval-parallel.ts (8 runs) --- scripts/eval/config.ts | 78 +++++------- scripts/eval/eval-parallel.ts | 106 ++++++++++++++++ scripts/eval/eval.ts | 223 +++++++++------------------------ scripts/eval/prompts/doctor.md | 3 - scripts/eval/types.ts | 35 +++--- 5 files changed, 216 insertions(+), 229 deletions(-) create mode 100644 scripts/eval/eval-parallel.ts delete mode 100644 scripts/eval/prompts/doctor.md diff --git a/scripts/eval/config.ts b/scripts/eval/config.ts index df665668d75e..fe9a1059719d 100644 --- a/scripts/eval/config.ts +++ b/scripts/eval/config.ts @@ -1,66 +1,50 @@ -import type { Project, AgentName, Agent } from './types.ts'; -import { claudeCodeAgent } from './lib/agents/claude-code.ts'; -import { codexAgent } from './lib/agents/codex.ts'; +import type { AgentName, Agent } from "./types.ts"; +import type { Project } from "./types.ts"; +import { claudeCodeAgent } from "./lib/agents/claude-code.ts"; +import { codexAgent } from "./lib/agents/codex.ts"; -/** - * Pre-prepared eval baseline repos. - * - * Each repo is a fork with an `eval-baseline` branch where: - * - Storybook files were cleaned - * - `npx storybook@latest init --yes --no-dev` was run - * - All deps installed and committed - * - * To regenerate: `npx jiti scripts/eval/prepare-repos.ts` - */ export const PROJECTS: Project[] = [ { - name: 'mealdrop', - repo: 'https://github.com/kasperpeulen/mealdrop', - branch: 'eval-baseline', - description: 'Styled components, Redux, React Router', + name: "mealdrop", + repo: "https://github.com/kasperpeulen/mealdrop", + branch: "eval-baseline", + description: "Styled components, Redux, React Router", }, { - name: 'edgy', - repo: 'https://github.com/kasperpeulen/edgy', - branch: 'eval-baseline', - description: 'Tailwind, HeadlessUI, React Router', + name: "edgy", + repo: "https://github.com/kasperpeulen/edgy", + branch: "eval-baseline", + description: "Tailwind, HeadlessUI, React Router", }, { - name: 'wikitok', - repo: 'https://github.com/kasperpeulen/wikitok', - branch: 'eval-baseline', - projectDir: 'frontend', - description: 'Simple project with Tailwind', + name: "wikitok", + repo: "https://github.com/kasperpeulen/wikitok", + branch: "eval-baseline", + projectDir: "frontend", + description: "Simple project with Tailwind", }, { - name: 'baklava', - repo: 'https://github.com/kasperpeulen/baklava', - branch: 'eval-baseline', - description: 'Component library with Zustand', + name: "baklava", + repo: "https://github.com/kasperpeulen/baklava", + branch: "eval-baseline", + description: "Component library with Zustand", }, { - name: 'echarts', - repo: 'https://github.com/kasperpeulen/echarts-react', - branch: 'eval-baseline', - description: 'ECharts React wrapper', + name: "echarts", + repo: "https://github.com/kasperpeulen/echarts-react", + branch: "eval-baseline", + description: "ECharts React wrapper", }, { - name: 'evergreen-ci', - repo: 'https://github.com/kasperpeulen/ui', - branch: 'eval-baseline', - projectDir: 'packages/lib', - description: 'GraphQL', + name: "evergreen-ci", + repo: "https://github.com/kasperpeulen/ui", + branch: "eval-baseline", + projectDir: "packages/lib", + description: "GraphQL", }, ]; export const agents: Record = { - 'claude-code': claudeCodeAgent, + "claude-code": claudeCodeAgent, codex: codexAgent, }; - -export const DEFAULT_AGENT: AgentName = 'claude-code'; - -export const DEFAULT_MODEL: Record = { - 'claude-code': 'claude-sonnet-4-6', - codex: 'gpt-5.4', -}; diff --git a/scripts/eval/eval-parallel.ts b/scripts/eval/eval-parallel.ts new file mode 100644 index 000000000000..11e459a04c4a --- /dev/null +++ b/scripts/eval/eval-parallel.ts @@ -0,0 +1,106 @@ +import { randomUUID } from "node:crypto"; +import { Command } from "commander"; +import pc from "picocolors"; +import type { TrialConfig, TrialResult } from "./types.ts"; +import { MODELS, effortForModel } from "./types.ts"; +import { PROJECTS } from "./config.ts"; +import { runTask } from "./lib/run-task.ts"; +import { listPrompts } from "./lib/generate-prompt.ts"; +import { log, formatDuration, formatCost } from "./lib/utils.ts"; + +const program = new Command() + .name("eval-parallel") + .description("Run all 4 models × 2 prompts = 8 evals in parallel for one project") + .option("-p, --project ", "project to evaluate") + .option("-u, --upload-id ", "upload ID for Google Sheets"); + +program.parse(); +const opts = program.opts(); + +const project = PROJECTS.find((p) => p.name === opts.project); +if (!project) { + log(pc.red(`Specify a project with -p. Available: ${PROJECTS.map((p) => p.name).join(", ")}`)); + process.exit(1); +} + +const prompts = listPrompts(); +const runId = randomUUID().slice(0, 8); +const uploadId = (opts.uploadId as string) || `eval-${runId}`; + +// Build all 4 models × 2 prompts = 8 configs +const configs: TrialConfig[] = []; +for (const m of MODELS) { + for (const prompt of prompts) { + configs.push({ + project, + agent: m.agent, + model: m.id, + effort: effortForModel(m.id, "high"), + prompt, + }); + } +} + +log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); +log(`${configs.length} parallel runs: ${MODELS.map((m) => m.label).join(", ")} × ${prompts.join(", ")}`); +log(`Run: ${runId}\n`); + +// Run all in parallel +const settled = await Promise.allSettled(configs.map((c) => runTask(c, runId, uploadId))); + +const results: TrialResult[] = []; +for (let i = 0; i < settled.length; i++) { + const s = settled[i]!; + if (s.status === "fulfilled") { + results.push(s.value); + } else { + const c = configs[i]!; + log(pc.red(`\n✗ ${c.model} + ${c.prompt}: ${s.reason}`)); + } +} + +// Summary table sorted by ghost stories rate +if (results.length > 0) { + results.sort((a, b) => { + const ga = a.grading.ghostStories?.successRate ?? -1; + const gb = b.grading.ghostStories?.successRate ?? -1; + return gb - ga; + }); + + log(pc.bold("\n\nResults (sorted by ghost stories rate)")); + log("=".repeat(110)); + log( + ["Model", "Prompt", "Build", "Ghost", "TS Err", "Cost", "Time", "Turns"] + .map((h, i) => h.padEnd(i === 0 ? 22 : i === 1 ? 12 : 10)) + .join(" | "), + ); + log("-".repeat(110)); + + for (const r of results) { + const build = r.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL"); + const ghost = r.grading.ghostStories; + const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; + log( + [ + r.model.padEnd(22), + r.prompt.padEnd(12), + (r.grading.buildSuccess ? "PASS" : "FAIL").padEnd(10).replace(/PASS|FAIL/, build), + ghostStr.padEnd(10), + String(r.grading.typeCheckErrors).padEnd(10), + formatCost(r.execution.cost).padEnd(10), + formatDuration(r.execution.duration).padEnd(10), + String(r.execution.turns).padEnd(10), + ].join(" | "), + ); + } + + log("-".repeat(110)); + const totalCost = results.reduce((s, r) => s + (r.execution.cost || 0), 0); + const ghostRates = results.map((r) => r.grading.ghostStories?.successRate).filter((r) => r != null); + const avgGhost = ghostRates.length > 0 ? ghostRates.reduce((s, r) => s + r, 0) / ghostRates.length : 0; + + log(`\nGhost stories avg: ${pc.bold(`${Math.round(avgGhost * 100)}%`)}`); + log(`Total cost: ${pc.bold(formatCost(totalCost))}`); +} + +log("\nDone."); diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 9ed15f6665bf..440300d57c0c 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -1,186 +1,79 @@ -import { randomUUID } from 'node:crypto'; -import { Command } from 'commander'; -import pc from 'picocolors'; -import type { TrialConfig, TrialResult, AgentName, SupportedModel, Effort } from './types.ts'; -import { MODELS_BY_AGENT, EFFORTS } from './types.ts'; -import { PROJECTS, DEFAULT_AGENT, DEFAULT_MODEL } from './config.ts'; -import { runTask } from './lib/run-task.ts'; -import { listPrompts } from './lib/generate-prompt.ts'; -import { log, formatDuration, formatCost } from './lib/utils.ts'; +import { randomUUID } from "node:crypto"; +import { Command } from "commander"; +import pc from "picocolors"; +import type { TrialConfig, Effort } from "./types.ts"; +import { MODELS, agentForModel, effortForModel } from "./types.ts"; +import { PROJECTS } from "./config.ts"; +import { runTask } from "./lib/run-task.ts"; +import { listPrompts } from "./lib/generate-prompt.ts"; +import { log, formatDuration, formatCost } from "./lib/utils.ts"; const program = new Command() - .name('eval') - .description('Evaluate AI agents on Storybook setup tasks') - .option('-p, --project ', 'run only this project (by name)') - .option('-a, --agent ', 'agent to use', DEFAULT_AGENT) - .option('-m, --model ', 'model to use (default: per agent)') - .option('-e, --effort ', 'effort level: low, medium, high, max', 'high') - .option('--prompt ', 'prompt name (from prompts/ dir)', 'setup') - .option('-n, --iterations ', 'number of iterations per project', '1') - .option('-v, --verbose', 'verbose output') - .option('-u, --upload-id ', 'upload ID for grouping results in Google Sheets') - .option('--list-projects', 'list available projects and exit') - .option('--list-models', 'list supported models and exit') - .option('--list-prompts', 'list available prompts and exit'); + .name("eval") + .description("Run a single Storybook setup eval") + .option("-p, --project ", "project to evaluate") + .option("-m, --model ", "model to use", "claude-sonnet-4-6") + .option("-e, --effort ", "effort: low, medium, high, max", "high") + .option("--prompt ", "prompt name", "setup") + .option("-v, --verbose", "verbose output") + .option("-u, --upload-id ", "upload ID for Google Sheets") + .option("--list-projects", "list projects") + .option("--list-models", "list models") + .option("--list-prompts", "list prompts"); program.parse(); - const opts = program.opts(); -// --- List commands --- - if (opts.listProjects) { - log('Available projects:'); - for (const p of PROJECTS) { - log(` ${pc.bold(p.name)} - ${p.description || p.repo}`); - } + for (const p of PROJECTS) log(` ${pc.bold(p.name)} — ${p.description}`); process.exit(0); } - -if (opts.listPrompts) { - log('Available prompts (use with --prompt ):'); - for (const name of listPrompts()) { - log(` ${pc.bold(name)}`); - } +if (opts.listModels) { + for (const m of MODELS) log(` ${pc.bold(m.id)} (${m.agent}) — ${m.label}`); process.exit(0); } - -if (opts.listModels) { - log('Models by agent:'); - for (const [agent, models] of Object.entries(MODELS_BY_AGENT)) { - log(`\n ${pc.bold(agent)}:`); - for (const m of models) log(` - ${m}`); - } - log(`\n Effort levels: ${EFFORTS.join(', ')}`); +if (opts.listPrompts) { + for (const name of listPrompts()) log(` ${pc.bold(name)}`); process.exit(0); } -// --- Validate inputs --- - -const agentName = opts.agent as AgentName; -const model = (opts.model ?? DEFAULT_MODEL[agentName]) as SupportedModel; -const effort = opts.effort as Effort; -const iterations = parseInt(opts.iterations as string, 10); - -const supportedModels = MODELS_BY_AGENT[agentName]; -if (!supportedModels) { - log(pc.red(`Unknown agent: ${agentName}. Use --list-models to see available agents.`)); - process.exit(1); -} -if (!supportedModels.includes(model)) { - log(pc.red(`Model ${model} is not supported by agent ${agentName}. Use --list-models to see options.`)); - process.exit(1); -} -if (!EFFORTS.includes(effort)) { - log(pc.red(`Unknown effort: ${effort}. Options: ${EFFORTS.join(', ')}`)); +const project = PROJECTS.find((p) => p.name === opts.project); +if (!project) { + log(pc.red(`Specify a project with -p. Available: ${PROJECTS.map((p) => p.name).join(", ")}`)); process.exit(1); } -// Filter projects -const projects = opts.project - ? PROJECTS.filter((p) => p.name === opts.project) - : PROJECTS; - -if (projects.length === 0) { - log(pc.red(`Project not found: ${opts.project}. Use --list-projects to see available projects.`)); - process.exit(1); -} - -// --- Run evals --- - +const model = opts.model as string; +const effort = effortForModel(model, opts.effort as Effort); const runId = randomUUID().slice(0, 8); const uploadId = (opts.uploadId as string) || `eval-${runId}`; -log(pc.bold('\nStorybook Setup Eval')); -log(`Agent: ${pc.cyan(agentName)} | Model: ${pc.cyan(model)} | Effort: ${pc.cyan(effort)} | Iterations: ${iterations}`); -log(`Projects: ${projects.map((p) => p.name).join(', ')}`); -log(`Run: ${runId} | Upload: ${uploadId}`); - -const allResults: TrialResult[] = []; - -for (const project of projects) { - for (let i = 0; i < iterations; i++) { - const suffix = iterations > 1 ? ` (iteration ${i + 1}/${iterations})` : ''; - log(pc.bold(`\n${'='.repeat(60)}`)); - log(pc.bold(`${project.name}${suffix}`)); - log(`${project.description || ''}`); - log(pc.bold('='.repeat(60))); - - const config: TrialConfig = { - project, - agent: agentName, - model, - effort, - prompt: opts.prompt as string, - verbose: opts.verbose as boolean | undefined, - }; - - try { - const result = await runTask(config, runId, uploadId); - allResults.push(result); - } catch (error) { - log(pc.red(`\nFailed to evaluate ${project.name}: ${error instanceof Error ? error.message : error}`)); - if (opts.verbose && error instanceof Error) { - log(error.stack || ''); - } - } - } -} - -// --- Print summary table --- - -if (allResults.length > 0) { - log(pc.bold('\n\nResults Summary')); - log('='.repeat(120)); - - // Header - const header = [ - 'Project'.padEnd(15), - 'Build'.padEnd(7), - 'TS Err'.padEnd(8), - 'Ghost'.padEnd(12), - 'Patterns'.padEnd(10), - 'Quality'.padEnd(9), - 'Cost'.padEnd(8), - 'Time'.padEnd(8), - 'Turns'.padEnd(7), - ].join(' | '); - log(header); - log('-'.repeat(120)); - - // Rows - for (const r of allResults) { - const buildStr = r.grading.buildSuccess ? 'PASS' : 'FAIL'; - const buildColored = r.grading.buildSuccess ? pc.green(buildStr) : pc.red(buildStr); - const ghost = r.grading.ghostStories; - const ghostStr = ghost ? `${ghost.passed}/${ghost.total}` : '-'; - const patternsStr = String(r.grading.setupPatterns.length); - const row = [ - r.project.padEnd(15), - buildStr.padEnd(7).replace(buildStr, buildColored), - String(r.grading.typeCheckErrors).padEnd(8), - ghostStr.padEnd(12), - patternsStr.padEnd(10), - String(r.quality.score).padEnd(9), - formatCost(r.execution.cost).padEnd(8), - formatDuration(r.execution.duration).padEnd(8), - String(r.execution.turns).padEnd(7), - ].join(' | '); - log(row); - } - - log('-'.repeat(120)); - - // Aggregate - const avgQuality = - allResults.reduce((sum, r) => sum + r.quality.score, 0) / allResults.length; - const totalCost = allResults.reduce((sum, r) => sum + (r.execution.cost || 0), 0); - const passRate = - allResults.filter((r) => r.grading.buildSuccess).length / allResults.length; - - log(`\nBuild pass rate: ${pc.bold(`${Math.round(passRate * 100)}%`)}`); - log(`Average quality: ${pc.bold(avgQuality.toFixed(2))}`); - log(`Total cost: ${pc.bold(formatCost(totalCost))}`); +const config: TrialConfig = { + project, + agent: agentForModel(model), + model, + effort, + prompt: opts.prompt as string, + verbose: opts.verbose as boolean | undefined, +}; + +log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); +log(`Model: ${model} | Effort: ${effort} | Prompt: ${config.prompt}`); +log(`Run: ${runId}\n`); + +try { + const result = await runTask(config, runId, uploadId); + const ghost = result.grading.ghostStories; + const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; + + log(pc.bold("\nResult")); + log(` Build: ${result.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL")}`); + log(` Ghost: ${ghostStr}`); + log(` TS Err: ${result.grading.typeCheckErrors}`); + log(` Cost: ${formatCost(result.execution.cost)}`); + log(` Time: ${formatDuration(result.execution.duration)}`); + log(` Turns: ${result.execution.turns}`); +} catch (error) { + log(pc.red(`\nFailed: ${error instanceof Error ? error.message : error}`)); + process.exit(1); } - -log('\nDone.'); diff --git a/scripts/eval/prompts/doctor.md b/scripts/eval/prompts/doctor.md deleted file mode 100644 index 9fc75e4d2d41..000000000000 --- a/scripts/eval/prompts/doctor.md +++ /dev/null @@ -1,3 +0,0 @@ -## Diagnostics first - -Before making large configuration changes, prefer a fast diagnostic command if it can reduce ambiguity. Use Storybook-specific diagnostics when available (e.g. `npx storybook doctor`). diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index 1823c194191b..f7d3237e4970 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -1,7 +1,8 @@ /** * Core types for the Storybook setup eval system. * - * Three independent axes: agent × model × effort + * Three independent axes: model × effort × prompt + * Agent is derived from the model. */ // --- Agent, Model, Effort --- @@ -9,19 +10,25 @@ export type AgentName = "claude-code" | "codex"; export type Effort = "low" | "medium" | "high" | "max"; -export const CLAUDE_MODELS = ["claude-sonnet-4-6", "claude-opus-4-6", "claude-haiku-4-5"] as const; -export const CODEX_MODELS = ["gpt-5.4"] as const; -export const ALL_MODELS = [...CLAUDE_MODELS, ...CODEX_MODELS] as const; -export const EFFORTS: Effort[] = ["low", "medium", "high", "max"]; +export const MODELS = [ + { id: "claude-sonnet-4-6", agent: "claude-code" as AgentName, label: "Sonnet 4.6" }, + { id: "claude-opus-4-6", agent: "claude-code" as AgentName, label: "Opus 4.6" }, + { id: "gpt-5.4-medium", agent: "codex" as AgentName, label: "GPT 5.4 Medium", effort: "medium" as Effort }, + { id: "gpt-5.4-high", agent: "codex" as AgentName, label: "GPT 5.4 High", effort: "high" as Effort }, +] as const; -export type ClaudeModel = (typeof CLAUDE_MODELS)[number]; -export type CodexModel = (typeof CODEX_MODELS)[number]; -export type SupportedModel = ClaudeModel | CodexModel; +export type SupportedModel = (typeof MODELS)[number]["id"]; -export const MODELS_BY_AGENT: Record = { - "claude-code": CLAUDE_MODELS, - codex: CODEX_MODELS, -}; +export function agentForModel(model: string): AgentName { + const entry = MODELS.find((m) => m.id === model); + if (!entry) throw new Error(`Unknown model: ${model}`); + return entry.agent; +} + +export function effortForModel(model: string, defaultEffort: Effort): Effort { + const entry = MODELS.find((m) => m.id === model); + return (entry as { effort?: Effort })?.effort ?? defaultEffort; +} // --- Project Types --- @@ -40,7 +47,7 @@ export interface TrialConfig { agent: AgentName; model: SupportedModel; effort: Effort; - prompt?: string; + prompt: string; verbose?: boolean; } @@ -114,8 +121,8 @@ export interface TrialResult { agent: string; model: string; effort: string; - timestamp: string; prompt: string; + timestamp: string; baselineCommit: string; execution: ExecutionResult; grading: GradingResult; From 2336c46a2f1dd0dbf5fe613bf530ced68f31adbe Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 22:36:14 +0700 Subject: [PATCH 14/63] Add prefixed logging for parallel runs --- scripts/eval/eval-parallel.ts | 9 +++++++-- scripts/eval/lib/run-task.ts | 7 +++++-- scripts/eval/lib/utils.ts | 22 ++++++++++++++++++---- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/scripts/eval/eval-parallel.ts b/scripts/eval/eval-parallel.ts index 11e459a04c4a..6217c428480d 100644 --- a/scripts/eval/eval-parallel.ts +++ b/scripts/eval/eval-parallel.ts @@ -6,7 +6,7 @@ import { MODELS, effortForModel } from "./types.ts"; import { PROJECTS } from "./config.ts"; import { runTask } from "./lib/run-task.ts"; import { listPrompts } from "./lib/generate-prompt.ts"; -import { log, formatDuration, formatCost } from "./lib/utils.ts"; +import { log, formatDuration, formatCost, createLogger } from "./lib/utils.ts"; const program = new Command() .name("eval-parallel") @@ -46,7 +46,12 @@ log(`${configs.length} parallel runs: ${MODELS.map((m) => m.label).join(", ")} log(`Run: ${runId}\n`); // Run all in parallel -const settled = await Promise.allSettled(configs.map((c) => runTask(c, runId, uploadId))); +const settled = await Promise.allSettled( + configs.map((c) => { + const tag = `${c.model.replace("claude-", "")}+${c.prompt}`; + return runTask(c, runId, uploadId, createLogger(tag)); + }), +); const results: TrialResult[] = []; for (let i = 0; i < settled.length; i++) { diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-task.ts index 52960c67abfa..e612d384b7ab 100644 --- a/scripts/eval/lib/run-task.ts +++ b/scripts/eval/lib/run-task.ts @@ -6,7 +6,8 @@ import { prepareTrial } from "./prepare-trial.ts"; import { generatePrompt } from "./generate-prompt.ts"; import { grade } from "./grade.ts"; import { captureEnvironment, saveToGoogleSheets } from "./save.ts"; -import { generateTrialId, log, logSuccess } from "./utils.ts"; +import { generateTrialId, createLogger } from "./utils.ts"; +import type { Logger } from "./utils.ts"; /** * Run a full eval trial: prepare -> execute agent -> grade -> save. @@ -15,12 +16,14 @@ export async function runTask( config: TrialConfig, runId: string, uploadId: string, + logger?: Logger, ): Promise { const { project, agent: agentName, model, effort, prompt: promptName, verbose } = config; + const { log, logSuccess } = logger ?? createLogger(); const trialId = generateTrialId(project.name, agentName, model); const timestamp = new Date().toISOString(); - log(`\nPreparing ${project.name}...`); + log(`Preparing ${project.name}...`); // 1. Prepare the trial const paths = await prepareTrial(project, trialId); diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts index f44f43f763c3..fa97ba01e29c 100644 --- a/scripts/eval/lib/utils.ts +++ b/scripts/eval/lib/utils.ts @@ -10,10 +10,24 @@ export const PROMPTS_DIR = resolve(import.meta.dirname, "..", "prompts"); // --- Logging --- -export const log = (msg: string) => console.log(msg); -export const logStep = (msg: string) => console.log(` ${pc.cyan(">")} ${msg}`); -export const logSuccess = (msg: string) => console.log(` ${pc.green("✓")} ${msg}`); -export const logError = (msg: string) => console.log(` ${pc.red("✗")} ${msg}`); +export function createLogger(prefix?: string) { + const p = prefix ? pc.dim(`[${prefix}]`) + " " : ""; + return { + log: (msg: string) => console.log(`${p}${msg}`), + logStep: (msg: string) => console.log(`${p} ${pc.cyan(">")} ${msg}`), + logSuccess: (msg: string) => console.log(`${p} ${pc.green("✓")} ${msg}`), + logError: (msg: string) => console.log(`${p} ${pc.red("✗")} ${msg}`), + }; +} + +export type Logger = ReturnType; + +// Default logger (no prefix) for single-run mode +const defaultLogger = createLogger(); +export const log = defaultLogger.log; +export const logStep = defaultLogger.logStep; +export const logSuccess = defaultLogger.logSuccess; +export const logError = defaultLogger.logError; export const formatDuration = (s: number) => s < 60 ? `${Math.round(s)}s` : `${Math.floor(s / 60)}m${Math.round(s % 60)}s`; From ca03d7cfcafbe2e037febe957e3207618527d4d5 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 22:41:30 +0700 Subject: [PATCH 15/63] Spawn separate node processes in eval-parallel for multi-core CPU usage --- scripts/eval/eval-parallel.ts | 109 +++++++++++++++++----------------- scripts/eval/eval.ts | 3 + 2 files changed, 59 insertions(+), 53 deletions(-) diff --git a/scripts/eval/eval-parallel.ts b/scripts/eval/eval-parallel.ts index 6217c428480d..11c4d14eb336 100644 --- a/scripts/eval/eval-parallel.ts +++ b/scripts/eval/eval-parallel.ts @@ -1,16 +1,17 @@ import { randomUUID } from "node:crypto"; +import { resolve } from "node:path"; import { Command } from "commander"; import pc from "picocolors"; -import type { TrialConfig, TrialResult } from "./types.ts"; +import { x } from "tinyexec"; import { MODELS, effortForModel } from "./types.ts"; import { PROJECTS } from "./config.ts"; -import { runTask } from "./lib/run-task.ts"; import { listPrompts } from "./lib/generate-prompt.ts"; -import { log, formatDuration, formatCost, createLogger } from "./lib/utils.ts"; +import { formatDuration, formatCost } from "./lib/utils.ts"; +import type { TrialResult } from "./types.ts"; const program = new Command() .name("eval-parallel") - .description("Run all 4 models × 2 prompts = 8 evals in parallel for one project") + .description("Run all 4 models × 2 prompts = 8 evals in parallel (separate processes)") .option("-p, --project ", "project to evaluate") .option("-u, --upload-id ", "upload ID for Google Sheets"); @@ -19,77 +20,79 @@ const opts = program.opts(); const project = PROJECTS.find((p) => p.name === opts.project); if (!project) { - log(pc.red(`Specify a project with -p. Available: ${PROJECTS.map((p) => p.name).join(", ")}`)); + console.log(pc.red(`Specify a project with -p. Available: ${PROJECTS.map((p) => p.name).join(", ")}`)); process.exit(1); } const prompts = listPrompts(); const runId = randomUUID().slice(0, 8); const uploadId = (opts.uploadId as string) || `eval-${runId}`; +const evalScript = resolve(import.meta.dirname, "eval.ts"); -// Build all 4 models × 2 prompts = 8 configs -const configs: TrialConfig[] = []; +const runs: Array<{ model: string; prompt: string; label: string }> = []; for (const m of MODELS) { for (const prompt of prompts) { - configs.push({ - project, - agent: m.agent, - model: m.id, - effort: effortForModel(m.id, "high"), - prompt, - }); + runs.push({ model: m.id, prompt, label: `${m.label}+${prompt}` }); } } -log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); -log(`${configs.length} parallel runs: ${MODELS.map((m) => m.label).join(", ")} × ${prompts.join(", ")}`); -log(`Run: ${runId}\n`); - -// Run all in parallel -const settled = await Promise.allSettled( - configs.map((c) => { - const tag = `${c.model.replace("claude-", "")}+${c.prompt}`; - return runTask(c, runId, uploadId, createLogger(tag)); - }), -); - -const results: TrialResult[] = []; -for (let i = 0; i < settled.length; i++) { - const s = settled[i]!; - if (s.status === "fulfilled") { - results.push(s.value); - } else { - const c = configs[i]!; - log(pc.red(`\n✗ ${c.model} + ${c.prompt}: ${s.reason}`)); - } -} +console.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); +console.log(`${runs.length} parallel processes: ${MODELS.map((m) => m.label).join(", ")} × ${prompts.join(", ")}`); +console.log(`Run: ${runId}\n`); + +// Spawn each as a separate node process +const promises = runs.map(({ model, prompt, label }) => { + const effort = effortForModel(model, "high"); + const tag = pc.dim(`[${label}]`); + + return x("node", [evalScript, "-p", project.name, "-m", model, "-e", effort, "--prompt", prompt, "-u", uploadId], { + throwOnError: false, + nodeOptions: { stdio: ["ignore", "pipe", "pipe"] }, + }).then((proc) => { + // Stream output with prefix + for (const line of proc.stdout.split("\n").filter(Boolean)) { + if (!line.startsWith("__RESULT__")) console.log(`${tag} ${line}`); + } + for (const line of proc.stderr.split("\n").filter(Boolean)) { + console.log(`${tag} ${pc.dim(line)}`); + } + + // Extract the result JSON + const resultLine = proc.stdout.split("\n").find((l) => l.startsWith("__RESULT__")); + if (resultLine) { + return JSON.parse(resultLine.slice("__RESULT__".length)) as TrialResult; + } + console.log(pc.red(`${tag} No result (exit ${proc.exitCode})`)); + return null; + }); +}); + +const settled = await Promise.allSettled(promises); +const results = settled + .map((s) => (s.status === "fulfilled" ? s.value : null)) + .filter((r): r is TrialResult => r != null); // Summary table sorted by ghost stories rate if (results.length > 0) { - results.sort((a, b) => { - const ga = a.grading.ghostStories?.successRate ?? -1; - const gb = b.grading.ghostStories?.successRate ?? -1; - return gb - ga; - }); + results.sort((a, b) => (b.grading.ghostStories?.successRate ?? -1) - (a.grading.ghostStories?.successRate ?? -1)); - log(pc.bold("\n\nResults (sorted by ghost stories rate)")); - log("=".repeat(110)); - log( + console.log(pc.bold("\n\nResults (sorted by ghost stories rate)")); + console.log("=".repeat(110)); + console.log( ["Model", "Prompt", "Build", "Ghost", "TS Err", "Cost", "Time", "Turns"] .map((h, i) => h.padEnd(i === 0 ? 22 : i === 1 ? 12 : 10)) .join(" | "), ); - log("-".repeat(110)); + console.log("-".repeat(110)); for (const r of results) { - const build = r.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL"); const ghost = r.grading.ghostStories; const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; - log( + console.log( [ r.model.padEnd(22), r.prompt.padEnd(12), - (r.grading.buildSuccess ? "PASS" : "FAIL").padEnd(10).replace(/PASS|FAIL/, build), + (r.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL")).padEnd(10 + 10), ghostStr.padEnd(10), String(r.grading.typeCheckErrors).padEnd(10), formatCost(r.execution.cost).padEnd(10), @@ -99,13 +102,13 @@ if (results.length > 0) { ); } - log("-".repeat(110)); + console.log("-".repeat(110)); const totalCost = results.reduce((s, r) => s + (r.execution.cost || 0), 0); - const ghostRates = results.map((r) => r.grading.ghostStories?.successRate).filter((r) => r != null); + const ghostRates = results.map((r) => r.grading.ghostStories?.successRate).filter((r): r is number => r != null); const avgGhost = ghostRates.length > 0 ? ghostRates.reduce((s, r) => s + r, 0) / ghostRates.length : 0; - log(`\nGhost stories avg: ${pc.bold(`${Math.round(avgGhost * 100)}%`)}`); - log(`Total cost: ${pc.bold(formatCost(totalCost))}`); + console.log(`\nGhost stories avg: ${pc.bold(`${Math.round(avgGhost * 100)}%`)}`); + console.log(`Total cost: ${pc.bold(formatCost(totalCost))}`); } -log("\nDone."); +console.log("\nDone."); diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 440300d57c0c..e0bb164fed53 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -73,6 +73,9 @@ try { log(` Cost: ${formatCost(result.execution.cost)}`); log(` Time: ${formatDuration(result.execution.duration)}`); log(` Turns: ${result.execution.turns}`); + + // Machine-readable output for eval-parallel to parse + console.log(`__RESULT__${JSON.stringify(result)}`); } catch (error) { log(pc.red(`\nFailed: ${error instanceof Error ? error.message : error}`)); process.exit(1); From 862994868e8bbb111b4dbfaadcce8889c3a573e3 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 22:44:34 +0700 Subject: [PATCH 16/63] Live-stream prefixed logs from child processes, improve Codex agent logging --- scripts/eval/eval-parallel.ts | 69 ++++++++++++++++++-------------- scripts/eval/lib/agents/codex.ts | 19 +++++++-- 2 files changed, 55 insertions(+), 33 deletions(-) diff --git a/scripts/eval/eval-parallel.ts b/scripts/eval/eval-parallel.ts index 11c4d14eb336..1af29cd62555 100644 --- a/scripts/eval/eval-parallel.ts +++ b/scripts/eval/eval-parallel.ts @@ -1,8 +1,9 @@ import { randomUUID } from "node:crypto"; import { resolve } from "node:path"; +import { spawn } from "node:child_process"; +import { createInterface } from "node:readline"; import { Command } from "commander"; import pc from "picocolors"; -import { x } from "tinyexec"; import { MODELS, effortForModel } from "./types.ts"; import { PROJECTS } from "./config.ts"; import { listPrompts } from "./lib/generate-prompt.ts"; @@ -40,37 +41,47 @@ console.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); console.log(`${runs.length} parallel processes: ${MODELS.map((m) => m.label).join(", ")} × ${prompts.join(", ")}`); console.log(`Run: ${runId}\n`); -// Spawn each as a separate node process -const promises = runs.map(({ model, prompt, label }) => { - const effort = effortForModel(model, "high"); - const tag = pc.dim(`[${label}]`); - - return x("node", [evalScript, "-p", project.name, "-m", model, "-e", effort, "--prompt", prompt, "-u", uploadId], { - throwOnError: false, - nodeOptions: { stdio: ["ignore", "pipe", "pipe"] }, - }).then((proc) => { - // Stream output with prefix - for (const line of proc.stdout.split("\n").filter(Boolean)) { - if (!line.startsWith("__RESULT__")) console.log(`${tag} ${line}`); - } - for (const line of proc.stderr.split("\n").filter(Boolean)) { +/** Spawn a child, stream its output line-by-line with a prefix, return the TrialResult. */ +function spawnRun(model: string, prompt: string, label: string): Promise { + return new Promise((resolve) => { + const effort = effortForModel(model, "high"); + const tag = pc.dim(`[${label}]`); + + const child = spawn("node", [evalScript, "-p", project!.name, "-m", model, "-e", effort, "--prompt", prompt, "-u", uploadId], { + stdio: ["ignore", "pipe", "pipe"], + }); + + let result: TrialResult | null = null; + + // Live-stream stdout with prefix + const outRL = createInterface({ input: child.stdout! }); + outRL.on("line", (line) => { + if (line.startsWith("__RESULT__")) { + try { result = JSON.parse(line.slice("__RESULT__".length)); } catch { /* skip */ } + } else { + console.log(`${tag} ${line}`); + } + }); + + // Live-stream stderr with prefix (agent logs go here) + const errRL = createInterface({ input: child.stderr! }); + errRL.on("line", (line) => { console.log(`${tag} ${pc.dim(line)}`); - } - - // Extract the result JSON - const resultLine = proc.stdout.split("\n").find((l) => l.startsWith("__RESULT__")); - if (resultLine) { - return JSON.parse(resultLine.slice("__RESULT__".length)) as TrialResult; - } - console.log(pc.red(`${tag} No result (exit ${proc.exitCode})`)); - return null; + }); + + child.on("close", (code) => { + if (code !== 0 && !result) { + console.log(pc.red(`${tag} exited with code ${code}`)); + } + resolve(result); + }); }); -}); +} -const settled = await Promise.allSettled(promises); -const results = settled - .map((s) => (s.status === "fulfilled" ? s.value : null)) - .filter((r): r is TrialResult => r != null); +// Run all in parallel +const results = (await Promise.all(runs.map((r) => spawnRun(r.model, r.prompt, r.label)))).filter( + (r): r is TrialResult => r != null, +); // Summary table sorted by ghost stories rate if (results.length > 0) { diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts index ee3999c06cad..30ffe472a2b8 100644 --- a/scripts/eval/lib/agents/codex.ts +++ b/scripts/eval/lib/agents/codex.ts @@ -3,7 +3,6 @@ import { writeFileSync } from "node:fs"; import { join } from "node:path"; import type { Agent, Effort, ExecutionResult, SupportedModel } from "../../types.ts"; -/** Map our unified effort to Codex's model_reasoning_effort values. */ const CODEX_EFFORT: Record = { low: "low", medium: "medium", @@ -22,6 +21,7 @@ export const codexAgent: Agent = { ): Promise { const { effort = "high", resultsDir } = options ?? {}; const startTime = Date.now(); + const log = (prefix: string, text: string) => process.stderr.write(`${prefix} ${text}\n`); const codex = new Codex({ model, @@ -36,17 +36,28 @@ export const codexAgent: Agent = { if (event.type === "item.completed") { const item = event.item as Record; items.push(item); + if (item.type === "message" && Array.isArray(item.content)) { - for (const block of item.content) { - if (typeof block === "object" && block !== null && "text" in block) { - process.stderr.write(`${(block as { text: string }).text}\n`); + for (const block of item.content as Array>) { + if (block.type === "output_text" && typeof block.text === "string") { + log("💬", block.text.slice(0, 300)); } } + } else if (item.type === "command_execution") { + const cmd = item.command as string | undefined; + const exit = item.exit_code as number | undefined; + log("🔧", `${cmd ?? "?"} → exit ${exit ?? "?"}`); + } + } else if (event.type === "turn.completed") { + const usage = event.usage as { input_tokens?: number; output_tokens?: number } | undefined; + if (usage) { + log("📊", `tokens: ${usage.input_tokens ?? 0}in / ${usage.output_tokens ?? 0}out`); } } } const duration = (Date.now() - startTime) / 1000; + log("✅", `Done — ${items.length} items, ${Math.round(duration)}s`); if (resultsDir) { writeFileSync(join(resultsDir, "transcript.json"), JSON.stringify(items, null, 2)); From 16060256482c68e0bcd1a6b00eeca7d35bb0e81d Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 22:52:56 +0700 Subject: [PATCH 17/63] Fix Codex agent logging to match actual SDK event/item types --- scripts/eval/lib/agents/codex.ts | 60 ++++++++++++++++++++----------- scripts/eval/lib/ghost-stories.ts | 4 +-- 2 files changed, 41 insertions(+), 23 deletions(-) diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts index 30ffe472a2b8..da7523839f28 100644 --- a/scripts/eval/lib/agents/codex.ts +++ b/scripts/eval/lib/agents/codex.ts @@ -23,36 +23,54 @@ export const codexAgent: Agent = { const startTime = Date.now(); const log = (prefix: string, text: string) => process.stderr.write(`${prefix} ${text}\n`); - const codex = new Codex({ + const codex = new Codex(); + const thread = codex.startThread({ model, - config: { model_reasoning_effort: CODEX_EFFORT[effort] }, + modelReasoningEffort: CODEX_EFFORT[effort], + workingDirectory: projectPath, + approvalPolicy: "never", }); - const thread = codex.startThread({ workingDirectory: projectPath }); const { events } = await thread.runStreamed(prompt); const items: unknown[] = []; + // Token tracking not yet exposed in result — logged per-turn for visibility for await (const event of events) { - if (event.type === "item.completed") { - const item = event.item as Record; - items.push(item); - - if (item.type === "message" && Array.isArray(item.content)) { - for (const block of item.content as Array>) { - if (block.type === "output_text" && typeof block.text === "string") { - log("💬", block.text.slice(0, 300)); - } + switch (event.type) { + case "item.completed": { + const item = event.item; + items.push(item); + switch (item.type) { + case "agent_message": + log("💬", item.text.slice(0, 300)); + break; + case "command_execution": + log("🔧", `$ ${item.command} → exit ${item.exit_code ?? "?"}`); + if (item.exit_code !== 0 && item.aggregated_output) { + log(" ", item.aggregated_output.slice(-200)); + } + break; + case "file_change": + for (const c of item.changes) log("📝", `${c.kind} ${c.path}`); + break; + case "reasoning": + log("🧠", item.text.slice(0, 200)); + break; + case "error": + log("❌", item.message); + break; } - } else if (item.type === "command_execution") { - const cmd = item.command as string | undefined; - const exit = item.exit_code as number | undefined; - log("🔧", `${cmd ?? "?"} → exit ${exit ?? "?"}`); - } - } else if (event.type === "turn.completed") { - const usage = event.usage as { input_tokens?: number; output_tokens?: number } | undefined; - if (usage) { - log("📊", `tokens: ${usage.input_tokens ?? 0}in / ${usage.output_tokens ?? 0}out`); + break; } + case "turn.completed": + log("📊", `tokens: ${event.usage.input_tokens}in / ${event.usage.output_tokens}out (${event.usage.cached_input_tokens} cached)`); + break; + case "turn.failed": + log("❌", `Turn failed: ${event.error.message}`); + break; + case "error": + log("❌", `Error: ${event.message}`); + break; } } diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts index f7c20b3652f2..8e9f78e665ba 100644 --- a/scripts/eval/lib/ghost-stories.ts +++ b/scripts/eval/lib/ghost-stories.ts @@ -23,12 +23,12 @@ export async function runGhostStories( const reportPath = join(resultsDir, "ghost-stories-report.json"); await exec( "npx", - ["vitest", "run", "--project=storybook", "--reporter=json", `--outputFile=${reportPath}`, "--testTimeout=10000"], + ["vitest", "run", "--project=storybook", "--reporter=json", `--outputFile=${reportPath}`, "--testTimeout=10000", ...candidates], { cwd: projectPath, timeout: 120_000, throwOnError: false, - env: { ...process.env, STORYBOOK_COMPONENT_PATHS: candidates.join(",") }, + env: { ...process.env, STORYBOOK_COMPONENT_PATHS: candidates.join(";") }, }, ); From 47e64e3ea1a66a27e439e528330fdf1c13ed1ae0 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 22:57:05 +0700 Subject: [PATCH 18/63] =?UTF-8?q?Decouple=20agent=20and=20model=20?= =?UTF-8?q?=E2=80=94=20choose=20agent=20then=20model=20independently?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/eval/eval-parallel.ts | 69 +++++++++++++++++------------------ scripts/eval/eval.ts | 33 ++++++++++++----- scripts/eval/types.ts | 34 +++++++---------- 3 files changed, 71 insertions(+), 65 deletions(-) diff --git a/scripts/eval/eval-parallel.ts b/scripts/eval/eval-parallel.ts index 1af29cd62555..9583302884c8 100644 --- a/scripts/eval/eval-parallel.ts +++ b/scripts/eval/eval-parallel.ts @@ -4,16 +4,17 @@ import { spawn } from "node:child_process"; import { createInterface } from "node:readline"; import { Command } from "commander"; import pc from "picocolors"; -import { MODELS, effortForModel } from "./types.ts"; +import { AGENTS } from "./types.ts"; +import type { AgentName, TrialResult } from "./types.ts"; import { PROJECTS } from "./config.ts"; import { listPrompts } from "./lib/generate-prompt.ts"; import { formatDuration, formatCost } from "./lib/utils.ts"; -import type { TrialResult } from "./types.ts"; const program = new Command() .name("eval-parallel") - .description("Run all 4 models × 2 prompts = 8 evals in parallel (separate processes)") + .description("Run all agent×model×prompt combos in parallel for one project") .option("-p, --project ", "project to evaluate") + .option("-e, --effort ", "effort: low, medium, high, max", "high") .option("-u, --upload-id ", "upload ID for Google Sheets"); program.parse(); @@ -26,36 +27,39 @@ if (!project) { } const prompts = listPrompts(); +const effort = opts.effort as string; const runId = randomUUID().slice(0, 8); const uploadId = (opts.uploadId as string) || `eval-${runId}`; const evalScript = resolve(import.meta.dirname, "eval.ts"); -const runs: Array<{ model: string; prompt: string; label: string }> = []; -for (const m of MODELS) { - for (const prompt of prompts) { - runs.push({ model: m.id, prompt, label: `${m.label}+${prompt}` }); +// Build all combos: every agent × model × prompt +const runs: Array<{ agent: string; model: string; prompt: string; label: string }> = []; +for (const [agent, { models }] of Object.entries(AGENTS)) { + for (const model of models) { + for (const prompt of prompts) { + runs.push({ agent, model, prompt, label: `${model}+${prompt}` }); + } } } console.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); -console.log(`${runs.length} parallel processes: ${MODELS.map((m) => m.label).join(", ")} × ${prompts.join(", ")}`); +console.log(`${runs.length} parallel processes | Effort: ${effort}`); +for (const [agent, { models }] of Object.entries(AGENTS)) { + console.log(` ${agent}: ${models.join(", ")}`); +} +console.log(` prompts: ${prompts.join(", ")}`); console.log(`Run: ${runId}\n`); -/** Spawn a child, stream its output line-by-line with a prefix, return the TrialResult. */ -function spawnRun(model: string, prompt: string, label: string): Promise { - return new Promise((resolve) => { - const effort = effortForModel(model, "high"); +function spawnRun(agent: string, model: string, prompt: string, label: string): Promise { + return new Promise((res) => { const tag = pc.dim(`[${label}]`); - - const child = spawn("node", [evalScript, "-p", project!.name, "-m", model, "-e", effort, "--prompt", prompt, "-u", uploadId], { - stdio: ["ignore", "pipe", "pipe"], - }); + const child = spawn("node", [ + evalScript, "-p", project!.name, "-a", agent, "-m", model, "-e", effort, "--prompt", prompt, "-u", uploadId, + ], { stdio: ["ignore", "pipe", "pipe"] }); let result: TrialResult | null = null; - // Live-stream stdout with prefix - const outRL = createInterface({ input: child.stdout! }); - outRL.on("line", (line) => { + createInterface({ input: child.stdout! }).on("line", (line) => { if (line.startsWith("__RESULT__")) { try { result = JSON.parse(line.slice("__RESULT__".length)); } catch { /* skip */ } } else { @@ -63,45 +67,40 @@ function spawnRun(model: string, prompt: string, label: string): Promise { + createInterface({ input: child.stderr! }).on("line", (line) => { console.log(`${tag} ${pc.dim(line)}`); }); child.on("close", (code) => { - if (code !== 0 && !result) { - console.log(pc.red(`${tag} exited with code ${code}`)); - } - resolve(result); + if (code !== 0 && !result) console.log(pc.red(`${tag} exited with code ${code}`)); + res(result); }); }); } -// Run all in parallel -const results = (await Promise.all(runs.map((r) => spawnRun(r.model, r.prompt, r.label)))).filter( +const results = (await Promise.all(runs.map((r) => spawnRun(r.agent, r.model, r.prompt, r.label)))).filter( (r): r is TrialResult => r != null, ); -// Summary table sorted by ghost stories rate if (results.length > 0) { results.sort((a, b) => (b.grading.ghostStories?.successRate ?? -1) - (a.grading.ghostStories?.successRate ?? -1)); console.log(pc.bold("\n\nResults (sorted by ghost stories rate)")); - console.log("=".repeat(110)); + console.log("=".repeat(120)); console.log( - ["Model", "Prompt", "Build", "Ghost", "TS Err", "Cost", "Time", "Turns"] - .map((h, i) => h.padEnd(i === 0 ? 22 : i === 1 ? 12 : 10)) + ["Agent", "Model", "Prompt", "Build", "Ghost", "TS Err", "Cost", "Time", "Turns"] + .map((h, i) => h.padEnd(i <= 1 ? 14 : i === 2 ? 12 : 10)) .join(" | "), ); - console.log("-".repeat(110)); + console.log("-".repeat(120)); for (const r of results) { const ghost = r.grading.ghostStories; const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; console.log( [ - r.model.padEnd(22), + r.agent.padEnd(14), + r.model.padEnd(14), r.prompt.padEnd(12), (r.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL")).padEnd(10 + 10), ghostStr.padEnd(10), @@ -113,7 +112,7 @@ if (results.length > 0) { ); } - console.log("-".repeat(110)); + console.log("-".repeat(120)); const totalCost = results.reduce((s, r) => s + (r.execution.cost || 0), 0); const ghostRates = results.map((r) => r.grading.ghostStories?.successRate).filter((r): r is number => r != null); const avgGhost = ghostRates.length > 0 ? ghostRates.reduce((s, r) => s + r, 0) / ghostRates.length : 0; diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index e0bb164fed53..07f0634966d7 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -1,8 +1,8 @@ import { randomUUID } from "node:crypto"; import { Command } from "commander"; import pc from "picocolors"; -import type { TrialConfig, Effort } from "./types.ts"; -import { MODELS, agentForModel, effortForModel } from "./types.ts"; +import type { TrialConfig, AgentName, Effort } from "./types.ts"; +import { AGENTS } from "./types.ts"; import { PROJECTS } from "./config.ts"; import { runTask } from "./lib/run-task.ts"; import { listPrompts } from "./lib/generate-prompt.ts"; @@ -12,7 +12,8 @@ const program = new Command() .name("eval") .description("Run a single Storybook setup eval") .option("-p, --project ", "project to evaluate") - .option("-m, --model ", "model to use", "claude-sonnet-4-6") + .option("-a, --agent ", "agent: claude-code, codex", "claude-code") + .option("-m, --model ", "model (default: per agent)") .option("-e, --effort ", "effort: low, medium, high, max", "high") .option("--prompt ", "prompt name", "setup") .option("-v, --verbose", "verbose output") @@ -29,7 +30,10 @@ if (opts.listProjects) { process.exit(0); } if (opts.listModels) { - for (const m of MODELS) log(` ${pc.bold(m.id)} (${m.agent}) — ${m.label}`); + for (const [agent, { models }] of Object.entries(AGENTS)) { + log(`\n ${pc.bold(agent)}`); + for (const m of models) log(` ${m}`); + } process.exit(0); } if (opts.listPrompts) { @@ -43,14 +47,26 @@ if (!project) { process.exit(1); } -const model = opts.model as string; -const effort = effortForModel(model, opts.effort as Effort); +const agent = opts.agent as AgentName; +const agentConfig = AGENTS[agent]; +if (!agentConfig) { + log(pc.red(`Unknown agent: ${agent}. Options: ${Object.keys(AGENTS).join(", ")}`)); + process.exit(1); +} + +const model = (opts.model ?? agentConfig.defaultModel) as string; +if (!agentConfig.models.includes(model)) { + log(pc.red(`Model ${model} not available for ${agent}. Options: ${agentConfig.models.join(", ")}`)); + process.exit(1); +} + +const effort = opts.effort as Effort; const runId = randomUUID().slice(0, 8); const uploadId = (opts.uploadId as string) || `eval-${runId}`; const config: TrialConfig = { project, - agent: agentForModel(model), + agent, model, effort, prompt: opts.prompt as string, @@ -58,7 +74,7 @@ const config: TrialConfig = { }; log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); -log(`Model: ${model} | Effort: ${effort} | Prompt: ${config.prompt}`); +log(`Agent: ${agent} | Model: ${model} | Effort: ${effort} | Prompt: ${config.prompt}`); log(`Run: ${runId}\n`); try { @@ -74,7 +90,6 @@ try { log(` Time: ${formatDuration(result.execution.duration)}`); log(` Turns: ${result.execution.turns}`); - // Machine-readable output for eval-parallel to parse console.log(`__RESULT__${JSON.stringify(result)}`); } catch (error) { log(pc.red(`\nFailed: ${error instanceof Error ? error.message : error}`)); diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index f7d3237e4970..14455824989e 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -1,8 +1,7 @@ /** * Core types for the Storybook setup eval system. * - * Three independent axes: model × effort × prompt - * Agent is derived from the model. + * Four independent axes: agent × model × effort × prompt */ // --- Agent, Model, Effort --- @@ -10,25 +9,18 @@ export type AgentName = "claude-code" | "codex"; export type Effort = "low" | "medium" | "high" | "max"; -export const MODELS = [ - { id: "claude-sonnet-4-6", agent: "claude-code" as AgentName, label: "Sonnet 4.6" }, - { id: "claude-opus-4-6", agent: "claude-code" as AgentName, label: "Opus 4.6" }, - { id: "gpt-5.4-medium", agent: "codex" as AgentName, label: "GPT 5.4 Medium", effort: "medium" as Effort }, - { id: "gpt-5.4-high", agent: "codex" as AgentName, label: "GPT 5.4 High", effort: "high" as Effort }, -] as const; - -export type SupportedModel = (typeof MODELS)[number]["id"]; - -export function agentForModel(model: string): AgentName { - const entry = MODELS.find((m) => m.id === model); - if (!entry) throw new Error(`Unknown model: ${model}`); - return entry.agent; -} - -export function effortForModel(model: string, defaultEffort: Effort): Effort { - const entry = MODELS.find((m) => m.id === model); - return (entry as { effort?: Effort })?.effort ?? defaultEffort; -} +export const AGENTS: Record = { + "claude-code": { + models: ["claude-sonnet-4-6", "claude-opus-4-6", "claude-haiku-4-5"], + defaultModel: "claude-sonnet-4-6", + }, + codex: { + models: ["gpt-5.4"], + defaultModel: "gpt-5.4", + }, +}; + +export type SupportedModel = string; // --- Project Types --- From f6671a134e148ed269075f7b0cba995e5635b809 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 22:59:29 +0700 Subject: [PATCH 19/63] =?UTF-8?q?Clean=20up=20names:=20claude-code?= =?UTF-8?q?=E2=86=92claude,=20claude-sonnet-4-6=E2=86=92sonnet-4.6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/eval/config.ts | 4 ++-- scripts/eval/eval.ts | 2 +- scripts/eval/lib/agents/claude-code.ts | 15 +++++++++++---- scripts/eval/types.ts | 8 ++++---- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/scripts/eval/config.ts b/scripts/eval/config.ts index fe9a1059719d..31e7b4a0a395 100644 --- a/scripts/eval/config.ts +++ b/scripts/eval/config.ts @@ -1,6 +1,6 @@ import type { AgentName, Agent } from "./types.ts"; import type { Project } from "./types.ts"; -import { claudeCodeAgent } from "./lib/agents/claude-code.ts"; +import { claudeAgent } from "./lib/agents/claude-code.ts"; import { codexAgent } from "./lib/agents/codex.ts"; export const PROJECTS: Project[] = [ @@ -45,6 +45,6 @@ export const PROJECTS: Project[] = [ ]; export const agents: Record = { - "claude-code": claudeCodeAgent, + claude: claudeAgent, codex: codexAgent, }; diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 07f0634966d7..ecfdbe9aaf1f 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -12,7 +12,7 @@ const program = new Command() .name("eval") .description("Run a single Storybook setup eval") .option("-p, --project ", "project to evaluate") - .option("-a, --agent ", "agent: claude-code, codex", "claude-code") + .option("-a, --agent ", "agent: claude, codex", "claude") .option("-m, --model ", "model (default: per agent)") .option("-e, --effort ", "effort: low, medium, high, max", "high") .option("--prompt ", "prompt name", "setup") diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index b1208fc8e9df..976be34b07f6 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -69,8 +69,15 @@ function logMessage(message: SDKMessage) { } } -export const claudeCodeAgent: Agent = { - name: "claude-code", +/** Map clean model names to Claude SDK model IDs */ +const CLAUDE_MODEL_MAP: Record = { + "sonnet-4.6": "claude-sonnet-4-6", + "opus-4.6": "claude-opus-4-6", + "haiku-4.5": "claude-haiku-4-5", +}; + +export const claudeAgent: Agent = { + name: "claude", async execute( prompt: string, @@ -89,7 +96,7 @@ export const claudeCodeAgent: Agent = { for await (const message of query({ prompt, options: { - model, + model: CLAUDE_MODEL_MAP[model] ?? model, cwd: projectPath, allowedTools: ["Read", "Write", "Edit", "Bash", "Glob", "Grep"], maxTurns: 50, @@ -116,7 +123,7 @@ export const claudeCodeAgent: Agent = { } return { - agent: "claude-code", + agent: "claude", model, effort, cost, diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index 14455824989e..8cbbbc611c02 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -6,13 +6,13 @@ // --- Agent, Model, Effort --- -export type AgentName = "claude-code" | "codex"; +export type AgentName = "claude" | "codex"; export type Effort = "low" | "medium" | "high" | "max"; export const AGENTS: Record = { - "claude-code": { - models: ["claude-sonnet-4-6", "claude-opus-4-6", "claude-haiku-4-5"], - defaultModel: "claude-sonnet-4-6", + claude: { + models: ["sonnet-4.6", "opus-4.6", "haiku-4.5"], + defaultModel: "sonnet-4.6", }, codex: { models: ["gpt-5.4"], From 5701e8dcd4db810d864949b97c031ddfd2532bdf Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 23:01:19 +0700 Subject: [PATCH 20/63] =?UTF-8?q?Infer=20agent=20from=20model=20=E2=80=94?= =?UTF-8?q?=20node=20eval.ts=20-m=20gpt-5.4=20auto-selects=20codex?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/eval/eval.ts | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index ecfdbe9aaf1f..da4bd67ceea8 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -47,17 +47,28 @@ if (!project) { process.exit(1); } -const agent = opts.agent as AgentName; -const agentConfig = AGENTS[agent]; -if (!agentConfig) { - log(pc.red(`Unknown agent: ${agent}. Options: ${Object.keys(AGENTS).join(", ")}`)); - process.exit(1); -} +// Infer agent from model if model is specified, otherwise use --agent flag +let agent: AgentName; +let model: string; -const model = (opts.model ?? agentConfig.defaultModel) as string; -if (!agentConfig.models.includes(model)) { - log(pc.red(`Model ${model} not available for ${agent}. Options: ${agentConfig.models.join(", ")}`)); - process.exit(1); +if (opts.model) { + // Find which agent owns this model + const match = Object.entries(AGENTS).find(([, cfg]) => cfg.models.includes(opts.model as string)); + if (!match) { + const all = Object.values(AGENTS).flatMap((cfg) => cfg.models); + log(pc.red(`Unknown model: ${opts.model}. Available: ${all.join(", ")}`)); + process.exit(1); + } + agent = match[0] as AgentName; + model = opts.model as string; +} else { + agent = opts.agent as AgentName; + const agentConfig = AGENTS[agent]; + if (!agentConfig) { + log(pc.red(`Unknown agent: ${agent}. Options: ${Object.keys(AGENTS).join(", ")}`)); + process.exit(1); + } + model = agentConfig.defaultModel; } const effort = opts.effort as Effort; From bdbae36fdff6b9fcf3ba772b3dd2bd6de0390b48 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 23:03:57 +0700 Subject: [PATCH 21/63] Fix parallel race condition: add prompt + random suffix to trial IDs --- scripts/eval/lib/run-task.ts | 2 +- scripts/eval/lib/utils.ts | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-task.ts index e612d384b7ab..4b8b09287a5a 100644 --- a/scripts/eval/lib/run-task.ts +++ b/scripts/eval/lib/run-task.ts @@ -20,7 +20,7 @@ export async function runTask( ): Promise { const { project, agent: agentName, model, effort, prompt: promptName, verbose } = config; const { log, logSuccess } = logger ?? createLogger(); - const trialId = generateTrialId(project.name, agentName, model); + const trialId = generateTrialId(project.name, agentName, model, promptName || "setup"); const timestamp = new Date().toISOString(); log(`Preparing ${project.name}...`); diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts index fa97ba01e29c..87b4eede40ef 100644 --- a/scripts/eval/lib/utils.ts +++ b/scripts/eval/lib/utils.ts @@ -34,8 +34,10 @@ export const formatDuration = (s: number) => export const formatCost = (cost?: number) => (cost == null ? "-" : `$${cost.toFixed(2)}`); -export function generateTrialId(project: string, agent: string, model: string) { - return `${new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19)}-${project}-${agent}-${model}`; +export function generateTrialId(project: string, agent: string, model: string, prompt: string) { + const ts = new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19); + const rand = Math.random().toString(36).slice(2, 6); + return `${ts}-${project}-${model}-${prompt}-${rand}`; } // --- Exec --- From 8819ae25b07dce71543d90a91e1c7875b0e3c54d Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 23:06:07 +0700 Subject: [PATCH 22/63] Use crypto.randomUUID for unique trial IDs --- scripts/eval/lib/utils.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts index 87b4eede40ef..c02fadfe88f3 100644 --- a/scripts/eval/lib/utils.ts +++ b/scripts/eval/lib/utils.ts @@ -36,8 +36,7 @@ export const formatCost = (cost?: number) => (cost == null ? "-" : `$${cost.toFi export function generateTrialId(project: string, agent: string, model: string, prompt: string) { const ts = new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19); - const rand = Math.random().toString(36).slice(2, 6); - return `${ts}-${project}-${model}-${prompt}-${rand}`; + return `${ts}-${project}-${model}-${prompt}-${crypto.randomUUID().slice(0, 8)}`; } // --- Exec --- From 3caafdad6e5e34d4e344cdfe3e98def0f1106397 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Fri, 27 Mar 2026 23:10:20 +0700 Subject: [PATCH 23/63] Fix ghost stories to match core implementation: pass paths as args + env, 1s timeout, no --project --- scripts/eval/lib/ghost-stories.ts | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts index 8e9f78e665ba..eda035717b77 100644 --- a/scripts/eval/lib/ghost-stories.ts +++ b/scripts/eval/lib/ghost-stories.ts @@ -6,6 +6,10 @@ import { logStep, logSuccess, logError, exec } from "./utils.ts"; /** * Run ghost stories: discover candidate components, auto-generate stories * via the addon-vitest componentTransform, and measure rendering success. + * + * Mirrors the approach in core-server/utils/ghost-stories/run-story-tests.ts: + * - Pass component paths as both CLI args (so vitest runs them) and + * STORYBOOK_COMPONENT_PATHS env var (so the transform plugin activates) */ export async function runGhostStories( projectPath: string, @@ -23,7 +27,13 @@ export async function runGhostStories( const reportPath = join(resultsDir, "ghost-stories-report.json"); await exec( "npx", - ["vitest", "run", "--project=storybook", "--reporter=json", `--outputFile=${reportPath}`, "--testTimeout=10000", ...candidates], + [ + "vitest", "run", + "--reporter=json", + `--outputFile=${reportPath}`, + "--testTimeout=1000", + ...candidates, + ], { cwd: projectPath, timeout: 120_000, From 4e04c665cc4fdb7b18567323a965dec4106ebbd4 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sat, 28 Mar 2026 07:21:06 +0700 Subject: [PATCH 24/63] Add tests, import ghost stories utilities from core, switch to parseArgs --- scripts/eval/config.test.ts | 31 +++ scripts/eval/eval-parallel.ts | 22 +- scripts/eval/eval.ts | 42 ++-- scripts/eval/lib/generate-prompt.test.ts | 46 ++++ scripts/eval/lib/ghost-stories.test.ts | 105 ++++++++++ scripts/eval/lib/ghost-stories.ts | 34 +-- scripts/eval/lib/grade.test.ts | 138 ++++++++++++ scripts/eval/lib/grade.ts | 57 +++-- scripts/eval/lib/grading-pipeline.test.ts | 193 +++++++++++++++++ scripts/eval/lib/run-task.test.ts | 243 ++++++++++++++++++++++ scripts/eval/lib/setup-patterns.test.ts | 129 ++++++++++++ scripts/eval/lib/utils.test.ts | 56 +++++ scripts/eval/types.test.ts | 27 +++ 13 files changed, 1054 insertions(+), 69 deletions(-) create mode 100644 scripts/eval/config.test.ts create mode 100644 scripts/eval/lib/generate-prompt.test.ts create mode 100644 scripts/eval/lib/ghost-stories.test.ts create mode 100644 scripts/eval/lib/grade.test.ts create mode 100644 scripts/eval/lib/grading-pipeline.test.ts create mode 100644 scripts/eval/lib/run-task.test.ts create mode 100644 scripts/eval/lib/setup-patterns.test.ts create mode 100644 scripts/eval/lib/utils.test.ts create mode 100644 scripts/eval/types.test.ts diff --git a/scripts/eval/config.test.ts b/scripts/eval/config.test.ts new file mode 100644 index 000000000000..f6d5aba687e8 --- /dev/null +++ b/scripts/eval/config.test.ts @@ -0,0 +1,31 @@ +import { describe, expect, it } from 'vitest'; + +import { PROJECTS, agents } from './config'; + +describe('PROJECTS', () => { + it('has at least one project', () => { + expect(PROJECTS.length).toBeGreaterThan(0); + }); + + it('each project has name, repo URL, and branch', () => { + for (const project of PROJECTS) { + expect(project.name).toBeTruthy(); + expect(project.repo).toMatch(/^https:\/\/github\.com\//); + expect(project.branch).toBeTruthy(); + } + }); + + it('project names are unique', () => { + const names = PROJECTS.map((p) => p.name); + expect(new Set(names).size).toBe(names.length); + }); +}); + +describe('agents', () => { + it('each agent has a name and execute method', () => { + for (const agent of Object.values(agents)) { + expect(agent.name).toBeTruthy(); + expect(typeof agent.execute).toBe('function'); + } + }); +}); diff --git a/scripts/eval/eval-parallel.ts b/scripts/eval/eval-parallel.ts index 9583302884c8..d0032e9f78c6 100644 --- a/scripts/eval/eval-parallel.ts +++ b/scripts/eval/eval-parallel.ts @@ -2,23 +2,21 @@ import { randomUUID } from "node:crypto"; import { resolve } from "node:path"; import { spawn } from "node:child_process"; import { createInterface } from "node:readline"; -import { Command } from "commander"; +import { parseArgs } from "node:util"; import pc from "picocolors"; import { AGENTS } from "./types.ts"; -import type { AgentName, TrialResult } from "./types.ts"; +import type { TrialResult } from "./types.ts"; import { PROJECTS } from "./config.ts"; import { listPrompts } from "./lib/generate-prompt.ts"; import { formatDuration, formatCost } from "./lib/utils.ts"; -const program = new Command() - .name("eval-parallel") - .description("Run all agent×model×prompt combos in parallel for one project") - .option("-p, --project ", "project to evaluate") - .option("-e, --effort ", "effort: low, medium, high, max", "high") - .option("-u, --upload-id ", "upload ID for Google Sheets"); - -program.parse(); -const opts = program.opts(); +const { values: opts } = parseArgs({ + options: { + project: { type: "string", short: "p" }, + effort: { type: "string", short: "e", default: "high" }, + "upload-id": { type: "string", short: "u" }, + }, +}); const project = PROJECTS.find((p) => p.name === opts.project); if (!project) { @@ -29,7 +27,7 @@ if (!project) { const prompts = listPrompts(); const effort = opts.effort as string; const runId = randomUUID().slice(0, 8); -const uploadId = (opts.uploadId as string) || `eval-${runId}`; +const uploadId = opts["upload-id"] || `eval-${runId}`; const evalScript = resolve(import.meta.dirname, "eval.ts"); // Build all combos: every agent × model × prompt diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index da4bd67ceea8..6797b04d8e70 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -1,5 +1,5 @@ import { randomUUID } from "node:crypto"; -import { Command } from "commander"; +import { parseArgs } from "node:util"; import pc from "picocolors"; import type { TrialConfig, AgentName, Effort } from "./types.ts"; import { AGENTS } from "./types.ts"; @@ -8,35 +8,33 @@ import { runTask } from "./lib/run-task.ts"; import { listPrompts } from "./lib/generate-prompt.ts"; import { log, formatDuration, formatCost } from "./lib/utils.ts"; -const program = new Command() - .name("eval") - .description("Run a single Storybook setup eval") - .option("-p, --project ", "project to evaluate") - .option("-a, --agent ", "agent: claude, codex", "claude") - .option("-m, --model ", "model (default: per agent)") - .option("-e, --effort ", "effort: low, medium, high, max", "high") - .option("--prompt ", "prompt name", "setup") - .option("-v, --verbose", "verbose output") - .option("-u, --upload-id ", "upload ID for Google Sheets") - .option("--list-projects", "list projects") - .option("--list-models", "list models") - .option("--list-prompts", "list prompts"); +const { values: opts } = parseArgs({ + options: { + project: { type: "string", short: "p" }, + agent: { type: "string", short: "a", default: "claude" }, + model: { type: "string", short: "m" }, + effort: { type: "string", short: "e", default: "high" }, + prompt: { type: "string", default: "setup" }, + verbose: { type: "boolean", short: "v", default: false }, + "upload-id": { type: "string", short: "u" }, + "list-projects": { type: "boolean", default: false }, + "list-models": { type: "boolean", default: false }, + "list-prompts": { type: "boolean", default: false }, + }, +}); -program.parse(); -const opts = program.opts(); - -if (opts.listProjects) { +if (opts["list-projects"]) { for (const p of PROJECTS) log(` ${pc.bold(p.name)} — ${p.description}`); process.exit(0); } -if (opts.listModels) { +if (opts["list-models"]) { for (const [agent, { models }] of Object.entries(AGENTS)) { log(`\n ${pc.bold(agent)}`); for (const m of models) log(` ${m}`); } process.exit(0); } -if (opts.listPrompts) { +if (opts["list-prompts"]) { for (const name of listPrompts()) log(` ${pc.bold(name)}`); process.exit(0); } @@ -73,7 +71,7 @@ if (opts.model) { const effort = opts.effort as Effort; const runId = randomUUID().slice(0, 8); -const uploadId = (opts.uploadId as string) || `eval-${runId}`; +const uploadId = opts["upload-id"] || `eval-${runId}`; const config: TrialConfig = { project, @@ -81,7 +79,7 @@ const config: TrialConfig = { model, effort, prompt: opts.prompt as string, - verbose: opts.verbose as boolean | undefined, + verbose: opts.verbose, }; log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); diff --git a/scripts/eval/lib/generate-prompt.test.ts b/scripts/eval/lib/generate-prompt.test.ts new file mode 100644 index 000000000000..070fd972bfae --- /dev/null +++ b/scripts/eval/lib/generate-prompt.test.ts @@ -0,0 +1,46 @@ +import { describe, expect, it } from 'vitest'; + +import { generatePrompt, listPrompts } from './generate-prompt'; + +describe('listPrompts', () => { + it('lists available prompt names', () => { + const prompts = listPrompts(); + expect(prompts).toContain('setup'); + expect(prompts).toContain('self-heal'); + }); + + it('returns only names without .md extension', () => { + const prompts = listPrompts(); + for (const name of prompts) { + expect(name).not.toContain('.md'); + } + }); +}); + +describe('generatePrompt', () => { + it('loads setup prompt by default', () => { + const prompt = generatePrompt(); + expect(prompt).toContain('Storybook'); + expect(prompt.length).toBeGreaterThan(0); + }); + + it('loads setup prompt by name', () => { + const prompt = generatePrompt('setup'); + expect(prompt).toContain('Storybook setup'); + }); + + it('loads self-heal prompt', () => { + const prompt = generatePrompt('self-heal'); + expect(prompt).toContain('Self-healing'); + expect(prompt).toContain('vitest'); + }); + + it('throws for unknown prompt', () => { + expect(() => generatePrompt('nonexistent-prompt-xyz')).toThrow('Prompt not found'); + }); + + it('returns trimmed content', () => { + const prompt = generatePrompt('setup'); + expect(prompt).toBe(prompt.trim()); + }); +}); diff --git a/scripts/eval/lib/ghost-stories.test.ts b/scripts/eval/lib/ghost-stories.test.ts new file mode 100644 index 000000000000..03437851f10f --- /dev/null +++ b/scripts/eval/lib/ghost-stories.test.ts @@ -0,0 +1,105 @@ +import { mkdirSync, writeFileSync, rmSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; + +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; + +import { findCandidates } from './ghost-stories'; + +let TMP: string; + +beforeEach(() => { + TMP = join(tmpdir(), `eval-ghost-stories-${Date.now()}`); + mkdirSync(join(TMP, 'src'), { recursive: true }); +}); + +afterEach(() => { + rmSync(TMP, { recursive: true, force: true }); +}); + +function writeFile(relativePath: string, content: string) { + const fullPath = join(TMP, relativePath); + mkdirSync(join(fullPath, '..'), { recursive: true }); + writeFileSync(fullPath, content); +} + +/** A realistic component file with an export and JSX via return(). */ +function simpleComponent(name: string) { + return [ + `import React from 'react';`, + `export function ${name}() {`, + ` return
${name}
;`, + `}`, + ].join('\n'); +} + +describe('findCandidates', () => { + it('finds exported components with JSX', () => { + writeFile('src/Button.tsx', simpleComponent('Button')); + expect(findCandidates(TMP)).toEqual(['src/Button.tsx']); + }); + + it('skips files without exports', () => { + writeFile( + 'src/Internal.tsx', + `function Internal() { return
hi
; }` + ); + expect(findCandidates(TMP)).toEqual([]); + }); + + it('skips files without JSX', () => { + writeFile('src/utils.tsx', `export const add = (a: number, b: number) => a + b;`); + expect(findCandidates(TMP)).toEqual([]); + }); + + it('skips test, spec, and story files', () => { + writeFile('src/Button.test.tsx', simpleComponent('X')); + writeFile('src/Button.spec.tsx', simpleComponent('X')); + writeFile('src/Button.stories.tsx', simpleComponent('X')); + writeFile('src/Button.story.tsx', simpleComponent('X')); + expect(findCandidates(TMP)).toEqual([]); + }); + + it('skips config files', () => { + writeFile('src/app.config.tsx', simpleComponent('X')); + expect(findCandidates(TMP)).toEqual([]); + }); + + it('sorts by complexity (simpler first)', () => { + writeFile('src/Simple.tsx', simpleComponent('Simple')); + const lines = [ + `import React from 'react';`, + `import { useState } from 'react';`, + `import { useEffect } from 'react';`, + `import { useCallback } from 'react';`, + `import { useMemo } from 'react';`, + ...Array.from({ length: 40 }, (_, i) => `const line${i} = ${i};`), + `export function Complex() { return
{line0}
; }`, + ]; + writeFile('src/Complex.tsx', lines.join('\n')); + + const candidates = findCandidates(TMP); + expect(candidates.indexOf('src/Simple.tsx')).toBeLessThan( + candidates.indexOf('src/Complex.tsx') + ); + }); + + it('limits to 20 candidates', () => { + for (let i = 0; i < 25; i++) { + writeFile(`src/Comp${i}.tsx`, simpleComponent(`Comp${i}`)); + } + expect(findCandidates(TMP)).toHaveLength(20); + }); + + it('returns empty for empty project', () => { + expect(findCandidates(TMP)).toEqual([]); + }); + + it('finds components using uppercase JSX tags', () => { + writeFile( + 'src/Wrapper.tsx', + `import { Container } from './ui';\nexport const Wrapper = () => hi;` + ); + expect(findCandidates(TMP)).toEqual(['src/Wrapper.tsx']); + }); +}); diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts index eda035717b77..f7f6ecd7871d 100644 --- a/scripts/eval/lib/ghost-stories.ts +++ b/scripts/eval/lib/ghost-stories.ts @@ -3,13 +3,18 @@ import { join } from "node:path"; import type { GhostStoriesResult } from "../types.ts"; import { logStep, logSuccess, logError, exec } from "./utils.ts"; +// Reuse core ghost-stories utilities via relative imports +import { getComponentComplexity } from "../../../code/core/src/core-server/utils/ghost-stories/component-analyzer.ts"; +import { parseVitestResults } from "../../../code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts"; + /** * Run ghost stories: discover candidate components, auto-generate stories * via the addon-vitest componentTransform, and measure rendering success. * - * Mirrors the approach in core-server/utils/ghost-stories/run-story-tests.ts: - * - Pass component paths as both CLI args (so vitest runs them) and - * STORYBOOK_COMPONENT_PATHS env var (so the transform plugin activates) + * Reuses parseVitestResults and getComponentComplexity from core. + * Candidate discovery uses a lightweight regex approach here because the + * core's getComponentCandidates depends on storybook/internal/babel which + * isn't resolvable from the scripts/ workspace. */ export async function runGhostStories( projectPath: string, @@ -49,15 +54,12 @@ export async function runGhostStories( try { const report = JSON.parse(readFileSync(reportPath, "utf-8")); - let total = 0; - let passed = 0; - for (const suite of report.testResults ?? []) { - for (const test of suite.assertionResults ?? []) { - total++; - if (test.status === "passed") passed++; - } + const { summary } = parseVitestResults(report); + if (!summary) { + logError("Ghost stories: no test results in Vitest report"); + return { candidateCount: candidates.length, total: 0, passed: 0, successRate: 0 }; } - const successRate = total > 0 ? Math.round((passed / total) * 100) / 100 : 0; + const { total, passed, successRate } = summary; if (total > 0) logSuccess(`Ghost stories: ${passed}/${total} passed (${Math.round(successRate * 100)}%)`); return { candidateCount: candidates.length, total, passed, successRate }; } catch { @@ -66,8 +68,11 @@ export async function runGhostStories( } } -/** Find .tsx/.jsx files that look like React components, sorted by simplicity. */ -function findCandidates(projectPath: string): string[] { +/** + * Find .tsx/.jsx files that look like React components, sorted by complexity. + * Uses getComponentComplexity from core for consistent scoring. + */ +export function findCandidates(projectPath: string): string[] { const SKIP = new Set(["node_modules", ".storybook", "dist", "build", ".git"]); const files = globSync("**/*.{tsx,jsx}", { cwd: projectPath, @@ -81,8 +86,7 @@ function findCandidates(projectPath: string): string[] { const content = readFileSync(join(projectPath, f), "utf-8"); if (!/export\s/.test(content)) return null; if (!/<[A-Z]/.test(content) && !/return\s*\(?\s* l.trim()).length; - return { path: f, complexity: Math.min(1, lines / 100) }; + return { path: f, complexity: getComponentComplexity(content) }; } catch { return null; } diff --git a/scripts/eval/lib/grade.test.ts b/scripts/eval/lib/grade.test.ts new file mode 100644 index 000000000000..1576cf287793 --- /dev/null +++ b/scripts/eval/lib/grade.test.ts @@ -0,0 +1,138 @@ +import { describe, expect, it } from 'vitest'; + +import { + filterStorybookFiles, + computeQualityScore, + countTypeCheckErrors, + parseChangedFiles, +} from './grade'; +import type { ChangedFile } from '../types'; + +describe('filterStorybookFiles', () => { + it('matches files in .storybook/ directory', () => { + const files: ChangedFile[] = [ + { path: '.storybook/main.ts', status: 'M' }, + { path: '.storybook/preview.tsx', status: 'A' }, + { path: 'src/App.tsx', status: 'M' }, + ]; + const result = filterStorybookFiles(files); + expect(result).toHaveLength(2); + expect(result.map((f) => f.path)).toEqual(['.storybook/main.ts', '.storybook/preview.tsx']); + }); + + it('matches story files with various extensions', () => { + const files: ChangedFile[] = [ + { path: 'src/Button.stories.tsx', status: 'A' }, + { path: 'src/Header.stories.ts', status: 'A' }, + { path: 'src/Page.story.jsx', status: 'A' }, + { path: 'src/utils.stories.js', status: 'A' }, + { path: 'src/Button.tsx', status: 'M' }, + { path: 'src/Button.test.tsx', status: 'M' }, + ]; + const result = filterStorybookFiles(files); + expect(result).toHaveLength(4); + }); + + it('returns empty for no storybook files', () => { + const files: ChangedFile[] = [ + { path: 'src/App.tsx', status: 'M' }, + { path: 'package.json', status: 'M' }, + ]; + expect(filterStorybookFiles(files)).toHaveLength(0); + }); + + it('handles empty input', () => { + expect(filterStorybookFiles([])).toHaveLength(0); + }); +}); + +describe('computeQualityScore', () => { + it('returns 1.0 for passing build and zero TS errors', () => { + const result = computeQualityScore(true, 0); + expect(result.score).toBe(1); + expect(result.breakdown.build).toBe(1); + expect(result.breakdown.typecheck).toBe(1); + }); + + it('returns 0.7 for passing build with many TS errors', () => { + const result = computeQualityScore(true, 100); + expect(result.score).toBe(0.7); + expect(result.breakdown.build).toBe(1); + expect(result.breakdown.typecheck).toBe(0); + }); + + it('returns 0.3 for failing build with zero TS errors', () => { + const result = computeQualityScore(false, 0); + expect(result.score).toBe(0.3); + expect(result.breakdown.build).toBe(0); + expect(result.breakdown.typecheck).toBe(1); + }); + + it('returns 0 for failing build with many TS errors', () => { + const result = computeQualityScore(false, 20); + expect(result.score).toBe(0); + expect(result.breakdown.build).toBe(0); + expect(result.breakdown.typecheck).toBe(0); + }); + + it('scales typecheck score linearly', () => { + // 10 errors -> tcScore = 1 - 10/20 = 0.5 + const result = computeQualityScore(true, 10); + expect(result.score).toBe(0.85); // 0.7 + 0.5*0.3 + expect(result.breakdown.typecheck).toBe(0.5); + }); + + it('clamps typecheck score at 0 for >= 20 errors', () => { + expect(computeQualityScore(true, 20).breakdown.typecheck).toBe(0); + expect(computeQualityScore(true, 50).breakdown.typecheck).toBe(0); + }); +}); + +describe('countTypeCheckErrors', () => { + it('counts zero for clean output', () => { + expect(countTypeCheckErrors('')).toBe(0); + expect(countTypeCheckErrors('All good\nNo issues')).toBe(0); + }); + + it('counts TypeScript error codes', () => { + const output = [ + "src/App.tsx(3,1): error TS2304: Cannot find name 'foo'.", + "src/App.tsx(5,1): error TS2322: Type 'string' is not assignable.", + 'Found 2 errors.', + ].join('\n'); + expect(countTypeCheckErrors(output)).toBe(2); + }); + + it('counts multiple errors on the same line', () => { + const output = 'error TS1234 and error TS5678 on same line'; + expect(countTypeCheckErrors(output)).toBe(2); + }); + + it('does not count non-error TS references', () => { + expect(countTypeCheckErrors('TS2304 without error prefix')).toBe(0); + expect(countTypeCheckErrors('warning TS1234')).toBe(0); + }); +}); + +describe('parseChangedFiles', () => { + it('parses added, modified, deleted, and renamed files', () => { + const output = 'A\tsrc/new-file.ts\nM\tsrc/existing.ts\nD\tsrc/removed.ts\nR100\told.ts\tnew.ts'; + const result = parseChangedFiles(output); + expect(result).toEqual([ + { path: 'src/new-file.ts', status: 'A' }, + { path: 'src/existing.ts', status: 'M' }, + { path: 'src/removed.ts', status: 'D' }, + { path: 'old.ts\tnew.ts', status: 'R' }, + ]); + }); + + it('handles empty output', () => { + expect(parseChangedFiles('')).toEqual([]); + expect(parseChangedFiles('\n')).toEqual([]); + }); + + it('handles single file', () => { + const result = parseChangedFiles('M\tpackage.json'); + expect(result).toEqual([{ path: 'package.json', status: 'M' }]); + }); +}); diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index 9bfd6e5ab986..00ac19c1ccd0 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -5,15 +5,45 @@ import { logStep, logSuccess, logError, exec } from "./utils.ts"; import { detectSetupPatterns } from "./setup-patterns.ts"; import { runGhostStories } from "./ghost-stories.ts"; +/** Filter changed files to only storybook-related ones. */ +export function filterStorybookFiles(changedFiles: ChangedFile[]): ChangedFile[] { + return changedFiles.filter( + (f) => f.path.includes(".storybook/") || /\.(stories|story)\.[tj]sx?$/.test(f.path), + ); +} + +/** Compute quality score: 70% build + 30% typecheck. */ +export function computeQualityScore(buildSuccess: boolean, typeCheckErrors: number): QualityResult { + const buildScore = buildSuccess ? 1 : 0; + const tcScore = Math.max(0, 1 - typeCheckErrors / 20); + const score = Math.round((buildScore * 0.7 + tcScore * 0.3) * 100) / 100; + return { score, breakdown: { build: buildScore, typecheck: Math.round(tcScore * 100) / 100 } }; +} + +/** Count TypeScript errors from tsc output. */ +export function countTypeCheckErrors(tscOutput: string): number { + return (tscOutput.match(/error TS\d+/g) || []).length; +} + +/** Parse git diff --name-status output into ChangedFile objects. */ +export function parseChangedFiles(gitOutput: string): ChangedFile[] { + return gitOutput + .trim() + .split("\n") + .filter(Boolean) + .map((line) => { + const [status, ...parts] = line.split("\t"); + return { path: parts.join("\t"), status: (status?.charAt(0) || "M") as ChangedFile["status"] }; + }); +} + export async function grade(paths: TrialPaths): Promise<{ grading: GradingResult; quality: QualityResult }> { const { repoRoot, projectPath, resultsDir, baselineCommit } = paths; // Changed files logStep("Collecting agent changes..."); const changedFiles = await getChangedFiles(repoRoot, baselineCommit); - const storybookFiles = changedFiles.filter( - (f) => f.path.includes(".storybook/") || /\.(stories|story)\.[tj]sx?$/.test(f.path), - ); + const storybookFiles = filterStorybookFiles(changedFiles); logSuccess(`${changedFiles.length} files changed (${storybookFiles.length} storybook-related)`); // Setup patterns @@ -41,7 +71,7 @@ export async function grade(paths: TrialPaths): Promise<{ grading: GradingResult const tsc = await exec("npx", ["tsc", "--noEmit"], { cwd: projectPath, timeout: 120_000, throwOnError: false }); const tscOutput = tsc.stdout + "\n" + tsc.stderr; writeFileSync(join(resultsDir, "typecheck-output.txt"), tscOutput); - const typeCheckErrors = (tscOutput.match(/error TS\d+/g) || []).length; + const typeCheckErrors = countTypeCheckErrors(tscOutput); if (typeCheckErrors === 0) { logSuccess("No TypeScript errors"); } else { @@ -62,15 +92,9 @@ export async function grade(paths: TrialPaths): Promise<{ grading: GradingResult ghostStories, }; - // Quality = 70% build + 30% typecheck - const buildScore = buildSuccess ? 1 : 0; - const tcScore = Math.max(0, 1 - typeCheckErrors / 20); - const score = Math.round((buildScore * 0.7 + tcScore * 0.3) * 100) / 100; + const quality = computeQualityScore(buildSuccess, typeCheckErrors); - return { - grading, - quality: { score, breakdown: { build: buildScore, typecheck: Math.round(tcScore * 100) / 100 } }, - }; + return { grading, quality }; } async function getChangedFiles(repoRoot: string, baseline: string): Promise { @@ -79,12 +103,5 @@ async function getChangedFiles(repoRoot: string, baseline: string): Promise { - const [status, ...parts] = line.split("\t"); - return { path: parts.join("\t"), status: (status?.charAt(0) || "M") as ChangedFile["status"] }; - }); + return parseChangedFiles(stdout); } diff --git a/scripts/eval/lib/grading-pipeline.test.ts b/scripts/eval/lib/grading-pipeline.test.ts new file mode 100644 index 000000000000..259ab8cdee18 --- /dev/null +++ b/scripts/eval/lib/grading-pipeline.test.ts @@ -0,0 +1,193 @@ +import { mkdirSync, writeFileSync, rmSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; + +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; + +import { findCandidates } from './ghost-stories'; +import { + computeQualityScore, + countTypeCheckErrors, + filterStorybookFiles, + parseChangedFiles, +} from './grade'; +import { detectSetupPatterns } from './setup-patterns'; + +/** + * High-level test: simulate the grading pipeline on a fake project directory. + * Data flows from one step to the next — candidate count feeds into the + * quality assessment, patterns inform what we expect from the changed files, etc. + */ + +let TMP: string; + +beforeEach(() => { + TMP = join(tmpdir(), `eval-grading-pipeline-${Date.now()}`); + mkdirSync(join(TMP, 'src', 'components'), { recursive: true }); + mkdirSync(join(TMP, '.storybook'), { recursive: true }); +}); + +afterEach(() => { + rmSync(TMP, { recursive: true, force: true }); +}); + +function writeFile(relativePath: string, content: string) { + const fullPath = join(TMP, relativePath); + mkdirSync(join(fullPath, '..'), { recursive: true }); + writeFileSync(fullPath, content); +} + +describe('grading pipeline', () => { + it('grades a well-configured project: candidates found, patterns detected, high quality', () => { + // Set up a realistic project with components and storybook config + writeFile( + 'src/components/Button.tsx', + [ + `import React from 'react';`, + `export function Button({ label }: { label: string }) {`, + ` return (`, + ` `, + ` );`, + `}`, + ].join('\n') + ); + writeFile( + 'src/components/Card.tsx', + [ + `import React from 'react';`, + `export function Card({ title }: { title: string }) {`, + ` return (`, + `
{title}
`, + ` );`, + `}`, + ].join('\n') + ); + writeFile( + '.storybook/preview.tsx', + [ + `import '../src/styles/globals.css';`, + `import { ThemeProvider } from '@emotion/react';`, + ].join('\n') + ); + writeFile( + '.storybook/main.ts', + `export default { staticDirs: ['../public'], stories: ['../src/**/*.stories.tsx'] };` + ); + + // Step 1: Find candidates — both components should be discovered + const candidates = findCandidates(TMP); + expect(candidates).toHaveLength(2); + + // Step 2: Detect patterns — config references CSS, theme, staticDirs + const patterns = detectSetupPatterns(TMP); + const patternIds = patterns.map((p) => p.id); + expect(patternIds).toContain('global-css'); + expect(patternIds).toContain('theme-provider'); + expect(patternIds).toContain('static-dirs'); + + // Step 3: Simulate git output where the agent added storybook config + one + // story per discovered candidate, plus modified package.json + const gitLines = [ + 'A\t.storybook/preview.tsx', + 'A\t.storybook/main.ts', + ...candidates.map((c) => `A\t${c.replace(/\.tsx$/, '.stories.tsx')}`), + 'M\tpackage.json', + ]; + const changedFiles = parseChangedFiles(gitLines.join('\n')); + const storybookFiles = filterStorybookFiles(changedFiles); + + // 2 config files + 1 story per candidate = storybook-related + expect(storybookFiles).toHaveLength(2 + candidates.length); + // Total includes package.json + expect(changedFiles).toHaveLength(storybookFiles.length + 1); + + // Step 4: Build passed, no TS errors → perfect score + const quality = computeQualityScore(true, 0); + expect(quality.score).toBe(1); + }); + + it('grades a broken project: candidates found but build fails, low quality', () => { + writeFile( + 'src/components/Widget.tsx', + [ + `import React from 'react';`, + `export function Widget() {`, + ` return
hello
;`, + `}`, + ].join('\n') + ); + + // Candidates still discoverable even when storybook setup is broken + const candidates = findCandidates(TMP); + expect(candidates).toHaveLength(1); + + // Agent didn't create any .storybook config + rmSync(join(TMP, '.storybook'), { recursive: true }); + expect(detectSetupPatterns(TMP)).toEqual([]); + + // Simulate tsc output with errors proportional to candidate count + const tscLines = candidates.map( + (c, i) => `${c}(${i + 1},1): error TS2304: Cannot find name 'React'.` + ); + tscLines.push("src/App.tsx(10,5): error TS2345: Argument not assignable."); + const errorCount = countTypeCheckErrors(tscLines.join('\n')); + expect(errorCount).toBe(candidates.length + 1); + + // Build failed + errors → low quality + const quality = computeQualityScore(false, errorCount); + expect(quality.score).toBeLessThan(0.3); + expect(quality.breakdown.build).toBe(0); + }); + + it('more candidates with setup patterns yields higher confidence in the grade', () => { + // Rich project: many simple components + for (let i = 0; i < 5; i++) { + writeFile( + `src/components/Comp${i}.tsx`, + [ + `import React from 'react';`, + `export function Comp${i}() {`, + ` return
Component ${i}
;`, + `}`, + ].join('\n') + ); + } + writeFile('.storybook/preview.tsx', `import { MemoryRouter } from 'react-router-dom';`); + + const candidates = findCandidates(TMP); + expect(candidates).toHaveLength(5); + + const patterns = detectSetupPatterns(TMP); + expect(patterns.map((p) => p.id)).toContain('router-provider'); + + // Agent wrote one story per candidate — all storybook-related + const gitOutput = candidates + .map((c) => `A\t${c.replace(/\.tsx$/, '.stories.tsx')}`) + .join('\n'); + const storybookFiles = filterStorybookFiles(parseChangedFiles(gitOutput)); + expect(storybookFiles).toHaveLength(candidates.length); + + // Clean build → perfect + expect(computeQualityScore(true, 0).score).toBe(1); + }); +}); + +describe('setup-patterns only scans .storybook/', () => { + it('does not detect patterns in component source files', () => { + // Router usage in a component should NOT be detected as a setup pattern + writeFile( + 'src/App.tsx', + [ + `import React from 'react';`, + `import { BrowserRouter } from 'react-router-dom';`, + `export function App() {`, + ` return
;`, + `}`, + ].join('\n') + ); + // Empty .storybook config with no patterns + writeFile('.storybook/main.ts', `export default { stories: ['../src/**/*.stories.tsx'] };`); + + expect(detectSetupPatterns(TMP).map((p) => p.id)).not.toContain('router-provider'); + }); +}); diff --git a/scripts/eval/lib/run-task.test.ts b/scripts/eval/lib/run-task.test.ts new file mode 100644 index 000000000000..d30868c8d3e7 --- /dev/null +++ b/scripts/eval/lib/run-task.test.ts @@ -0,0 +1,243 @@ +import { existsSync, mkdirSync, readFileSync, rmSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; + +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; + +import type { TrialConfig, TrialResult } from '../types'; + +// Mock external dependencies to avoid real git/storybook/vitest calls +vi.mock('./prepare-trial', () => ({ + prepareTrial: vi.fn(), +})); +vi.mock('./grade', () => ({ + grade: vi.fn(), +})); +vi.mock('./save', () => ({ + captureEnvironment: vi.fn().mockResolvedValue({ + nodeVersion: 'v22.21.1', + gitBranch: 'test-branch', + gitCommit: 'abc123', + }), + saveToGoogleSheets: vi.fn().mockResolvedValue(undefined), +})); +vi.mock('../config', () => ({ + agents: { + claude: { + name: 'claude', + execute: vi.fn(), + }, + codex: { + name: 'codex', + execute: vi.fn(), + }, + }, +})); + +import { agents } from '../config'; +import { grade } from './grade'; +import { prepareTrial } from './prepare-trial'; +import { runTask } from './run-task'; +import { captureEnvironment, saveToGoogleSheets } from './save'; + +let TMP: string; + +beforeEach(() => { + vi.clearAllMocks(); + TMP = join(tmpdir(), `eval-run-task-${Date.now()}`); + mkdirSync(join(TMP, 'results'), { recursive: true }); +}); + +afterEach(() => { + rmSync(TMP, { recursive: true, force: true }); +}); + +function setupMocks(overrides?: { + buildSuccess?: boolean; + typeCheckErrors?: number; + cost?: number; +}) { + const { buildSuccess = true, typeCheckErrors = 0, cost = 0.42 } = overrides ?? {}; + + vi.mocked(prepareTrial).mockResolvedValue({ + trialDir: TMP, + repoRoot: TMP, + projectPath: TMP, + resultsDir: join(TMP, 'results'), + baselineCommit: 'deadbeef', + }); + + vi.mocked(agents.claude.execute).mockResolvedValue({ + agent: 'claude', + model: 'sonnet-4.6', + effort: 'high', + cost, + duration: 45.2, + turns: 12, + }); + + vi.mocked(grade).mockResolvedValue({ + grading: { + buildSuccess, + typeCheckErrors, + changedFiles: [ + { path: '.storybook/preview.tsx', status: 'A' }, + { path: 'src/Button.stories.tsx', status: 'A' }, + ], + storybookFiles: [ + { path: '.storybook/preview.tsx', status: 'A' }, + { path: 'src/Button.stories.tsx', status: 'A' }, + ], + setupPatterns: [{ id: 'tailwind', label: 'Tailwind CSS', sourceFiles: ['.storybook/preview.ts'] }], + }, + quality: { score: buildSuccess ? 1 : 0.3, breakdown: { build: buildSuccess ? 1 : 0, typecheck: 1 } }, + }); +} + +const baseConfig: TrialConfig = { + project: { name: 'test-project', repo: 'https://github.com/test/repo', branch: 'main' }, + agent: 'claude', + model: 'sonnet-4.6', + effort: 'high', + prompt: 'setup', +}; + +describe('runTask pipeline', () => { + it('assembles a complete TrialResult from pipeline steps', async () => { + setupMocks(); + + const result = await runTask(baseConfig, 'run-123', 'upload-456'); + + // Config fields mapped correctly + expect(result.schemaVersion).toBe(1); + expect(result.project).toBe('test-project'); + expect(result.agent).toBe('claude'); + expect(result.model).toBe('sonnet-4.6'); + expect(result.effort).toBe('high'); + expect(result.prompt).toBe('setup'); + expect(result.timestamp).toMatch(/^\d{4}-\d{2}-\d{2}T/); + + // prepareTrial output flows into result + expect(result.baselineCommit).toBe('deadbeef'); + + // Agent execution output flows into result + expect(result.execution).toEqual({ + agent: 'claude', + model: 'sonnet-4.6', + effort: 'high', + cost: 0.42, + duration: 45.2, + turns: 12, + }); + + // Grade output flows into result + expect(result.grading.buildSuccess).toBe(true); + expect(result.quality.score).toBe(1); + }); + + it('calls pipeline steps with correct arguments', async () => { + setupMocks(); + + const config: TrialConfig = { + ...baseConfig, + project: { name: 'mealdrop', repo: 'https://github.com/test/mealdrop', branch: 'eval-baseline' }, + }; + + await runTask(config, 'run-1', 'upload-1'); + + // prepareTrial receives the project + expect(vi.mocked(prepareTrial).mock.calls[0][0].name).toBe('mealdrop'); + + // captureEnvironment receives the results dir + expect(vi.mocked(captureEnvironment).mock.calls[0][0]).toBe(join(TMP, 'results')); + + // Agent receives real prompt content, the project path, model, and options + const [prompt, projectPath, model, options] = vi.mocked(agents.claude.execute).mock.calls[0]; + expect(prompt).toContain('Storybook setup'); + expect(projectPath).toBe(TMP); + expect(model).toBe('sonnet-4.6'); + expect(options?.effort).toBe('high'); + + // grade receives the trial paths + const gradePaths = vi.mocked(grade).mock.calls[0][0]; + expect(gradePaths.baselineCommit).toBe('deadbeef'); + expect(gradePaths.projectPath).toBe(TMP); + + // saveToGoogleSheets receives the assembled result + env + IDs + const [savedResult, savedEnv, savedRunId, savedUploadId] = + vi.mocked(saveToGoogleSheets).mock.calls[0]; + expect(savedResult.project).toBe('mealdrop'); + expect(savedEnv.gitBranch).toBe('test-branch'); + expect(savedRunId).toBe('run-1'); + expect(savedUploadId).toBe('upload-1'); + }); + + it('writes summary.json and prompt.md to results dir', async () => { + setupMocks(); + + await runTask(baseConfig, 'run-1', 'upload-1'); + + const resultsDir = join(TMP, 'results'); + + // summary.json is parseable and matches the returned result + const summary: TrialResult = JSON.parse(readFileSync(join(resultsDir, 'summary.json'), 'utf-8')); + expect(summary.schemaVersion).toBe(1); + expect(summary.execution.cost).toBe(0.42); + expect(summary.grading.buildSuccess).toBe(true); + + // prompt.md contains the real setup prompt + const promptContent = readFileSync(join(resultsDir, 'prompt.md'), 'utf-8'); + expect(promptContent).toContain('Storybook setup'); + }); + + it('propagates failed build into result', async () => { + setupMocks({ buildSuccess: false, typeCheckErrors: 5 }); + + const result = await runTask(baseConfig, 'run-1', 'upload-1'); + expect(result.grading.buildSuccess).toBe(false); + expect(result.quality.score).toBe(0.3); + }); + + it('does not call grade before agent finishes', async () => { + // Use execution order tracking to verify sequencing + const callOrder: string[] = []; + + vi.mocked(prepareTrial).mockImplementation(async () => { + callOrder.push('prepare'); + return { + trialDir: TMP, + repoRoot: TMP, + projectPath: TMP, + resultsDir: join(TMP, 'results'), + baselineCommit: 'deadbeef', + }; + }); + + vi.mocked(agents.claude.execute).mockImplementation(async () => { + callOrder.push('agent'); + return { agent: 'claude', model: 'sonnet-4.6', effort: 'high', cost: 0.1, duration: 10, turns: 3 }; + }); + + vi.mocked(grade).mockImplementation(async () => { + callOrder.push('grade'); + return { + grading: { + buildSuccess: true, + typeCheckErrors: 0, + changedFiles: [], + storybookFiles: [], + setupPatterns: [], + }, + quality: { score: 1, breakdown: { build: 1, typecheck: 1 } }, + }; + }); + + vi.mocked(saveToGoogleSheets).mockImplementation(async () => { + callOrder.push('save'); + }); + + await runTask(baseConfig, 'run-1', 'upload-1'); + + expect(callOrder).toEqual(['prepare', 'agent', 'grade', 'save']); + }); +}); diff --git a/scripts/eval/lib/setup-patterns.test.ts b/scripts/eval/lib/setup-patterns.test.ts new file mode 100644 index 000000000000..75318fd01531 --- /dev/null +++ b/scripts/eval/lib/setup-patterns.test.ts @@ -0,0 +1,129 @@ +import { mkdirSync, writeFileSync, rmSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; + +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; + +import { detectSetupPatterns } from './setup-patterns'; + +let TMP: string; + +beforeEach(() => { + TMP = join(tmpdir(), `eval-setup-patterns-${Date.now()}`); + mkdirSync(join(TMP, '.storybook'), { recursive: true }); +}); + +afterEach(() => { + rmSync(TMP, { recursive: true, force: true }); +}); + +function writeConfig(name: string, content: string) { + writeFileSync(join(TMP, '.storybook', name), content); +} + +describe('detectSetupPatterns', () => { + it('returns empty when no .storybook dir', () => { + rmSync(join(TMP, '.storybook'), { recursive: true }); + expect(detectSetupPatterns(TMP)).toEqual([]); + }); + + it('returns empty when .storybook has no matching patterns', () => { + writeConfig('main.ts', 'export default { stories: ["../src/**/*.stories.@(ts|tsx)"] };'); + expect(detectSetupPatterns(TMP)).toEqual([]); + }); + + it('detects Tailwind CSS', () => { + writeConfig('preview.ts', `import 'tailwindcss/tailwind.css';`); + expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('tailwind'); + }); + + it('detects global CSS imports', () => { + writeConfig('preview.ts', `import '../src/styles/globals.css';`); + expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('global-css'); + }); + + it('detects styled-components', () => { + writeConfig('preview.tsx', `import { createGlobalStyle } from 'styled-components';`); + expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('styled-components'); + }); + + it('detects React Router', () => { + writeConfig('preview.tsx', `import { MemoryRouter } from 'react-router-dom';`); + expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('router-provider'); + }); + + it('detects Redux provider', () => { + writeConfig( + 'preview.tsx', + `import { Provider } from 'react-redux';\n` + ); + expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('redux-provider'); + }); + + it('detects Zustand', () => { + writeConfig('preview.ts', `import { create } from 'zustand';`); + expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('zustand'); + }); + + it('detects GraphQL/Apollo', () => { + writeConfig('preview.tsx', `import { MockedProvider } from '@apollo/client/testing';`); + expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('graphql'); + }); + + it('detects theme providers', () => { + writeConfig('preview.tsx', `import { ThemeProvider } from '@emotion/react';`); + expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('theme-provider'); + }); + + it('detects staticDirs', () => { + writeConfig('main.ts', `export default { staticDirs: ['../public'] };`); + expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('static-dirs'); + }); + + it('detects vite alias config', () => { + writeConfig( + 'main.ts', + `export default { viteFinal: (config) => ({ ...config, resolve: { alias: { '@': './src' } } }) };` + ); + expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('vite-alias'); + }); + + it('detects multiple patterns in the same file', () => { + writeConfig( + 'preview.tsx', + [ + `import '../src/index.css';`, + `import { MemoryRouter } from 'react-router-dom';`, + `import { ThemeProvider } from '@emotion/react';`, + ].join('\n') + ); + const ids = detectSetupPatterns(TMP).map((p) => p.id); + expect(ids).toContain('global-css'); + expect(ids).toContain('router-provider'); + expect(ids).toContain('theme-provider'); + }); + + it('includes sourceFiles relative to project path', () => { + writeConfig('preview.ts', `import 'tailwindcss';`); + const tailwind = detectSetupPatterns(TMP).find((p) => p.id === 'tailwind'); + expect(tailwind?.sourceFiles).toEqual(['.storybook/preview.ts']); + }); + + it('does not false-positive on unrelated React hooks', () => { + writeConfig('preview.ts', `import { useState, useEffect } from 'react';`); + expect(detectSetupPatterns(TMP)).toEqual([]); + }); + + it('does not detect patterns in files outside .storybook/', () => { + // Write a router import in a source file, not in .storybook/ + mkdirSync(join(TMP, 'src'), { recursive: true }); + writeFileSync( + join(TMP, 'src', 'App.tsx'), + `import { BrowserRouter } from 'react-router-dom';` + ); + // .storybook/ has no patterns + writeConfig('main.ts', `export default { stories: ['../src/**/*.stories.tsx'] };`); + + expect(detectSetupPatterns(TMP).map((p) => p.id)).not.toContain('router-provider'); + }); +}); diff --git a/scripts/eval/lib/utils.test.ts b/scripts/eval/lib/utils.test.ts new file mode 100644 index 000000000000..2e64599d8aa6 --- /dev/null +++ b/scripts/eval/lib/utils.test.ts @@ -0,0 +1,56 @@ +import { describe, expect, it } from 'vitest'; + +import { formatDuration, formatCost, generateTrialId } from './utils'; + +describe('formatDuration', () => { + it('formats seconds under a minute', () => { + expect(formatDuration(0)).toBe('0s'); + expect(formatDuration(1)).toBe('1s'); + expect(formatDuration(45)).toBe('45s'); + }); + + it('rounds fractional seconds', () => { + expect(formatDuration(2.7)).toBe('3s'); + expect(formatDuration(59.4)).toBe('59s'); + }); + + it('formats minutes and seconds', () => { + expect(formatDuration(60)).toBe('1m0s'); + expect(formatDuration(61)).toBe('1m1s'); + expect(formatDuration(90)).toBe('1m30s'); + expect(formatDuration(125)).toBe('2m5s'); + expect(formatDuration(3661)).toBe('61m1s'); + }); +}); + +describe('formatCost', () => { + it('returns dash for undefined', () => { + expect(formatCost(undefined)).toBe('-'); + expect(formatCost()).toBe('-'); + }); + + it('formats dollar amounts', () => { + expect(formatCost(0)).toBe('$0.00'); + expect(formatCost(1.5)).toBe('$1.50'); + }); +}); + +describe('generateTrialId', () => { + it('contains project, model, and prompt', () => { + const id = generateTrialId('mealdrop', 'claude', 'sonnet-4.6', 'setup'); + expect(id).toContain('mealdrop'); + expect(id).toContain('sonnet-4.6'); + expect(id).toContain('setup'); + }); + + it('starts with an ISO-like timestamp', () => { + const id = generateTrialId('proj', 'agent', 'model', 'prompt'); + expect(id).toMatch(/^\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}/); + }); + + it('generates unique IDs', () => { + const a = generateTrialId('p', 'a', 'm', 'pr'); + const b = generateTrialId('p', 'a', 'm', 'pr'); + expect(a).not.toBe(b); + }); +}); diff --git a/scripts/eval/types.test.ts b/scripts/eval/types.test.ts new file mode 100644 index 000000000000..0ba18bf625ea --- /dev/null +++ b/scripts/eval/types.test.ts @@ -0,0 +1,27 @@ +import { describe, expect, it } from 'vitest'; + +import { AGENTS } from './types'; + +describe('AGENTS config', () => { + it('has claude and codex agents', () => { + expect(AGENTS).toHaveProperty('claude'); + expect(AGENTS).toHaveProperty('codex'); + }); + + it('each agent has a non-empty models list', () => { + for (const config of Object.values(AGENTS)) { + expect(config.models.length).toBeGreaterThan(0); + } + }); + + it('each agent defaultModel is in its models list', () => { + for (const config of Object.values(AGENTS)) { + expect(config.models).toContain(config.defaultModel); + } + }); + + it('no model is shared between agents', () => { + const allModels = Object.values(AGENTS).flatMap((a) => a.models); + expect(new Set(allModels).size).toBe(allModels.length); + }); +}); From 5051a6dd46e65b47d161d8a305b1ffaef7bb1104 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sat, 28 Mar 2026 11:05:37 +0700 Subject: [PATCH 25/63] Simplify eval harness: merge config, options objects, remove duplication MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Delete config.ts and generate-prompt.ts — merge PROJECTS into types.ts, prompts into utils.ts, inline agents map into run-task.ts - computeQualityScore takes options object instead of 4 positional params - Quality score now includes ghost stories (40%), build (25%), typecheck (25%), and performance (10%) - exec() uses tinyexec native timeout instead of manual AbortController - Codex agent tracks token usage and estimates cost from pricing table - Environment fields renamed to evalBranch/evalCommit for clarity - IPC sentinel shared as exported constant between eval.ts and eval-parallel.ts - Summary tables now show quality score column - setup-patterns uses object array instead of positional tuples - prepare-repos.ts uses shared exec(), static imports, consistent quotes - google-apps-script.js modernized to const/let + arrow functions - Remove SupportedModel type alias (was just string) - Fix .gitignore trailing newline, prompt no longer hardcodes React+Vite - MAX_TURNS extracted as named constant in claude agent --- .gitignore | 2 +- scripts/eval/config.test.ts | 31 ------- scripts/eval/config.ts | 50 ----------- scripts/eval/eval-parallel.ts | 20 ++--- scripts/eval/eval.ts | 13 +-- scripts/eval/google-apps-script.js | 43 +++++----- scripts/eval/lib/agents/claude-code.ts | 8 +- scripts/eval/lib/agents/codex.ts | 39 +++++++-- scripts/eval/lib/generate-prompt.test.ts | 46 ---------- scripts/eval/lib/generate-prompt.ts | 20 ----- scripts/eval/lib/grade.test.ts | 100 +++++++++++++++------- scripts/eval/lib/grade.ts | 53 +++++++++--- scripts/eval/lib/grading-pipeline.test.ts | 12 +-- scripts/eval/lib/run-task.test.ts | 34 +++----- scripts/eval/lib/run-task.ts | 17 ++-- scripts/eval/lib/save.ts | 20 +++-- scripts/eval/lib/setup-patterns.ts | 24 +++--- scripts/eval/lib/utils.test.ts | 45 +++++++++- scripts/eval/lib/utils.ts | 59 +++++++------ scripts/eval/prepare-repos.ts | 27 ++---- scripts/eval/prompts/setup.md | 2 +- scripts/eval/types.test.ts | 23 ++++- scripts/eval/types.ts | 53 ++++++++++-- 23 files changed, 397 insertions(+), 344 deletions(-) delete mode 100644 scripts/eval/config.test.ts delete mode 100644 scripts/eval/config.ts delete mode 100644 scripts/eval/lib/generate-prompt.test.ts delete mode 100644 scripts/eval/lib/generate-prompt.ts diff --git a/.gitignore b/.gitignore index 1afe80035d56..d2c36bd45454 100644 --- a/.gitignore +++ b/.gitignore @@ -83,4 +83,4 @@ CLAUDE.local.md # Eval system scripts/eval/.cache -scripts/eval/results \ No newline at end of file +scripts/eval/results diff --git a/scripts/eval/config.test.ts b/scripts/eval/config.test.ts deleted file mode 100644 index f6d5aba687e8..000000000000 --- a/scripts/eval/config.test.ts +++ /dev/null @@ -1,31 +0,0 @@ -import { describe, expect, it } from 'vitest'; - -import { PROJECTS, agents } from './config'; - -describe('PROJECTS', () => { - it('has at least one project', () => { - expect(PROJECTS.length).toBeGreaterThan(0); - }); - - it('each project has name, repo URL, and branch', () => { - for (const project of PROJECTS) { - expect(project.name).toBeTruthy(); - expect(project.repo).toMatch(/^https:\/\/github\.com\//); - expect(project.branch).toBeTruthy(); - } - }); - - it('project names are unique', () => { - const names = PROJECTS.map((p) => p.name); - expect(new Set(names).size).toBe(names.length); - }); -}); - -describe('agents', () => { - it('each agent has a name and execute method', () => { - for (const agent of Object.values(agents)) { - expect(agent.name).toBeTruthy(); - expect(typeof agent.execute).toBe('function'); - } - }); -}); diff --git a/scripts/eval/config.ts b/scripts/eval/config.ts deleted file mode 100644 index 31e7b4a0a395..000000000000 --- a/scripts/eval/config.ts +++ /dev/null @@ -1,50 +0,0 @@ -import type { AgentName, Agent } from "./types.ts"; -import type { Project } from "./types.ts"; -import { claudeAgent } from "./lib/agents/claude-code.ts"; -import { codexAgent } from "./lib/agents/codex.ts"; - -export const PROJECTS: Project[] = [ - { - name: "mealdrop", - repo: "https://github.com/kasperpeulen/mealdrop", - branch: "eval-baseline", - description: "Styled components, Redux, React Router", - }, - { - name: "edgy", - repo: "https://github.com/kasperpeulen/edgy", - branch: "eval-baseline", - description: "Tailwind, HeadlessUI, React Router", - }, - { - name: "wikitok", - repo: "https://github.com/kasperpeulen/wikitok", - branch: "eval-baseline", - projectDir: "frontend", - description: "Simple project with Tailwind", - }, - { - name: "baklava", - repo: "https://github.com/kasperpeulen/baklava", - branch: "eval-baseline", - description: "Component library with Zustand", - }, - { - name: "echarts", - repo: "https://github.com/kasperpeulen/echarts-react", - branch: "eval-baseline", - description: "ECharts React wrapper", - }, - { - name: "evergreen-ci", - repo: "https://github.com/kasperpeulen/ui", - branch: "eval-baseline", - projectDir: "packages/lib", - description: "GraphQL", - }, -]; - -export const agents: Record = { - claude: claudeAgent, - codex: codexAgent, -}; diff --git a/scripts/eval/eval-parallel.ts b/scripts/eval/eval-parallel.ts index d0032e9f78c6..a66a5d633edf 100644 --- a/scripts/eval/eval-parallel.ts +++ b/scripts/eval/eval-parallel.ts @@ -4,11 +4,10 @@ import { spawn } from "node:child_process"; import { createInterface } from "node:readline"; import { parseArgs } from "node:util"; import pc from "picocolors"; -import { AGENTS } from "./types.ts"; +import { AGENTS, PROJECTS } from "./types.ts"; import type { TrialResult } from "./types.ts"; -import { PROJECTS } from "./config.ts"; -import { listPrompts } from "./lib/generate-prompt.ts"; -import { formatDuration, formatCost } from "./lib/utils.ts"; +import { formatDuration, formatCost, listPrompts } from "./lib/utils.ts"; +import { RESULT_SENTINEL } from "./eval.ts"; const { values: opts } = parseArgs({ options: { @@ -58,8 +57,8 @@ function spawnRun(agent: string, model: string, prompt: string, label: string): let result: TrialResult | null = null; createInterface({ input: child.stdout! }).on("line", (line) => { - if (line.startsWith("__RESULT__")) { - try { result = JSON.parse(line.slice("__RESULT__".length)); } catch { /* skip */ } + if (line.startsWith(RESULT_SENTINEL)) { + try { result = JSON.parse(line.slice(RESULT_SENTINEL.length)); } catch { /* skip */ } } else { console.log(`${tag} ${line}`); } @@ -84,13 +83,13 @@ if (results.length > 0) { results.sort((a, b) => (b.grading.ghostStories?.successRate ?? -1) - (a.grading.ghostStories?.successRate ?? -1)); console.log(pc.bold("\n\nResults (sorted by ghost stories rate)")); - console.log("=".repeat(120)); + console.log("=".repeat(130)); console.log( - ["Agent", "Model", "Prompt", "Build", "Ghost", "TS Err", "Cost", "Time", "Turns"] + ["Agent", "Model", "Prompt", "Build", "Ghost", "TS Err", "Score", "Cost", "Time", "Turns"] .map((h, i) => h.padEnd(i <= 1 ? 14 : i === 2 ? 12 : 10)) .join(" | "), ); - console.log("-".repeat(120)); + console.log("-".repeat(130)); for (const r of results) { const ghost = r.grading.ghostStories; @@ -103,6 +102,7 @@ if (results.length > 0) { (r.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL")).padEnd(10 + 10), ghostStr.padEnd(10), String(r.grading.typeCheckErrors).padEnd(10), + String(r.quality.score).padEnd(10), formatCost(r.execution.cost).padEnd(10), formatDuration(r.execution.duration).padEnd(10), String(r.execution.turns).padEnd(10), @@ -110,7 +110,7 @@ if (results.length > 0) { ); } - console.log("-".repeat(120)); + console.log("-".repeat(130)); const totalCost = results.reduce((s, r) => s + (r.execution.cost || 0), 0); const ghostRates = results.map((r) => r.grading.ghostStories?.successRate).filter((r): r is number => r != null); const avgGhost = ghostRates.length > 0 ? ghostRates.reduce((s, r) => s + r, 0) / ghostRates.length : 0; diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 6797b04d8e70..144f520705ad 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -2,11 +2,12 @@ import { randomUUID } from "node:crypto"; import { parseArgs } from "node:util"; import pc from "picocolors"; import type { TrialConfig, AgentName, Effort } from "./types.ts"; -import { AGENTS } from "./types.ts"; -import { PROJECTS } from "./config.ts"; +import { AGENTS, PROJECTS } from "./types.ts"; import { runTask } from "./lib/run-task.ts"; -import { listPrompts } from "./lib/generate-prompt.ts"; -import { log, formatDuration, formatCost } from "./lib/utils.ts"; +import { log, formatDuration, formatCost, listPrompts } from "./lib/utils.ts"; + +/** Sentinel for structured IPC with eval-parallel.ts. */ +export const RESULT_SENTINEL = "__EVAL_RESULT_d3f1a8b2__"; const { values: opts } = parseArgs({ options: { @@ -50,7 +51,6 @@ let agent: AgentName; let model: string; if (opts.model) { - // Find which agent owns this model const match = Object.entries(AGENTS).find(([, cfg]) => cfg.models.includes(opts.model as string)); if (!match) { const all = Object.values(AGENTS).flatMap((cfg) => cfg.models); @@ -95,11 +95,12 @@ try { log(` Build: ${result.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL")}`); log(` Ghost: ${ghostStr}`); log(` TS Err: ${result.grading.typeCheckErrors}`); + log(` Score: ${result.quality.score}`); log(` Cost: ${formatCost(result.execution.cost)}`); log(` Time: ${formatDuration(result.execution.duration)}`); log(` Turns: ${result.execution.turns}`); - console.log(`__RESULT__${JSON.stringify(result)}`); + console.log(`${RESULT_SENTINEL}${JSON.stringify(result)}`); } catch (error) { log(pc.red(`\nFailed: ${error instanceof Error ? error.message : error}`)); process.exit(1); diff --git a/scripts/eval/google-apps-script.js b/scripts/eval/google-apps-script.js index 164d8a059bdd..bbe56318fa22 100644 --- a/scripts/eval/google-apps-script.js +++ b/scripts/eval/google-apps-script.js @@ -17,55 +17,52 @@ * Click "Review Permissions" → Select account → "Advanced" → "Go to [project] (unsafe)" → "Allow" */ -function toTitleCase(key) { - return key +const toTitleCase = (key) => + key .replace(/([A-Z])/g, " $1") .replace(/^./, (str) => str.toUpperCase()) .trim(); -} -function ensureHeaders(sheet, keys) { - const firstHeaderCell = sheet.getRange(1, 1).getValue(); - if (firstHeaderCell === "") { +const ensureHeaders = (sheet, keys) => { + if (sheet.getRange(1, 1).getValue() === "") { const headers = keys.map(toTitleCase); sheet.getRange(1, 1, 1, headers.length).setValues([headers]); sheet.getRange(1, 1, 1, headers.length).setFontWeight("bold"); } -} +}; -function appendRow(sheet, keys, rowData) { - var lock = LockService.getScriptLock(); +const appendRow = (sheet, rowData) => { + const lock = LockService.getScriptLock(); try { lock.waitLock(120000); - var lastRow = sheet.getLastRow(); - var targetRow = lastRow < 1 ? 2 : lastRow + 1; + const lastRow = sheet.getLastRow(); + const targetRow = lastRow < 1 ? 2 : lastRow + 1; sheet.getRange(targetRow, 1, 1, rowData.length).setValues([rowData]); SpreadsheetApp.flush(); return targetRow; } finally { lock.releaseLock(); } -} +}; -function prepareRowData(keys, data) { - return keys.map(function (key) { - var value = data[key]; +const prepareRowData = (keys, data) => + keys.map((key) => { + const value = data[key]; if (typeof value === "boolean") return value ? "TRUE" : "FALSE"; if (value === null || value === undefined) return ""; return value; }); -} // eslint-disable-next-line @typescript-eslint/no-unused-vars function doPost(e) { try { - var data = JSON.parse(e.postData.contents); - var sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet(); - var keys = Object.keys(data); - var rowData = prepareRowData(keys, data); + const data = JSON.parse(e.postData.contents); + const sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet(); + const keys = Object.keys(data); + const rowData = prepareRowData(keys, data); ensureHeaders(sheet, keys); - var targetRow = appendRow(sheet, keys, rowData); + const targetRow = appendRow(sheet, rowData); return ContentService.createTextOutput( JSON.stringify({ success: true, row: targetRow }), @@ -79,7 +76,7 @@ function doPost(e) { // eslint-disable-next-line @typescript-eslint/no-unused-vars function authorize() { - var spreadsheet = SpreadsheetApp.getActiveSpreadsheet(); - var file = DriveApp.getFileById(spreadsheet.getId()); + const spreadsheet = SpreadsheetApp.getActiveSpreadsheet(); + const file = DriveApp.getFileById(spreadsheet.getId()); console.log("Authorized! File:", file.getName()); } diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index 976be34b07f6..08fc66e216c7 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -2,7 +2,7 @@ import type { SDKMessage } from "@anthropic-ai/claude-agent-sdk"; import { query } from "@anthropic-ai/claude-agent-sdk"; import { writeFileSync } from "node:fs"; import { join } from "node:path"; -import type { Agent, Effort, ExecutionResult, SupportedModel } from "../../types.ts"; +import type { Agent, Effort, ExecutionResult } from "../../types.ts"; function logMessage(message: SDKMessage) { const log = (prefix: string, text: string) => process.stderr.write(`${prefix} ${text}\n`); @@ -69,6 +69,8 @@ function logMessage(message: SDKMessage) { } } +const MAX_TURNS = 50; + /** Map clean model names to Claude SDK model IDs */ const CLAUDE_MODEL_MAP: Record = { "sonnet-4.6": "claude-sonnet-4-6", @@ -82,7 +84,7 @@ export const claudeAgent: Agent = { async execute( prompt: string, projectPath: string, - model: SupportedModel, + model: string, options?: { effort?: Effort; resultsDir?: string }, ): Promise { const { effort = "high", resultsDir } = options ?? {}; @@ -99,7 +101,7 @@ export const claudeAgent: Agent = { model: CLAUDE_MODEL_MAP[model] ?? model, cwd: projectPath, allowedTools: ["Read", "Write", "Edit", "Bash", "Glob", "Grep"], - maxTurns: 50, + maxTurns: MAX_TURNS, effort, debug: true, systemPrompt: { type: "preset", preset: "claude_code" }, diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts index da7523839f28..6b8e096fc2b5 100644 --- a/scripts/eval/lib/agents/codex.ts +++ b/scripts/eval/lib/agents/codex.ts @@ -1,7 +1,28 @@ import { Codex } from "@openai/codex-sdk"; import { writeFileSync } from "node:fs"; import { join } from "node:path"; -import type { Agent, Effort, ExecutionResult, SupportedModel } from "../../types.ts"; +import type { Agent, Effort, ExecutionResult } from "../../types.ts"; + +/** Per-million-token pricing for Codex/OpenAI models (USD). */ +const OPENAI_PRICING: Record = { + "gpt-5.4": { input: 2.50, cachedInput: 0.625, output: 10.00 }, +}; + +function estimateCost( + model: string, + inputTokens: number, + cachedInputTokens: number, + outputTokens: number, +): number | undefined { + const pricing = OPENAI_PRICING[model]; + if (!pricing) return undefined; + const freshInput = inputTokens - cachedInputTokens; + return ( + (freshInput / 1_000_000) * pricing.input + + (cachedInputTokens / 1_000_000) * pricing.cachedInput + + (outputTokens / 1_000_000) * pricing.output + ); +} const CODEX_EFFORT: Record = { low: "low", @@ -16,7 +37,7 @@ export const codexAgent: Agent = { async execute( prompt: string, projectPath: string, - model: SupportedModel, + model: string, options?: { effort?: Effort; verbose?: boolean; resultsDir?: string }, ): Promise { const { effort = "high", resultsDir } = options ?? {}; @@ -33,7 +54,10 @@ export const codexAgent: Agent = { const { events } = await thread.runStreamed(prompt); const items: unknown[] = []; - // Token tracking not yet exposed in result — logged per-turn for visibility + let totalInput = 0; + let totalCached = 0; + let totalOutput = 0; + let turns = 0; for await (const event of events) { switch (event.type) { @@ -63,6 +87,10 @@ export const codexAgent: Agent = { break; } case "turn.completed": + totalInput += event.usage.input_tokens; + totalCached += event.usage.cached_input_tokens; + totalOutput += event.usage.output_tokens; + turns++; log("📊", `tokens: ${event.usage.input_tokens}in / ${event.usage.output_tokens}out (${event.usage.cached_input_tokens} cached)`); break; case "turn.failed": @@ -75,12 +103,13 @@ export const codexAgent: Agent = { } const duration = (Date.now() - startTime) / 1000; - log("✅", `Done — ${items.length} items, ${Math.round(duration)}s`); + const cost = estimateCost(model, totalInput, totalCached, totalOutput); + log("✅", `Done — ${turns} turns, ${Math.round(duration)}s, ${totalInput}in/${totalOutput}out tokens${cost != null ? `, $${cost.toFixed(4)}` : ""}`); if (resultsDir) { writeFileSync(join(resultsDir, "transcript.json"), JSON.stringify(items, null, 2)); } - return { agent: "codex", model, effort, duration, turns: items.length }; + return { agent: "codex", model, effort, cost, duration, turns }; }, }; diff --git a/scripts/eval/lib/generate-prompt.test.ts b/scripts/eval/lib/generate-prompt.test.ts deleted file mode 100644 index 070fd972bfae..000000000000 --- a/scripts/eval/lib/generate-prompt.test.ts +++ /dev/null @@ -1,46 +0,0 @@ -import { describe, expect, it } from 'vitest'; - -import { generatePrompt, listPrompts } from './generate-prompt'; - -describe('listPrompts', () => { - it('lists available prompt names', () => { - const prompts = listPrompts(); - expect(prompts).toContain('setup'); - expect(prompts).toContain('self-heal'); - }); - - it('returns only names without .md extension', () => { - const prompts = listPrompts(); - for (const name of prompts) { - expect(name).not.toContain('.md'); - } - }); -}); - -describe('generatePrompt', () => { - it('loads setup prompt by default', () => { - const prompt = generatePrompt(); - expect(prompt).toContain('Storybook'); - expect(prompt.length).toBeGreaterThan(0); - }); - - it('loads setup prompt by name', () => { - const prompt = generatePrompt('setup'); - expect(prompt).toContain('Storybook setup'); - }); - - it('loads self-heal prompt', () => { - const prompt = generatePrompt('self-heal'); - expect(prompt).toContain('Self-healing'); - expect(prompt).toContain('vitest'); - }); - - it('throws for unknown prompt', () => { - expect(() => generatePrompt('nonexistent-prompt-xyz')).toThrow('Prompt not found'); - }); - - it('returns trimmed content', () => { - const prompt = generatePrompt('setup'); - expect(prompt).toBe(prompt.trim()); - }); -}); diff --git a/scripts/eval/lib/generate-prompt.ts b/scripts/eval/lib/generate-prompt.ts deleted file mode 100644 index 1debad6620ba..000000000000 --- a/scripts/eval/lib/generate-prompt.ts +++ /dev/null @@ -1,20 +0,0 @@ -import { readFileSync, existsSync, readdirSync } from "node:fs"; -import { resolve, basename } from "node:path"; -import { PROMPTS_DIR } from "./utils.ts"; - -/** Load a prompt by name from prompts/{name}.md. Defaults to "setup". */ -export function generatePrompt(name = "setup"): string { - const file = resolve(PROMPTS_DIR, `${name}.md`); - if (!existsSync(file)) { - throw new Error(`Prompt not found: ${file}\nAvailable: ${listPrompts().join(", ")}`); - } - return readFileSync(file, "utf-8").trim(); -} - -/** List available prompt names. */ -export function listPrompts(): string[] { - if (!existsSync(PROMPTS_DIR)) return []; - return readdirSync(PROMPTS_DIR) - .filter((f) => f.endsWith(".md")) - .map((f) => basename(f, ".md")); -} diff --git a/scripts/eval/lib/grade.test.ts b/scripts/eval/lib/grade.test.ts index 1576cf287793..ee68248aea7d 100644 --- a/scripts/eval/lib/grade.test.ts +++ b/scripts/eval/lib/grade.test.ts @@ -29,8 +29,7 @@ describe('filterStorybookFiles', () => { { path: 'src/Button.tsx', status: 'M' }, { path: 'src/Button.test.tsx', status: 'M' }, ]; - const result = filterStorybookFiles(files); - expect(result).toHaveLength(4); + expect(filterStorybookFiles(files)).toHaveLength(4); }); it('returns empty for no storybook files', () => { @@ -47,44 +46,86 @@ describe('filterStorybookFiles', () => { }); describe('computeQualityScore', () => { - it('returns 1.0 for passing build and zero TS errors', () => { - const result = computeQualityScore(true, 0); + // Weights: 40% ghost, 25% build, 25% typecheck, 10% performance + + it('returns 1.0 when everything passes and agent is fast', () => { + const result = computeQualityScore({ + buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 60, + }); expect(result.score).toBe(1); - expect(result.breakdown.build).toBe(1); - expect(result.breakdown.typecheck).toBe(1); + expect(result.breakdown).toEqual({ build: 1, typecheck: 1, ghostStories: 1, performance: 1 }); + }); + + it('ghost stories have 40% weight', () => { + const result = computeQualityScore({ + buildSuccess: false, typeCheckErrors: 20, ghostSuccessRate: 1.0, durationSeconds: 600, + }); + expect(result.score).toBe(0.4); }); - it('returns 0.7 for passing build with many TS errors', () => { - const result = computeQualityScore(true, 100); - expect(result.score).toBe(0.7); - expect(result.breakdown.build).toBe(1); - expect(result.breakdown.typecheck).toBe(0); + it('build has 25% weight', () => { + const result = computeQualityScore({ + buildSuccess: true, typeCheckErrors: 20, ghostSuccessRate: 0, durationSeconds: 600, + }); + expect(result.score).toBe(0.25); }); - it('returns 0.3 for failing build with zero TS errors', () => { - const result = computeQualityScore(false, 0); - expect(result.score).toBe(0.3); - expect(result.breakdown.build).toBe(0); - expect(result.breakdown.typecheck).toBe(1); + it('performance has 10% weight', () => { + const result = computeQualityScore({ + buildSuccess: false, typeCheckErrors: 20, ghostSuccessRate: 0, durationSeconds: 60, + }); + expect(result.score).toBe(0.1); }); - it('returns 0 for failing build with many TS errors', () => { - const result = computeQualityScore(false, 20); + it('returns 0 when everything fails', () => { + const result = computeQualityScore({ + buildSuccess: false, typeCheckErrors: 20, ghostSuccessRate: 0, durationSeconds: 600, + }); expect(result.score).toBe(0); - expect(result.breakdown.build).toBe(0); - expect(result.breakdown.typecheck).toBe(0); }); it('scales typecheck score linearly', () => { - // 10 errors -> tcScore = 1 - 10/20 = 0.5 - const result = computeQualityScore(true, 10); - expect(result.score).toBe(0.85); // 0.7 + 0.5*0.3 + const result = computeQualityScore({ + buildSuccess: true, typeCheckErrors: 10, ghostSuccessRate: 1.0, durationSeconds: 60, + }); expect(result.breakdown.typecheck).toBe(0.5); }); it('clamps typecheck score at 0 for >= 20 errors', () => { - expect(computeQualityScore(true, 20).breakdown.typecheck).toBe(0); - expect(computeQualityScore(true, 50).breakdown.typecheck).toBe(0); + const a = computeQualityScore({ buildSuccess: true, typeCheckErrors: 20, ghostSuccessRate: 1.0, durationSeconds: 60 }); + const b = computeQualityScore({ buildSuccess: true, typeCheckErrors: 50, ghostSuccessRate: 1.0, durationSeconds: 60 }); + expect(a.breakdown.typecheck).toBe(0); + expect(b.breakdown.typecheck).toBe(0); + }); + + it('treats undefined ghost stories as 0', () => { + const a = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 0, durationSeconds: 60 }); + const b = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, durationSeconds: 60 }); + expect(a.score).toBe(b.score); + }); + + it('performance: ≤120s scores 1.0', () => { + const a = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 0 }); + const b = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 120 }); + expect(a.breakdown.performance).toBe(1); + expect(b.breakdown.performance).toBe(1); + }); + + it('performance: 360s scores 0.5', () => { + const r = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 360 }); + expect(r.breakdown.performance).toBe(0.5); + }); + + it('performance: ≥600s scores 0', () => { + const a = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 600 }); + const b = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 1000 }); + expect(a.breakdown.performance).toBe(0); + expect(b.breakdown.performance).toBe(0); + }); + + it('performance: undefined duration scores 0', () => { + const r = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0 }); + expect(r.breakdown.performance).toBe(0); }); }); @@ -104,8 +145,7 @@ describe('countTypeCheckErrors', () => { }); it('counts multiple errors on the same line', () => { - const output = 'error TS1234 and error TS5678 on same line'; - expect(countTypeCheckErrors(output)).toBe(2); + expect(countTypeCheckErrors('error TS1234 and error TS5678 on same line')).toBe(2); }); it('does not count non-error TS references', () => { @@ -117,8 +157,7 @@ describe('countTypeCheckErrors', () => { describe('parseChangedFiles', () => { it('parses added, modified, deleted, and renamed files', () => { const output = 'A\tsrc/new-file.ts\nM\tsrc/existing.ts\nD\tsrc/removed.ts\nR100\told.ts\tnew.ts'; - const result = parseChangedFiles(output); - expect(result).toEqual([ + expect(parseChangedFiles(output)).toEqual([ { path: 'src/new-file.ts', status: 'A' }, { path: 'src/existing.ts', status: 'M' }, { path: 'src/removed.ts', status: 'D' }, @@ -132,7 +171,6 @@ describe('parseChangedFiles', () => { }); it('handles single file', () => { - const result = parseChangedFiles('M\tpackage.json'); - expect(result).toEqual([{ path: 'package.json', status: 'M' }]); + expect(parseChangedFiles('M\tpackage.json')).toEqual([{ path: 'package.json', status: 'M' }]); }); }); diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index 00ac19c1ccd0..10d75eeefea1 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -12,12 +12,34 @@ export function filterStorybookFiles(changedFiles: ChangedFile[]): ChangedFile[] ); } -/** Compute quality score: 70% build + 30% typecheck. */ -export function computeQualityScore(buildSuccess: boolean, typeCheckErrors: number): QualityResult { - const buildScore = buildSuccess ? 1 : 0; - const tcScore = Math.max(0, 1 - typeCheckErrors / 20); - const score = Math.round((buildScore * 0.7 + tcScore * 0.3) * 100) / 100; - return { score, breakdown: { build: buildScore, typecheck: Math.round(tcScore * 100) / 100 } }; +/** + * Compute quality score. + * + * Weights: 40% ghost stories, 25% build, 25% typecheck, 10% performance. + * + * Performance is scored on a curve: ≤120s → 1.0, 600s → 0, linear between. + */ +export function computeQualityScore(opts: { + buildSuccess: boolean; + typeCheckErrors: number; + ghostSuccessRate?: number; + durationSeconds?: number; +}): QualityResult { + const buildScore = opts.buildSuccess ? 1 : 0; + const tcScore = Math.max(0, 1 - opts.typeCheckErrors / 20); + const ghostScore = opts.ghostSuccessRate ?? 0; + const d = opts.durationSeconds; + const perfScore = d == null ? 0 : Math.max(0, Math.min(1, 1 - (d - 120) / 480)); + const score = Math.round((ghostScore * 0.4 + buildScore * 0.25 + tcScore * 0.25 + perfScore * 0.1) * 100) / 100; + return { + score, + breakdown: { + build: buildScore, + typecheck: Math.round(tcScore * 100) / 100, + ghostStories: Math.round(ghostScore * 100) / 100, + performance: Math.round(perfScore * 100) / 100, + }, + }; } /** Count TypeScript errors from tsc output. */ @@ -37,7 +59,10 @@ export function parseChangedFiles(gitOutput: string): ChangedFile[] { }); } -export async function grade(paths: TrialPaths): Promise<{ grading: GradingResult; quality: QualityResult }> { +export async function grade( + paths: TrialPaths, + agentDuration?: number, +): Promise<{ grading: GradingResult; quality: QualityResult }> { const { repoRoot, projectPath, resultsDir, baselineCommit } = paths; // Changed files @@ -59,7 +84,8 @@ export async function grade(paths: TrialPaths): Promise<{ grading: GradingResult env: { ...process.env, STORYBOOK_DISABLE_TELEMETRY: "1", NODE_OPTIONS: "--max_old_space_size=4096" }, }); const buildSuccess = build.exitCode === 0; - writeFileSync(join(resultsDir, "build-output.txt"), build.stdout + "\n" + build.stderr); + const buildOutput = build.stdout + "\n" + build.stderr; + writeFileSync(join(resultsDir, "build-output.txt"), buildOutput); if (buildSuccess) { logSuccess("Storybook build succeeded"); } else { @@ -83,7 +109,7 @@ export async function grade(paths: TrialPaths): Promise<{ grading: GradingResult const grading: GradingResult = { buildSuccess, - buildError: buildSuccess ? undefined : (build.stdout + "\n" + build.stderr).slice(-2000), + buildError: buildSuccess ? undefined : buildOutput.slice(-2000), typeCheckErrors, typeCheckOutput: typeCheckErrors > 0 ? tscOutput.slice(-2000) : undefined, changedFiles, @@ -92,12 +118,19 @@ export async function grade(paths: TrialPaths): Promise<{ grading: GradingResult ghostStories, }; - const quality = computeQualityScore(buildSuccess, typeCheckErrors); + const quality = computeQualityScore({ + buildSuccess, + typeCheckErrors, + ghostSuccessRate: ghostStories?.successRate, + durationSeconds: agentDuration, + }); return { grading, quality }; } async function getChangedFiles(repoRoot: string, baseline: string): Promise { + // Stage all files so `git diff --cached` picks up new files the agent created. + // Safe: this runs on an ephemeral trial copy, not the real repo. await exec("git", ["add", "-A"], { cwd: repoRoot }); const { stdout } = await exec("git", ["diff", "--cached", "--name-status", baseline], { cwd: repoRoot, diff --git a/scripts/eval/lib/grading-pipeline.test.ts b/scripts/eval/lib/grading-pipeline.test.ts index 259ab8cdee18..17906ceb8b6b 100644 --- a/scripts/eval/lib/grading-pipeline.test.ts +++ b/scripts/eval/lib/grading-pipeline.test.ts @@ -101,8 +101,8 @@ describe('grading pipeline', () => { // Total includes package.json expect(changedFiles).toHaveLength(storybookFiles.length + 1); - // Step 4: Build passed, no TS errors → perfect score - const quality = computeQualityScore(true, 0); + // Step 4: Build passed, no TS errors, 100% ghost stories, fast agent → perfect score + const quality = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 60 }); expect(quality.score).toBe(1); }); @@ -133,8 +133,8 @@ describe('grading pipeline', () => { const errorCount = countTypeCheckErrors(tscLines.join('\n')); expect(errorCount).toBe(candidates.length + 1); - // Build failed + errors → low quality - const quality = computeQualityScore(false, errorCount); + // Build failed, no ghost stories, errors, slow → low quality + const quality = computeQualityScore({ buildSuccess: false, typeCheckErrors: errorCount, ghostSuccessRate: 0, durationSeconds: 600 }); expect(quality.score).toBeLessThan(0.3); expect(quality.breakdown.build).toBe(0); }); @@ -167,8 +167,8 @@ describe('grading pipeline', () => { const storybookFiles = filterStorybookFiles(parseChangedFiles(gitOutput)); expect(storybookFiles).toHaveLength(candidates.length); - // Clean build → perfect - expect(computeQualityScore(true, 0).score).toBe(1); + // Clean build + 100% ghost stories + fast → perfect + expect(computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 60 }).score).toBe(1); }); }); diff --git a/scripts/eval/lib/run-task.test.ts b/scripts/eval/lib/run-task.test.ts index d30868c8d3e7..77a24ef78e45 100644 --- a/scripts/eval/lib/run-task.test.ts +++ b/scripts/eval/lib/run-task.test.ts @@ -16,25 +16,19 @@ vi.mock('./grade', () => ({ vi.mock('./save', () => ({ captureEnvironment: vi.fn().mockResolvedValue({ nodeVersion: 'v22.21.1', - gitBranch: 'test-branch', - gitCommit: 'abc123', + evalBranch: 'test-branch', + evalCommit: 'abc123', }), saveToGoogleSheets: vi.fn().mockResolvedValue(undefined), })); -vi.mock('../config', () => ({ - agents: { - claude: { - name: 'claude', - execute: vi.fn(), - }, - codex: { - name: 'codex', - execute: vi.fn(), - }, - }, +vi.mock('./agents/claude-code', () => ({ + claudeAgent: { name: 'claude', execute: vi.fn() }, +})); +vi.mock('./agents/codex', () => ({ + codexAgent: { name: 'codex', execute: vi.fn() }, })); -import { agents } from '../config'; +import { claudeAgent } from './agents/claude-code'; import { grade } from './grade'; import { prepareTrial } from './prepare-trial'; import { runTask } from './run-task'; @@ -67,7 +61,7 @@ function setupMocks(overrides?: { baselineCommit: 'deadbeef', }); - vi.mocked(agents.claude.execute).mockResolvedValue({ + vi.mocked(claudeAgent.execute).mockResolvedValue({ agent: 'claude', model: 'sonnet-4.6', effort: 'high', @@ -90,7 +84,7 @@ function setupMocks(overrides?: { ], setupPatterns: [{ id: 'tailwind', label: 'Tailwind CSS', sourceFiles: ['.storybook/preview.ts'] }], }, - quality: { score: buildSuccess ? 1 : 0.3, breakdown: { build: buildSuccess ? 1 : 0, typecheck: 1 } }, + quality: { score: buildSuccess ? 1 : 0.3, breakdown: { build: buildSuccess ? 1 : 0, typecheck: 1, ghostStories: 0, performance: 0 } }, }); } @@ -152,7 +146,7 @@ describe('runTask pipeline', () => { expect(vi.mocked(captureEnvironment).mock.calls[0][0]).toBe(join(TMP, 'results')); // Agent receives real prompt content, the project path, model, and options - const [prompt, projectPath, model, options] = vi.mocked(agents.claude.execute).mock.calls[0]; + const [prompt, projectPath, model, options] = vi.mocked(claudeAgent.execute).mock.calls[0]; expect(prompt).toContain('Storybook setup'); expect(projectPath).toBe(TMP); expect(model).toBe('sonnet-4.6'); @@ -167,7 +161,7 @@ describe('runTask pipeline', () => { const [savedResult, savedEnv, savedRunId, savedUploadId] = vi.mocked(saveToGoogleSheets).mock.calls[0]; expect(savedResult.project).toBe('mealdrop'); - expect(savedEnv.gitBranch).toBe('test-branch'); + expect(savedEnv.evalBranch).toBe('test-branch'); expect(savedRunId).toBe('run-1'); expect(savedUploadId).toBe('upload-1'); }); @@ -213,7 +207,7 @@ describe('runTask pipeline', () => { }; }); - vi.mocked(agents.claude.execute).mockImplementation(async () => { + vi.mocked(claudeAgent.execute).mockImplementation(async () => { callOrder.push('agent'); return { agent: 'claude', model: 'sonnet-4.6', effort: 'high', cost: 0.1, duration: 10, turns: 3 }; }); @@ -228,7 +222,7 @@ describe('runTask pipeline', () => { storybookFiles: [], setupPatterns: [], }, - quality: { score: 1, breakdown: { build: 1, typecheck: 1 } }, + quality: { score: 1, breakdown: { build: 1, typecheck: 1, ghostStories: 0, performance: 0 } }, }; }); diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-task.ts index 4b8b09287a5a..b38de8801f63 100644 --- a/scripts/eval/lib/run-task.ts +++ b/scripts/eval/lib/run-task.ts @@ -1,14 +1,19 @@ import { writeFileSync } from "node:fs"; import { join } from "node:path"; -import type { TrialConfig, TrialResult } from "../types.ts"; -import { agents } from "../config.ts"; +import type { AgentName, TrialConfig, TrialResult, Agent } from "../types.ts"; +import { claudeAgent } from "./agents/claude-code.ts"; +import { codexAgent } from "./agents/codex.ts"; import { prepareTrial } from "./prepare-trial.ts"; -import { generatePrompt } from "./generate-prompt.ts"; import { grade } from "./grade.ts"; import { captureEnvironment, saveToGoogleSheets } from "./save.ts"; -import { generateTrialId, createLogger } from "./utils.ts"; +import { generateTrialId, generatePrompt, createLogger } from "./utils.ts"; import type { Logger } from "./utils.ts"; +const agents: Record = { + claude: claudeAgent, + codex: codexAgent, +}; + /** * Run a full eval trial: prepare -> execute agent -> grade -> save. */ @@ -47,8 +52,8 @@ export async function runTask( `Agent completed (${Math.round(execution.duration)}s, ${execution.cost ? `$${execution.cost.toFixed(2)}` : "cost N/A"}, ${execution.turns} turns)`, ); - // 5. Grade the results - const { grading, quality } = await grade(paths); + // 5. Grade the results (pass agent duration for performance scoring) + const { grading, quality } = await grade(paths, execution.duration); // 6. Assemble final result const result: TrialResult = { diff --git a/scripts/eval/lib/save.ts b/scripts/eval/lib/save.ts index 266461cee4e6..250dff2b6b08 100644 --- a/scripts/eval/lib/save.ts +++ b/scripts/eval/lib/save.ts @@ -7,20 +7,22 @@ const GOOGLE_SHEETS_URL = process.env.EVAL_GOOGLE_SHEETS_URL; export interface Environment { nodeVersion: string; - gitBranch: string; - gitCommit: string; + /** Git branch of the eval harness (storybook monorepo), not the evaluated project. */ + evalBranch: string; + /** Git commit of the eval harness (storybook monorepo), not the evaluated project. */ + evalCommit: string; } export async function captureEnvironment(resultsDir: string): Promise { - let gitBranch = "unknown"; - let gitCommit = "unknown"; + let evalBranch = "unknown"; + let evalCommit = "unknown"; try { - gitBranch = (await exec("git", ["rev-parse", "--abbrev-ref", "HEAD"])).stdout.trim(); - gitCommit = (await exec("git", ["rev-parse", "HEAD"])).stdout.trim(); + evalBranch = (await exec("git", ["rev-parse", "--abbrev-ref", "HEAD"])).stdout.trim(); + evalCommit = (await exec("git", ["rev-parse", "HEAD"])).stdout.trim(); } catch { /* not in a git repo */ } - const env = { nodeVersion: process.version, gitBranch, gitCommit }; + const env: Environment = { nodeVersion: process.version, evalBranch, evalCommit }; writeFileSync(join(resultsDir, "environment.json"), JSON.stringify(env, null, 2)); return env; } @@ -59,8 +61,8 @@ export async function saveToGoogleSheets( cost: result.execution.cost ?? "unknown", duration: result.execution.duration, turns: result.execution.turns, - gitBranch: env.gitBranch, - gitCommit: env.gitCommit, + evalBranch: env.evalBranch, + evalCommit: env.evalCommit, }; try { diff --git a/scripts/eval/lib/setup-patterns.ts b/scripts/eval/lib/setup-patterns.ts index d7a022f891f7..64eac26bac2e 100644 --- a/scripts/eval/lib/setup-patterns.ts +++ b/scripts/eval/lib/setup-patterns.ts @@ -2,17 +2,17 @@ import { readFileSync, existsSync, globSync } from "node:fs"; import { join, relative } from "node:path"; import type { SetupPattern } from "../types.ts"; -const RULES: Array<[id: string, label: string, pattern: RegExp]> = [ - ["global-css", "Global CSS import", /import\s+['"][^'"]+\.(css|scss|sass|less)['"]|import\s+['"]tailwindcss/], - ["tailwind", "Tailwind CSS", /@tailwind|tailwindcss|tailwind\.css/], - ["styled-components", "Styled Components", /styled-components|createGlobalStyle/], - ["router-provider", "React Router", /MemoryRouter|BrowserRouter|RouterProvider/], - ["redux-provider", "Redux Provider", /react-redux.*Provider| join(dir, f)); const results: SetupPattern[] = []; - for (const [id, label, pattern] of RULES) { + for (const { id, label, pattern } of RULES) { const matches = files.filter((f) => { try { return pattern.test(readFileSync(f, "utf-8")); diff --git a/scripts/eval/lib/utils.test.ts b/scripts/eval/lib/utils.test.ts index 2e64599d8aa6..c213bf3b4298 100644 --- a/scripts/eval/lib/utils.test.ts +++ b/scripts/eval/lib/utils.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest'; -import { formatDuration, formatCost, generateTrialId } from './utils'; +import { formatDuration, formatCost, generateTrialId, generatePrompt, listPrompts } from './utils'; describe('formatDuration', () => { it('formats seconds under a minute', () => { @@ -54,3 +54,46 @@ describe('generateTrialId', () => { expect(a).not.toBe(b); }); }); + +describe('listPrompts', () => { + it('lists available prompt names', () => { + const prompts = listPrompts(); + expect(prompts).toContain('setup'); + expect(prompts).toContain('self-heal'); + }); + + it('returns only names without .md extension', () => { + for (const name of listPrompts()) { + expect(name).not.toContain('.md'); + } + }); +}); + +describe('generatePrompt', () => { + it('loads setup prompt by default', () => { + const prompt = generatePrompt(); + expect(prompt).toContain('Storybook'); + expect(prompt.length).toBeGreaterThan(0); + }); + + it('loads setup prompt by name', () => { + const prompt = generatePrompt('setup'); + expect(prompt).toContain('Storybook setup'); + expect(prompt).not.toContain('React + Vite'); + }); + + it('loads self-heal prompt', () => { + const prompt = generatePrompt('self-heal'); + expect(prompt).toContain('Self-healing'); + expect(prompt).toContain('vitest'); + }); + + it('throws for unknown prompt', () => { + expect(() => generatePrompt('nonexistent-prompt-xyz')).toThrow('Prompt not found'); + }); + + it('returns trimmed content', () => { + const prompt = generatePrompt('setup'); + expect(prompt).toBe(prompt.trim()); + }); +}); diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts index c02fadfe88f3..9f8aa00b79fb 100644 --- a/scripts/eval/lib/utils.ts +++ b/scripts/eval/lib/utils.ts @@ -1,4 +1,5 @@ -import { resolve } from "node:path"; +import { readFileSync, existsSync, readdirSync } from "node:fs"; +import { resolve, basename } from "node:path"; import pc from "picocolors"; import { x } from "tinyexec"; @@ -29,6 +30,8 @@ export const logStep = defaultLogger.logStep; export const logSuccess = defaultLogger.logSuccess; export const logError = defaultLogger.logError; +// --- Formatting --- + export const formatDuration = (s: number) => s < 60 ? `${Math.round(s)}s` : `${Math.floor(s / 60)}m${Math.round(s % 60)}s`; @@ -39,6 +42,25 @@ export function generateTrialId(project: string, agent: string, model: string, p return `${ts}-${project}-${model}-${prompt}-${crypto.randomUUID().slice(0, 8)}`; } +// --- Prompts --- + +/** Load a prompt by name from prompts/{name}.md. Defaults to "setup". */ +export function generatePrompt(name = "setup"): string { + const file = resolve(PROMPTS_DIR, `${name}.md`); + if (!existsSync(file)) { + throw new Error(`Prompt not found: ${file}\nAvailable: ${listPrompts().join(", ")}`); + } + return readFileSync(file, "utf-8").trim(); +} + +/** List available prompt names. */ +export function listPrompts(): string[] { + if (!existsSync(PROMPTS_DIR)) return []; + return readdirSync(PROMPTS_DIR) + .filter((f) => f.endsWith(".md")) + .map((f) => basename(f, ".md")); +} + // --- Exec --- export interface ExecResult { @@ -55,31 +77,18 @@ export async function exec( env?: Record; timeout?: number; throwOnError?: boolean; - stdin?: "ignore"; } = {}, ): Promise { - const { cwd, env, timeout, throwOnError = true, stdin } = options; - const controller = timeout ? new AbortController() : undefined; - const timer = timeout ? setTimeout(() => controller!.abort(), timeout) : undefined; - const stdio: ["ignore", "pipe", "pipe"] | undefined = - stdin === "ignore" ? ["ignore", "pipe", "pipe"] : undefined; - - try { - const result = await x(command, args, { - throwOnError: false, - nodeOptions: { - cwd, - env: env as NodeJS.ProcessEnv, - signal: controller?.signal, - ...(stdio ? { stdio } : {}), - }, - }); - - if (throwOnError && result.exitCode !== 0) { - throw new Error(`Command failed: ${command} ${args.join(" ")}\n${result.stderr}`); - } - return { stdout: result.stdout, stderr: result.stderr, exitCode: result.exitCode }; - } finally { - if (timer) clearTimeout(timer); + const { cwd, env, timeout, throwOnError = true } = options; + + const result = await x(command, args, { + throwOnError: false, + timeout, + nodeOptions: { cwd, env: env as NodeJS.ProcessEnv }, + }); + + if (throwOnError && result.exitCode !== 0) { + throw new Error(`Command failed: ${command} ${args.join(" ")}\n${result.stderr}`); } + return { stdout: result.stdout, stderr: result.stderr, exitCode: result.exitCode }; } diff --git a/scripts/eval/prepare-repos.ts b/scripts/eval/prepare-repos.ts index 1372ed993a09..df0b488eb6e1 100644 --- a/scripts/eval/prepare-repos.ts +++ b/scripts/eval/prepare-repos.ts @@ -13,13 +13,14 @@ * Usage: npx jiti scripts/eval/prepare-repos.ts */ -import { existsSync, mkdirSync, readFileSync, writeFileSync, rmSync, readdirSync } from 'node:fs'; -import { join } from 'node:path'; -import pc from 'picocolors'; +import { existsSync, mkdirSync, readFileSync, writeFileSync, rmSync, readdirSync } from "node:fs"; +import { join } from "node:path"; +import pc from "picocolors"; +import { exec } from "./lib/utils.ts"; -const EVAL_ROOT = join(import.meta.dirname, '..', '..', '..', '..', 'storybook-eval'); -const PREP_DIR = join(EVAL_ROOT, 'prepared-repos'); -const BASELINE_BRANCH = 'eval-baseline'; +const EVAL_ROOT = join(import.meta.dirname, "..", "..", "..", "..", "storybook-eval"); +const PREP_DIR = join(EVAL_ROOT, "prepared-repos"); +const BASELINE_BRANCH = "eval-baseline"; /** Known storybook init starter files that are safe to remove. */ const STARTER_FILES = new Set([ @@ -65,19 +66,7 @@ const GIT_ENV = { }; async function run(cmd: string, args: string[], opts: { cwd?: string; env?: Record; timeout?: number } = {}) { - const { x } = await import('tinyexec'); - const result = await x(cmd, args, { - throwOnError: false, - nodeOptions: { - cwd: opts.cwd, - env: (opts.env ?? process.env) as NodeJS.ProcessEnv, - timeout: opts.timeout, - }, - }); - if (result.exitCode !== 0) { - throw new Error(`${cmd} ${args.join(' ')} failed (${result.exitCode}):\n${result.stderr}`); - } - return result; + return exec(cmd, args, { cwd: opts.cwd, env: opts.env, timeout: opts.timeout }); } function stripStorybookDeps(pkgPath: string) { diff --git a/scripts/eval/prompts/setup.md b/scripts/eval/prompts/setup.md index 77958ce0219e..342ec4749339 100644 --- a/scripts/eval/prompts/setup.md +++ b/scripts/eval/prompts/setup.md @@ -1,4 +1,4 @@ -You are finishing Storybook setup for an existing React + Vite codebase. +You are finishing Storybook setup for an existing codebase. ## Starting state diff --git a/scripts/eval/types.test.ts b/scripts/eval/types.test.ts index 0ba18bf625ea..beda28e8dd77 100644 --- a/scripts/eval/types.test.ts +++ b/scripts/eval/types.test.ts @@ -1,8 +1,8 @@ import { describe, expect, it } from 'vitest'; -import { AGENTS } from './types'; +import { AGENTS, PROJECTS } from './types'; -describe('AGENTS config', () => { +describe('AGENTS', () => { it('has claude and codex agents', () => { expect(AGENTS).toHaveProperty('claude'); expect(AGENTS).toHaveProperty('codex'); @@ -25,3 +25,22 @@ describe('AGENTS config', () => { expect(new Set(allModels).size).toBe(allModels.length); }); }); + +describe('PROJECTS', () => { + it('has at least one project', () => { + expect(PROJECTS.length).toBeGreaterThan(0); + }); + + it('each project has name, repo URL, and branch', () => { + for (const project of PROJECTS) { + expect(project.name).toBeTruthy(); + expect(project.repo).toMatch(/^https:\/\/github\.com\//); + expect(project.branch).toBeTruthy(); + } + }); + + it('project names are unique', () => { + const names = PROJECTS.map((p) => p.name); + expect(new Set(names).size).toBe(names.length); + }); +}); diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index 8cbbbc611c02..65b335cd83d4 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -1,5 +1,5 @@ /** - * Core types for the Storybook setup eval system. + * Core types and config for the Storybook setup eval system. * * Four independent axes: agent × model × effort × prompt */ @@ -20,9 +20,7 @@ export const AGENTS: Record; } From f39708553bfe6f75b0b15b675b6adcf38717e179 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sat, 28 Mar 2026 11:10:33 +0700 Subject: [PATCH 26/63] Fix: stop importing parse-vitest-report from core (extensionless imports) Core source files use extensionless import specifiers that fail under Node's native TypeScript loader. Read numPassedTests/numTotalTests directly from the vitest JSON report instead. --- scripts/eval/lib/ghost-stories.ts | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts index f7f6ecd7871d..3b05a35332e9 100644 --- a/scripts/eval/lib/ghost-stories.ts +++ b/scripts/eval/lib/ghost-stories.ts @@ -3,18 +3,15 @@ import { join } from "node:path"; import type { GhostStoriesResult } from "../types.ts"; import { logStep, logSuccess, logError, exec } from "./utils.ts"; -// Reuse core ghost-stories utilities via relative imports +// component-analyzer has zero dependencies so it imports cleanly. +// parse-vitest-report can't be imported — its transitive imports use +// extensionless specifiers that fail under Node's native TS loader. +// We read numPassedTests/numTotalTests from the vitest JSON directly instead. import { getComponentComplexity } from "../../../code/core/src/core-server/utils/ghost-stories/component-analyzer.ts"; -import { parseVitestResults } from "../../../code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts"; /** * Run ghost stories: discover candidate components, auto-generate stories * via the addon-vitest componentTransform, and measure rendering success. - * - * Reuses parseVitestResults and getComponentComplexity from core. - * Candidate discovery uses a lightweight regex approach here because the - * core's getComponentCandidates depends on storybook/internal/babel which - * isn't resolvable from the scripts/ workspace. */ export async function runGhostStories( projectPath: string, @@ -54,13 +51,14 @@ export async function runGhostStories( try { const report = JSON.parse(readFileSync(reportPath, "utf-8")); - const { summary } = parseVitestResults(report); - if (!summary) { + const total: number = report.numTotalTests ?? 0; + const passed: number = report.numPassedTests ?? 0; + if (total === 0) { logError("Ghost stories: no test results in Vitest report"); return { candidateCount: candidates.length, total: 0, passed: 0, successRate: 0 }; } - const { total, passed, successRate } = summary; - if (total > 0) logSuccess(`Ghost stories: ${passed}/${total} passed (${Math.round(successRate * 100)}%)`); + const successRate = parseFloat((passed / total).toFixed(2)); + logSuccess(`Ghost stories: ${passed}/${total} passed (${Math.round(successRate * 100)}%)`); return { candidateCount: candidates.length, total, passed, successRate }; } catch { logError("Ghost stories: failed to parse Vitest report"); From 9fb35ca216206c07805b120f3852dada8dd417fe Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sat, 28 Mar 2026 11:13:18 +0700 Subject: [PATCH 27/63] Add .ts extensions to core imports used by eval harness Node's native TypeScript loader requires explicit .ts extensions. Add them to parse-vitest-report.ts and categorize-render-errors.ts so the eval can import parseVitestResults from core via relative path. --- .../utils/ghost-stories/parse-vitest-report.ts | 10 +++++++--- code/core/src/shared/utils/categorize-render-errors.ts | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts b/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts index 8c783abdccbe..e0bd41cc53a6 100644 --- a/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts +++ b/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts @@ -1,6 +1,10 @@ -import type { ErrorCategory } from '../../../shared/utils/categorize-render-errors'; -import { categorizeError } from '../../../shared/utils/categorize-render-errors'; -import { type ErrorCategorizationResult, type StoryTestResult, type TestRunSummary } from './types'; +import type { ErrorCategory } from '../../../shared/utils/categorize-render-errors.ts'; +import { categorizeError } from '../../../shared/utils/categorize-render-errors.ts'; +import { + type ErrorCategorizationResult, + type StoryTestResult, + type TestRunSummary, +} from './types.ts'; /** * For a given list of test results: diff --git a/code/core/src/shared/utils/categorize-render-errors.ts b/code/core/src/shared/utils/categorize-render-errors.ts index 68e9653139cf..2bf36b1086a3 100644 --- a/code/core/src/shared/utils/categorize-render-errors.ts +++ b/code/core/src/shared/utils/categorize-render-errors.ts @@ -3,7 +3,7 @@ import { isRouterPackage, isStateManagementPackage, isStylingPackage, -} from './ecosystem-identifier'; +} from './ecosystem-identifier.ts'; export const ERROR_CATEGORIES = { MISSING_PROVIDER: 'MISSING_PROVIDER', From 460fc5d1a98c984452007c6870e35ccf23d54d51 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sat, 28 Mar 2026 11:15:58 +0700 Subject: [PATCH 28/63] Fix ghost-stories comment to reflect inline vitest parsing approach --- scripts/eval/lib/ghost-stories.ts | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts index 3b05a35332e9..cdd2349ce76e 100644 --- a/scripts/eval/lib/ghost-stories.ts +++ b/scripts/eval/lib/ghost-stories.ts @@ -3,10 +3,9 @@ import { join } from "node:path"; import type { GhostStoriesResult } from "../types.ts"; import { logStep, logSuccess, logError, exec } from "./utils.ts"; -// component-analyzer has zero dependencies so it imports cleanly. -// parse-vitest-report can't be imported — its transitive imports use -// extensionless specifiers that fail under Node's native TS loader. -// We read numPassedTests/numTotalTests from the vitest JSON directly instead. +// Reuse core's complexity scorer (zero dependencies, imports cleanly). +// Vitest report parsing is done inline — simpler than importing the full +// parseVitestResults chain which pulls in error categorization we don't need. import { getComponentComplexity } from "../../../code/core/src/core-server/utils/ghost-stories/component-analyzer.ts"; /** From 3dd22462024a48a7832210f6f9910a373b702c78 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sat, 28 Mar 2026 20:49:57 +0700 Subject: [PATCH 29/63] Use parseVitestResults from core for ghost stories grading --- scripts/eval/lib/ghost-stories.ts | 32 +++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts index cdd2349ce76e..23389e326514 100644 --- a/scripts/eval/lib/ghost-stories.ts +++ b/scripts/eval/lib/ghost-stories.ts @@ -1,12 +1,11 @@ import { readFileSync, existsSync, globSync } from "node:fs"; import { join } from "node:path"; -import type { GhostStoriesResult } from "../types.ts"; -import { logStep, logSuccess, logError, exec } from "./utils.ts"; +import type { GhostStoriesResult, Logger } from "../types.ts"; +import { exec } from "./utils.ts"; -// Reuse core's complexity scorer (zero dependencies, imports cleanly). -// Vitest report parsing is done inline — simpler than importing the full -// parseVitestResults chain which pulls in error categorization we don't need. +// Reuse core ghost-stories utilities via relative imports. import { getComponentComplexity } from "../../../code/core/src/core-server/utils/ghost-stories/component-analyzer.ts"; +import { parseVitestResults } from "../../../code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts"; /** * Run ghost stories: discover candidate components, auto-generate stories @@ -15,15 +14,16 @@ import { getComponentComplexity } from "../../../code/core/src/core-server/utils export async function runGhostStories( projectPath: string, resultsDir: string, + logger: Logger, ): Promise { - logStep("Running ghost stories..."); + logger.logStep("Running ghost stories..."); const candidates = findCandidates(projectPath); if (candidates.length === 0) { - logError("No candidate components found"); + logger.logError("No candidate components found"); return undefined; } - logStep(`Found ${candidates.length} candidate component(s)`); + logger.logStep(`Found ${candidates.length} candidate component(s)`); const reportPath = join(resultsDir, "ghost-stories-report.json"); await exec( @@ -44,23 +44,23 @@ export async function runGhostStories( ); if (!existsSync(reportPath)) { - logError("Ghost stories: no Vitest report generated"); + logger.logError("Ghost stories: no Vitest report generated"); return { candidateCount: candidates.length, total: 0, passed: 0, successRate: 0 }; } try { const report = JSON.parse(readFileSync(reportPath, "utf-8")); - const total: number = report.numTotalTests ?? 0; - const passed: number = report.numPassedTests ?? 0; - if (total === 0) { - logError("Ghost stories: no test results in Vitest report"); + const { summary } = parseVitestResults(report); + if (!summary) { + logger.logError("Ghost stories: no test results in Vitest report"); return { candidateCount: candidates.length, total: 0, passed: 0, successRate: 0 }; } - const successRate = parseFloat((passed / total).toFixed(2)); - logSuccess(`Ghost stories: ${passed}/${total} passed (${Math.round(successRate * 100)}%)`); + const { total, passed, successRate } = summary; + if (total > 0) + logger.logSuccess(`Ghost stories: ${passed}/${total} passed (${Math.round(successRate * 100)}%)`); return { candidateCount: candidates.length, total, passed, successRate }; } catch { - logError("Ghost stories: failed to parse Vitest report"); + logger.logError("Ghost stories: failed to parse Vitest report"); return { candidateCount: candidates.length, total: 0, passed: 0, successRate: 0 }; } } From 663b8e9015ca45c22f24c8f7ab27f7b36b917f5c Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sat, 28 Mar 2026 22:13:19 +0700 Subject: [PATCH 30/63] Refactor eval harness: injectable logger, Node IPC, parallel grading, tsconfig fixes - Separate types from runtime config (types.ts + config.ts) - Thread Logger through entire pipeline (fixes garbled parallel output) - Replace fragile stdout sentinel IPC with Node fork/process.send - Run storybook build + typecheck in parallel (saves ~60-120s/trial) - Tighten Agent interface to single params object - Add --agent/--model/--prompt filters to eval-parallel - Make quality score weights configurable - Add prompt template variable support - Enable allowImportingTsExtensions in root and scripts tsconfigs - Fix all pre-existing TS errors in eval files --- .../utils/ghost-stories/get-candidates.ts | 27 ++++-- code/tsconfig.json | 1 + scripts/eval/config.ts | 59 ++++++++++++ scripts/eval/eval-parallel.ts | 89 ++++++++++------- scripts/eval/eval.ts | 52 +++++----- scripts/eval/lib/agents/claude-code.ts | 49 +++++----- scripts/eval/lib/agents/codex.ts | 44 ++++----- scripts/eval/lib/ghost-stories.test.ts | 60 ++++++------ scripts/eval/lib/ghost-stories.ts | 41 ++------ scripts/eval/lib/grade.ts | 75 +++++++++------ scripts/eval/lib/grading-pipeline.test.ts | 19 ++-- scripts/eval/lib/prepare-trial.ts | 24 ++--- scripts/eval/lib/run-task.test.ts | 26 +++-- scripts/eval/lib/run-task.ts | 36 ++++--- scripts/eval/lib/save.ts | 15 +-- scripts/eval/lib/utils.test.ts | 3 +- scripts/eval/lib/utils.ts | 26 +++-- scripts/eval/types.test.ts | 2 +- scripts/eval/types.ts | 95 +++++++------------ scripts/tsconfig.json | 3 +- 20 files changed, 403 insertions(+), 343 deletions(-) create mode 100644 scripts/eval/config.ts diff --git a/code/core/src/core-server/utils/ghost-stories/get-candidates.ts b/code/core/src/core-server/utils/ghost-stories/get-candidates.ts index 8c7d7a113cb3..4c4a96054315 100644 --- a/code/core/src/core-server/utils/ghost-stories/get-candidates.ts +++ b/code/core/src/core-server/utils/ghost-stories/get-candidates.ts @@ -1,15 +1,14 @@ import { readFile } from 'node:fs/promises'; import { babelParse, traverse } from 'storybook/internal/babel'; -import { logger } from 'storybook/internal/node-logger'; // eslint-disable-next-line depend/ban-dependencies import { glob } from 'glob'; -import { getComponentComplexity } from './component-analyzer'; +import { getComponentComplexity } from './component-analyzer.ts'; -// A valid candidate includes React code and at least one export -function isValidCandidate(source: string): boolean { +/** Check whether source contains JSX and at least one export using AST. */ +export function isValidCandidate(source: string): boolean { const ast = babelParse(source); let hasJSX = false; @@ -60,10 +59,15 @@ function isValidCandidate(source: string): boolean { * Based on a list of files, analyze them to find potential candidates to generate story files for. * this is based on whether the file has JSX and exports and how many runtime LOC and imports it * has. + * + * @param isCandidate - Validation function to check if source is a valid component. + * Defaults to AST-based {@link isValidCandidate}. Callers outside the storybook + * workspace can supply a lighter regex-based check instead. */ export async function getCandidatesForStorybook( files: string[], - sampleCount: number + sampleCount: number, + isCandidate: (source: string) => boolean = isValidCandidate ): Promise<{ candidates: string[]; analyzedCount: number; @@ -77,7 +81,7 @@ export async function getCandidatesForStorybook( try { source = await readFile(file, 'utf-8'); // filter out non-React code or files without exports - if (!isValidCandidate(source)) { + if (!isCandidate(source)) { continue; } } catch { @@ -128,9 +132,15 @@ export async function getCandidatesForStorybook( export async function getComponentCandidates({ sampleSize = 20, globPattern = '**/*.{tsx,jsx}', + isCandidate = isValidCandidate, + cwd = process.cwd(), }: { sampleSize?: number; globPattern?: string; + /** Validation function. Defaults to AST-based check; supply a regex version for lightweight usage. */ + isCandidate?: (source: string) => boolean; + /** Working directory for glob. Defaults to process.cwd(). */ + cwd?: string; } = {}): Promise<{ candidates: string[]; error?: string; @@ -145,7 +155,7 @@ export async function getComponentCandidates({ // Find files matching the glob pattern files = await glob(globPattern, { - cwd: process.cwd(), + cwd, absolute: true, ignore: [ '**/node_modules/**', @@ -176,7 +186,8 @@ export async function getComponentCandidates({ const { analyzedCount, avgComplexity, candidates } = await getCandidatesForStorybook( files, - sampleSize + sampleSize, + isCandidate ); return { diff --git a/code/tsconfig.json b/code/tsconfig.json index a0979540cdc8..940555d1805e 100644 --- a/code/tsconfig.json +++ b/code/tsconfig.json @@ -13,6 +13,7 @@ "lib": ["dom", "dom.iterable", "esnext"], "module": "Preserve", "moduleResolution": "bundler", + "allowImportingTsExtensions": true, "noImplicitAny": true, "noUnusedLocals": false, "skipLibCheck": true, diff --git a/scripts/eval/config.ts b/scripts/eval/config.ts new file mode 100644 index 000000000000..a8be3570dbc4 --- /dev/null +++ b/scripts/eval/config.ts @@ -0,0 +1,59 @@ +/** + * Runtime configuration for the Storybook eval system. + * + * Types live in types.ts — this file holds the concrete values. + */ + +import type { AgentName, Project } from "./types.ts"; + +export const AGENTS: Record = { + claude: { + models: ["sonnet-4.6", "opus-4.6", "haiku-4.5"], + defaultModel: "sonnet-4.6", + }, + codex: { + models: ["gpt-5.4"], + defaultModel: "gpt-5.4", + }, +}; + +export const PROJECTS: Project[] = [ + { + name: "mealdrop", + repo: "https://github.com/kasperpeulen/mealdrop", + branch: "eval-baseline", + description: "Styled components, Redux, React Router", + }, + { + name: "edgy", + repo: "https://github.com/kasperpeulen/edgy", + branch: "eval-baseline", + description: "Tailwind, HeadlessUI, React Router", + }, + { + name: "wikitok", + repo: "https://github.com/kasperpeulen/wikitok", + branch: "eval-baseline", + projectDir: "frontend", + description: "Simple project with Tailwind", + }, + { + name: "baklava", + repo: "https://github.com/kasperpeulen/baklava", + branch: "eval-baseline", + description: "Component library with Zustand", + }, + { + name: "echarts", + repo: "https://github.com/kasperpeulen/echarts-react", + branch: "eval-baseline", + description: "ECharts React wrapper", + }, + { + name: "evergreen-ci", + repo: "https://github.com/kasperpeulen/ui", + branch: "eval-baseline", + projectDir: "packages/lib", + description: "GraphQL", + }, +]; diff --git a/scripts/eval/eval-parallel.ts b/scripts/eval/eval-parallel.ts index a66a5d633edf..f7577214221a 100644 --- a/scripts/eval/eval-parallel.ts +++ b/scripts/eval/eval-parallel.ts @@ -1,17 +1,21 @@ import { randomUUID } from "node:crypto"; import { resolve } from "node:path"; -import { spawn } from "node:child_process"; +import { fork } from "node:child_process"; import { createInterface } from "node:readline"; import { parseArgs } from "node:util"; import pc from "picocolors"; -import { AGENTS, PROJECTS } from "./types.ts"; +import { AGENTS, PROJECTS } from "./config.ts"; import type { TrialResult } from "./types.ts"; -import { formatDuration, formatCost, listPrompts } from "./lib/utils.ts"; -import { RESULT_SENTINEL } from "./eval.ts"; +import { createLogger, formatDuration, formatCost, listPrompts } from "./lib/utils.ts"; + +const logger = createLogger(); const { values: opts } = parseArgs({ options: { project: { type: "string", short: "p" }, + agent: { type: "string", short: "a" }, + model: { type: "string", short: "m" }, + prompt: { type: "string" }, effort: { type: "string", short: "e", default: "high" }, "upload-id": { type: "string", short: "u" }, }, @@ -19,57 +23,74 @@ const { values: opts } = parseArgs({ const project = PROJECTS.find((p) => p.name === opts.project); if (!project) { - console.log(pc.red(`Specify a project with -p. Available: ${PROJECTS.map((p) => p.name).join(", ")}`)); + logger.log(pc.red(`Specify a project with -p. Available: ${PROJECTS.map((p) => p.name).join(", ")}`)); process.exit(1); } -const prompts = listPrompts(); +const prompts = opts.prompt ? opts.prompt.split(",") : listPrompts(); +const modelFilter = opts.model ? opts.model.split(",") : null; +const agentFilter = opts.agent ? opts.agent.split(",") : null; const effort = opts.effort as string; const runId = randomUUID().slice(0, 8); const uploadId = opts["upload-id"] || `eval-${runId}`; const evalScript = resolve(import.meta.dirname, "eval.ts"); -// Build all combos: every agent × model × prompt +// Build all combos: every agent x model x prompt (with optional filters) const runs: Array<{ agent: string; model: string; prompt: string; label: string }> = []; for (const [agent, { models }] of Object.entries(AGENTS)) { + if (agentFilter && !agentFilter.includes(agent)) continue; for (const model of models) { + if (modelFilter && !modelFilter.includes(model)) continue; for (const prompt of prompts) { runs.push({ agent, model, prompt, label: `${model}+${prompt}` }); } } } -console.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); -console.log(`${runs.length} parallel processes | Effort: ${effort}`); +if (runs.length === 0) { + logger.log(pc.red("No matching agent/model/prompt combinations found.")); + process.exit(1); +} + +logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); +logger.log(`${runs.length} parallel processes | Effort: ${effort}`); for (const [agent, { models }] of Object.entries(AGENTS)) { - console.log(` ${agent}: ${models.join(", ")}`); + const filteredModels = models.filter((m) => runs.some((r) => r.model === m)); + if (filteredModels.length > 0) { + logger.log(` ${agent}: ${filteredModels.join(", ")}`); + } } -console.log(` prompts: ${prompts.join(", ")}`); -console.log(`Run: ${runId}\n`); +logger.log(` prompts: ${[...new Set(runs.map((r) => r.prompt))].join(", ")}`); +logger.log(`Run: ${runId}\n`); function spawnRun(agent: string, model: string, prompt: string, label: string): Promise { return new Promise((res) => { const tag = pc.dim(`[${label}]`); - const child = spawn("node", [ - evalScript, "-p", project!.name, "-a", agent, "-m", model, "-e", effort, "--prompt", prompt, "-u", uploadId, - ], { stdio: ["ignore", "pipe", "pipe"] }); + const child = fork(evalScript, [ + "-p", project!.name, "-a", agent, "-m", model, "-e", effort, "--prompt", prompt, "-u", uploadId, + ], { stdio: ["ignore", "pipe", "pipe", "ipc"] }); let result: TrialResult | null = null; - createInterface({ input: child.stdout! }).on("line", (line) => { - if (line.startsWith(RESULT_SENTINEL)) { - try { result = JSON.parse(line.slice(RESULT_SENTINEL.length)); } catch { /* skip */ } - } else { - console.log(`${tag} ${line}`); - } + // Receive structured result via IPC + child.on("message", (msg: TrialResult) => { + result = msg; }); - createInterface({ input: child.stderr! }).on("line", (line) => { - console.log(`${tag} ${pc.dim(line)}`); - }); + // Stream stdout/stderr with prefix for readability + if (child.stdout) { + createInterface({ input: child.stdout }).on("line", (line) => { + logger.log(`${tag} ${line}`); + }); + } + if (child.stderr) { + createInterface({ input: child.stderr }).on("line", (line) => { + logger.log(`${tag} ${pc.dim(line)}`); + }); + } child.on("close", (code) => { - if (code !== 0 && !result) console.log(pc.red(`${tag} exited with code ${code}`)); + if (code !== 0 && !result) logger.logError(`${tag} exited with code ${code}`); res(result); }); }); @@ -82,19 +103,19 @@ const results = (await Promise.all(runs.map((r) => spawnRun(r.agent, r.model, r. if (results.length > 0) { results.sort((a, b) => (b.grading.ghostStories?.successRate ?? -1) - (a.grading.ghostStories?.successRate ?? -1)); - console.log(pc.bold("\n\nResults (sorted by ghost stories rate)")); - console.log("=".repeat(130)); - console.log( + logger.log(pc.bold("\n\nResults (sorted by ghost stories rate)")); + logger.log("=".repeat(130)); + logger.log( ["Agent", "Model", "Prompt", "Build", "Ghost", "TS Err", "Score", "Cost", "Time", "Turns"] .map((h, i) => h.padEnd(i <= 1 ? 14 : i === 2 ? 12 : 10)) .join(" | "), ); - console.log("-".repeat(130)); + logger.log("-".repeat(130)); for (const r of results) { const ghost = r.grading.ghostStories; const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; - console.log( + logger.log( [ r.agent.padEnd(14), r.model.padEnd(14), @@ -110,13 +131,13 @@ if (results.length > 0) { ); } - console.log("-".repeat(130)); + logger.log("-".repeat(130)); const totalCost = results.reduce((s, r) => s + (r.execution.cost || 0), 0); const ghostRates = results.map((r) => r.grading.ghostStories?.successRate).filter((r): r is number => r != null); const avgGhost = ghostRates.length > 0 ? ghostRates.reduce((s, r) => s + r, 0) / ghostRates.length : 0; - console.log(`\nGhost stories avg: ${pc.bold(`${Math.round(avgGhost * 100)}%`)}`); - console.log(`Total cost: ${pc.bold(formatCost(totalCost))}`); + logger.log(`\nGhost stories avg: ${pc.bold(`${Math.round(avgGhost * 100)}%`)}`); + logger.log(`Total cost: ${pc.bold(formatCost(totalCost))}`); } -console.log("\nDone."); +logger.log("\nDone."); diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 144f520705ad..83c7096935c9 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -2,12 +2,11 @@ import { randomUUID } from "node:crypto"; import { parseArgs } from "node:util"; import pc from "picocolors"; import type { TrialConfig, AgentName, Effort } from "./types.ts"; -import { AGENTS, PROJECTS } from "./types.ts"; +import { AGENTS, PROJECTS } from "./config.ts"; import { runTask } from "./lib/run-task.ts"; -import { log, formatDuration, formatCost, listPrompts } from "./lib/utils.ts"; +import { createLogger, formatDuration, formatCost, listPrompts } from "./lib/utils.ts"; -/** Sentinel for structured IPC with eval-parallel.ts. */ -export const RESULT_SENTINEL = "__EVAL_RESULT_d3f1a8b2__"; +const logger = createLogger(); const { values: opts } = parseArgs({ options: { @@ -25,24 +24,24 @@ const { values: opts } = parseArgs({ }); if (opts["list-projects"]) { - for (const p of PROJECTS) log(` ${pc.bold(p.name)} — ${p.description}`); + for (const p of PROJECTS) logger.log(` ${pc.bold(p.name)} — ${p.description}`); process.exit(0); } if (opts["list-models"]) { for (const [agent, { models }] of Object.entries(AGENTS)) { - log(`\n ${pc.bold(agent)}`); - for (const m of models) log(` ${m}`); + logger.log(`\n ${pc.bold(agent)}`); + for (const m of models) logger.log(` ${m}`); } process.exit(0); } if (opts["list-prompts"]) { - for (const name of listPrompts()) log(` ${pc.bold(name)}`); + for (const name of listPrompts()) logger.log(` ${pc.bold(name)}`); process.exit(0); } const project = PROJECTS.find((p) => p.name === opts.project); if (!project) { - log(pc.red(`Specify a project with -p. Available: ${PROJECTS.map((p) => p.name).join(", ")}`)); + logger.log(pc.red(`Specify a project with -p. Available: ${PROJECTS.map((p) => p.name).join(", ")}`)); process.exit(1); } @@ -54,7 +53,7 @@ if (opts.model) { const match = Object.entries(AGENTS).find(([, cfg]) => cfg.models.includes(opts.model as string)); if (!match) { const all = Object.values(AGENTS).flatMap((cfg) => cfg.models); - log(pc.red(`Unknown model: ${opts.model}. Available: ${all.join(", ")}`)); + logger.log(pc.red(`Unknown model: ${opts.model}. Available: ${all.join(", ")}`)); process.exit(1); } agent = match[0] as AgentName; @@ -63,7 +62,7 @@ if (opts.model) { agent = opts.agent as AgentName; const agentConfig = AGENTS[agent]; if (!agentConfig) { - log(pc.red(`Unknown agent: ${agent}. Options: ${Object.keys(AGENTS).join(", ")}`)); + logger.log(pc.red(`Unknown agent: ${agent}. Options: ${Object.keys(AGENTS).join(", ")}`)); process.exit(1); } model = agentConfig.defaultModel; @@ -82,26 +81,29 @@ const config: TrialConfig = { verbose: opts.verbose, }; -log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); -log(`Agent: ${agent} | Model: ${model} | Effort: ${effort} | Prompt: ${config.prompt}`); -log(`Run: ${runId}\n`); +logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); +logger.log(`Agent: ${agent} | Model: ${model} | Effort: ${effort} | Prompt: ${config.prompt}`); +logger.log(`Run: ${runId}\n`); try { - const result = await runTask(config, runId, uploadId); + const result = await runTask(config, runId, uploadId, logger); const ghost = result.grading.ghostStories; const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; - log(pc.bold("\nResult")); - log(` Build: ${result.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL")}`); - log(` Ghost: ${ghostStr}`); - log(` TS Err: ${result.grading.typeCheckErrors}`); - log(` Score: ${result.quality.score}`); - log(` Cost: ${formatCost(result.execution.cost)}`); - log(` Time: ${formatDuration(result.execution.duration)}`); - log(` Turns: ${result.execution.turns}`); + logger.log(pc.bold("\nResult")); + logger.log(` Build: ${result.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL")}`); + logger.log(` Ghost: ${ghostStr}`); + logger.log(` TS Err: ${result.grading.typeCheckErrors}`); + logger.log(` Score: ${result.quality.score}`); + logger.log(` Cost: ${formatCost(result.execution.cost)}`); + logger.log(` Time: ${formatDuration(result.execution.duration)}`); + logger.log(` Turns: ${result.execution.turns}`); - console.log(`${RESULT_SENTINEL}${JSON.stringify(result)}`); + // Send result via IPC when forked by eval-parallel, otherwise no-op + if (process.send) { + process.send(result); + } } catch (error) { - log(pc.red(`\nFailed: ${error instanceof Error ? error.message : error}`)); + logger.log(pc.red(`\nFailed: ${error instanceof Error ? error.message : error}`)); process.exit(1); } diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index 08fc66e216c7..0e769038f92f 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -2,22 +2,20 @@ import type { SDKMessage } from "@anthropic-ai/claude-agent-sdk"; import { query } from "@anthropic-ai/claude-agent-sdk"; import { writeFileSync } from "node:fs"; import { join } from "node:path"; -import type { Agent, Effort, ExecutionResult } from "../../types.ts"; - -function logMessage(message: SDKMessage) { - const log = (prefix: string, text: string) => process.stderr.write(`${prefix} ${text}\n`); +import type { Agent, ExecutionResult, Logger } from "../../types.ts"; +function logMessage(message: SDKMessage, logger: Logger) { switch (message.type) { case "assistant": { for (const block of message.message.content) { if (block.type === "text") { - log("💬", block.text); + logger.log(`💬 ${block.text}`); } else if (block.type === "tool_use") { - log("🔧", `${block.name}(${JSON.stringify(block.input).slice(0, 200)})`); + logger.log(`🔧 ${block.name}(${JSON.stringify(block.input).slice(0, 200)})`); } } if (message.error) { - log("❌", `Assistant error: ${message.error}`); + logger.logError(`Assistant error: ${message.error}`); } break; } @@ -37,32 +35,32 @@ function logMessage(message: SDKMessage) { .join("") .slice(0, 200) : "[no content]"; - log("📎", `tool_result(${block.tool_use_id?.slice(-8)}): ${text}`); + logger.log(`📎 tool_result(${block.tool_use_id?.slice(-8)}): ${text}`); } } break; } case "result": if (message.subtype === "success") { - log("✅", `Done — ${message.num_turns} turns, $${message.total_cost_usd?.toFixed(4)}`); + logger.logSuccess(`Done — ${message.num_turns} turns, $${message.total_cost_usd?.toFixed(4)}`); } else { - log("❌", `Error (${message.subtype}): ${message.errors?.join(", ")}`); + logger.logError(`Error (${message.subtype}): ${message.errors?.join(", ")}`); } break; case "system": if (message.subtype === "init") { - log("🚀", `Session started — model: ${message.model}`); + logger.log(`🚀 Session started — model: ${message.model}`); } else if (message.subtype === "api_retry") { - log("🔄", `API retry: attempt ${message.attempt}/${message.max_retries}`); + logger.log(`🔄 API retry: attempt ${message.attempt}/${message.max_retries}`); } else if (message.subtype === "status") { - log("📊", `status: ${message.status ?? "unknown"}`); + logger.log(`📊 status: ${message.status ?? "unknown"}`); } break; case "tool_use_summary": - log("📋", message.summary.slice(0, 200)); + logger.log(`📋 ${message.summary.slice(0, 200)}`); break; case "rate_limit_event": - log("⏳", `Rate limited — status: ${message.rate_limit_info?.status}, resets at: ${message.rate_limit_info?.resetsAt}`); + logger.log(`⏳ Rate limited — status: ${message.rate_limit_info?.status}, resets at: ${message.rate_limit_info?.resetsAt}`); break; default: break; @@ -81,13 +79,14 @@ const CLAUDE_MODEL_MAP: Record = { export const claudeAgent: Agent = { name: "claude", - async execute( - prompt: string, - projectPath: string, - model: string, - options?: { effort?: Effort; resultsDir?: string }, - ): Promise { - const { effort = "high", resultsDir } = options ?? {}; + async execute({ + prompt, + projectPath, + model, + effort = "high", + resultsDir, + logger, + }): Promise { const startTime = Date.now(); let cost: number | undefined; @@ -107,7 +106,7 @@ export const claudeAgent: Agent = { systemPrompt: { type: "preset", preset: "claude_code" }, }, })) { - logMessage(message); + logMessage(message, logger); messages.push(message); if (message.type === "result" && message.subtype === "success") { @@ -120,9 +119,7 @@ export const claudeAgent: Agent = { const duration = (Date.now() - startTime) / 1000; - if (resultsDir) { - writeFileSync(join(resultsDir, "transcript.json"), JSON.stringify(messages, null, 2)); - } + writeFileSync(join(resultsDir, "transcript.json"), JSON.stringify(messages, null, 2)); return { agent: "claude", diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts index 6b8e096fc2b5..9be632b7d900 100644 --- a/scripts/eval/lib/agents/codex.ts +++ b/scripts/eval/lib/agents/codex.ts @@ -1,4 +1,4 @@ -import { Codex } from "@openai/codex-sdk"; +import { Codex, type ModelReasoningEffort } from "@openai/codex-sdk"; import { writeFileSync } from "node:fs"; import { join } from "node:path"; import type { Agent, Effort, ExecutionResult } from "../../types.ts"; @@ -24,7 +24,7 @@ function estimateCost( ); } -const CODEX_EFFORT: Record = { +const CODEX_EFFORT: Record = { low: "low", medium: "medium", high: "high", @@ -34,15 +34,15 @@ const CODEX_EFFORT: Record = { export const codexAgent: Agent = { name: "codex", - async execute( - prompt: string, - projectPath: string, - model: string, - options?: { effort?: Effort; verbose?: boolean; resultsDir?: string }, - ): Promise { - const { effort = "high", resultsDir } = options ?? {}; + async execute({ + prompt, + projectPath, + model, + effort = "high", + resultsDir, + logger, + }): Promise { const startTime = Date.now(); - const log = (prefix: string, text: string) => process.stderr.write(`${prefix} ${text}\n`); const codex = new Codex(); const thread = codex.startThread({ @@ -66,22 +66,22 @@ export const codexAgent: Agent = { items.push(item); switch (item.type) { case "agent_message": - log("💬", item.text.slice(0, 300)); + logger.log(`💬 ${item.text.slice(0, 300)}`); break; case "command_execution": - log("🔧", `$ ${item.command} → exit ${item.exit_code ?? "?"}`); + logger.log(`🔧 $ ${item.command} → exit ${item.exit_code ?? "?"}`); if (item.exit_code !== 0 && item.aggregated_output) { - log(" ", item.aggregated_output.slice(-200)); + logger.log(` ${item.aggregated_output.slice(-200)}`); } break; case "file_change": - for (const c of item.changes) log("📝", `${c.kind} ${c.path}`); + for (const c of item.changes) logger.log(`📝 ${c.kind} ${c.path}`); break; case "reasoning": - log("🧠", item.text.slice(0, 200)); + logger.log(`🧠 ${item.text.slice(0, 200)}`); break; case "error": - log("❌", item.message); + logger.logError(item.message); break; } break; @@ -91,24 +91,22 @@ export const codexAgent: Agent = { totalCached += event.usage.cached_input_tokens; totalOutput += event.usage.output_tokens; turns++; - log("📊", `tokens: ${event.usage.input_tokens}in / ${event.usage.output_tokens}out (${event.usage.cached_input_tokens} cached)`); + logger.log(`📊 tokens: ${event.usage.input_tokens}in / ${event.usage.output_tokens}out (${event.usage.cached_input_tokens} cached)`); break; case "turn.failed": - log("❌", `Turn failed: ${event.error.message}`); + logger.logError(`Turn failed: ${event.error.message}`); break; case "error": - log("❌", `Error: ${event.message}`); + logger.logError(`Error: ${event.message}`); break; } } const duration = (Date.now() - startTime) / 1000; const cost = estimateCost(model, totalInput, totalCached, totalOutput); - log("✅", `Done — ${turns} turns, ${Math.round(duration)}s, ${totalInput}in/${totalOutput}out tokens${cost != null ? `, $${cost.toFixed(4)}` : ""}`); + logger.logSuccess(`Done — ${turns} turns, ${Math.round(duration)}s, ${totalInput}in/${totalOutput}out tokens${cost != null ? `, $${cost.toFixed(4)}` : ""}`); - if (resultsDir) { - writeFileSync(join(resultsDir, "transcript.json"), JSON.stringify(items, null, 2)); - } + writeFileSync(join(resultsDir, "transcript.json"), JSON.stringify(items, null, 2)); return { agent: "codex", model, effort, cost, duration, turns }; }, diff --git a/scripts/eval/lib/ghost-stories.test.ts b/scripts/eval/lib/ghost-stories.test.ts index 03437851f10f..ff61259cdd43 100644 --- a/scripts/eval/lib/ghost-stories.test.ts +++ b/scripts/eval/lib/ghost-stories.test.ts @@ -1,10 +1,11 @@ import { mkdirSync, writeFileSync, rmSync } from 'node:fs'; -import { join } from 'node:path'; import { tmpdir } from 'node:os'; +import { join } from 'node:path'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; -import { findCandidates } from './ghost-stories'; +// Core function — tests verify it works from the eval context (requires compile). +import { getComponentCandidates } from '../../../code/core/src/core-server/utils/ghost-stories/get-candidates'; let TMP: string; @@ -23,7 +24,6 @@ function writeFile(relativePath: string, content: string) { writeFileSync(fullPath, content); } -/** A realistic component file with an export and JSX via return(). */ function simpleComponent(name: string) { return [ `import React from 'react';`, @@ -33,39 +33,41 @@ function simpleComponent(name: string) { ].join('\n'); } -describe('findCandidates', () => { - it('finds exported components with JSX', () => { +async function findCandidates(cwd: string) { + const { candidates } = await getComponentCandidates({ cwd, sampleSize: 20 }); + // Return relative paths for easier assertions + return candidates.map((c) => c.replace(cwd + '/', '')); +} + +describe('getComponentCandidates from core', () => { + it('finds exported components with JSX', async () => { writeFile('src/Button.tsx', simpleComponent('Button')); - expect(findCandidates(TMP)).toEqual(['src/Button.tsx']); + expect(await findCandidates(TMP)).toEqual(['src/Button.tsx']); }); - it('skips files without exports', () => { - writeFile( - 'src/Internal.tsx', - `function Internal() { return
hi
; }` - ); - expect(findCandidates(TMP)).toEqual([]); + it('skips files without exports', async () => { + writeFile('src/Internal.tsx', `function Internal() { return
hi
; }`); + expect(await findCandidates(TMP)).toEqual([]); }); - it('skips files without JSX', () => { + it('skips files without JSX', async () => { writeFile('src/utils.tsx', `export const add = (a: number, b: number) => a + b;`); - expect(findCandidates(TMP)).toEqual([]); + expect(await findCandidates(TMP)).toEqual([]); }); - it('skips test, spec, and story files', () => { + it('skips test, spec, and story files', async () => { writeFile('src/Button.test.tsx', simpleComponent('X')); writeFile('src/Button.spec.tsx', simpleComponent('X')); writeFile('src/Button.stories.tsx', simpleComponent('X')); - writeFile('src/Button.story.tsx', simpleComponent('X')); - expect(findCandidates(TMP)).toEqual([]); + expect(await findCandidates(TMP)).toEqual([]); }); - it('skips config files', () => { + it('skips config files', async () => { writeFile('src/app.config.tsx', simpleComponent('X')); - expect(findCandidates(TMP)).toEqual([]); + expect(await findCandidates(TMP)).toEqual([]); }); - it('sorts by complexity (simpler first)', () => { + it('sorts by complexity (simpler first)', async () => { writeFile('src/Simple.tsx', simpleComponent('Simple')); const lines = [ `import React from 'react';`, @@ -78,28 +80,20 @@ describe('findCandidates', () => { ]; writeFile('src/Complex.tsx', lines.join('\n')); - const candidates = findCandidates(TMP); + const candidates = await findCandidates(TMP); expect(candidates.indexOf('src/Simple.tsx')).toBeLessThan( candidates.indexOf('src/Complex.tsx') ); }); - it('limits to 20 candidates', () => { + it('limits to sampleSize candidates', async () => { for (let i = 0; i < 25; i++) { writeFile(`src/Comp${i}.tsx`, simpleComponent(`Comp${i}`)); } - expect(findCandidates(TMP)).toHaveLength(20); - }); - - it('returns empty for empty project', () => { - expect(findCandidates(TMP)).toEqual([]); + expect(await findCandidates(TMP)).toHaveLength(20); }); - it('finds components using uppercase JSX tags', () => { - writeFile( - 'src/Wrapper.tsx', - `import { Container } from './ui';\nexport const Wrapper = () => hi;` - ); - expect(findCandidates(TMP)).toEqual(['src/Wrapper.tsx']); + it('returns empty for empty project', async () => { + expect(await findCandidates(TMP)).toEqual([]); }); }); diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts index 23389e326514..a79ccc8210ed 100644 --- a/scripts/eval/lib/ghost-stories.ts +++ b/scripts/eval/lib/ghost-stories.ts @@ -1,10 +1,10 @@ -import { readFileSync, existsSync, globSync } from "node:fs"; +import { readFileSync, existsSync } from "node:fs"; import { join } from "node:path"; import type { GhostStoriesResult, Logger } from "../types.ts"; import { exec } from "./utils.ts"; -// Reuse core ghost-stories utilities via relative imports. -import { getComponentComplexity } from "../../../code/core/src/core-server/utils/ghost-stories/component-analyzer.ts"; +// Core ghost-stories utilities — requires `yarn nx run-many -t compile` first. +import { getComponentCandidates } from "../../../code/core/src/core-server/utils/ghost-stories/get-candidates.ts"; import { parseVitestResults } from "../../../code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts"; /** @@ -18,9 +18,9 @@ export async function runGhostStories( ): Promise { logger.logStep("Running ghost stories..."); - const candidates = findCandidates(projectPath); - if (candidates.length === 0) { - logger.logError("No candidate components found"); + const { candidates, error } = await getComponentCandidates({ sampleSize: 20, cwd: projectPath }); + if (error || candidates.length === 0) { + logger.logError(error ?? "No candidate components found"); return undefined; } logger.logStep(`Found ${candidates.length} candidate component(s)`); @@ -64,32 +64,3 @@ export async function runGhostStories( return { candidateCount: candidates.length, total: 0, passed: 0, successRate: 0 }; } } - -/** - * Find .tsx/.jsx files that look like React components, sorted by complexity. - * Uses getComponentComplexity from core for consistent scoring. - */ -export function findCandidates(projectPath: string): string[] { - const SKIP = new Set(["node_modules", ".storybook", "dist", "build", ".git"]); - const files = globSync("**/*.{tsx,jsx}", { - cwd: projectPath, - exclude: (f) => SKIP.has(f.name), - }); - - return files - .filter((f) => !/\.(test|spec|stories|story)\./.test(f) && !/config\./.test(f)) - .map((f) => { - try { - const content = readFileSync(join(projectPath, f), "utf-8"); - if (!/export\s/.test(content)) return null; - if (!/<[A-Z]/.test(content) && !/return\s*\(?\s* a!.complexity - b!.complexity) - .slice(0, 20) - .map((c) => c!.path); -} diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index 10d75eeefea1..47d8996c90a6 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -1,7 +1,8 @@ import { writeFileSync } from "node:fs"; import { join } from "node:path"; -import type { GradingResult, QualityResult, TrialPaths, ChangedFile } from "../types.ts"; -import { logStep, logSuccess, logError, exec } from "./utils.ts"; +import type { GradingResult, QualityResult, QualityWeights, TrialPaths, ChangedFile, Logger } from "../types.ts"; +import { DEFAULT_QUALITY_WEIGHTS } from "../types.ts"; +import { exec } from "./utils.ts"; import { detectSetupPatterns } from "./setup-patterns.ts"; import { runGhostStories } from "./ghost-stories.ts"; @@ -13,24 +14,34 @@ export function filterStorybookFiles(changedFiles: ChangedFile[]): ChangedFile[] } /** - * Compute quality score. + * Compute quality score with configurable weights. * - * Weights: 40% ghost stories, 25% build, 25% typecheck, 10% performance. + * Default weights: 40% ghost stories, 25% build, 25% typecheck, 10% performance. * - * Performance is scored on a curve: ≤120s → 1.0, 600s → 0, linear between. + * Performance is scored on a curve: <=120s -> 1.0, 600s -> 0, linear between. */ -export function computeQualityScore(opts: { - buildSuccess: boolean; - typeCheckErrors: number; - ghostSuccessRate?: number; - durationSeconds?: number; -}): QualityResult { +export function computeQualityScore( + opts: { + buildSuccess: boolean; + typeCheckErrors: number; + ghostSuccessRate?: number; + durationSeconds?: number; + }, + weights: QualityWeights = DEFAULT_QUALITY_WEIGHTS, +): QualityResult { const buildScore = opts.buildSuccess ? 1 : 0; const tcScore = Math.max(0, 1 - opts.typeCheckErrors / 20); const ghostScore = opts.ghostSuccessRate ?? 0; const d = opts.durationSeconds; const perfScore = d == null ? 0 : Math.max(0, Math.min(1, 1 - (d - 120) / 480)); - const score = Math.round((ghostScore * 0.4 + buildScore * 0.25 + tcScore * 0.25 + perfScore * 0.1) * 100) / 100; + const score = + Math.round( + (ghostScore * weights.ghostStories + + buildScore * weights.build + + tcScore * weights.typecheck + + perfScore * weights.performance) * + 100, + ) / 100; return { score, breakdown: { @@ -61,51 +72,53 @@ export function parseChangedFiles(gitOutput: string): ChangedFile[] { export async function grade( paths: TrialPaths, + logger: Logger, agentDuration?: number, ): Promise<{ grading: GradingResult; quality: QualityResult }> { const { repoRoot, projectPath, resultsDir, baselineCommit } = paths; // Changed files - logStep("Collecting agent changes..."); + logger.logStep("Collecting agent changes..."); const changedFiles = await getChangedFiles(repoRoot, baselineCommit); const storybookFiles = filterStorybookFiles(changedFiles); - logSuccess(`${changedFiles.length} files changed (${storybookFiles.length} storybook-related)`); + logger.logSuccess(`${changedFiles.length} files changed (${storybookFiles.length} storybook-related)`); // Setup patterns const setupPatterns = detectSetupPatterns(projectPath); - if (setupPatterns.length > 0) logSuccess(`Detected patterns: ${setupPatterns.map((p) => p.label).join(", ")}`); + if (setupPatterns.length > 0) logger.logSuccess(`Detected patterns: ${setupPatterns.map((p) => p.label).join(", ")}`); + + // Storybook build + TypeScript check in parallel + logger.logStep("Running storybook build + typecheck..."); + const [build, tsc] = await Promise.all([ + exec("npx", ["storybook", "build", "--quiet"], { + cwd: projectPath, + timeout: 300_000, + throwOnError: false, + env: { ...process.env, STORYBOOK_DISABLE_TELEMETRY: "1", NODE_OPTIONS: "--max_old_space_size=4096" }, + }), + exec("npx", ["tsc", "--noEmit"], { cwd: projectPath, timeout: 120_000, throwOnError: false }), + ]); - // Storybook build - logStep("Running storybook build..."); - const build = await exec("npx", ["storybook", "build", "--quiet"], { - cwd: projectPath, - timeout: 300_000, - throwOnError: false, - env: { ...process.env, STORYBOOK_DISABLE_TELEMETRY: "1", NODE_OPTIONS: "--max_old_space_size=4096" }, - }); const buildSuccess = build.exitCode === 0; const buildOutput = build.stdout + "\n" + build.stderr; writeFileSync(join(resultsDir, "build-output.txt"), buildOutput); if (buildSuccess) { - logSuccess("Storybook build succeeded"); + logger.logSuccess("Storybook build succeeded"); } else { - logError(`Storybook build failed (exit ${build.exitCode})`); + logger.logError(`Storybook build failed (exit ${build.exitCode})`); } - // TypeScript check - logStep("Running typecheck..."); - const tsc = await exec("npx", ["tsc", "--noEmit"], { cwd: projectPath, timeout: 120_000, throwOnError: false }); const tscOutput = tsc.stdout + "\n" + tsc.stderr; writeFileSync(join(resultsDir, "typecheck-output.txt"), tscOutput); const typeCheckErrors = countTypeCheckErrors(tscOutput); if (typeCheckErrors === 0) { - logSuccess("No TypeScript errors"); + logger.logSuccess("No TypeScript errors"); } else { - logError(`${typeCheckErrors} TypeScript error(s)`); + logger.logError(`${typeCheckErrors} TypeScript error(s)`); } // Ghost stories (only if build passed) - const ghostStories = buildSuccess ? await runGhostStories(projectPath, resultsDir) : undefined; + const ghostStories = buildSuccess ? await runGhostStories(projectPath, resultsDir, logger) : undefined; const grading: GradingResult = { buildSuccess, diff --git a/scripts/eval/lib/grading-pipeline.test.ts b/scripts/eval/lib/grading-pipeline.test.ts index 17906ceb8b6b..7b63a94737ae 100644 --- a/scripts/eval/lib/grading-pipeline.test.ts +++ b/scripts/eval/lib/grading-pipeline.test.ts @@ -4,7 +4,7 @@ import { tmpdir } from 'node:os'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; -import { findCandidates } from './ghost-stories'; +import { getComponentCandidates } from '../../../code/core/src/core-server/utils/ghost-stories/get-candidates'; import { computeQualityScore, countTypeCheckErrors, @@ -37,8 +37,13 @@ function writeFile(relativePath: string, content: string) { writeFileSync(fullPath, content); } +async function findCandidates(cwd: string) { + const { candidates } = await getComponentCandidates({ cwd, sampleSize: 20 }); + return candidates.map((c) => c.replace(cwd + '/', '')); +} + describe('grading pipeline', () => { - it('grades a well-configured project: candidates found, patterns detected, high quality', () => { + it('grades a well-configured project: candidates found, patterns detected, high quality', async () => { // Set up a realistic project with components and storybook config writeFile( 'src/components/Button.tsx', @@ -75,7 +80,7 @@ describe('grading pipeline', () => { ); // Step 1: Find candidates — both components should be discovered - const candidates = findCandidates(TMP); + const candidates = await findCandidates(TMP); expect(candidates).toHaveLength(2); // Step 2: Detect patterns — config references CSS, theme, staticDirs @@ -106,7 +111,7 @@ describe('grading pipeline', () => { expect(quality.score).toBe(1); }); - it('grades a broken project: candidates found but build fails, low quality', () => { + it('grades a broken project: candidates found but build fails, low quality', async () => { writeFile( 'src/components/Widget.tsx', [ @@ -118,7 +123,7 @@ describe('grading pipeline', () => { ); // Candidates still discoverable even when storybook setup is broken - const candidates = findCandidates(TMP); + const candidates = await findCandidates(TMP); expect(candidates).toHaveLength(1); // Agent didn't create any .storybook config @@ -139,7 +144,7 @@ describe('grading pipeline', () => { expect(quality.breakdown.build).toBe(0); }); - it('more candidates with setup patterns yields higher confidence in the grade', () => { + it('more candidates with setup patterns yields higher confidence in the grade', async () => { // Rich project: many simple components for (let i = 0; i < 5; i++) { writeFile( @@ -154,7 +159,7 @@ describe('grading pipeline', () => { } writeFile('.storybook/preview.tsx', `import { MemoryRouter } from 'react-router-dom';`); - const candidates = findCandidates(TMP); + const candidates = await findCandidates(TMP); expect(candidates).toHaveLength(5); const patterns = detectSetupPatterns(TMP); diff --git a/scripts/eval/lib/prepare-trial.ts b/scripts/eval/lib/prepare-trial.ts index 53ac3dd0b0ab..a013dbcc235e 100644 --- a/scripts/eval/lib/prepare-trial.ts +++ b/scripts/eval/lib/prepare-trial.ts @@ -1,9 +1,9 @@ import { existsSync, mkdirSync, cpSync } from "node:fs"; import { join } from "node:path"; -import type { Project, TrialPaths } from "../types.ts"; -import { CACHE_DIR, TRIALS_DIR, logStep, logSuccess, exec } from "./utils.ts"; +import type { Project, TrialPaths, Logger } from "../types.ts"; +import { CACHE_DIR, TRIALS_DIR, exec } from "./utils.ts"; -async function installDeps(dir: string) { +async function installDeps(dir: string, logger: Logger) { const has = (f: string) => existsSync(join(dir, f)); const [cmd, args]: [string, string[]] = has("pnpm-lock.yaml") || has("pnpm-workspace.yaml") ? ["pnpm", ["install", "--no-frozen-lockfile"]] @@ -13,33 +13,33 @@ async function installDeps(dir: string) { ? ["bun", ["install"]] : ["npm", ["install", "--ignore-scripts"]]; - logStep(`Installing with ${cmd}...`); + logger.logStep(`Installing with ${cmd}...`); await exec(cmd, args, { cwd: dir, timeout: 300_000 }); } /** - * First run: clone eval-baseline → install deps → cache it. + * First run: clone eval-baseline -> install deps -> cache it. * Subsequent runs: copy from cache. Agent starts immediately. */ -export async function prepareTrial(project: Project, trialId: string): Promise { +export async function prepareTrial(project: Project, trialId: string, logger: Logger): Promise { const cacheDir = join(CACHE_DIR, project.name); const trialDir = join(TRIALS_DIR, trialId); const repoRoot = join(trialDir, "project"); mkdirSync(trialDir, { recursive: true }); if (existsSync(join(cacheDir, ".git"))) { - logStep("Copying from cache..."); + logger.logStep("Copying from cache..."); cpSync(cacheDir, repoRoot, { recursive: true }); } else { - logStep(`Cloning ${project.repo}#${project.branch}...`); + logger.logStep(`Cloning ${project.repo}#${project.branch}...`); mkdirSync(CACHE_DIR, { recursive: true }); await exec("git", ["clone", "--depth", "1", "--branch", project.branch!, project.repo, repoRoot], { timeout: 120_000, }); const projectPath = project.projectDir ? join(repoRoot, project.projectDir) : repoRoot; - await installDeps(projectPath); - logSuccess("Dependencies installed"); - logStep("Caching for future runs..."); + await installDeps(projectPath, logger); + logger.logSuccess("Dependencies installed"); + logger.logStep("Caching for future runs..."); cpSync(repoRoot, cacheDir, { recursive: true }); } @@ -48,6 +48,6 @@ export async function prepareTrial(project: Project, trialId: string): Promise { await runTask(config, 'run-1', 'upload-1'); - // prepareTrial receives the project + // prepareTrial receives the project and a logger expect(vi.mocked(prepareTrial).mock.calls[0][0].name).toBe('mealdrop'); + // Third arg is the logger + expect(vi.mocked(prepareTrial).mock.calls[0][2]).toBeDefined(); // captureEnvironment receives the results dir expect(vi.mocked(captureEnvironment).mock.calls[0][0]).toBe(join(TMP, 'results')); - // Agent receives real prompt content, the project path, model, and options - const [prompt, projectPath, model, options] = vi.mocked(claudeAgent.execute).mock.calls[0]; - expect(prompt).toContain('Storybook setup'); - expect(projectPath).toBe(TMP); - expect(model).toBe('sonnet-4.6'); - expect(options?.effort).toBe('high'); + // Agent receives a params object with prompt, projectPath, model, effort, resultsDir, logger + const params = vi.mocked(claudeAgent.execute).mock.calls[0][0] as Record; + expect(params.prompt).toContain('Storybook setup'); + expect(params.projectPath).toBe(TMP); + expect(params.model).toBe('sonnet-4.6'); + expect(params.effort).toBe('high'); + expect(params.resultsDir).toBe(join(TMP, 'results')); + expect(params.logger).toBeDefined(); - // grade receives the trial paths + // grade receives the trial paths and a logger const gradePaths = vi.mocked(grade).mock.calls[0][0]; expect(gradePaths.baselineCommit).toBe('deadbeef'); expect(gradePaths.projectPath).toBe(TMP); + // Second arg is the logger + expect(vi.mocked(grade).mock.calls[0][1]).toBeDefined(); - // saveToGoogleSheets receives the assembled result + env + IDs + // saveToGoogleSheets receives the assembled result + env + IDs + logger const [savedResult, savedEnv, savedRunId, savedUploadId] = vi.mocked(saveToGoogleSheets).mock.calls[0]; expect(savedResult.project).toBe('mealdrop'); diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-task.ts index b38de8801f63..b3c7982d5bf0 100644 --- a/scripts/eval/lib/run-task.ts +++ b/scripts/eval/lib/run-task.ts @@ -1,13 +1,12 @@ import { writeFileSync } from "node:fs"; import { join } from "node:path"; -import type { AgentName, TrialConfig, TrialResult, Agent } from "../types.ts"; +import type { AgentName, Logger, TrialConfig, TrialResult, Agent } from "../types.ts"; import { claudeAgent } from "./agents/claude-code.ts"; import { codexAgent } from "./agents/codex.ts"; import { prepareTrial } from "./prepare-trial.ts"; import { grade } from "./grade.ts"; import { captureEnvironment, saveToGoogleSheets } from "./save.ts"; import { generateTrialId, generatePrompt, createLogger } from "./utils.ts"; -import type { Logger } from "./utils.ts"; const agents: Record = { claude: claudeAgent, @@ -23,37 +22,44 @@ export async function runTask( uploadId: string, logger?: Logger, ): Promise { - const { project, agent: agentName, model, effort, prompt: promptName, verbose } = config; - const { log, logSuccess } = logger ?? createLogger(); + const { project, agent: agentName, model, effort, prompt: promptName } = config; + const log = logger ?? createLogger(); const trialId = generateTrialId(project.name, agentName, model, promptName || "setup"); const timestamp = new Date().toISOString(); - log(`Preparing ${project.name}...`); + log.log(`Preparing ${project.name}...`); // 1. Prepare the trial - const paths = await prepareTrial(project, trialId); + const paths = await prepareTrial(project, trialId, log); // 2. Capture environment const environment = await captureEnvironment(paths.resultsDir); - // 3. Generate the prompt - const prompt = generatePrompt(promptName); + // 3. Generate the prompt (with project-specific template variables) + const prompt = generatePrompt(promptName, { + projectName: project.name, + description: project.description ?? "", + projectDir: project.projectDir ?? ".", + }); writeFileSync(join(paths.resultsDir, "prompt.md"), prompt); // 4. Execute the agent - log(` Running ${agentName} (${model}, effort=${effort})...`); + log.log(` Running ${agentName} (${model}, effort=${effort})...`); const agent = agents[agentName]; - const execution = await agent.execute(prompt, paths.projectPath, model, { + const execution = await agent.execute({ + prompt, + projectPath: paths.projectPath, + model, effort, - verbose, resultsDir: paths.resultsDir, + logger: log, }); - logSuccess( + log.logSuccess( `Agent completed (${Math.round(execution.duration)}s, ${execution.cost ? `$${execution.cost.toFixed(2)}` : "cost N/A"}, ${execution.turns} turns)`, ); // 5. Grade the results (pass agent duration for performance scoring) - const { grading, quality } = await grade(paths, execution.duration); + const { grading, quality } = await grade(paths, log, execution.duration); // 6. Assemble final result const result: TrialResult = { @@ -71,10 +77,10 @@ export async function runTask( }; writeFileSync(join(paths.resultsDir, "summary.json"), JSON.stringify(result, null, 2)); - logSuccess(`Results saved to ${paths.resultsDir}`); + log.logSuccess(`Results saved to ${paths.resultsDir}`); // 7. Upload to Google Sheets - await saveToGoogleSheets(result, environment, runId, uploadId); + await saveToGoogleSheets(result, environment, runId, uploadId, log); return result; } diff --git a/scripts/eval/lib/save.ts b/scripts/eval/lib/save.ts index 250dff2b6b08..eb65750f3ffe 100644 --- a/scripts/eval/lib/save.ts +++ b/scripts/eval/lib/save.ts @@ -1,7 +1,7 @@ import { writeFileSync } from "node:fs"; import { join } from "node:path"; -import type { TrialResult } from "../types.ts"; -import { logStep, logSuccess, logError, exec } from "./utils.ts"; +import type { TrialResult, Logger } from "../types.ts"; +import { exec } from "./utils.ts"; const GOOGLE_SHEETS_URL = process.env.EVAL_GOOGLE_SHEETS_URL; @@ -32,12 +32,13 @@ export async function saveToGoogleSheets( env: Environment, runId: string, uploadId: string, + logger: Logger, ): Promise { if (!GOOGLE_SHEETS_URL) { - logStep("Skipping Google Sheets (set EVAL_GOOGLE_SHEETS_URL to enable)"); + logger.logStep("Skipping Google Sheets (set EVAL_GOOGLE_SHEETS_URL to enable)"); return; } - logStep("Uploading to Google Sheets..."); + logger.logStep("Uploading to Google Sheets..."); const ghost = result.grading.ghostStories; const data = { @@ -76,12 +77,12 @@ export async function saveToGoogleSheets( if (contentType?.includes("application/json")) { const body = (await response.json()) as { success: boolean; error?: string }; if (!body.success) { - logError(`Google Sheets error: ${body.error}`); + logger.logError(`Google Sheets error: ${body.error}`); return; } } - logSuccess("Uploaded to Google Sheets"); + logger.logSuccess("Uploaded to Google Sheets"); } catch (error) { - logError(`Google Sheets upload failed: ${error instanceof Error ? error.message : error}`); + logger.logError(`Google Sheets upload failed: ${error instanceof Error ? error.message : error}`); } } diff --git a/scripts/eval/lib/utils.test.ts b/scripts/eval/lib/utils.test.ts index c213bf3b4298..32abdf4cb9b7 100644 --- a/scripts/eval/lib/utils.test.ts +++ b/scripts/eval/lib/utils.test.ts @@ -36,9 +36,10 @@ describe('formatCost', () => { }); describe('generateTrialId', () => { - it('contains project, model, and prompt', () => { + it('contains project, agent, model, and prompt', () => { const id = generateTrialId('mealdrop', 'claude', 'sonnet-4.6', 'setup'); expect(id).toContain('mealdrop'); + expect(id).toContain('claude'); expect(id).toContain('sonnet-4.6'); expect(id).toContain('setup'); }); diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts index 9f8aa00b79fb..a12e59099e5b 100644 --- a/scripts/eval/lib/utils.ts +++ b/scripts/eval/lib/utils.ts @@ -2,6 +2,7 @@ import { readFileSync, existsSync, readdirSync } from "node:fs"; import { resolve, basename } from "node:path"; import pc from "picocolors"; import { x } from "tinyexec"; +import type { Logger } from "../types.ts"; export const REPO_ROOT = resolve(import.meta.dirname, "..", "..", ".."); export const EVAL_ROOT = resolve(REPO_ROOT, "..", "storybook-eval"); @@ -11,7 +12,7 @@ export const PROMPTS_DIR = resolve(import.meta.dirname, "..", "prompts"); // --- Logging --- -export function createLogger(prefix?: string) { +export function createLogger(prefix?: string): Logger { const p = prefix ? pc.dim(`[${prefix}]`) + " " : ""; return { log: (msg: string) => console.log(`${p}${msg}`), @@ -21,15 +22,6 @@ export function createLogger(prefix?: string) { }; } -export type Logger = ReturnType; - -// Default logger (no prefix) for single-run mode -const defaultLogger = createLogger(); -export const log = defaultLogger.log; -export const logStep = defaultLogger.logStep; -export const logSuccess = defaultLogger.logSuccess; -export const logError = defaultLogger.logError; - // --- Formatting --- export const formatDuration = (s: number) => @@ -39,18 +31,24 @@ export const formatCost = (cost?: number) => (cost == null ? "-" : `$${cost.toFi export function generateTrialId(project: string, agent: string, model: string, prompt: string) { const ts = new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19); - return `${ts}-${project}-${model}-${prompt}-${crypto.randomUUID().slice(0, 8)}`; + return `${ts}-${project}-${agent}-${model}-${prompt}-${crypto.randomUUID().slice(0, 8)}`; } // --- Prompts --- -/** Load a prompt by name from prompts/{name}.md. Defaults to "setup". */ -export function generatePrompt(name = "setup"): string { +/** Load a prompt by name from prompts/{name}.md, with optional template variables. */ +export function generatePrompt(name = "setup", vars?: Record): string { const file = resolve(PROMPTS_DIR, `${name}.md`); if (!existsSync(file)) { throw new Error(`Prompt not found: ${file}\nAvailable: ${listPrompts().join(", ")}`); } - return readFileSync(file, "utf-8").trim(); + let content = readFileSync(file, "utf-8").trim(); + if (vars) { + for (const [key, value] of Object.entries(vars)) { + content = content.replaceAll(`{{${key}}}`, value); + } + } + return content; } /** List available prompt names. */ diff --git a/scripts/eval/types.test.ts b/scripts/eval/types.test.ts index beda28e8dd77..50c1fb670b78 100644 --- a/scripts/eval/types.test.ts +++ b/scripts/eval/types.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest'; -import { AGENTS, PROJECTS } from './types'; +import { AGENTS, PROJECTS } from './config'; describe('AGENTS', () => { it('has claude and codex agents', () => { diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index 65b335cd83d4..7065f5b78115 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -1,25 +1,25 @@ /** - * Core types and config for the Storybook setup eval system. + * Core types for the Storybook setup eval system. * - * Four independent axes: agent × model × effort × prompt + * Four independent axes: agent x model x effort x prompt + * + * Runtime configuration (AGENTS, PROJECTS) lives in config.ts. */ +// --- Logger --- + +export interface Logger { + log: (msg: string) => void; + logStep: (msg: string) => void; + logSuccess: (msg: string) => void; + logError: (msg: string) => void; +} + // --- Agent, Model, Effort --- export type AgentName = "claude" | "codex"; export type Effort = "low" | "medium" | "high" | "max"; -export const AGENTS: Record = { - claude: { - models: ["sonnet-4.6", "opus-4.6", "haiku-4.5"], - defaultModel: "sonnet-4.6", - }, - codex: { - models: ["gpt-5.4"], - defaultModel: "gpt-5.4", - }, -}; - // --- Projects --- export interface Project { @@ -30,47 +30,6 @@ export interface Project { description?: string; } -export const PROJECTS: Project[] = [ - { - name: "mealdrop", - repo: "https://github.com/kasperpeulen/mealdrop", - branch: "eval-baseline", - description: "Styled components, Redux, React Router", - }, - { - name: "edgy", - repo: "https://github.com/kasperpeulen/edgy", - branch: "eval-baseline", - description: "Tailwind, HeadlessUI, React Router", - }, - { - name: "wikitok", - repo: "https://github.com/kasperpeulen/wikitok", - branch: "eval-baseline", - projectDir: "frontend", - description: "Simple project with Tailwind", - }, - { - name: "baklava", - repo: "https://github.com/kasperpeulen/baklava", - branch: "eval-baseline", - description: "Component library with Zustand", - }, - { - name: "echarts", - repo: "https://github.com/kasperpeulen/echarts-react", - branch: "eval-baseline", - description: "ECharts React wrapper", - }, - { - name: "evergreen-ci", - repo: "https://github.com/kasperpeulen/ui", - branch: "eval-baseline", - projectDir: "packages/lib", - description: "GraphQL", - }, -]; - // --- Trial Types --- export interface TrialConfig { @@ -139,6 +98,20 @@ export interface GhostStoriesResult { // --- Quality Score --- +export interface QualityWeights { + ghostStories: number; + build: number; + typecheck: number; + performance: number; +} + +export const DEFAULT_QUALITY_WEIGHTS: QualityWeights = { + ghostStories: 0.4, + build: 0.25, + typecheck: 0.25, + performance: 0.1, +}; + export interface QualityResult { score: number; breakdown: { build: number; typecheck: number; ghostStories: number; performance: number }; @@ -164,10 +137,12 @@ export interface TrialResult { export interface Agent { name: AgentName; - execute( - prompt: string, - projectPath: string, - model: string, - options?: { effort?: Effort; verbose?: boolean; resultsDir?: string }, - ): Promise; + execute(params: { + prompt: string; + projectPath: string; + model: string; + effort: Effort; + resultsDir: string; + logger: Logger; + }): Promise; } diff --git a/scripts/tsconfig.json b/scripts/tsconfig.json index c8082acb3897..a00e31d3f6d5 100644 --- a/scripts/tsconfig.json +++ b/scripts/tsconfig.json @@ -11,6 +11,7 @@ "moduleResolution": "bundler", "target": "ESNext", "module": "Preserve", + "allowImportingTsExtensions": true, "skipLibCheck": true, "allowSyntheticDefaultImports": true, "esModuleInterop": true, @@ -27,6 +28,6 @@ "noFallthroughCasesInSwitch": true, "resolveJsonModule": true }, - "exclude": ["dist", "**/dist", "node_modules", "**/node_modules"], + "exclude": ["dist", "**/dist", "node_modules", "**/node_modules", "eval/google-apps-script.js"], "include": ["./**/*", "./.eslintrc.cjs"] } From 5452a10681fa3065ce3c494dc9931e6e6b9df56f Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sat, 28 Mar 2026 23:20:18 +0700 Subject: [PATCH 31/63] =?UTF-8?q?Refactor=20ghost=20stories:=20rename=20ru?= =?UTF-8?q?nStoryTests=E2=86=92runGhostStories,=20export=20from=20core-ser?= =?UTF-8?q?ver,=20inline=20into=20grade.ts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename runStoryTests to runGhostStories in core (clearer name) - Add cwd parameter to runGhostStories and getComponentCandidates - Export getComponentCandidates, runGhostStories, TestRunSummary from core-server index - Remove eval ghost-stories.ts wrapper — inline logic into grade.ts - Remove eval ghost-stories.test.ts — core already has its own tests - Revert speculative isCandidate/isValidCandidate export (unused) - Remove unused logger import from get-candidates.ts --- code/core/src/core-server/index.ts | 4 + .../server-channel/ghost-stories-channel.ts | 4 +- .../utils/ghost-stories/get-candidates.ts | 19 +--- .../utils/ghost-stories/run-story-tests.ts | 14 ++- scripts/eval/lib/ghost-stories.test.ts | 99 ------------------- scripts/eval/lib/ghost-stories.ts | 66 ------------- scripts/eval/lib/grade.ts | 28 +++++- scripts/eval/lib/grading-pipeline.test.ts | 2 +- 8 files changed, 50 insertions(+), 186 deletions(-) delete mode 100644 scripts/eval/lib/ghost-stories.test.ts delete mode 100644 scripts/eval/lib/ghost-stories.ts diff --git a/code/core/src/core-server/index.ts b/code/core/src/core-server/index.ts index f475fa6166ca..5699b1ad14af 100644 --- a/code/core/src/core-server/index.ts +++ b/code/core/src/core-server/index.ts @@ -32,3 +32,7 @@ export { } from './stores/test-provider'; export { getServerPort } from './utils/server-address'; + +export { getComponentCandidates } from './utils/ghost-stories/get-candidates'; +export { runGhostStories } from './utils/ghost-stories/run-story-tests'; +export type { TestRunSummary } from './utils/ghost-stories/types'; diff --git a/code/core/src/core-server/server-channel/ghost-stories-channel.ts b/code/core/src/core-server/server-channel/ghost-stories-channel.ts index 2b334865556e..3076d7b293d9 100644 --- a/code/core/src/core-server/server-channel/ghost-stories-channel.ts +++ b/code/core/src/core-server/server-channel/ghost-stories-channel.ts @@ -9,7 +9,7 @@ import { import type { CoreConfig, Options } from 'storybook/internal/types'; import { getComponentCandidates } from '../utils/ghost-stories/get-candidates'; -import { runStoryTests } from '../utils/ghost-stories/run-story-tests'; +import { runGhostStories } from '../utils/ghost-stories/run-story-tests'; export function initGhostStoriesChannel( channel: Channel, @@ -91,7 +91,7 @@ export function initGhostStoriesChannel( // Phase 2: Run tests on those candidates Vitest. The components will be transformed directly to tests // If they pass, it means that creating a story file for them would succeed. - const testRunResult = await runStoryTests(candidatesResult.candidates); + const testRunResult = await runGhostStories(candidatesResult.candidates); stats.totalRunDuration = Date.now() - ghostRunStart; stats.testRunDuration = testRunResult.duration; if (testRunResult.runError) { diff --git a/code/core/src/core-server/utils/ghost-stories/get-candidates.ts b/code/core/src/core-server/utils/ghost-stories/get-candidates.ts index 4c4a96054315..661196a3ebea 100644 --- a/code/core/src/core-server/utils/ghost-stories/get-candidates.ts +++ b/code/core/src/core-server/utils/ghost-stories/get-candidates.ts @@ -7,8 +7,8 @@ import { glob } from 'glob'; import { getComponentComplexity } from './component-analyzer.ts'; -/** Check whether source contains JSX and at least one export using AST. */ -export function isValidCandidate(source: string): boolean { +// A valid candidate includes React code and at least one export +function isValidCandidate(source: string): boolean { const ast = babelParse(source); let hasJSX = false; @@ -59,15 +59,10 @@ export function isValidCandidate(source: string): boolean { * Based on a list of files, analyze them to find potential candidates to generate story files for. * this is based on whether the file has JSX and exports and how many runtime LOC and imports it * has. - * - * @param isCandidate - Validation function to check if source is a valid component. - * Defaults to AST-based {@link isValidCandidate}. Callers outside the storybook - * workspace can supply a lighter regex-based check instead. */ export async function getCandidatesForStorybook( files: string[], - sampleCount: number, - isCandidate: (source: string) => boolean = isValidCandidate + sampleCount: number ): Promise<{ candidates: string[]; analyzedCount: number; @@ -81,7 +76,7 @@ export async function getCandidatesForStorybook( try { source = await readFile(file, 'utf-8'); // filter out non-React code or files without exports - if (!isCandidate(source)) { + if (!isValidCandidate(source)) { continue; } } catch { @@ -132,13 +127,10 @@ export async function getCandidatesForStorybook( export async function getComponentCandidates({ sampleSize = 20, globPattern = '**/*.{tsx,jsx}', - isCandidate = isValidCandidate, cwd = process.cwd(), }: { sampleSize?: number; globPattern?: string; - /** Validation function. Defaults to AST-based check; supply a regex version for lightweight usage. */ - isCandidate?: (source: string) => boolean; /** Working directory for glob. Defaults to process.cwd(). */ cwd?: string; } = {}): Promise<{ @@ -186,8 +178,7 @@ export async function getComponentCandidates({ const { analyzedCount, avgComplexity, candidates } = await getCandidatesForStorybook( files, - sampleSize, - isCandidate + sampleSize ); return { diff --git a/code/core/src/core-server/utils/ghost-stories/run-story-tests.ts b/code/core/src/core-server/utils/ghost-stories/run-story-tests.ts index 42ab270ee58a..e4c3f1f2ebdd 100644 --- a/code/core/src/core-server/utils/ghost-stories/run-story-tests.ts +++ b/code/core/src/core-server/utils/ghost-stories/run-story-tests.ts @@ -8,7 +8,18 @@ import { join } from 'pathe'; import { parseVitestResults } from './parse-vitest-report'; import type { TestRunSummary } from './types'; -export async function runStoryTests(componentFilePaths: string[]): Promise { +/** + * Run ghost stories: execute vitest on component file paths to auto-generate + * and test stories that don't exist on disk. + * + * @param componentFilePaths - Absolute paths to component files to test. + * @param options.cwd - Working directory for vitest. Defaults to process.cwd(). + */ +export async function runGhostStories( + componentFilePaths: string[], + options?: { cwd?: string } +): Promise { + const cwd = options?.cwd; try { // Create the cache directory for story discovery tests const cacheDir = resolvePathInStorybookCache('ghost-stories-tests'); @@ -34,6 +45,7 @@ export async function runStoryTests(componentFilePaths: string[]): Promise { - TMP = join(tmpdir(), `eval-ghost-stories-${Date.now()}`); - mkdirSync(join(TMP, 'src'), { recursive: true }); -}); - -afterEach(() => { - rmSync(TMP, { recursive: true, force: true }); -}); - -function writeFile(relativePath: string, content: string) { - const fullPath = join(TMP, relativePath); - mkdirSync(join(fullPath, '..'), { recursive: true }); - writeFileSync(fullPath, content); -} - -function simpleComponent(name: string) { - return [ - `import React from 'react';`, - `export function ${name}() {`, - ` return
${name}
;`, - `}`, - ].join('\n'); -} - -async function findCandidates(cwd: string) { - const { candidates } = await getComponentCandidates({ cwd, sampleSize: 20 }); - // Return relative paths for easier assertions - return candidates.map((c) => c.replace(cwd + '/', '')); -} - -describe('getComponentCandidates from core', () => { - it('finds exported components with JSX', async () => { - writeFile('src/Button.tsx', simpleComponent('Button')); - expect(await findCandidates(TMP)).toEqual(['src/Button.tsx']); - }); - - it('skips files without exports', async () => { - writeFile('src/Internal.tsx', `function Internal() { return
hi
; }`); - expect(await findCandidates(TMP)).toEqual([]); - }); - - it('skips files without JSX', async () => { - writeFile('src/utils.tsx', `export const add = (a: number, b: number) => a + b;`); - expect(await findCandidates(TMP)).toEqual([]); - }); - - it('skips test, spec, and story files', async () => { - writeFile('src/Button.test.tsx', simpleComponent('X')); - writeFile('src/Button.spec.tsx', simpleComponent('X')); - writeFile('src/Button.stories.tsx', simpleComponent('X')); - expect(await findCandidates(TMP)).toEqual([]); - }); - - it('skips config files', async () => { - writeFile('src/app.config.tsx', simpleComponent('X')); - expect(await findCandidates(TMP)).toEqual([]); - }); - - it('sorts by complexity (simpler first)', async () => { - writeFile('src/Simple.tsx', simpleComponent('Simple')); - const lines = [ - `import React from 'react';`, - `import { useState } from 'react';`, - `import { useEffect } from 'react';`, - `import { useCallback } from 'react';`, - `import { useMemo } from 'react';`, - ...Array.from({ length: 40 }, (_, i) => `const line${i} = ${i};`), - `export function Complex() { return
{line0}
; }`, - ]; - writeFile('src/Complex.tsx', lines.join('\n')); - - const candidates = await findCandidates(TMP); - expect(candidates.indexOf('src/Simple.tsx')).toBeLessThan( - candidates.indexOf('src/Complex.tsx') - ); - }); - - it('limits to sampleSize candidates', async () => { - for (let i = 0; i < 25; i++) { - writeFile(`src/Comp${i}.tsx`, simpleComponent(`Comp${i}`)); - } - expect(await findCandidates(TMP)).toHaveLength(20); - }); - - it('returns empty for empty project', async () => { - expect(await findCandidates(TMP)).toEqual([]); - }); -}); diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts deleted file mode 100644 index a79ccc8210ed..000000000000 --- a/scripts/eval/lib/ghost-stories.ts +++ /dev/null @@ -1,66 +0,0 @@ -import { readFileSync, existsSync } from "node:fs"; -import { join } from "node:path"; -import type { GhostStoriesResult, Logger } from "../types.ts"; -import { exec } from "./utils.ts"; - -// Core ghost-stories utilities — requires `yarn nx run-many -t compile` first. -import { getComponentCandidates } from "../../../code/core/src/core-server/utils/ghost-stories/get-candidates.ts"; -import { parseVitestResults } from "../../../code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts"; - -/** - * Run ghost stories: discover candidate components, auto-generate stories - * via the addon-vitest componentTransform, and measure rendering success. - */ -export async function runGhostStories( - projectPath: string, - resultsDir: string, - logger: Logger, -): Promise { - logger.logStep("Running ghost stories..."); - - const { candidates, error } = await getComponentCandidates({ sampleSize: 20, cwd: projectPath }); - if (error || candidates.length === 0) { - logger.logError(error ?? "No candidate components found"); - return undefined; - } - logger.logStep(`Found ${candidates.length} candidate component(s)`); - - const reportPath = join(resultsDir, "ghost-stories-report.json"); - await exec( - "npx", - [ - "vitest", "run", - "--reporter=json", - `--outputFile=${reportPath}`, - "--testTimeout=1000", - ...candidates, - ], - { - cwd: projectPath, - timeout: 120_000, - throwOnError: false, - env: { ...process.env, STORYBOOK_COMPONENT_PATHS: candidates.join(";") }, - }, - ); - - if (!existsSync(reportPath)) { - logger.logError("Ghost stories: no Vitest report generated"); - return { candidateCount: candidates.length, total: 0, passed: 0, successRate: 0 }; - } - - try { - const report = JSON.parse(readFileSync(reportPath, "utf-8")); - const { summary } = parseVitestResults(report); - if (!summary) { - logger.logError("Ghost stories: no test results in Vitest report"); - return { candidateCount: candidates.length, total: 0, passed: 0, successRate: 0 }; - } - const { total, passed, successRate } = summary; - if (total > 0) - logger.logSuccess(`Ghost stories: ${passed}/${total} passed (${Math.round(successRate * 100)}%)`); - return { candidateCount: candidates.length, total, passed, successRate }; - } catch { - logger.logError("Ghost stories: failed to parse Vitest report"); - return { candidateCount: candidates.length, total: 0, passed: 0, successRate: 0 }; - } -} diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index 47d8996c90a6..8e9959812bb8 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -1,10 +1,10 @@ import { writeFileSync } from "node:fs"; import { join } from "node:path"; -import type { GradingResult, QualityResult, QualityWeights, TrialPaths, ChangedFile, Logger } from "../types.ts"; +import type { GradingResult, GhostStoriesResult, QualityResult, QualityWeights, TrialPaths, ChangedFile, Logger } from "../types.ts"; import { DEFAULT_QUALITY_WEIGHTS } from "../types.ts"; import { exec } from "./utils.ts"; import { detectSetupPatterns } from "./setup-patterns.ts"; -import { runGhostStories } from "./ghost-stories.ts"; +import { getComponentCandidates, runGhostStories } from "../../../code/core/src/core-server/index.ts"; /** Filter changed files to only storybook-related ones. */ export function filterStorybookFiles(changedFiles: ChangedFile[]): ChangedFile[] { @@ -118,7 +118,7 @@ export async function grade( } // Ghost stories (only if build passed) - const ghostStories = buildSuccess ? await runGhostStories(projectPath, resultsDir, logger) : undefined; + const ghostStories = buildSuccess ? await gradeGhostStories(projectPath, logger) : undefined; const grading: GradingResult = { buildSuccess, @@ -151,3 +151,25 @@ async function getChangedFiles(repoRoot: string, baseline: string): Promise { + logger.logStep("Running ghost stories..."); + + const { candidates, error } = await getComponentCandidates({ sampleSize: 20, cwd: projectPath }); + if (error || candidates.length === 0) { + logger.logError(error ?? "No candidate components found"); + return undefined; + } + logger.logStep(`Found ${candidates.length} candidate component(s)`); + + const result = await runGhostStories(candidates, { cwd: projectPath }); + const { total, passed, successRate } = result.summary ?? { total: 0, passed: 0, successRate: 0 }; + + if (result.runError) { + logger.logError(`Ghost stories: ${result.runError}`); + } else if (total > 0) { + logger.logSuccess(`Ghost stories: ${passed}/${total} passed (${Math.round(successRate * 100)}%)`); + } + + return { candidateCount: candidates.length, total, passed, successRate }; +} diff --git a/scripts/eval/lib/grading-pipeline.test.ts b/scripts/eval/lib/grading-pipeline.test.ts index 7b63a94737ae..5a6e06fbec75 100644 --- a/scripts/eval/lib/grading-pipeline.test.ts +++ b/scripts/eval/lib/grading-pipeline.test.ts @@ -4,7 +4,7 @@ import { tmpdir } from 'node:os'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; -import { getComponentCandidates } from '../../../code/core/src/core-server/utils/ghost-stories/get-candidates'; +import { getComponentCandidates } from '../../../code/core/src/core-server/index'; import { computeQualityScore, countTypeCheckErrors, From 45acc9b82c7763c6b3d7058f0b97458cf747276a Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sat, 28 Mar 2026 23:25:24 +0700 Subject: [PATCH 32/63] Fix native Node TS execution: use direct file imports with .ts extensions The core-server barrel index re-exports modules (build-static, etc.) that fail under native Node TS. Import ghost-stories utilities directly from their source files instead, and add .ts extensions to internal imports in the import chain. --- .../src/core-server/utils/ghost-stories/run-story-tests.ts | 4 ++-- scripts/eval/lib/grade.ts | 3 ++- scripts/eval/lib/grading-pipeline.test.ts | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/code/core/src/core-server/utils/ghost-stories/run-story-tests.ts b/code/core/src/core-server/utils/ghost-stories/run-story-tests.ts index e4c3f1f2ebdd..c934eb385ecd 100644 --- a/code/core/src/core-server/utils/ghost-stories/run-story-tests.ts +++ b/code/core/src/core-server/utils/ghost-stories/run-story-tests.ts @@ -5,8 +5,8 @@ import { executeCommand, resolvePathInStorybookCache } from 'storybook/internal/ import { join } from 'pathe'; -import { parseVitestResults } from './parse-vitest-report'; -import type { TestRunSummary } from './types'; +import { parseVitestResults } from './parse-vitest-report.ts'; +import type { TestRunSummary } from './types.ts'; /** * Run ghost stories: execute vitest on component file paths to auto-generate diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index 8e9959812bb8..ce0218c766b4 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -4,7 +4,8 @@ import type { GradingResult, GhostStoriesResult, QualityResult, QualityWeights, import { DEFAULT_QUALITY_WEIGHTS } from "../types.ts"; import { exec } from "./utils.ts"; import { detectSetupPatterns } from "./setup-patterns.ts"; -import { getComponentCandidates, runGhostStories } from "../../../code/core/src/core-server/index.ts"; +import { getComponentCandidates } from "../../../code/core/src/core-server/utils/ghost-stories/get-candidates.ts"; +import { runGhostStories } from "../../../code/core/src/core-server/utils/ghost-stories/run-story-tests.ts"; /** Filter changed files to only storybook-related ones. */ export function filterStorybookFiles(changedFiles: ChangedFile[]): ChangedFile[] { diff --git a/scripts/eval/lib/grading-pipeline.test.ts b/scripts/eval/lib/grading-pipeline.test.ts index 5a6e06fbec75..7b63a94737ae 100644 --- a/scripts/eval/lib/grading-pipeline.test.ts +++ b/scripts/eval/lib/grading-pipeline.test.ts @@ -4,7 +4,7 @@ import { tmpdir } from 'node:os'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; -import { getComponentCandidates } from '../../../code/core/src/core-server/index'; +import { getComponentCandidates } from '../../../code/core/src/core-server/utils/ghost-stories/get-candidates'; import { computeQualityScore, countTypeCheckErrors, From bf5855b5a707c02c9158f67f479f8c941002b114 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sun, 29 Mar 2026 00:13:19 +0700 Subject: [PATCH 33/63] Simplify eval harness: Promise.all, async fs, remove Google Sheets, drop exec wrapper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace fork/IPC parallel execution with direct Promise.allSettled + prefixed loggers - Make blocking fs calls async (cpSync→cp, writeFileSync→writeFile, mkdirSync→mkdir) - Remove Google Sheets upload, google-apps-script.js, and upload-id/run-id plumbing - Drop custom exec wrapper — use tinyexec's x() directly at call sites - Remove runId/uploadId from runTask signature and both CLI entry points --- scripts/eval/config.ts | 13 +++- scripts/eval/eval-parallel.ts | 68 ++++++++------------- scripts/eval/eval.ts | 31 ++++++---- scripts/eval/google-apps-script.js | 82 -------------------------- scripts/eval/lib/agents/claude-code.ts | 4 +- scripts/eval/lib/agents/codex.ts | 15 ++--- scripts/eval/lib/grade.ts | 26 ++++---- scripts/eval/lib/prepare-trial.ts | 22 +++---- scripts/eval/lib/run-task.test.ts | 27 +++------ scripts/eval/lib/run-task.ts | 15 ++--- scripts/eval/lib/save.ts | 73 ++--------------------- scripts/eval/lib/utils.ts | 32 ---------- scripts/eval/prepare-repos.ts | 4 +- scripts/eval/types.test.ts | 12 ++++ scripts/eval/types.ts | 5 +- 15 files changed, 121 insertions(+), 308 deletions(-) delete mode 100644 scripts/eval/google-apps-script.js diff --git a/scripts/eval/config.ts b/scripts/eval/config.ts index a8be3570dbc4..7a0e50eb0ba3 100644 --- a/scripts/eval/config.ts +++ b/scripts/eval/config.ts @@ -6,14 +6,25 @@ import type { AgentName, Project } from "./types.ts"; -export const AGENTS: Record = { +export interface AgentConfig { + models: string[]; + defaultModel: string; + efforts: string[]; + defaultEffort: string; +} + +export const AGENTS: Record = { claude: { models: ["sonnet-4.6", "opus-4.6", "haiku-4.5"], defaultModel: "sonnet-4.6", + efforts: ["low", "medium", "high", "max"], + defaultEffort: "high", }, codex: { models: ["gpt-5.4"], defaultModel: "gpt-5.4", + efforts: ["low", "medium", "high", "xhigh"], + defaultEffort: "high", }, }; diff --git a/scripts/eval/eval-parallel.ts b/scripts/eval/eval-parallel.ts index f7577214221a..b79d833b05a2 100644 --- a/scripts/eval/eval-parallel.ts +++ b/scripts/eval/eval-parallel.ts @@ -1,11 +1,9 @@ import { randomUUID } from "node:crypto"; -import { resolve } from "node:path"; -import { fork } from "node:child_process"; -import { createInterface } from "node:readline"; import { parseArgs } from "node:util"; import pc from "picocolors"; import { AGENTS, PROJECTS } from "./config.ts"; -import type { TrialResult } from "./types.ts"; +import type { AgentName, TrialConfig, TrialResult } from "./types.ts"; +import { runTask } from "./lib/run-task.ts"; import { createLogger, formatDuration, formatCost, listPrompts } from "./lib/utils.ts"; const logger = createLogger(); @@ -16,8 +14,7 @@ const { values: opts } = parseArgs({ agent: { type: "string", short: "a" }, model: { type: "string", short: "m" }, prompt: { type: "string" }, - effort: { type: "string", short: "e", default: "high" }, - "upload-id": { type: "string", short: "u" }, + effort: { type: "string", short: "e" }, }, }); @@ -30,10 +27,8 @@ if (!project) { const prompts = opts.prompt ? opts.prompt.split(",") : listPrompts(); const modelFilter = opts.model ? opts.model.split(",") : null; const agentFilter = opts.agent ? opts.agent.split(",") : null; -const effort = opts.effort as string; +const effortOverride = opts.effort; const runId = randomUUID().slice(0, 8); -const uploadId = opts["upload-id"] || `eval-${runId}`; -const evalScript = resolve(import.meta.dirname, "eval.ts"); // Build all combos: every agent x model x prompt (with optional filters) const runs: Array<{ agent: string; model: string; prompt: string; label: string }> = []; @@ -53,7 +48,7 @@ if (runs.length === 0) { } logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); -logger.log(`${runs.length} parallel processes | Effort: ${effort}`); +logger.log(`${runs.length} parallel runs${effortOverride ? ` | Effort: ${effortOverride}` : ""}`); for (const [agent, { models }] of Object.entries(AGENTS)) { const filteredModels = models.filter((m) => runs.some((r) => r.model === m)); if (filteredModels.length > 0) { @@ -63,43 +58,28 @@ for (const [agent, { models }] of Object.entries(AGENTS)) { logger.log(` prompts: ${[...new Set(runs.map((r) => r.prompt))].join(", ")}`); logger.log(`Run: ${runId}\n`); -function spawnRun(agent: string, model: string, prompt: string, label: string): Promise { - return new Promise((res) => { - const tag = pc.dim(`[${label}]`); - const child = fork(evalScript, [ - "-p", project!.name, "-a", agent, "-m", model, "-e", effort, "--prompt", prompt, "-u", uploadId, - ], { stdio: ["ignore", "pipe", "pipe", "ipc"] }); - - let result: TrialResult | null = null; - - // Receive structured result via IPC - child.on("message", (msg: TrialResult) => { - result = msg; - }); - - // Stream stdout/stderr with prefix for readability - if (child.stdout) { - createInterface({ input: child.stdout }).on("line", (line) => { - logger.log(`${tag} ${line}`); - }); - } - if (child.stderr) { - createInterface({ input: child.stderr }).on("line", (line) => { - logger.log(`${tag} ${pc.dim(line)}`); - }); - } +const settled = await Promise.allSettled( + runs.map((run) => { + const config: TrialConfig = { + project, + agent: run.agent as AgentName, + model: run.model, + effort: effortOverride ?? AGENTS[run.agent as AgentName].defaultEffort, + prompt: run.prompt, + }; + return runTask(config, createLogger(run.label)); + }), +); - child.on("close", (code) => { - if (code !== 0 && !result) logger.logError(`${tag} exited with code ${code}`); - res(result); - }); - }); +const results: TrialResult[] = []; +for (const [i, s] of settled.entries()) { + if (s.status === "fulfilled") { + results.push(s.value); + } else { + logger.logError(`${runs[i].label}: ${s.reason instanceof Error ? s.reason.message : s.reason}`); + } } -const results = (await Promise.all(runs.map((r) => spawnRun(r.agent, r.model, r.prompt, r.label)))).filter( - (r): r is TrialResult => r != null, -); - if (results.length > 0) { results.sort((a, b) => (b.grading.ghostStories?.successRate ?? -1) - (a.grading.ghostStories?.successRate ?? -1)); diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 83c7096935c9..b49bf2f33c4a 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -1,7 +1,18 @@ +/** + * Eval harness entry point — single trial run. + * + * Runs with `node ./eval/eval.ts` (no jiti). Node 22+ supports .ts natively + * via type stripping, so no loader or transpiler is needed. The .npmrc in the + * monorepo root enables this. New scripts should follow this pattern instead + * of using jiti — we are migrating away from jiti toward native Node TS. + * + * Import specifiers use explicit .ts extensions, which is required by Node's + * native TS support and enabled by `allowImportingTsExtensions` in tsconfig. + */ import { randomUUID } from "node:crypto"; import { parseArgs } from "node:util"; import pc from "picocolors"; -import type { TrialConfig, AgentName, Effort } from "./types.ts"; +import type { TrialConfig, AgentName } from "./types.ts"; import { AGENTS, PROJECTS } from "./config.ts"; import { runTask } from "./lib/run-task.ts"; import { createLogger, formatDuration, formatCost, listPrompts } from "./lib/utils.ts"; @@ -13,10 +24,9 @@ const { values: opts } = parseArgs({ project: { type: "string", short: "p" }, agent: { type: "string", short: "a", default: "claude" }, model: { type: "string", short: "m" }, - effort: { type: "string", short: "e", default: "high" }, + effort: { type: "string", short: "e" }, prompt: { type: "string", default: "setup" }, verbose: { type: "boolean", short: "v", default: false }, - "upload-id": { type: "string", short: "u" }, "list-projects": { type: "boolean", default: false }, "list-models": { type: "boolean", default: false }, "list-prompts": { type: "boolean", default: false }, @@ -68,9 +78,13 @@ if (opts.model) { model = agentConfig.defaultModel; } -const effort = opts.effort as Effort; +const agentCfg = AGENTS[agent]; +const effort = opts.effort ?? agentCfg.defaultEffort; +if (!agentCfg.efforts.includes(effort)) { + logger.log(pc.red(`Unknown effort "${effort}" for ${agent}. Available: ${agentCfg.efforts.join(", ")}`)); + process.exit(1); +} const runId = randomUUID().slice(0, 8); -const uploadId = opts["upload-id"] || `eval-${runId}`; const config: TrialConfig = { project, @@ -86,7 +100,7 @@ logger.log(`Agent: ${agent} | Model: ${model} | Effort: ${effort} | Prompt: ${co logger.log(`Run: ${runId}\n`); try { - const result = await runTask(config, runId, uploadId, logger); + const result = await runTask(config, logger); const ghost = result.grading.ghostStories; const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; @@ -98,11 +112,6 @@ try { logger.log(` Cost: ${formatCost(result.execution.cost)}`); logger.log(` Time: ${formatDuration(result.execution.duration)}`); logger.log(` Turns: ${result.execution.turns}`); - - // Send result via IPC when forked by eval-parallel, otherwise no-op - if (process.send) { - process.send(result); - } } catch (error) { logger.log(pc.red(`\nFailed: ${error instanceof Error ? error.message : error}`)); process.exit(1); diff --git a/scripts/eval/google-apps-script.js b/scripts/eval/google-apps-script.js deleted file mode 100644 index bbe56318fa22..000000000000 --- a/scripts/eval/google-apps-script.js +++ /dev/null @@ -1,82 +0,0 @@ -/** - * Google Apps Script for Storybook Setup Evaluations - * - * Instructions: - * 1. Create a new Google Sheet for eval results - * 2. Go to Extensions > Apps Script - * 3. Replace the contents with this code - * 4. Click "Deploy" > "New deployment" - * 5. Select type: "Web app" - * 6. Execute as: "Me" - * 7. Who has access: "Anyone" - * 8. Click "Deploy" and copy the web app URL - * 9. Set EVAL_GOOGLE_SHEETS_URL= in your environment - * - * Authorization: - * Run authorize() from the editor to trigger the authorization prompt. - * Click "Review Permissions" → Select account → "Advanced" → "Go to [project] (unsafe)" → "Allow" - */ - -const toTitleCase = (key) => - key - .replace(/([A-Z])/g, " $1") - .replace(/^./, (str) => str.toUpperCase()) - .trim(); - -const ensureHeaders = (sheet, keys) => { - if (sheet.getRange(1, 1).getValue() === "") { - const headers = keys.map(toTitleCase); - sheet.getRange(1, 1, 1, headers.length).setValues([headers]); - sheet.getRange(1, 1, 1, headers.length).setFontWeight("bold"); - } -}; - -const appendRow = (sheet, rowData) => { - const lock = LockService.getScriptLock(); - try { - lock.waitLock(120000); - const lastRow = sheet.getLastRow(); - const targetRow = lastRow < 1 ? 2 : lastRow + 1; - sheet.getRange(targetRow, 1, 1, rowData.length).setValues([rowData]); - SpreadsheetApp.flush(); - return targetRow; - } finally { - lock.releaseLock(); - } -}; - -const prepareRowData = (keys, data) => - keys.map((key) => { - const value = data[key]; - if (typeof value === "boolean") return value ? "TRUE" : "FALSE"; - if (value === null || value === undefined) return ""; - return value; - }); - -// eslint-disable-next-line @typescript-eslint/no-unused-vars -function doPost(e) { - try { - const data = JSON.parse(e.postData.contents); - const sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet(); - const keys = Object.keys(data); - const rowData = prepareRowData(keys, data); - - ensureHeaders(sheet, keys); - const targetRow = appendRow(sheet, rowData); - - return ContentService.createTextOutput( - JSON.stringify({ success: true, row: targetRow }), - ).setMimeType(ContentService.MimeType.JSON); - } catch (error) { - return ContentService.createTextOutput( - JSON.stringify({ success: false, error: error.toString() }), - ).setMimeType(ContentService.MimeType.JSON); - } -} - -// eslint-disable-next-line @typescript-eslint/no-unused-vars -function authorize() { - const spreadsheet = SpreadsheetApp.getActiveSpreadsheet(); - const file = DriveApp.getFileById(spreadsheet.getId()); - console.log("Authorized! File:", file.getName()); -} diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index 0e769038f92f..f0e5bcbf498c 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -1,6 +1,6 @@ import type { SDKMessage } from "@anthropic-ai/claude-agent-sdk"; import { query } from "@anthropic-ai/claude-agent-sdk"; -import { writeFileSync } from "node:fs"; +import { writeFile } from "node:fs/promises"; import { join } from "node:path"; import type { Agent, ExecutionResult, Logger } from "../../types.ts"; @@ -119,7 +119,7 @@ export const claudeAgent: Agent = { const duration = (Date.now() - startTime) / 1000; - writeFileSync(join(resultsDir, "transcript.json"), JSON.stringify(messages, null, 2)); + await writeFile(join(resultsDir, "transcript.json"), JSON.stringify(messages, null, 2)); return { agent: "claude", diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts index 9be632b7d900..9962b074b037 100644 --- a/scripts/eval/lib/agents/codex.ts +++ b/scripts/eval/lib/agents/codex.ts @@ -1,7 +1,7 @@ import { Codex, type ModelReasoningEffort } from "@openai/codex-sdk"; -import { writeFileSync } from "node:fs"; +import { writeFile } from "node:fs/promises"; import { join } from "node:path"; -import type { Agent, Effort, ExecutionResult } from "../../types.ts"; +import type { Agent, ExecutionResult } from "../../types.ts"; /** Per-million-token pricing for Codex/OpenAI models (USD). */ const OPENAI_PRICING: Record = { @@ -24,13 +24,6 @@ function estimateCost( ); } -const CODEX_EFFORT: Record = { - low: "low", - medium: "medium", - high: "high", - max: "xhigh", -}; - export const codexAgent: Agent = { name: "codex", @@ -47,7 +40,7 @@ export const codexAgent: Agent = { const codex = new Codex(); const thread = codex.startThread({ model, - modelReasoningEffort: CODEX_EFFORT[effort], + modelReasoningEffort: effort as ModelReasoningEffort, workingDirectory: projectPath, approvalPolicy: "never", }); @@ -106,7 +99,7 @@ export const codexAgent: Agent = { const cost = estimateCost(model, totalInput, totalCached, totalOutput); logger.logSuccess(`Done — ${turns} turns, ${Math.round(duration)}s, ${totalInput}in/${totalOutput}out tokens${cost != null ? `, $${cost.toFixed(4)}` : ""}`); - writeFileSync(join(resultsDir, "transcript.json"), JSON.stringify(items, null, 2)); + await writeFile(join(resultsDir, "transcript.json"), JSON.stringify(items, null, 2)); return { agent: "codex", model, effort, cost, duration, turns }; }, diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index ce0218c766b4..41b9665ede43 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -1,8 +1,8 @@ -import { writeFileSync } from "node:fs"; +import { writeFile } from "node:fs/promises"; import { join } from "node:path"; import type { GradingResult, GhostStoriesResult, QualityResult, QualityWeights, TrialPaths, ChangedFile, Logger } from "../types.ts"; import { DEFAULT_QUALITY_WEIGHTS } from "../types.ts"; -import { exec } from "./utils.ts"; +import { x } from "tinyexec"; import { detectSetupPatterns } from "./setup-patterns.ts"; import { getComponentCandidates } from "../../../code/core/src/core-server/utils/ghost-stories/get-candidates.ts"; import { runGhostStories } from "../../../code/core/src/core-server/utils/ghost-stories/run-story-tests.ts"; @@ -91,18 +91,20 @@ export async function grade( // Storybook build + TypeScript check in parallel logger.logStep("Running storybook build + typecheck..."); const [build, tsc] = await Promise.all([ - exec("npx", ["storybook", "build", "--quiet"], { - cwd: projectPath, - timeout: 300_000, + x("npx", ["storybook", "build", "--quiet"], { throwOnError: false, - env: { ...process.env, STORYBOOK_DISABLE_TELEMETRY: "1", NODE_OPTIONS: "--max_old_space_size=4096" }, + timeout: 300_000, + nodeOptions: { + cwd: projectPath, + env: { ...process.env, STORYBOOK_DISABLE_TELEMETRY: "1", NODE_OPTIONS: "--max_old_space_size=4096" }, + }, }), - exec("npx", ["tsc", "--noEmit"], { cwd: projectPath, timeout: 120_000, throwOnError: false }), + x("npx", ["tsc", "--noEmit"], { throwOnError: false, timeout: 120_000, nodeOptions: { cwd: projectPath } }), ]); const buildSuccess = build.exitCode === 0; const buildOutput = build.stdout + "\n" + build.stderr; - writeFileSync(join(resultsDir, "build-output.txt"), buildOutput); + await writeFile(join(resultsDir, "build-output.txt"), buildOutput); if (buildSuccess) { logger.logSuccess("Storybook build succeeded"); } else { @@ -110,7 +112,7 @@ export async function grade( } const tscOutput = tsc.stdout + "\n" + tsc.stderr; - writeFileSync(join(resultsDir, "typecheck-output.txt"), tscOutput); + await writeFile(join(resultsDir, "typecheck-output.txt"), tscOutput); const typeCheckErrors = countTypeCheckErrors(tscOutput); if (typeCheckErrors === 0) { logger.logSuccess("No TypeScript errors"); @@ -145,10 +147,10 @@ export async function grade( async function getChangedFiles(repoRoot: string, baseline: string): Promise { // Stage all files so `git diff --cached` picks up new files the agent created. // Safe: this runs on an ephemeral trial copy, not the real repo. - await exec("git", ["add", "-A"], { cwd: repoRoot }); - const { stdout } = await exec("git", ["diff", "--cached", "--name-status", baseline], { - cwd: repoRoot, + await x("git", ["add", "-A"], { nodeOptions: { cwd: repoRoot } }); + const { stdout } = await x("git", ["diff", "--cached", "--name-status", baseline], { throwOnError: false, + nodeOptions: { cwd: repoRoot }, }); return parseChangedFiles(stdout); } diff --git a/scripts/eval/lib/prepare-trial.ts b/scripts/eval/lib/prepare-trial.ts index a013dbcc235e..8baa88c04b31 100644 --- a/scripts/eval/lib/prepare-trial.ts +++ b/scripts/eval/lib/prepare-trial.ts @@ -1,7 +1,9 @@ -import { existsSync, mkdirSync, cpSync } from "node:fs"; +import { existsSync } from "node:fs"; +import { cp, mkdir } from "node:fs/promises"; import { join } from "node:path"; import type { Project, TrialPaths, Logger } from "../types.ts"; -import { CACHE_DIR, TRIALS_DIR, exec } from "./utils.ts"; +import { x } from "tinyexec"; +import { CACHE_DIR, TRIALS_DIR } from "./utils.ts"; async function installDeps(dir: string, logger: Logger) { const has = (f: string) => existsSync(join(dir, f)); @@ -14,7 +16,7 @@ async function installDeps(dir: string, logger: Logger) { : ["npm", ["install", "--ignore-scripts"]]; logger.logStep(`Installing with ${cmd}...`); - await exec(cmd, args, { cwd: dir, timeout: 300_000 }); + await x(cmd, args, { timeout: 300_000, nodeOptions: { cwd: dir } }); } /** @@ -25,28 +27,28 @@ export async function prepareTrial(project: Project, trialId: string, logger: Lo const cacheDir = join(CACHE_DIR, project.name); const trialDir = join(TRIALS_DIR, trialId); const repoRoot = join(trialDir, "project"); - mkdirSync(trialDir, { recursive: true }); + await mkdir(trialDir, { recursive: true }); if (existsSync(join(cacheDir, ".git"))) { logger.logStep("Copying from cache..."); - cpSync(cacheDir, repoRoot, { recursive: true }); + await cp(cacheDir, repoRoot, { recursive: true }); } else { logger.logStep(`Cloning ${project.repo}#${project.branch}...`); - mkdirSync(CACHE_DIR, { recursive: true }); - await exec("git", ["clone", "--depth", "1", "--branch", project.branch!, project.repo, repoRoot], { + await mkdir(CACHE_DIR, { recursive: true }); + await x("git", ["clone", "--depth", "1", "--branch", project.branch!, project.repo, repoRoot], { timeout: 120_000, }); const projectPath = project.projectDir ? join(repoRoot, project.projectDir) : repoRoot; await installDeps(projectPath, logger); logger.logSuccess("Dependencies installed"); logger.logStep("Caching for future runs..."); - cpSync(repoRoot, cacheDir, { recursive: true }); + await cp(repoRoot, cacheDir, { recursive: true }); } - const baselineCommit = (await exec("git", ["rev-parse", "HEAD"], { cwd: repoRoot })).stdout.trim(); + const baselineCommit = (await x("git", ["rev-parse", "HEAD"], { nodeOptions: { cwd: repoRoot } })).stdout.trim(); const projectPath = project.projectDir ? join(repoRoot, project.projectDir) : repoRoot; const resultsDir = join(trialDir, "results"); - mkdirSync(resultsDir, { recursive: true }); + await mkdir(resultsDir, { recursive: true }); logger.logSuccess("Trial ready"); return { trialDir, repoRoot, projectPath, resultsDir, baselineCommit }; diff --git a/scripts/eval/lib/run-task.test.ts b/scripts/eval/lib/run-task.test.ts index 6b56224c477d..ccd7aa562333 100644 --- a/scripts/eval/lib/run-task.test.ts +++ b/scripts/eval/lib/run-task.test.ts @@ -19,7 +19,6 @@ vi.mock('./save', () => ({ evalBranch: 'test-branch', evalCommit: 'abc123', }), - saveToGoogleSheets: vi.fn().mockResolvedValue(undefined), })); vi.mock('./agents/claude-code', () => ({ claudeAgent: { name: 'claude', execute: vi.fn() }, @@ -32,7 +31,7 @@ import { claudeAgent } from './agents/claude-code'; import { grade } from './grade'; import { prepareTrial } from './prepare-trial'; import { runTask } from './run-task'; -import { captureEnvironment, saveToGoogleSheets } from './save'; +import { captureEnvironment } from './save'; let TMP: string; @@ -100,7 +99,7 @@ describe('runTask pipeline', () => { it('assembles a complete TrialResult from pipeline steps', async () => { setupMocks(); - const result = await runTask(baseConfig, 'run-123', 'upload-456'); + const result = await runTask(baseConfig); // Config fields mapped correctly expect(result.schemaVersion).toBe(1); @@ -137,7 +136,7 @@ describe('runTask pipeline', () => { project: { name: 'mealdrop', repo: 'https://github.com/test/mealdrop', branch: 'eval-baseline' }, }; - await runTask(config, 'run-1', 'upload-1'); + await runTask(config); // prepareTrial receives the project and a logger expect(vi.mocked(prepareTrial).mock.calls[0][0].name).toBe('mealdrop'); @@ -162,20 +161,12 @@ describe('runTask pipeline', () => { expect(gradePaths.projectPath).toBe(TMP); // Second arg is the logger expect(vi.mocked(grade).mock.calls[0][1]).toBeDefined(); - - // saveToGoogleSheets receives the assembled result + env + IDs + logger - const [savedResult, savedEnv, savedRunId, savedUploadId] = - vi.mocked(saveToGoogleSheets).mock.calls[0]; - expect(savedResult.project).toBe('mealdrop'); - expect(savedEnv.evalBranch).toBe('test-branch'); - expect(savedRunId).toBe('run-1'); - expect(savedUploadId).toBe('upload-1'); }); it('writes summary.json and prompt.md to results dir', async () => { setupMocks(); - await runTask(baseConfig, 'run-1', 'upload-1'); + await runTask(baseConfig); const resultsDir = join(TMP, 'results'); @@ -193,7 +184,7 @@ describe('runTask pipeline', () => { it('propagates failed build into result', async () => { setupMocks({ buildSuccess: false, typeCheckErrors: 5 }); - const result = await runTask(baseConfig, 'run-1', 'upload-1'); + const result = await runTask(baseConfig); expect(result.grading.buildSuccess).toBe(false); expect(result.quality.score).toBe(0.3); }); @@ -232,12 +223,8 @@ describe('runTask pipeline', () => { }; }); - vi.mocked(saveToGoogleSheets).mockImplementation(async () => { - callOrder.push('save'); - }); - - await runTask(baseConfig, 'run-1', 'upload-1'); + await runTask(baseConfig); - expect(callOrder).toEqual(['prepare', 'agent', 'grade', 'save']); + expect(callOrder).toEqual(['prepare', 'agent', 'grade']); }); }); diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-task.ts index b3c7982d5bf0..bb9532e45915 100644 --- a/scripts/eval/lib/run-task.ts +++ b/scripts/eval/lib/run-task.ts @@ -1,11 +1,11 @@ -import { writeFileSync } from "node:fs"; +import { writeFile } from "node:fs/promises"; import { join } from "node:path"; import type { AgentName, Logger, TrialConfig, TrialResult, Agent } from "../types.ts"; import { claudeAgent } from "./agents/claude-code.ts"; import { codexAgent } from "./agents/codex.ts"; import { prepareTrial } from "./prepare-trial.ts"; import { grade } from "./grade.ts"; -import { captureEnvironment, saveToGoogleSheets } from "./save.ts"; +import { captureEnvironment } from "./save.ts"; import { generateTrialId, generatePrompt, createLogger } from "./utils.ts"; const agents: Record = { @@ -18,8 +18,6 @@ const agents: Record = { */ export async function runTask( config: TrialConfig, - runId: string, - uploadId: string, logger?: Logger, ): Promise { const { project, agent: agentName, model, effort, prompt: promptName } = config; @@ -33,7 +31,7 @@ export async function runTask( const paths = await prepareTrial(project, trialId, log); // 2. Capture environment - const environment = await captureEnvironment(paths.resultsDir); + await captureEnvironment(paths.resultsDir); // 3. Generate the prompt (with project-specific template variables) const prompt = generatePrompt(promptName, { @@ -41,7 +39,7 @@ export async function runTask( description: project.description ?? "", projectDir: project.projectDir ?? ".", }); - writeFileSync(join(paths.resultsDir, "prompt.md"), prompt); + await writeFile(join(paths.resultsDir, "prompt.md"), prompt); // 4. Execute the agent log.log(` Running ${agentName} (${model}, effort=${effort})...`); @@ -76,11 +74,8 @@ export async function runTask( quality, }; - writeFileSync(join(paths.resultsDir, "summary.json"), JSON.stringify(result, null, 2)); + await writeFile(join(paths.resultsDir, "summary.json"), JSON.stringify(result, null, 2)); log.logSuccess(`Results saved to ${paths.resultsDir}`); - // 7. Upload to Google Sheets - await saveToGoogleSheets(result, environment, runId, uploadId, log); - return result; } diff --git a/scripts/eval/lib/save.ts b/scripts/eval/lib/save.ts index eb65750f3ffe..363ac29abe00 100644 --- a/scripts/eval/lib/save.ts +++ b/scripts/eval/lib/save.ts @@ -1,9 +1,6 @@ -import { writeFileSync } from "node:fs"; +import { writeFile } from "node:fs/promises"; import { join } from "node:path"; -import type { TrialResult, Logger } from "../types.ts"; -import { exec } from "./utils.ts"; - -const GOOGLE_SHEETS_URL = process.env.EVAL_GOOGLE_SHEETS_URL; +import { x } from "tinyexec"; export interface Environment { nodeVersion: string; @@ -17,72 +14,12 @@ export async function captureEnvironment(resultsDir: string): Promise { - if (!GOOGLE_SHEETS_URL) { - logger.logStep("Skipping Google Sheets (set EVAL_GOOGLE_SHEETS_URL to enable)"); - return; - } - logger.logStep("Uploading to Google Sheets..."); - - const ghost = result.grading.ghostStories; - const data = { - uploadId, - runId, - timestamp: result.timestamp, - project: result.project, - agent: result.agent, - model: result.model, - effort: result.effort, - prompt: result.prompt, - buildSuccess: result.grading.buildSuccess, - typeCheckErrors: result.grading.typeCheckErrors, - ghostStoriesPassed: ghost?.passed ?? null, - ghostStoriesTotal: ghost?.total ?? null, - ghostStoriesRate: ghost?.successRate ?? null, - setupPatterns: result.grading.setupPatterns.map((p) => p.id).join(", "), - changedFiles: result.grading.changedFiles.length, - storybookFiles: result.grading.storybookFiles.length, - qualityScore: result.quality.score, - cost: result.execution.cost ?? "unknown", - duration: result.execution.duration, - turns: result.execution.turns, - evalBranch: env.evalBranch, - evalCommit: env.evalCommit, - }; - - try { - const response = await fetch(GOOGLE_SHEETS_URL, { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify(data), - redirect: "manual", - }); - const contentType = response.headers.get("content-type"); - if (contentType?.includes("application/json")) { - const body = (await response.json()) as { success: boolean; error?: string }; - if (!body.success) { - logger.logError(`Google Sheets error: ${body.error}`); - return; - } - } - logger.logSuccess("Uploaded to Google Sheets"); - } catch (error) { - logger.logError(`Google Sheets upload failed: ${error instanceof Error ? error.message : error}`); - } -} diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts index a12e59099e5b..6bb36e8dd04a 100644 --- a/scripts/eval/lib/utils.ts +++ b/scripts/eval/lib/utils.ts @@ -1,7 +1,6 @@ import { readFileSync, existsSync, readdirSync } from "node:fs"; import { resolve, basename } from "node:path"; import pc from "picocolors"; -import { x } from "tinyexec"; import type { Logger } from "../types.ts"; export const REPO_ROOT = resolve(import.meta.dirname, "..", "..", ".."); @@ -59,34 +58,3 @@ export function listPrompts(): string[] { .map((f) => basename(f, ".md")); } -// --- Exec --- - -export interface ExecResult { - stdout: string; - stderr: string; - exitCode: number | null; -} - -export async function exec( - command: string, - args: string[], - options: { - cwd?: string; - env?: Record; - timeout?: number; - throwOnError?: boolean; - } = {}, -): Promise { - const { cwd, env, timeout, throwOnError = true } = options; - - const result = await x(command, args, { - throwOnError: false, - timeout, - nodeOptions: { cwd, env: env as NodeJS.ProcessEnv }, - }); - - if (throwOnError && result.exitCode !== 0) { - throw new Error(`Command failed: ${command} ${args.join(" ")}\n${result.stderr}`); - } - return { stdout: result.stdout, stderr: result.stderr, exitCode: result.exitCode }; -} diff --git a/scripts/eval/prepare-repos.ts b/scripts/eval/prepare-repos.ts index df0b488eb6e1..17b959f6f6fa 100644 --- a/scripts/eval/prepare-repos.ts +++ b/scripts/eval/prepare-repos.ts @@ -16,7 +16,7 @@ import { existsSync, mkdirSync, readFileSync, writeFileSync, rmSync, readdirSync } from "node:fs"; import { join } from "node:path"; import pc from "picocolors"; -import { exec } from "./lib/utils.ts"; +import { x } from "tinyexec"; const EVAL_ROOT = join(import.meta.dirname, "..", "..", "..", "..", "storybook-eval"); const PREP_DIR = join(EVAL_ROOT, "prepared-repos"); @@ -66,7 +66,7 @@ const GIT_ENV = { }; async function run(cmd: string, args: string[], opts: { cwd?: string; env?: Record; timeout?: number } = {}) { - return exec(cmd, args, { cwd: opts.cwd, env: opts.env, timeout: opts.timeout }); + return x(cmd, args, { timeout: opts.timeout, nodeOptions: { cwd: opts.cwd, env: opts.env as NodeJS.ProcessEnv } }); } function stripStorybookDeps(pkgPath: string) { diff --git a/scripts/eval/types.test.ts b/scripts/eval/types.test.ts index 50c1fb670b78..9550f3e9527e 100644 --- a/scripts/eval/types.test.ts +++ b/scripts/eval/types.test.ts @@ -20,6 +20,18 @@ describe('AGENTS', () => { } }); + it('each agent has a non-empty efforts list', () => { + for (const config of Object.values(AGENTS)) { + expect(config.efforts.length).toBeGreaterThan(0); + } + }); + + it('each agent defaultEffort is in its efforts list', () => { + for (const config of Object.values(AGENTS)) { + expect(config.efforts).toContain(config.defaultEffort); + } + }); + it('no model is shared between agents', () => { const allModels = Object.values(AGENTS).flatMap((a) => a.models); expect(new Set(allModels).size).toBe(allModels.length); diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index 7065f5b78115..f196abf3cb25 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -18,7 +18,6 @@ export interface Logger { // --- Agent, Model, Effort --- export type AgentName = "claude" | "codex"; -export type Effort = "low" | "medium" | "high" | "max"; // --- Projects --- @@ -36,7 +35,7 @@ export interface TrialConfig { project: Project; agent: AgentName; model: string; - effort: Effort; + effort: string; prompt: string; verbose?: boolean; } @@ -141,7 +140,7 @@ export interface Agent { prompt: string; projectPath: string; model: string; - effort: Effort; + effort: string; resultsDir: string; logger: Logger; }): Promise; From da1f96e3806a300274c3d5ccad4176397660f91e Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sun, 29 Mar 2026 00:43:24 +0700 Subject: [PATCH 34/63] Refactor eval system: Zod schemas, unified CLI, shared utilities - Replace plain interfaces with Zod schemas for runtime validation (types.ts) - Merge eval.ts + eval-parallel.ts into a single CLI with comma-separated args - Fix deep core imports to use barrel export (core-server/index.ts) - Extract shared package-manager detection and install (lib/package-manager.ts) - Move pricing tables and model ID mappings into config.ts - Make setup-patterns.ts fully async with fs/promises - Add formatTable utility with ANSI-aware column alignment - Integrate prepare-repos.ts with shared logger and PM utilities --- scripts/eval/config.ts | 58 +++++- scripts/eval/eval-parallel.ts | 123 ----------- scripts/eval/eval.ts | 189 ++++++++++++----- scripts/eval/lib/agents/claude-code.ts | 11 +- scripts/eval/lib/agents/codex.ts | 24 +-- scripts/eval/lib/grade.ts | 5 +- scripts/eval/lib/grading-pipeline.test.ts | 12 +- scripts/eval/lib/package-manager.ts | 47 +++++ scripts/eval/lib/prepare-trial.ts | 15 +- scripts/eval/lib/pricing.ts | 27 +++ scripts/eval/lib/setup-patterns.test.ts | 64 +++--- scripts/eval/lib/setup-patterns.ts | 30 ++- scripts/eval/lib/utils.test.ts | 49 ++++- scripts/eval/lib/utils.ts | 25 ++- scripts/eval/prepare-repos.ts | 65 +++--- scripts/eval/types.test.ts | 55 ++--- scripts/eval/types.ts | 243 ++++++++++++---------- 17 files changed, 576 insertions(+), 466 deletions(-) delete mode 100644 scripts/eval/eval-parallel.ts create mode 100644 scripts/eval/lib/package-manager.ts create mode 100644 scripts/eval/lib/pricing.ts diff --git a/scripts/eval/config.ts b/scripts/eval/config.ts index 7a0e50eb0ba3..9206d9c31012 100644 --- a/scripts/eval/config.ts +++ b/scripts/eval/config.ts @@ -1,31 +1,67 @@ /** * Runtime configuration for the Storybook eval system. * - * Types live in types.ts — this file holds the concrete values. + * Types live in types.ts — this file holds the concrete agent configs, + * model mappings, pricing, and benchmark project definitions. + * + * Agent configs are validated with Zod at import time — invalid config + * (e.g. defaultModel not in models list) throws immediately. */ +import { z } from "zod"; import type { AgentName, Project } from "./types.ts"; -export interface AgentConfig { - models: string[]; - defaultModel: string; - efforts: string[]; - defaultEffort: string; -} +// --- Pricing --- + +export const Pricing = z.object({ + input: z.number(), + cachedInput: z.number(), + output: z.number(), +}); +export type Pricing = z.infer; + +// --- Agent Config --- + +export const AgentConfig = z + .object({ + models: z.array(z.string()).min(1), + defaultModel: z.string(), + /** Map friendly model names to SDK-specific model IDs (e.g. "sonnet-4.6" → "claude-sonnet-4-6"). */ + sdkModelIds: z.record(z.string(), z.string()).default({}), + /** Per-million-token pricing for manual cost estimation (agents that don't report cost natively). */ + pricing: z.record(z.string(), Pricing).default({}), + efforts: z.array(z.string()).min(1), + defaultEffort: z.string(), + }) + .refine((cfg) => cfg.models.includes(cfg.defaultModel), { + message: "defaultModel must be in models list", + }) + .refine((cfg) => cfg.efforts.includes(cfg.defaultEffort), { + message: "defaultEffort must be in efforts list", + }); +export type AgentConfig = z.infer; export const AGENTS: Record = { - claude: { + claude: AgentConfig.parse({ models: ["sonnet-4.6", "opus-4.6", "haiku-4.5"], defaultModel: "sonnet-4.6", + sdkModelIds: { + "sonnet-4.6": "claude-sonnet-4-6", + "opus-4.6": "claude-opus-4-6", + "haiku-4.5": "claude-haiku-4-5", + }, efforts: ["low", "medium", "high", "max"], defaultEffort: "high", - }, - codex: { + }), + codex: AgentConfig.parse({ models: ["gpt-5.4"], defaultModel: "gpt-5.4", + pricing: { + "gpt-5.4": { input: 2.5, cachedInput: 0.625, output: 10.0 }, + }, efforts: ["low", "medium", "high", "xhigh"], defaultEffort: "high", - }, + }), }; export const PROJECTS: Project[] = [ diff --git a/scripts/eval/eval-parallel.ts b/scripts/eval/eval-parallel.ts deleted file mode 100644 index b79d833b05a2..000000000000 --- a/scripts/eval/eval-parallel.ts +++ /dev/null @@ -1,123 +0,0 @@ -import { randomUUID } from "node:crypto"; -import { parseArgs } from "node:util"; -import pc from "picocolors"; -import { AGENTS, PROJECTS } from "./config.ts"; -import type { AgentName, TrialConfig, TrialResult } from "./types.ts"; -import { runTask } from "./lib/run-task.ts"; -import { createLogger, formatDuration, formatCost, listPrompts } from "./lib/utils.ts"; - -const logger = createLogger(); - -const { values: opts } = parseArgs({ - options: { - project: { type: "string", short: "p" }, - agent: { type: "string", short: "a" }, - model: { type: "string", short: "m" }, - prompt: { type: "string" }, - effort: { type: "string", short: "e" }, - }, -}); - -const project = PROJECTS.find((p) => p.name === opts.project); -if (!project) { - logger.log(pc.red(`Specify a project with -p. Available: ${PROJECTS.map((p) => p.name).join(", ")}`)); - process.exit(1); -} - -const prompts = opts.prompt ? opts.prompt.split(",") : listPrompts(); -const modelFilter = opts.model ? opts.model.split(",") : null; -const agentFilter = opts.agent ? opts.agent.split(",") : null; -const effortOverride = opts.effort; -const runId = randomUUID().slice(0, 8); - -// Build all combos: every agent x model x prompt (with optional filters) -const runs: Array<{ agent: string; model: string; prompt: string; label: string }> = []; -for (const [agent, { models }] of Object.entries(AGENTS)) { - if (agentFilter && !agentFilter.includes(agent)) continue; - for (const model of models) { - if (modelFilter && !modelFilter.includes(model)) continue; - for (const prompt of prompts) { - runs.push({ agent, model, prompt, label: `${model}+${prompt}` }); - } - } -} - -if (runs.length === 0) { - logger.log(pc.red("No matching agent/model/prompt combinations found.")); - process.exit(1); -} - -logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); -logger.log(`${runs.length} parallel runs${effortOverride ? ` | Effort: ${effortOverride}` : ""}`); -for (const [agent, { models }] of Object.entries(AGENTS)) { - const filteredModels = models.filter((m) => runs.some((r) => r.model === m)); - if (filteredModels.length > 0) { - logger.log(` ${agent}: ${filteredModels.join(", ")}`); - } -} -logger.log(` prompts: ${[...new Set(runs.map((r) => r.prompt))].join(", ")}`); -logger.log(`Run: ${runId}\n`); - -const settled = await Promise.allSettled( - runs.map((run) => { - const config: TrialConfig = { - project, - agent: run.agent as AgentName, - model: run.model, - effort: effortOverride ?? AGENTS[run.agent as AgentName].defaultEffort, - prompt: run.prompt, - }; - return runTask(config, createLogger(run.label)); - }), -); - -const results: TrialResult[] = []; -for (const [i, s] of settled.entries()) { - if (s.status === "fulfilled") { - results.push(s.value); - } else { - logger.logError(`${runs[i].label}: ${s.reason instanceof Error ? s.reason.message : s.reason}`); - } -} - -if (results.length > 0) { - results.sort((a, b) => (b.grading.ghostStories?.successRate ?? -1) - (a.grading.ghostStories?.successRate ?? -1)); - - logger.log(pc.bold("\n\nResults (sorted by ghost stories rate)")); - logger.log("=".repeat(130)); - logger.log( - ["Agent", "Model", "Prompt", "Build", "Ghost", "TS Err", "Score", "Cost", "Time", "Turns"] - .map((h, i) => h.padEnd(i <= 1 ? 14 : i === 2 ? 12 : 10)) - .join(" | "), - ); - logger.log("-".repeat(130)); - - for (const r of results) { - const ghost = r.grading.ghostStories; - const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; - logger.log( - [ - r.agent.padEnd(14), - r.model.padEnd(14), - r.prompt.padEnd(12), - (r.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL")).padEnd(10 + 10), - ghostStr.padEnd(10), - String(r.grading.typeCheckErrors).padEnd(10), - String(r.quality.score).padEnd(10), - formatCost(r.execution.cost).padEnd(10), - formatDuration(r.execution.duration).padEnd(10), - String(r.execution.turns).padEnd(10), - ].join(" | "), - ); - } - - logger.log("-".repeat(130)); - const totalCost = results.reduce((s, r) => s + (r.execution.cost || 0), 0); - const ghostRates = results.map((r) => r.grading.ghostStories?.successRate).filter((r): r is number => r != null); - const avgGhost = ghostRates.length > 0 ? ghostRates.reduce((s, r) => s + r, 0) / ghostRates.length : 0; - - logger.log(`\nGhost stories avg: ${pc.bold(`${Math.round(avgGhost * 100)}%`)}`); - logger.log(`Total cost: ${pc.bold(formatCost(totalCost))}`); -} - -logger.log("\nDone."); diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index b49bf2f33c4a..4095a42676e0 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -1,31 +1,32 @@ /** - * Eval harness entry point — single trial run. + * Eval harness entry point — single or parallel trial runs. * * Runs with `node ./eval/eval.ts` (no jiti). Node 22+ supports .ts natively - * via type stripping, so no loader or transpiler is needed. The .npmrc in the - * monorepo root enables this. New scripts should follow this pattern instead - * of using jiti — we are migrating away from jiti toward native Node TS. + * via type stripping. Import specifiers use explicit .ts extensions. * - * Import specifiers use explicit .ts extensions, which is required by Node's - * native TS support and enabled by `allowImportingTsExtensions` in tsconfig. + * Usage: + * node eval/eval.ts -p mealdrop # single run (claude, default model) + * node eval/eval.ts -p mealdrop -m gpt-5.4 # single run (agent inferred from model) + * node eval/eval.ts -p mealdrop -m sonnet-4.6,gpt-5.4 # parallel runs + * node eval/eval.ts -p mealdrop -a claude,codex # parallel runs (default model each) */ import { randomUUID } from "node:crypto"; import { parseArgs } from "node:util"; import pc from "picocolors"; -import type { TrialConfig, AgentName } from "./types.ts"; +import type { AgentName, TrialConfig, TrialResult } from "./types.ts"; import { AGENTS, PROJECTS } from "./config.ts"; import { runTask } from "./lib/run-task.ts"; -import { createLogger, formatDuration, formatCost, listPrompts } from "./lib/utils.ts"; +import { createLogger, formatDuration, formatCost, formatTable, listPrompts } from "./lib/utils.ts"; const logger = createLogger(); const { values: opts } = parseArgs({ options: { project: { type: "string", short: "p" }, - agent: { type: "string", short: "a", default: "claude" }, + agent: { type: "string", short: "a" }, model: { type: "string", short: "m" }, effort: { type: "string", short: "e" }, - prompt: { type: "string", default: "setup" }, + prompt: { type: "string" }, verbose: { type: "boolean", short: "v", default: false }, "list-projects": { type: "boolean", default: false }, "list-models": { type: "boolean", default: false }, @@ -33,6 +34,8 @@ const { values: opts } = parseArgs({ }, }); +// --- List commands --- + if (opts["list-projects"]) { for (const p of PROJECTS) logger.log(` ${pc.bold(p.name)} — ${p.description}`); process.exit(0); @@ -49,70 +52,152 @@ if (opts["list-prompts"]) { process.exit(0); } +// --- Validate project --- + const project = PROJECTS.find((p) => p.name === opts.project); if (!project) { logger.log(pc.red(`Specify a project with -p. Available: ${PROJECTS.map((p) => p.name).join(", ")}`)); process.exit(1); } -// Infer agent from model if model is specified, otherwise use --agent flag -let agent: AgentName; -let model: string; +// --- Build configs (supports comma-separated values for parallel runs) --- + +const promptNames = opts.prompt?.split(",") ?? ["setup"]; +const allModels = Object.values(AGENTS).flatMap((cfg) => cfg.models); + +// Determine agent → model pairs +let agentModels: Array<{ agent: AgentName; model: string }>; if (opts.model) { - const match = Object.entries(AGENTS).find(([, cfg]) => cfg.models.includes(opts.model as string)); - if (!match) { - const all = Object.values(AGENTS).flatMap((cfg) => cfg.models); - logger.log(pc.red(`Unknown model: ${opts.model}. Available: ${all.join(", ")}`)); - process.exit(1); + // Models specified — infer agent per model + agentModels = opts.model.split(",").map((model) => { + const entry = Object.entries(AGENTS).find(([, cfg]) => cfg.models.includes(model)); + if (!entry) { + logger.log(pc.red(`Unknown model: ${model}. Available: ${allModels.join(", ")}`)); + process.exit(1); + } + return { agent: entry[0] as AgentName, model }; + }); + // If --agent is also specified, filter to matching agents + if (opts.agent) { + const filter = opts.agent.split(","); + agentModels = agentModels.filter((am) => filter.includes(am.agent)); } - agent = match[0] as AgentName; - model = opts.model as string; +} else if (opts.agent) { + // Agents specified — use default model per agent + agentModels = opts.agent.split(",").map((name) => { + const cfg = AGENTS[name as AgentName]; + if (!cfg) { + logger.log(pc.red(`Unknown agent: ${name}. Options: ${Object.keys(AGENTS).join(", ")}`)); + process.exit(1); + } + return { agent: name as AgentName, model: cfg.defaultModel }; + }); } else { - agent = opts.agent as AgentName; - const agentConfig = AGENTS[agent]; - if (!agentConfig) { - logger.log(pc.red(`Unknown agent: ${agent}. Options: ${Object.keys(AGENTS).join(", ")}`)); + // Default: single claude run + agentModels = [{ agent: "claude", model: AGENTS.claude.defaultModel }]; +} + +// Expand to full configs: agent×model × prompt +const configs = agentModels.flatMap(({ agent, model }) => { + const cfg = AGENTS[agent]; + const effort = opts.effort ?? cfg.defaultEffort; + if (!cfg.efforts.includes(effort)) { + logger.log(pc.red(`Unknown effort "${effort}" for ${agent}. Available: ${cfg.efforts.join(", ")}`)); process.exit(1); } - model = agentConfig.defaultModel; -} + return promptNames.map((prompt) => ({ + config: { project, agent, model, effort, prompt, verbose: opts.verbose } as TrialConfig, + label: `${model}+${prompt}`, + })); +}); -const agentCfg = AGENTS[agent]; -const effort = opts.effort ?? agentCfg.defaultEffort; -if (!agentCfg.efforts.includes(effort)) { - logger.log(pc.red(`Unknown effort "${effort}" for ${agent}. Available: ${agentCfg.efforts.join(", ")}`)); +if (configs.length === 0) { + logger.log(pc.red("No matching agent/model/prompt combinations found.")); process.exit(1); } -const runId = randomUUID().slice(0, 8); -const config: TrialConfig = { - project, - agent, - model, - effort, - prompt: opts.prompt as string, - verbose: opts.verbose, -}; +// --- Print header --- +const runId = randomUUID().slice(0, 8); logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); -logger.log(`Agent: ${agent} | Model: ${model} | Effort: ${effort} | Prompt: ${config.prompt}`); +if (configs.length === 1) { + const { agent, model, effort, prompt } = configs[0].config; + logger.log(`Agent: ${agent} | Model: ${model} | Effort: ${effort} | Prompt: ${prompt}`); +} else { + logger.log(`${configs.length} parallel runs`); + for (const [agent, { models }] of Object.entries(AGENTS)) { + const active = models.filter((m) => configs.some((c) => c.config.model === m)); + if (active.length > 0) logger.log(` ${agent}: ${active.join(", ")}`); + } + logger.log(` prompts: ${[...new Set(promptNames)].join(", ")}`); +} logger.log(`Run: ${runId}\n`); -try { - const result = await runTask(config, logger); - const ghost = result.grading.ghostStories; +// --- Execute (always use allSettled — works for 1 or N runs) --- + +const settled = await Promise.allSettled( + configs.map((c) => runTask(c.config, createLogger(configs.length > 1 ? c.label : undefined))), +); + +const results: TrialResult[] = []; +for (const [i, s] of settled.entries()) { + if (s.status === "fulfilled") { + results.push(s.value); + } else { + logger.logError(`${configs[i].label}: ${s.reason instanceof Error ? s.reason.message : s.reason}`); + } +} + +if (results.length === 0) { + process.exit(1); +} + +// --- Print results --- + +if (results.length === 1) { + const r = results[0]; + const ghost = r.grading.ghostStories; const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; logger.log(pc.bold("\nResult")); - logger.log(` Build: ${result.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL")}`); + logger.log(` Build: ${r.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL")}`); logger.log(` Ghost: ${ghostStr}`); - logger.log(` TS Err: ${result.grading.typeCheckErrors}`); - logger.log(` Score: ${result.quality.score}`); - logger.log(` Cost: ${formatCost(result.execution.cost)}`); - logger.log(` Time: ${formatDuration(result.execution.duration)}`); - logger.log(` Turns: ${result.execution.turns}`); -} catch (error) { - logger.log(pc.red(`\nFailed: ${error instanceof Error ? error.message : error}`)); - process.exit(1); + logger.log(` TS Err: ${r.grading.typeCheckErrors}`); + logger.log(` Score: ${r.quality.score}`); + logger.log(` Cost: ${formatCost(r.execution.cost)}`); + logger.log(` Time: ${formatDuration(r.execution.duration)}`); + logger.log(` Turns: ${r.execution.turns}`); +} else { + results.sort((a, b) => (b.grading.ghostStories?.successRate ?? -1) - (a.grading.ghostStories?.successRate ?? -1)); + + const headers = ["Agent", "Model", "Prompt", "Build", "Ghost", "TS Err", "Score", "Cost", "Time", "Turns"]; + const rows = results.map((r) => { + const ghost = r.grading.ghostStories; + const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; + return [ + r.agent, + r.model, + r.prompt, + r.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL"), + ghostStr, + String(r.grading.typeCheckErrors), + String(r.quality.score), + formatCost(r.execution.cost), + formatDuration(r.execution.duration), + String(r.execution.turns), + ]; + }); + + logger.log(pc.bold("\n\nResults (sorted by ghost stories rate)")); + logger.log(formatTable(headers, rows)); + + const totalCost = results.reduce((s, r) => s + (r.execution.cost || 0), 0); + const ghostRates = results.map((r) => r.grading.ghostStories?.successRate).filter((r): r is number => r != null); + const avgGhost = ghostRates.length > 0 ? ghostRates.reduce((s, r) => s + r, 0) / ghostRates.length : 0; + + logger.log(`\nGhost stories avg: ${pc.bold(`${Math.round(avgGhost * 100)}%`)}`); + logger.log(`Total cost: ${pc.bold(formatCost(totalCost))}`); } + +logger.log("\nDone."); diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index f0e5bcbf498c..774d356c9797 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -2,6 +2,7 @@ import type { SDKMessage } from "@anthropic-ai/claude-agent-sdk"; import { query } from "@anthropic-ai/claude-agent-sdk"; import { writeFile } from "node:fs/promises"; import { join } from "node:path"; +import { AGENTS } from "../../config.ts"; import type { Agent, ExecutionResult, Logger } from "../../types.ts"; function logMessage(message: SDKMessage, logger: Logger) { @@ -69,13 +70,6 @@ function logMessage(message: SDKMessage, logger: Logger) { const MAX_TURNS = 50; -/** Map clean model names to Claude SDK model IDs */ -const CLAUDE_MODEL_MAP: Record = { - "sonnet-4.6": "claude-sonnet-4-6", - "opus-4.6": "claude-opus-4-6", - "haiku-4.5": "claude-haiku-4-5", -}; - export const claudeAgent: Agent = { name: "claude", @@ -88,6 +82,7 @@ export const claudeAgent: Agent = { logger, }): Promise { const startTime = Date.now(); + const sdkModel = AGENTS.claude.sdkModelIds[model] ?? model; let cost: number | undefined; let turns = 0; @@ -97,7 +92,7 @@ export const claudeAgent: Agent = { for await (const message of query({ prompt, options: { - model: CLAUDE_MODEL_MAP[model] ?? model, + model: sdkModel, cwd: projectPath, allowedTools: ["Read", "Write", "Edit", "Bash", "Glob", "Grep"], maxTurns: MAX_TURNS, diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts index 9962b074b037..d4495a8ce471 100644 --- a/scripts/eval/lib/agents/codex.ts +++ b/scripts/eval/lib/agents/codex.ts @@ -2,27 +2,7 @@ import { Codex, type ModelReasoningEffort } from "@openai/codex-sdk"; import { writeFile } from "node:fs/promises"; import { join } from "node:path"; import type { Agent, ExecutionResult } from "../../types.ts"; - -/** Per-million-token pricing for Codex/OpenAI models (USD). */ -const OPENAI_PRICING: Record = { - "gpt-5.4": { input: 2.50, cachedInput: 0.625, output: 10.00 }, -}; - -function estimateCost( - model: string, - inputTokens: number, - cachedInputTokens: number, - outputTokens: number, -): number | undefined { - const pricing = OPENAI_PRICING[model]; - if (!pricing) return undefined; - const freshInput = inputTokens - cachedInputTokens; - return ( - (freshInput / 1_000_000) * pricing.input + - (cachedInputTokens / 1_000_000) * pricing.cachedInput + - (outputTokens / 1_000_000) * pricing.output - ); -} +import { estimateCost } from "../pricing.ts"; export const codexAgent: Agent = { name: "codex", @@ -96,7 +76,7 @@ export const codexAgent: Agent = { } const duration = (Date.now() - startTime) / 1000; - const cost = estimateCost(model, totalInput, totalCached, totalOutput); + const cost = estimateCost("codex", model, { inputTokens: totalInput, cachedInputTokens: totalCached, outputTokens: totalOutput }); logger.logSuccess(`Done — ${turns} turns, ${Math.round(duration)}s, ${totalInput}in/${totalOutput}out tokens${cost != null ? `, $${cost.toFixed(4)}` : ""}`); await writeFile(join(resultsDir, "transcript.json"), JSON.stringify(items, null, 2)); diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index 41b9665ede43..d04f3af1252c 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -4,8 +4,7 @@ import type { GradingResult, GhostStoriesResult, QualityResult, QualityWeights, import { DEFAULT_QUALITY_WEIGHTS } from "../types.ts"; import { x } from "tinyexec"; import { detectSetupPatterns } from "./setup-patterns.ts"; -import { getComponentCandidates } from "../../../code/core/src/core-server/utils/ghost-stories/get-candidates.ts"; -import { runGhostStories } from "../../../code/core/src/core-server/utils/ghost-stories/run-story-tests.ts"; +import { getComponentCandidates, runGhostStories } from "../../../code/core/src/core-server/index.ts"; /** Filter changed files to only storybook-related ones. */ export function filterStorybookFiles(changedFiles: ChangedFile[]): ChangedFile[] { @@ -85,7 +84,7 @@ export async function grade( logger.logSuccess(`${changedFiles.length} files changed (${storybookFiles.length} storybook-related)`); // Setup patterns - const setupPatterns = detectSetupPatterns(projectPath); + const setupPatterns = await detectSetupPatterns(projectPath); if (setupPatterns.length > 0) logger.logSuccess(`Detected patterns: ${setupPatterns.map((p) => p.label).join(", ")}`); // Storybook build + TypeScript check in parallel diff --git a/scripts/eval/lib/grading-pipeline.test.ts b/scripts/eval/lib/grading-pipeline.test.ts index 7b63a94737ae..5ffcab12170f 100644 --- a/scripts/eval/lib/grading-pipeline.test.ts +++ b/scripts/eval/lib/grading-pipeline.test.ts @@ -4,7 +4,7 @@ import { tmpdir } from 'node:os'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; -import { getComponentCandidates } from '../../../code/core/src/core-server/utils/ghost-stories/get-candidates'; +import { getComponentCandidates } from '../../../code/core/src/core-server/index'; import { computeQualityScore, countTypeCheckErrors, @@ -84,7 +84,7 @@ describe('grading pipeline', () => { expect(candidates).toHaveLength(2); // Step 2: Detect patterns — config references CSS, theme, staticDirs - const patterns = detectSetupPatterns(TMP); + const patterns = await detectSetupPatterns(TMP); const patternIds = patterns.map((p) => p.id); expect(patternIds).toContain('global-css'); expect(patternIds).toContain('theme-provider'); @@ -128,7 +128,7 @@ describe('grading pipeline', () => { // Agent didn't create any .storybook config rmSync(join(TMP, '.storybook'), { recursive: true }); - expect(detectSetupPatterns(TMP)).toEqual([]); + expect(await detectSetupPatterns(TMP)).toEqual([]); // Simulate tsc output with errors proportional to candidate count const tscLines = candidates.map( @@ -162,7 +162,7 @@ describe('grading pipeline', () => { const candidates = await findCandidates(TMP); expect(candidates).toHaveLength(5); - const patterns = detectSetupPatterns(TMP); + const patterns = await detectSetupPatterns(TMP); expect(patterns.map((p) => p.id)).toContain('router-provider'); // Agent wrote one story per candidate — all storybook-related @@ -178,7 +178,7 @@ describe('grading pipeline', () => { }); describe('setup-patterns only scans .storybook/', () => { - it('does not detect patterns in component source files', () => { + it('does not detect patterns in component source files', async () => { // Router usage in a component should NOT be detected as a setup pattern writeFile( 'src/App.tsx', @@ -193,6 +193,6 @@ describe('setup-patterns only scans .storybook/', () => { // Empty .storybook config with no patterns writeFile('.storybook/main.ts', `export default { stories: ['../src/**/*.stories.tsx'] };`); - expect(detectSetupPatterns(TMP).map((p) => p.id)).not.toContain('router-provider'); + expect((await detectSetupPatterns(TMP)).map((p) => p.id)).not.toContain('router-provider'); }); }); diff --git a/scripts/eval/lib/package-manager.ts b/scripts/eval/lib/package-manager.ts new file mode 100644 index 000000000000..ab11af3a26a1 --- /dev/null +++ b/scripts/eval/lib/package-manager.ts @@ -0,0 +1,47 @@ +/** + * Shared package manager detection and dependency installation. + * + * Used by both the trial preparation (prepare-trial.ts) and the + * one-time repo preparation script (prepare-repos.ts). + */ + +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import { x } from "tinyexec"; +import type { Logger } from "../types.ts"; + +/** Detect the package manager from lock files in a directory. */ +export function detectPackageManager(dir: string): string { + if (existsSync(join(dir, "pnpm-lock.yaml")) || existsSync(join(dir, "pnpm-workspace.yaml"))) return "pnpm"; + if (existsSync(join(dir, "yarn.lock"))) return "yarn"; + if (existsSync(join(dir, "bun.lockb")) || existsSync(join(dir, "bun.lock"))) return "bun"; + return "npm"; +} + +function getInstallArgs(pm: string, dir: string): [string, string[]] { + switch (pm) { + case "pnpm": + return ["pnpm", ["install", "--no-frozen-lockfile"]]; + case "yarn": + return ["yarn", existsSync(join(dir, ".yarnrc.yml")) ? ["install", "--no-immutable"] : ["install"]]; + case "bun": + return ["bun", ["install"]]; + default: + return ["npm", ["install", "--ignore-scripts"]]; + } +} + +/** Install dependencies using the detected package manager. */ +export async function installDeps( + dir: string, + logger: Logger, + env?: Record, +): Promise { + const pm = detectPackageManager(dir); + const [cmd, args] = getInstallArgs(pm, dir); + logger.logStep(`Installing with ${pm}...`); + await x(cmd, args, { + timeout: 300_000, + nodeOptions: { cwd: dir, ...(env && { env: env as NodeJS.ProcessEnv }) }, + }); +} diff --git a/scripts/eval/lib/prepare-trial.ts b/scripts/eval/lib/prepare-trial.ts index 8baa88c04b31..301e06f29e0e 100644 --- a/scripts/eval/lib/prepare-trial.ts +++ b/scripts/eval/lib/prepare-trial.ts @@ -3,22 +3,9 @@ import { cp, mkdir } from "node:fs/promises"; import { join } from "node:path"; import type { Project, TrialPaths, Logger } from "../types.ts"; import { x } from "tinyexec"; +import { installDeps } from "./package-manager.ts"; import { CACHE_DIR, TRIALS_DIR } from "./utils.ts"; -async function installDeps(dir: string, logger: Logger) { - const has = (f: string) => existsSync(join(dir, f)); - const [cmd, args]: [string, string[]] = has("pnpm-lock.yaml") || has("pnpm-workspace.yaml") - ? ["pnpm", ["install", "--no-frozen-lockfile"]] - : has("yarn.lock") - ? ["yarn", has(".yarnrc.yml") ? ["install", "--no-immutable"] : ["install"]] - : has("bun.lockb") || has("bun.lock") - ? ["bun", ["install"]] - : ["npm", ["install", "--ignore-scripts"]]; - - logger.logStep(`Installing with ${cmd}...`); - await x(cmd, args, { timeout: 300_000, nodeOptions: { cwd: dir } }); -} - /** * First run: clone eval-baseline -> install deps -> cache it. * Subsequent runs: copy from cache. Agent starts immediately. diff --git a/scripts/eval/lib/pricing.ts b/scripts/eval/lib/pricing.ts new file mode 100644 index 000000000000..16fb7995211e --- /dev/null +++ b/scripts/eval/lib/pricing.ts @@ -0,0 +1,27 @@ +/** + * Shared cost estimation from token usage. + * + * Pricing tables live in config.ts alongside agent definitions. + * This module provides the math. + */ + +import { AGENTS } from "../config.ts"; +import type { AgentName } from "../types.ts"; + +export interface TokenUsage { + inputTokens: number; + cachedInputTokens: number; + outputTokens: number; +} + +/** Estimate cost from token usage using the pricing table in config. */ +export function estimateCost(agent: AgentName, model: string, usage: TokenUsage): number | undefined { + const pricing = AGENTS[agent].pricing[model]; + if (!pricing) return undefined; + const freshInput = usage.inputTokens - usage.cachedInputTokens; + return ( + (freshInput / 1_000_000) * pricing.input + + (usage.cachedInputTokens / 1_000_000) * pricing.cachedInput + + (usage.outputTokens / 1_000_000) * pricing.output + ); +} diff --git a/scripts/eval/lib/setup-patterns.test.ts b/scripts/eval/lib/setup-patterns.test.ts index 75318fd01531..47ca1d9a3dcb 100644 --- a/scripts/eval/lib/setup-patterns.test.ts +++ b/scripts/eval/lib/setup-patterns.test.ts @@ -22,73 +22,73 @@ function writeConfig(name: string, content: string) { } describe('detectSetupPatterns', () => { - it('returns empty when no .storybook dir', () => { + it('returns empty when no .storybook dir', async () => { rmSync(join(TMP, '.storybook'), { recursive: true }); - expect(detectSetupPatterns(TMP)).toEqual([]); + expect(await detectSetupPatterns(TMP)).toEqual([]); }); - it('returns empty when .storybook has no matching patterns', () => { + it('returns empty when .storybook has no matching patterns', async () => { writeConfig('main.ts', 'export default { stories: ["../src/**/*.stories.@(ts|tsx)"] };'); - expect(detectSetupPatterns(TMP)).toEqual([]); + expect(await detectSetupPatterns(TMP)).toEqual([]); }); - it('detects Tailwind CSS', () => { + it('detects Tailwind CSS', async () => { writeConfig('preview.ts', `import 'tailwindcss/tailwind.css';`); - expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('tailwind'); + expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('tailwind'); }); - it('detects global CSS imports', () => { + it('detects global CSS imports', async () => { writeConfig('preview.ts', `import '../src/styles/globals.css';`); - expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('global-css'); + expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('global-css'); }); - it('detects styled-components', () => { + it('detects styled-components', async () => { writeConfig('preview.tsx', `import { createGlobalStyle } from 'styled-components';`); - expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('styled-components'); + expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('styled-components'); }); - it('detects React Router', () => { + it('detects React Router', async () => { writeConfig('preview.tsx', `import { MemoryRouter } from 'react-router-dom';`); - expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('router-provider'); + expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('router-provider'); }); - it('detects Redux provider', () => { + it('detects Redux provider', async () => { writeConfig( 'preview.tsx', `import { Provider } from 'react-redux';\n` ); - expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('redux-provider'); + expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('redux-provider'); }); - it('detects Zustand', () => { + it('detects Zustand', async () => { writeConfig('preview.ts', `import { create } from 'zustand';`); - expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('zustand'); + expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('zustand'); }); - it('detects GraphQL/Apollo', () => { + it('detects GraphQL/Apollo', async () => { writeConfig('preview.tsx', `import { MockedProvider } from '@apollo/client/testing';`); - expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('graphql'); + expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('graphql'); }); - it('detects theme providers', () => { + it('detects theme providers', async () => { writeConfig('preview.tsx', `import { ThemeProvider } from '@emotion/react';`); - expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('theme-provider'); + expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('theme-provider'); }); - it('detects staticDirs', () => { + it('detects staticDirs', async () => { writeConfig('main.ts', `export default { staticDirs: ['../public'] };`); - expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('static-dirs'); + expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('static-dirs'); }); - it('detects vite alias config', () => { + it('detects vite alias config', async () => { writeConfig( 'main.ts', `export default { viteFinal: (config) => ({ ...config, resolve: { alias: { '@': './src' } } }) };` ); - expect(detectSetupPatterns(TMP).map((p) => p.id)).toContain('vite-alias'); + expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('vite-alias'); }); - it('detects multiple patterns in the same file', () => { + it('detects multiple patterns in the same file', async () => { writeConfig( 'preview.tsx', [ @@ -97,24 +97,24 @@ describe('detectSetupPatterns', () => { `import { ThemeProvider } from '@emotion/react';`, ].join('\n') ); - const ids = detectSetupPatterns(TMP).map((p) => p.id); + const ids = (await detectSetupPatterns(TMP)).map((p) => p.id); expect(ids).toContain('global-css'); expect(ids).toContain('router-provider'); expect(ids).toContain('theme-provider'); }); - it('includes sourceFiles relative to project path', () => { + it('includes sourceFiles relative to project path', async () => { writeConfig('preview.ts', `import 'tailwindcss';`); - const tailwind = detectSetupPatterns(TMP).find((p) => p.id === 'tailwind'); + const tailwind = (await detectSetupPatterns(TMP)).find((p) => p.id === 'tailwind'); expect(tailwind?.sourceFiles).toEqual(['.storybook/preview.ts']); }); - it('does not false-positive on unrelated React hooks', () => { + it('does not false-positive on unrelated React hooks', async () => { writeConfig('preview.ts', `import { useState, useEffect } from 'react';`); - expect(detectSetupPatterns(TMP)).toEqual([]); + expect(await detectSetupPatterns(TMP)).toEqual([]); }); - it('does not detect patterns in files outside .storybook/', () => { + it('does not detect patterns in files outside .storybook/', async () => { // Write a router import in a source file, not in .storybook/ mkdirSync(join(TMP, 'src'), { recursive: true }); writeFileSync( @@ -124,6 +124,6 @@ describe('detectSetupPatterns', () => { // .storybook/ has no patterns writeConfig('main.ts', `export default { stories: ['../src/**/*.stories.tsx'] };`); - expect(detectSetupPatterns(TMP).map((p) => p.id)).not.toContain('router-provider'); + expect((await detectSetupPatterns(TMP)).map((p) => p.id)).not.toContain('router-provider'); }); }); diff --git a/scripts/eval/lib/setup-patterns.ts b/scripts/eval/lib/setup-patterns.ts index 64eac26bac2e..f6ade6a64a8e 100644 --- a/scripts/eval/lib/setup-patterns.ts +++ b/scripts/eval/lib/setup-patterns.ts @@ -1,4 +1,5 @@ -import { readFileSync, existsSync, globSync } from "node:fs"; +import { readFile, readdir } from "node:fs/promises"; +import { existsSync } from "node:fs"; import { join, relative } from "node:path"; import type { SetupPattern } from "../types.ts"; @@ -16,23 +17,30 @@ const RULES = [ ]; /** Scan .storybook/ config files for known setup patterns. */ -export function detectSetupPatterns(projectPath: string): SetupPattern[] { +export async function detectSetupPatterns(projectPath: string): Promise { const dir = join(projectPath, ".storybook"); if (!existsSync(dir)) return []; - const files = globSync("**/*", { cwd: dir }).map((f) => join(dir, f)); - const results: SetupPattern[] = []; - - for (const { id, label, pattern } of RULES) { - const matches = files.filter((f) => { + // Read all entries recursively, then attempt to read each as a file + const entries = await readdir(dir, { recursive: true }); + const fileContents = await Promise.all( + entries.map(async (entry) => { + const fullPath = join(dir, entry); try { - return pattern.test(readFileSync(f, "utf-8")); + return { path: fullPath, content: await readFile(fullPath, "utf-8") }; } catch { - return false; + return null; // directories or unreadable files } - }); + }), + ); + + const files = fileContents.filter((f): f is { path: string; content: string } => f !== null); + + const results: SetupPattern[] = []; + for (const { id, label, pattern } of RULES) { + const matches = files.filter((f) => pattern.test(f.content)); if (matches.length > 0) { - results.push({ id, label, sourceFiles: matches.map((f) => relative(projectPath, f)) }); + results.push({ id, label, sourceFiles: matches.map((f) => relative(projectPath, f.path)) }); } } diff --git a/scripts/eval/lib/utils.test.ts b/scripts/eval/lib/utils.test.ts index 32abdf4cb9b7..7d18d74625f2 100644 --- a/scripts/eval/lib/utils.test.ts +++ b/scripts/eval/lib/utils.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest'; -import { formatDuration, formatCost, generateTrialId, generatePrompt, listPrompts } from './utils'; +import { formatDuration, formatCost, generateTrialId, generatePrompt, listPrompts, formatTable } from './utils'; describe('formatDuration', () => { it('formats seconds under a minute', () => { @@ -98,3 +98,50 @@ describe('generatePrompt', () => { expect(prompt).toBe(prompt.trim()); }); }); + +describe('formatTable', () => { + it('formats a simple table with aligned columns', () => { + const result = formatTable( + ['Name', 'Score'], + [['Alice', '100'], ['Bob', '95']], + ); + const lines = result.split('\n'); + expect(lines).toHaveLength(4); // header + divider + 2 rows + expect(lines[0]).toContain('Name'); + expect(lines[0]).toContain('Score'); + expect(lines[1]).toMatch(/^-+\+-+$/); + expect(lines[2]).toContain('Alice'); + expect(lines[3]).toContain('Bob'); + }); + + it('auto-sizes columns to fit content', () => { + const result = formatTable( + ['X', 'Y'], + [['short', 'a-much-longer-value']], + ); + const lines = result.split('\n'); + // Header column for Y should be padded to match the data width + const headerCols = lines[0].split(' | '); + const dataCols = lines[2].split(' | '); + expect(headerCols[1].trim().length).toBeLessThanOrEqual(dataCols[1].trim().length); + }); + + it('handles ANSI escape codes in cells', () => { + const green = '\x1b[32mPASS\x1b[39m'; + const result = formatTable( + ['Status'], + [[green], ['FAIL']], + ); + const lines = result.split('\n'); + // Both rows should be the same visible width + // The ANSI row has extra invisible chars but should still align + expect(lines[2]).toContain('PASS'); + expect(lines[3]).toContain('FAIL'); + }); + + it('handles empty rows', () => { + const result = formatTable(['A', 'B'], []); + const lines = result.split('\n'); + expect(lines).toHaveLength(2); // header + divider only + }); +}); diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts index 6bb36e8dd04a..019a7765906e 100644 --- a/scripts/eval/lib/utils.ts +++ b/scripts/eval/lib/utils.ts @@ -33,6 +33,30 @@ export function generateTrialId(project: string, agent: string, model: string, p return `${ts}-${project}-${agent}-${model}-${prompt}-${crypto.randomUUID().slice(0, 8)}`; } +// --- Table formatting --- + +/** Strip ANSI escape codes for accurate width calculation. */ +const stripAnsi = (str: string) => str.replace(/\x1b\[[0-9;]*m/g, ""); + +/** Format data as an aligned table with automatic column widths. */ +export function formatTable(headers: string[], rows: string[][]): string { + const widths = headers.map((h, i) => + Math.max(h.length, ...rows.map((r) => stripAnsi(r[i] ?? "").length)), + ); + + const pad = (str: string, width: number) => { + const visible = stripAnsi(str).length; + return str + " ".repeat(Math.max(0, width - visible)); + }; + + const sep = " | "; + return [ + headers.map((h, i) => pad(h, widths[i])).join(sep), + widths.map((w) => "-".repeat(w)).join("-+-"), + ...rows.map((row) => row.map((cell, i) => pad(cell, widths[i])).join(sep)), + ].join("\n"); +} + // --- Prompts --- /** Load a prompt by name from prompts/{name}.md, with optional template variables. */ @@ -57,4 +81,3 @@ export function listPrompts(): string[] { .filter((f) => f.endsWith(".md")) .map((f) => basename(f, ".md")); } - diff --git a/scripts/eval/prepare-repos.ts b/scripts/eval/prepare-repos.ts index 17b959f6f6fa..dc0913b4b0f5 100644 --- a/scripts/eval/prepare-repos.ts +++ b/scripts/eval/prepare-repos.ts @@ -10,13 +10,21 @@ * After this, each eval trial just does a fast shallow clone of the * prepared branch — no more storybook init during trials. * - * Usage: npx jiti scripts/eval/prepare-repos.ts + * Usage: node scripts/eval/prepare-repos.ts + * + * NOTE: The REPOS list below contains the *original* upstream repos + * (e.g. "yannbf/mealdrop"), which is distinct from the *fork* URLs in + * config.ts PROJECTS (e.g. "kasperpeulen/mealdrop"). This script forks + * and pushes eval-baseline branches to those forks. */ import { existsSync, mkdirSync, readFileSync, writeFileSync, rmSync, readdirSync } from "node:fs"; import { join } from "node:path"; -import pc from "picocolors"; import { x } from "tinyexec"; +import { createLogger } from "./lib/utils.ts"; +import { installDeps } from "./lib/package-manager.ts"; + +const logger = createLogger(); const EVAL_ROOT = join(import.meta.dirname, "..", "..", "..", "..", "storybook-eval"); const PREP_DIR = join(EVAL_ROOT, "prepared-repos"); @@ -118,43 +126,26 @@ function isStarterDirectory(dir: string): boolean { } } -function detectPM(dir: string): string { - if (existsSync(join(dir, 'pnpm-lock.yaml'))) return 'pnpm'; - if (existsSync(join(dir, 'yarn.lock'))) return 'yarn'; - if (existsSync(join(dir, 'bun.lockb')) || existsSync(join(dir, 'bun.lock'))) return 'bun'; - return 'npm'; -} - -async function installDeps(dir: string) { - const env = cleanNpmEnv(); - const pm = detectPM(dir); - console.log(` > Installing with ${pm}...`); - const args = pm === 'pnpm' ? ['install', '--no-frozen-lockfile'] - : pm === 'yarn' && existsSync(join(dir, '.yarnrc.yml')) ? ['install', '--no-immutable'] - : ['install']; - await run(pm, args, { cwd: dir, env, timeout: 300_000 }); -} - async function prepareRepo(repo: BenchmarkRepo) { - console.log(pc.bold(`\n=== ${repo.name} ===`)); + logger.log(`\n=== ${repo.name} ===`); const repoDir = join(PREP_DIR, repo.name); // 1. Fork (idempotent — gh fork is a no-op if already forked) - console.log(` > Forking ${repo.repo}...`); + logger.logStep(`Forking ${repo.repo}...`); try { await run('gh', ['repo', 'fork', repo.repo, '--clone=false']); } catch { - console.log(` ! Fork may already exist, continuing...`); + logger.log(` ! Fork may already exist, continuing...`); } // Figure out the fork name (gh forks to authenticated user) const whoami = (await run('gh', ['api', 'user', '--jq', '.login'])).stdout.trim(); const forkSlug = `${whoami}/${repo.repo.split('/')[1]}`; - console.log(` > Fork: ${forkSlug}`); + logger.logStep(`Fork: ${forkSlug}`); // 2. Clone (or pull) the fork if (existsSync(repoDir)) { - console.log(` > Updating existing clone...`); + logger.logStep(`Updating existing clone...`); await run('git', ['fetch', 'origin'], { cwd: repoDir }); const branch = repo.branch || (await run('git', ['remote', 'show', 'origin'], { cwd: repoDir })) .stdout.match(/HEAD branch:\s*(\S+)/)?.[1] || 'main'; @@ -162,14 +153,14 @@ async function prepareRepo(repo: BenchmarkRepo) { await run('git', ['reset', '--hard', `origin/${branch}`], { cwd: repoDir }); await run('git', ['clean', '-fdx', '-e', 'node_modules'], { cwd: repoDir }); } else { - console.log(` > Cloning ${forkSlug}...`); + logger.logStep(`Cloning ${forkSlug}...`); const cloneArgs = ['clone', `https://github.com/${forkSlug}.git`, repoDir]; if (repo.branch) cloneArgs.splice(1, 0, '--branch', repo.branch); await run('git', cloneArgs, { timeout: 120_000 }); } // 3. Create eval-baseline branch - console.log(` > Creating ${BASELINE_BRANCH} branch...`); + logger.logStep(`Creating ${BASELINE_BRANCH} branch...`); await run('git', ['checkout', '-B', BASELINE_BRANCH], { cwd: repoDir }); // 4. Clean storybook files @@ -177,10 +168,10 @@ async function prepareRepo(repo: BenchmarkRepo) { cleanStorybookFiles(projectDir); // 5. Install dependencies - await installDeps(projectDir); + await installDeps(projectDir, logger, cleanNpmEnv()); // 6. Run storybook init - console.log(` > Running storybook init...`); + logger.logStep(`Running storybook init...`); const env = cleanNpmEnv(); await run('npx', ['storybook@latest', 'init', '--yes', '--no-dev'], { cwd: projectDir, @@ -189,10 +180,10 @@ async function prepareRepo(repo: BenchmarkRepo) { }); // 7. Post-init install - await installDeps(projectDir); + await installDeps(projectDir, logger, cleanNpmEnv()); // 8. Commit everything - console.log(` > Committing baseline...`); + logger.logStep(`Committing baseline...`); await run('git', ['add', '-A'], { cwd: repoDir, env: { ...cleanNpmEnv(), ...GIT_ENV } }); await run('git', ['commit', '-m', 'eval baseline after storybook init', '--allow-empty'], { cwd: repoDir, @@ -200,18 +191,18 @@ async function prepareRepo(repo: BenchmarkRepo) { }); // 9. Force-push the baseline branch - console.log(` > Pushing ${BASELINE_BRANCH}...`); + logger.logStep(`Pushing ${BASELINE_BRANCH}...`); await run('git', ['push', '-f', 'origin', BASELINE_BRANCH], { cwd: repoDir }); - console.log(pc.green(` ✓ ${repo.name} ready at ${forkSlug}#${BASELINE_BRANCH}`)); + logger.logSuccess(`${repo.name} ready at ${forkSlug}#${BASELINE_BRANCH}`); return { name: repo.name, forkRepo: `https://github.com/${forkSlug}`, branch: BASELINE_BRANCH, projectDir: repo.projectDir }; } // --- Main --- mkdirSync(PREP_DIR, { recursive: true }); -console.log(pc.bold('Preparing eval baseline repos')); -console.log(`Output: ${PREP_DIR}\n`); +logger.log(`Preparing eval baseline repos`); +logger.log(`Output: ${PREP_DIR}\n`); const results = []; for (const repo of REPOS) { @@ -219,11 +210,11 @@ for (const repo of REPOS) { const result = await prepareRepo(repo); results.push(result); } catch (error) { - console.log(pc.red(` ✗ Failed: ${error instanceof Error ? error.message : error}`)); + logger.logError(`Failed: ${error instanceof Error ? error.message : error}`); } } -console.log(pc.bold('\n\nPrepared repos:')); +logger.log(`\n\nPrepared repos:`); for (const r of results) { - console.log(` ${r.name}: ${r.forkRepo}#${r.branch}${r.projectDir ? ` (${r.projectDir})` : ''}`); + logger.logSuccess(`${r.name}: ${r.forkRepo}#${r.branch}${r.projectDir ? ` (${r.projectDir})` : ''}`); } diff --git a/scripts/eval/types.test.ts b/scripts/eval/types.test.ts index 9550f3e9527e..18fb167ab361 100644 --- a/scripts/eval/types.test.ts +++ b/scripts/eval/types.test.ts @@ -2,40 +2,39 @@ import { describe, expect, it } from 'vitest'; import { AGENTS, PROJECTS } from './config'; -describe('AGENTS', () => { - it('has claude and codex agents', () => { - expect(AGENTS).toHaveProperty('claude'); - expect(AGENTS).toHaveProperty('codex'); - }); +/** + * Basic shape validation (required fields, defaults, types) is handled by Zod + * schemas at import time — AgentConfig.parse() in config.ts throws on invalid + * config. These tests cover cross-cutting invariants that Zod cannot express. + */ - it('each agent has a non-empty models list', () => { - for (const config of Object.values(AGENTS)) { - expect(config.models.length).toBeGreaterThan(0); - } +describe('AGENTS', () => { + it('validates against schema (the import itself proves this)', () => { + // AgentConfig.parse() runs at import time. If this test file loads, + // the config is valid (models non-empty, defaultModel in list, etc.). + expect(Object.keys(AGENTS)).toEqual(['claude', 'codex']); }); - it('each agent defaultModel is in its models list', () => { - for (const config of Object.values(AGENTS)) { - expect(config.models).toContain(config.defaultModel); - } + it('no model is shared between agents', () => { + const allModels = Object.values(AGENTS).flatMap((a) => a.models); + expect(new Set(allModels).size).toBe(allModels.length); }); - it('each agent has a non-empty efforts list', () => { - for (const config of Object.values(AGENTS)) { - expect(config.efforts.length).toBeGreaterThan(0); + it('sdkModelIds only reference known models', () => { + for (const [, cfg] of Object.entries(AGENTS)) { + for (const model of Object.keys(cfg.sdkModelIds)) { + expect(cfg.models).toContain(model); + } } }); - it('each agent defaultEffort is in its efforts list', () => { - for (const config of Object.values(AGENTS)) { - expect(config.efforts).toContain(config.defaultEffort); + it('pricing only references known models', () => { + for (const [, cfg] of Object.entries(AGENTS)) { + for (const model of Object.keys(cfg.pricing)) { + expect(cfg.models).toContain(model); + } } }); - - it('no model is shared between agents', () => { - const allModels = Object.values(AGENTS).flatMap((a) => a.models); - expect(new Set(allModels).size).toBe(allModels.length); - }); }); describe('PROJECTS', () => { @@ -43,14 +42,6 @@ describe('PROJECTS', () => { expect(PROJECTS.length).toBeGreaterThan(0); }); - it('each project has name, repo URL, and branch', () => { - for (const project of PROJECTS) { - expect(project.name).toBeTruthy(); - expect(project.repo).toMatch(/^https:\/\/github\.com\//); - expect(project.branch).toBeTruthy(); - } - }); - it('project names are unique', () => { const names = PROJECTS.map((p) => p.name); expect(new Set(names).size).toBe(names.length); diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index f196abf3cb25..bb4038f19808 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -1,12 +1,13 @@ /** * Core types for the Storybook setup eval system. * - * Four independent axes: agent x model x effort x prompt - * - * Runtime configuration (AGENTS, PROJECTS) lives in config.ts. + * Data types use Zod schemas for runtime validation. + * Behavioral interfaces (Logger, Agent) stay as plain TypeScript. */ -// --- Logger --- +import { z } from "zod"; + +// --- Logger (behavioral interface — not validated at runtime) --- export interface Logger { log: (msg: string) => void; @@ -15,124 +16,140 @@ export interface Logger { logError: (msg: string) => void; } -// --- Agent, Model, Effort --- - -export type AgentName = "claude" | "codex"; - -// --- Projects --- - -export interface Project { - name: string; - repo: string; - branch?: string; - projectDir?: string; - description?: string; -} - -// --- Trial Types --- - -export interface TrialConfig { - project: Project; - agent: AgentName; - model: string; - effort: string; - prompt: string; - verbose?: boolean; -} - -export interface TrialPaths { - trialDir: string; - repoRoot: string; - projectPath: string; - resultsDir: string; - baselineCommit: string; -} - -// --- Execution Types --- - -export interface ExecutionResult { - agent: string; - model: string; - effort: string; - cost?: number; - duration: number; - durationApi?: number; - turns: number; -} +// --- Agent Name --- + +export const AgentName = z.enum(["claude", "codex"]); +export type AgentName = z.infer; + +// --- Project --- + +export const Project = z.object({ + name: z.string().min(1), + repo: z.string().url(), + branch: z.string().optional(), + projectDir: z.string().optional(), + description: z.string().optional(), +}); +export type Project = z.infer; + +// --- Trial Config --- + +export const TrialConfig = z.object({ + project: Project, + agent: AgentName, + model: z.string(), + effort: z.string(), + prompt: z.string(), + verbose: z.boolean().optional(), +}); +export type TrialConfig = z.infer; + +// --- Trial Paths --- + +export const TrialPaths = z.object({ + trialDir: z.string(), + repoRoot: z.string(), + projectPath: z.string(), + resultsDir: z.string(), + baselineCommit: z.string(), +}); +export type TrialPaths = z.infer; + +// --- Execution --- + +export const ExecutionResult = z.object({ + agent: z.string(), + model: z.string(), + effort: z.string(), + cost: z.number().optional(), + duration: z.number(), + durationApi: z.number().optional(), + turns: z.number(), +}); +export type ExecutionResult = z.infer; // --- Changed Files --- -export interface ChangedFile { - path: string; - status: "A" | "M" | "D" | "R"; -} +export const ChangedFile = z.object({ + path: z.string(), + status: z.enum(["A", "M", "D", "R"]), +}); +export type ChangedFile = z.infer; // --- Setup Patterns --- -export interface SetupPattern { - id: string; - label: string; - sourceFiles: string[]; -} - -// --- Grading Types --- - -export interface GradingResult { - buildSuccess: boolean; - buildError?: string; - typeCheckErrors: number; - typeCheckOutput?: string; - changedFiles: ChangedFile[]; - storybookFiles: ChangedFile[]; - setupPatterns: SetupPattern[]; - ghostStories?: GhostStoriesResult; -} - -export interface GhostStoriesResult { - candidateCount: number; - total: number; - passed: number; - successRate: number; -} +export const SetupPattern = z.object({ + id: z.string(), + label: z.string(), + sourceFiles: z.array(z.string()), +}); +export type SetupPattern = z.infer; + +// --- Ghost Stories --- + +export const GhostStoriesResult = z.object({ + candidateCount: z.number(), + total: z.number(), + passed: z.number(), + successRate: z.number(), +}); +export type GhostStoriesResult = z.infer; + +// --- Grading --- + +export const GradingResult = z.object({ + buildSuccess: z.boolean(), + buildError: z.string().optional(), + typeCheckErrors: z.number(), + typeCheckOutput: z.string().optional(), + changedFiles: z.array(ChangedFile), + storybookFiles: z.array(ChangedFile), + setupPatterns: z.array(SetupPattern), + ghostStories: GhostStoriesResult.optional(), +}); +export type GradingResult = z.infer; // --- Quality Score --- -export interface QualityWeights { - ghostStories: number; - build: number; - typecheck: number; - performance: number; -} - -export const DEFAULT_QUALITY_WEIGHTS: QualityWeights = { - ghostStories: 0.4, - build: 0.25, - typecheck: 0.25, - performance: 0.1, -}; - -export interface QualityResult { - score: number; - breakdown: { build: number; typecheck: number; ghostStories: number; performance: number }; -} - -// --- Final Result --- - -export interface TrialResult { - schemaVersion: 1; - project: string; - agent: string; - model: string; - effort: string; - prompt: string; - timestamp: string; - baselineCommit: string; - execution: ExecutionResult; - grading: GradingResult; - quality: QualityResult; -} - -// --- Agent Interface --- +export const QualityWeights = z.object({ + ghostStories: z.number().default(0.4), + build: z.number().default(0.25), + typecheck: z.number().default(0.25), + performance: z.number().default(0.1), +}); +export type QualityWeights = z.infer; + +export const DEFAULT_QUALITY_WEIGHTS: QualityWeights = QualityWeights.parse({}); + +export const QualityResult = z.object({ + score: z.number(), + breakdown: z.object({ + build: z.number(), + typecheck: z.number(), + ghostStories: z.number(), + performance: z.number(), + }), +}); +export type QualityResult = z.infer; + +// --- Trial Result --- + +export const TrialResult = z.object({ + schemaVersion: z.literal(1), + project: z.string(), + agent: z.string(), + model: z.string(), + effort: z.string(), + prompt: z.string(), + timestamp: z.string(), + baselineCommit: z.string(), + execution: ExecutionResult, + grading: GradingResult, + quality: QualityResult, +}); +export type TrialResult = z.infer; + +// --- Agent Interface (behavioral — not validated) --- export interface Agent { name: AgentName; From 9b6085b5dc3e00d23ee1fd854e0d446589234631 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sun, 29 Mar 2026 00:44:13 +0700 Subject: [PATCH 35/63] Update AGENTS.md and tsconfig comments for native Node TS execution --- AGENTS.md | 3 ++- code/tsconfig.json | 1 + scripts/tsconfig.json | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/AGENTS.md b/AGENTS.md index 7c99c9041a9e..9854c7b6515b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -9,10 +9,11 @@ This file is the canonical instruction source for coding agents. Files like `CLA Storybook is a large TypeScript monorepo. The git root is the repo root, the main code lives in `code/`, and build tooling lives in `scripts/`. The default branch is `next`. - **Base branch**: `next` (all PRs should target `next`, not `main`) -- **Node.js**: `22.21.1` (see `.nvmrc`) +- **Node.js**: `22.22.1` (see `.nvmrc`) — supports `.ts` natively via type stripping (no loader needed) - **Package Manager**: Yarn Berry - **Task orchestration**: NX plus the custom `yarn task` runner - **CI environment**: Linux and Windows +- **TS execution**: Migrating from `jiti` to native `node` for running `.ts` files. New scripts should use `node ./path/file.ts` with explicit `.ts` import extensions (enabled by `allowImportingTsExtensions` in tsconfig). Legacy scripts still use `jiti` but should be migrated over time. ## Repository Structure diff --git a/code/tsconfig.json b/code/tsconfig.json index 940555d1805e..870835c74b2a 100644 --- a/code/tsconfig.json +++ b/code/tsconfig.json @@ -13,6 +13,7 @@ "lib": ["dom", "dom.iterable", "esnext"], "module": "Preserve", "moduleResolution": "bundler", + // Required for explicit .ts import extensions — migrating toward native Node TS execution "allowImportingTsExtensions": true, "noImplicitAny": true, "noUnusedLocals": false, diff --git a/scripts/tsconfig.json b/scripts/tsconfig.json index a00e31d3f6d5..98673817b79d 100644 --- a/scripts/tsconfig.json +++ b/scripts/tsconfig.json @@ -11,6 +11,7 @@ "moduleResolution": "bundler", "target": "ESNext", "module": "Preserve", + // Required for native Node TS execution (node file.ts) — we are migrating from jiti to native node "allowImportingTsExtensions": true, "skipLibCheck": true, "allowSyntheticDefaultImports": true, From cabe15aa2503fe8b0351c280617896b977933fd7 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sun, 29 Mar 2026 08:47:09 +0700 Subject: [PATCH 36/63] WIP: checkpoint current eval harness changes --- .agents/skills/review-pr/SKILL.md | 526 ++++++++++++++++++++++ code/core/src/core-server/index.ts | 4 - scripts/eval/config.ts | 79 ++-- scripts/eval/eval.ts | 362 +++++++-------- scripts/eval/lib/agents/codex.ts | 2 +- scripts/eval/lib/ghost-stories.ts | 118 +++++ scripts/eval/lib/grade.ts | 11 +- scripts/eval/lib/grading-pipeline.test.ts | 4 +- scripts/eval/lib/pricing.ts | 27 -- scripts/eval/lib/run-task.test.ts | 20 +- scripts/eval/lib/run-task.ts | 11 +- scripts/eval/lib/save.ts | 25 - scripts/eval/lib/utils.test.ts | 14 +- scripts/eval/lib/utils.ts | 40 +- scripts/eval/types.test.ts | 10 +- scripts/eval/types.ts | 205 ++++----- scripts/package.json | 1 + yarn.lock | 8 + 18 files changed, 1044 insertions(+), 423 deletions(-) create mode 100644 .agents/skills/review-pr/SKILL.md create mode 100644 scripts/eval/lib/ghost-stories.ts delete mode 100644 scripts/eval/lib/pricing.ts delete mode 100644 scripts/eval/lib/save.ts diff --git a/.agents/skills/review-pr/SKILL.md b/.agents/skills/review-pr/SKILL.md new file mode 100644 index 000000000000..0b8a41c8fc95 --- /dev/null +++ b/.agents/skills/review-pr/SKILL.md @@ -0,0 +1,526 @@ +--- +name: review-pr +description: "Generate a Reveal.js slideshow to review a PR. Use when the user says 'review pr', 'review this PR', 'slideshow review', 'pr slideshow', or wants to review PR changes in a narrative presentation format." +allowed-tools: Bash, Read, Write, Edit, Agent, Grep, Glob +--- + +# PR Review Slideshow + +Generate a Reveal.js slideshow that walks through a PR as a narrative — starting from the main flow, then zooming into every detail. + +## Philosophy + +Two principles — both matter, they work on different axes: + +1. **Big picture first.** The horizontal flow goes broad → specific. Start with the high-level "what and why", then progressively zoom into each area of change. +2. **Tests first.** At each stop along the way, show the test before the implementation. The test explains *what* the behavior is. The implementation explains *how*. + +Together: you walk through the PR from the broadest overview to the smallest detail, and at every level you see the test before you see the code. + +Other principles: +- **Discuss before you fix, fix before you present.** Flag readability problems, get approval, then clean up. +- **If reading the tests doesn't make the change obvious, that's a smell.** Flag it. +- **Cover everything.** By the last slide, every changed file has been addressed. +- **Less is more.** Omit boilerplate, but always note what you left out. + +## Step 1 — Gather PR data + +Determine the PR to review. If the user provides a PR number, use that. Otherwise detect from the current branch: + +```bash +# Get PR number from current branch +gh pr view --json number,title,author,headRefName,baseRefName,body,additions,deletions,changedFiles + +# Get the list of changed files +gh pr diff --name-only + +# Get the full diff +gh pr diff +``` + +If a PR number or URL is given as an argument, pass it to `gh pr view ` and `gh pr diff `. + +## Step 2 — Read and analyze changes + +For each changed file: + +1. Read the full diff (from `gh pr diff`) +2. Read the full file content for surrounding context (use `Read` tool) +3. Identify if it's a **test file**, **type definition**, **implementation**, **config**, or **docs** + +For each implementation file, look for a corresponding test file: +- `foo.ts` → look for `foo.test.ts`, `foo.spec.ts`, `foo.test.tsx`, `__tests__/foo.ts` +- Even if the test file wasn't changed, read it for context if the implementation was changed + +## Step 3 — Identify and discuss problems + +Before building the slideshow, scan the PR for readability issues. **Don't fix anything yet** — present findings and let the user decide. + +```bash +gh pr checkout +``` + +Look for: +- Vague test names, massive test setup, missing assertions +- Changed code with no test coverage +- Unclear names, dead code, overly clever logic +- `any` types where a proper type is obvious + +Present a numbered list with concrete examples and suggested fixes. Then **wait** — the user decides what gets fixed (all, some, or none). + +After approval, fix in the working tree, lint, test, commit as a separate commit, and push. + +If the user says skip, go straight to Step 4 — unfixed issues will naturally show up in the slideshow as code that's hard to explain. + +## Step 4 — Plan the narrative + +The slideshow tells a story on two axes. + +### Horizontal axis: big picture → specific areas + +Group the changes into logical areas and order them broad-to-specific: + +1. **Big picture** — what this PR does and why, in plain English +2. **Core areas** — the main logical groups of change, ordered from most important to least. Each area becomes a horizontal slide. +3. **Supporting changes** — config, dependencies, docs that don't fit the core areas +4. **Summary** — key takeaways + +### Vertical axis: test first, then implementation + +Within each area, the vertical slides follow this order: + +1. **Overview** — what changed in this area, in plain English (the top slide) +2. **Test** — the test that explains the behavior. Show it fully. The reader should understand the *what* from this alone. +3. **Implementation** — the code that makes the test pass. Show enough context. +4. **More details** — types, helpers, surrounding context, additional tests + +If there's no test for an area, the overview slide flags that with a smell-box, and the implementation goes directly below it. + +### Check coverage — MANDATORY + +After planning, run through the list of changed files from `gh pr diff --name-only` and verify **every single file** appears somewhere in the slideshow — in an area, in a zoom-in, or in supporting changes. + +This is not optional. If a file is missing from the slideshow, the review is incomplete. Use `file-path` spans for every file so coverage can be verified by searching the HTML. + +For files with trivial changes (e.g. lockfiles, tsconfig one-liners), a bullet point in the Supporting Changes slide is enough. But they must appear. + +## Step 5 — Generate the slideshow + +Pick a short unique ID for this slideshow — use the PR number (e.g. `pr-34365`). The output directory is: + +``` +~/life/slideshows// +``` + +Write the slideshow to `~/life/slideshows//index.html`. + +### Narrative structure + +``` +[Title] → [Big Picture] → [Area 1] → [Area 2] → ... → [Supporting] → [Summary] + ↓ ↓ ↓ + [Test 1a] [Test 2a] [Config diffs] + ↓ ↓ + [Impl 1a] [Impl 2a] + ↓ + [Test 1b] + ↓ + [Impl 1b] +``` + +**Horizontal (← →)** = big picture → specific areas. Read left-to-right to understand the shape of the PR. +**Vertical (↓)** = test first, then implementation. Press down to see *what* the behavior is (test), then *how* it works (code). + +A reader who only goes right sees each area at a glance. A reader who also goes down gets the full test-then-implementation story for each area. + +### HTML template + +Use this exact template structure. Replace `{{SLIDES}}` with generated slide content: + +```html + + + + + + PR Review: {{TITLE}} + + + + + + + + + + + +
+
+ {{SLIDES}} +
+
+ + + + + + +``` + +### Slide guidelines + +**Title slide:** +```html +
+

PR #{{NUMBER}}: {{TITLE}}

+

by {{AUTHOR}} · {{BRANCH}} → {{BASE}}

+
+ {{FILES}} files + +{{ADDITIONS}} + -{{DELETIONS}} +
+
+``` + +**Big picture slide — sets up the story, previews the areas:** +```html +
+

What this PR does

+

{{2-3 sentence summary of the change and why it matters}}

+

Areas of change

+
    +
  1. {{Area 1}} — {{one-liner}}
  2. +
  3. {{Area 2}} — {{one-liner}}
  4. +
  5. {{Area 3}} — {{one-liner}}
  6. +
+

→ to walk through each area

+
+``` + +**Area slide — overview on top, test below, implementation below that:** +```html +
+ +
+

{{Area name}}

+

{{What changed in this area and why, in plain English}}

+ path/to/main-file.ts +

+// Show just the key change — the "headline" that orients the reader
+    
+
{{How this area connects to the rest of the PR}}
+

↓ tests, then implementation

+
+ + +
+

What the tests say

+ path/to/file.test.ts +

+// Show the full test — the reader should now understand the expected behavior
+    
+
{{Plain-English summary of what this test tells us}}
+
+ + +
+

Implementation

+ path/to/file.ts +

+// Show the implementation with enough surrounding context
+    
+
{{Why this approach was taken}}
+
+ + +
+``` + +**Area with no test (flag the smell, show implementation directly):** +```html +
+
+

{{Area name}}

+

{{What changed and why}}

+ path/to/file.ts +

+// Show the changed code
+    
+
🔍 No test covers this change — the behavior has to be inferred from the implementation.
+

↓ details

+
+ +
+``` + +**Supporting changes slide — for files that don't fit the main flow:** +```html +
+
+

Supporting changes

+

These files support the main flow but aren't part of it:

+
    +
  • package.json — added dependency X
  • +
  • tsconfig.json — enabled option Y
  • +
+

↓ details

+
+ +
+``` + +**When omitting code:** +```html +
⏭ 47 lines of error handling omitted — standard try/catch pattern
+``` + +**When a test exists but doesn't fully explain the code:** +```html +
🔍 The test only covers the happy path — the implementation handles 3 edge cases that aren't tested.
+``` + +**Diff highlights for before → after:** +```html +
- const oldWay = doThing(a, b);
+
+ const newWay = doThingBetter(a, b, options);
+``` + +**Summary slide:** +```html +
+

Summary

+
    +
  • {{Key takeaway 1}}
  • +
  • {{Key takeaway 2}}
  • +
+
{{Open questions or concerns, if any}}
+
+``` + +### Code display rules + +1. **Horizontal top slides = area overview** — the headline change, just enough to follow the big picture going left-to-right +2. **First zoom = test** — show complete test bodies, the reader now understands the behavior +3. **Second zoom = implementation** — the code that makes the test pass, with surrounding context +3. **Use `data-line-numbers="X-Y"` to highlight changed lines** within a larger code block +4. **One concept per slide** — split large changes across multiple vertical slides +5. **Max ~30 lines of code per slide** — if more, split or omit with an omitted-box +6. **HTML-escape all code content** — replace `<` with `<`, `>` with `>`, `&` with `&` in all code blocks and diff divs +7. **Every changed file must appear somewhere** — this is the most important rule. Run `gh pr diff --name-only` and check every file off against the slideshow. Missing files = incomplete review. Use `` for each file so coverage is verifiable + +## Step 6 — Write the server and start it + +Write this live-reload server to `~/life/slideshows//server.mjs`: + +```javascript +import { createServer } from 'node:http'; +import { readFileSync, watch } from 'node:fs'; +import { join, extname } from 'node:path'; + +const dir = new URL('.', import.meta.url).pathname; +const port = 3000; +let clients = []; + +watch(dir, { recursive: true }, (event, filename) => { + if (filename === 'server.mjs') return; + clients.forEach(res => { + try { res.write('data: reload\n\n'); } catch {} + }); +}); + +createServer((req, res) => { + if (req.url === '/__sse') { + res.writeHead(200, { + 'Content-Type': 'text/event-stream', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + }); + res.write('data: connected\n\n'); + clients.push(res); + req.on('close', () => { clients = clients.filter(c => c !== res); }); + return; + } + + try { + const filePath = join(dir, req.url === '/' ? 'index.html' : req.url); + let content = readFileSync(filePath); + const ext = extname(filePath); + const types = { + '.html': 'text/html', '.js': 'text/javascript', + '.css': 'text/css', '.json': 'application/json', + '.mjs': 'text/javascript', + }; + res.writeHead(200, { 'Content-Type': types[ext] || 'application/octet-stream' }); + + if (ext === '.html') { + content = content.toString().replace('', + `\n`); + } + res.end(content); + } catch { + res.writeHead(404).end('Not found'); + } +}).listen(port, () => { + console.log(`\n PR Review: http://localhost:${port}\n`); + console.log(' Watching for changes...\n'); +}); +``` + +Then start it: + +```bash +mkdir -p ~/life/slideshows/ +# Write server.mjs and index.html first, then: +node ~/life/slideshows//server.mjs & +open http://localhost:3000 # macOS +``` + +Run the server in the background using Bash with `run_in_background: true`. + +## Step 7 — Iterate + +After generating the initial slideshow, tell the user: +- The slideshow is live at http://localhost:3000 +- They can ask you to update specific slides +- The browser will auto-reload when you write changes + +When the user asks for updates, just rewrite `~/life/slideshows//index.html` — the browser will auto-reload. + +## Important rules + +- **Discuss fixes first.** Scan for readability problems, present them, wait for approval before changing code. +- **Horizontal = big picture.** A reader pressing only → sees each area of change at a glance. +- **Vertical = test first, then implementation.** Press ↓ to see the test (what), then the code (how). +- **Cover everything.** Every changed file appears in the slideshow — in the flow, in a zoom-in, or in supporting changes. +- **Always HTML-escape code.** `<` → `<`, `>` → `>`, `&` → `&`. +- **Kill any existing server on port 3000** before starting: `lsof -ti:3000 | xargs kill -9 2>/dev/null || true` +- **Note omissions.** If you skip code, always say what and roughly how much. +- **One concept per slide.** Use vertical slides to go deeper, not wider. +- **Separate fix commit.** Never mix review fixes with the author's commits. diff --git a/code/core/src/core-server/index.ts b/code/core/src/core-server/index.ts index 5699b1ad14af..f475fa6166ca 100644 --- a/code/core/src/core-server/index.ts +++ b/code/core/src/core-server/index.ts @@ -32,7 +32,3 @@ export { } from './stores/test-provider'; export { getServerPort } from './utils/server-address'; - -export { getComponentCandidates } from './utils/ghost-stories/get-candidates'; -export { runGhostStories } from './utils/ghost-stories/run-story-tests'; -export type { TestRunSummary } from './utils/ghost-stories/types'; diff --git a/scripts/eval/config.ts b/scripts/eval/config.ts index 9206d9c31012..8c47155e87ca 100644 --- a/scripts/eval/config.ts +++ b/scripts/eval/config.ts @@ -1,48 +1,41 @@ /** * Runtime configuration for the Storybook eval system. * - * Types live in types.ts — this file holds the concrete agent configs, - * model mappings, pricing, and benchmark project definitions. - * - * Agent configs are validated with Zod at import time — invalid config - * (e.g. defaultModel not in models list) throws immediately. + * Agent configs, model mappings, pricing, benchmark project definitions, + * and cost estimation utilities. */ -import { z } from "zod"; import type { AgentName, Project } from "./types.ts"; // --- Pricing --- -export const Pricing = z.object({ - input: z.number(), - cachedInput: z.number(), - output: z.number(), -}); -export type Pricing = z.infer; +export interface Pricing { + input: number; + cachedInput: number; + output: number; +} + +export interface TokenUsage { + inputTokens: number; + cachedInputTokens: number; + outputTokens: number; +} // --- Agent Config --- -export const AgentConfig = z - .object({ - models: z.array(z.string()).min(1), - defaultModel: z.string(), - /** Map friendly model names to SDK-specific model IDs (e.g. "sonnet-4.6" → "claude-sonnet-4-6"). */ - sdkModelIds: z.record(z.string(), z.string()).default({}), - /** Per-million-token pricing for manual cost estimation (agents that don't report cost natively). */ - pricing: z.record(z.string(), Pricing).default({}), - efforts: z.array(z.string()).min(1), - defaultEffort: z.string(), - }) - .refine((cfg) => cfg.models.includes(cfg.defaultModel), { - message: "defaultModel must be in models list", - }) - .refine((cfg) => cfg.efforts.includes(cfg.defaultEffort), { - message: "defaultEffort must be in efforts list", - }); -export type AgentConfig = z.infer; +export interface AgentConfig { + models: string[]; + defaultModel: string; + /** Map friendly model names to SDK-specific model IDs (e.g. "sonnet-4.6" → "claude-sonnet-4-6"). */ + sdkModelIds: Record; + /** Per-million-token pricing for manual cost estimation (agents that don't report cost natively). */ + pricing: Record; + efforts: string[]; + defaultEffort: string; +} export const AGENTS: Record = { - claude: AgentConfig.parse({ + claude: { models: ["sonnet-4.6", "opus-4.6", "haiku-4.5"], defaultModel: "sonnet-4.6", sdkModelIds: { @@ -50,20 +43,38 @@ export const AGENTS: Record = { "opus-4.6": "claude-opus-4-6", "haiku-4.5": "claude-haiku-4-5", }, + pricing: {}, efforts: ["low", "medium", "high", "max"], defaultEffort: "high", - }), - codex: AgentConfig.parse({ + }, + codex: { models: ["gpt-5.4"], defaultModel: "gpt-5.4", + sdkModelIds: {}, pricing: { "gpt-5.4": { input: 2.5, cachedInput: 0.625, output: 10.0 }, }, efforts: ["low", "medium", "high", "xhigh"], defaultEffort: "high", - }), + }, }; +// --- Cost Estimation --- + +/** Estimate cost from token usage using the pricing table. */ +export function estimateCost(agent: AgentName, model: string, usage: TokenUsage): number | undefined { + const pricing = AGENTS[agent].pricing[model]; + if (!pricing) return undefined; + const freshInput = usage.inputTokens - usage.cachedInputTokens; + return ( + (freshInput / 1_000_000) * pricing.input + + (usage.cachedInputTokens / 1_000_000) * pricing.cachedInput + + (usage.outputTokens / 1_000_000) * pricing.output + ); +} + +// --- Projects --- + export const PROJECTS: Project[] = [ { name: "mealdrop", diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 4095a42676e0..38b723fc4f00 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -9,195 +9,205 @@ * node eval/eval.ts -p mealdrop -m gpt-5.4 # single run (agent inferred from model) * node eval/eval.ts -p mealdrop -m sonnet-4.6,gpt-5.4 # parallel runs * node eval/eval.ts -p mealdrop -a claude,codex # parallel runs (default model each) + * node eval/eval.ts --list-projects # list projects + * node eval/eval.ts --list-models # list models + * node eval/eval.ts --list-prompts # list prompts */ +import { defineCommand, runMain } from "citty"; import { randomUUID } from "node:crypto"; -import { parseArgs } from "node:util"; import pc from "picocolors"; import type { AgentName, TrialConfig, TrialResult } from "./types.ts"; import { AGENTS, PROJECTS } from "./config.ts"; import { runTask } from "./lib/run-task.ts"; import { createLogger, formatDuration, formatCost, formatTable, listPrompts } from "./lib/utils.ts"; -const logger = createLogger(); - -const { values: opts } = parseArgs({ - options: { - project: { type: "string", short: "p" }, - agent: { type: "string", short: "a" }, - model: { type: "string", short: "m" }, - effort: { type: "string", short: "e" }, - prompt: { type: "string" }, - verbose: { type: "boolean", short: "v", default: false }, - "list-projects": { type: "boolean", default: false }, - "list-models": { type: "boolean", default: false }, - "list-prompts": { type: "boolean", default: false }, +const main = defineCommand({ + meta: { + name: "eval", + description: "Storybook setup eval harness — measure AI agent quality on real-world projects", }, -}); + args: { + project: { type: "string", alias: "p", description: "Project name" }, + agent: { type: "string", alias: "a", description: "Agent(s), comma-separated" }, + model: { type: "string", alias: "m", description: "Model(s), comma-separated" }, + effort: { type: "string", alias: "e", description: "Effort level" }, + prompt: { type: "string", description: "Prompt name", default: "setup" }, + verbose: { type: "boolean", alias: "v", description: "Verbose output", default: false }, + listProjects: { type: "boolean", description: "List available projects", default: false }, + listModels: { type: "boolean", description: "List available models", default: false }, + listPrompts: { type: "boolean", description: "List available prompts", default: false }, + }, + async run({ args }) { + const logger = createLogger(); + + // --- List commands --- + + if (args.listProjects) { + for (const p of PROJECTS) logger.log(` ${pc.bold(p.name)} — ${p.description}`); + return; + } + if (args.listModels) { + for (const [agent, { models }] of Object.entries(AGENTS)) { + logger.log(`\n ${pc.bold(agent)}`); + for (const m of models) logger.log(` ${m}`); + } + return; + } + if (args.listPrompts) { + for (const name of listPrompts()) logger.log(` ${pc.bold(name)}`); + return; + } + + // --- Validate project --- + + const project = PROJECTS.find((p) => p.name === args.project); + if (!project) { + logger.log(pc.red(`Specify a project with -p. Available: ${PROJECTS.map((p) => p.name).join(", ")}`)); + process.exit(1); + } + + // --- Build configs (supports comma-separated values for parallel runs) --- + + const promptNames = args.prompt!.split(","); + const allModels = Object.values(AGENTS).flatMap((cfg) => cfg.models); + + // Determine agent → model pairs + let agentModels: Array<{ agent: AgentName; model: string }>; + + if (args.model) { + // Models specified — infer agent per model + agentModels = args.model.split(",").map((model) => { + const entry = Object.entries(AGENTS).find(([, cfg]) => cfg.models.includes(model)); + if (!entry) { + logger.log(pc.red(`Unknown model: ${model}. Available: ${allModels.join(", ")}`)); + process.exit(1); + } + return { agent: entry[0] as AgentName, model }; + }); + // If --agent is also specified, filter to matching agents + if (args.agent) { + const filter = args.agent.split(","); + agentModels = agentModels.filter((am) => filter.includes(am.agent)); + } + } else if (args.agent) { + // Agents specified — use default model per agent + agentModels = args.agent.split(",").map((name) => { + const cfg = AGENTS[name as AgentName]; + if (!cfg) { + logger.log(pc.red(`Unknown agent: ${name}. Options: ${Object.keys(AGENTS).join(", ")}`)); + process.exit(1); + } + return { agent: name as AgentName, model: cfg.defaultModel }; + }); + } else { + // Default: single claude run + agentModels = [{ agent: "claude", model: AGENTS.claude.defaultModel }]; + } -// --- List commands --- - -if (opts["list-projects"]) { - for (const p of PROJECTS) logger.log(` ${pc.bold(p.name)} — ${p.description}`); - process.exit(0); -} -if (opts["list-models"]) { - for (const [agent, { models }] of Object.entries(AGENTS)) { - logger.log(`\n ${pc.bold(agent)}`); - for (const m of models) logger.log(` ${m}`); - } - process.exit(0); -} -if (opts["list-prompts"]) { - for (const name of listPrompts()) logger.log(` ${pc.bold(name)}`); - process.exit(0); -} - -// --- Validate project --- - -const project = PROJECTS.find((p) => p.name === opts.project); -if (!project) { - logger.log(pc.red(`Specify a project with -p. Available: ${PROJECTS.map((p) => p.name).join(", ")}`)); - process.exit(1); -} - -// --- Build configs (supports comma-separated values for parallel runs) --- - -const promptNames = opts.prompt?.split(",") ?? ["setup"]; -const allModels = Object.values(AGENTS).flatMap((cfg) => cfg.models); - -// Determine agent → model pairs -let agentModels: Array<{ agent: AgentName; model: string }>; - -if (opts.model) { - // Models specified — infer agent per model - agentModels = opts.model.split(",").map((model) => { - const entry = Object.entries(AGENTS).find(([, cfg]) => cfg.models.includes(model)); - if (!entry) { - logger.log(pc.red(`Unknown model: ${model}. Available: ${allModels.join(", ")}`)); + // Expand to full configs: agent×model × prompt + const configs = agentModels.flatMap(({ agent, model }) => { + const cfg = AGENTS[agent]; + const effort = args.effort ?? cfg.defaultEffort; + if (!cfg.efforts.includes(effort)) { + logger.log(pc.red(`Unknown effort "${effort}" for ${agent}. Available: ${cfg.efforts.join(", ")}`)); + process.exit(1); + } + return promptNames.map((prompt) => ({ + config: { project, agent, model, effort, prompt, verbose: args.verbose } as TrialConfig, + label: `${model}+${prompt}`, + })); + }); + + if (configs.length === 0) { + logger.log(pc.red("No matching agent/model/prompt combinations found.")); process.exit(1); } - return { agent: entry[0] as AgentName, model }; - }); - // If --agent is also specified, filter to matching agents - if (opts.agent) { - const filter = opts.agent.split(","); - agentModels = agentModels.filter((am) => filter.includes(am.agent)); - } -} else if (opts.agent) { - // Agents specified — use default model per agent - agentModels = opts.agent.split(",").map((name) => { - const cfg = AGENTS[name as AgentName]; - if (!cfg) { - logger.log(pc.red(`Unknown agent: ${name}. Options: ${Object.keys(AGENTS).join(", ")}`)); + + // --- Print header --- + + const runId = randomUUID().slice(0, 8); + logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); + if (configs.length === 1) { + const { agent, model, effort, prompt } = configs[0].config; + logger.log(`Agent: ${agent} | Model: ${model} | Effort: ${effort} | Prompt: ${prompt}`); + } else { + logger.log(`${configs.length} parallel runs`); + for (const [agent, { models }] of Object.entries(AGENTS)) { + const active = models.filter((m) => configs.some((c) => c.config.model === m)); + if (active.length > 0) logger.log(` ${agent}: ${active.join(", ")}`); + } + logger.log(` prompts: ${[...new Set(promptNames)].join(", ")}`); + } + logger.log(`Run: ${runId}\n`); + + // --- Execute (always use allSettled — works for 1 or N runs) --- + + const settled = await Promise.allSettled( + configs.map((c) => runTask(c.config, createLogger(configs.length > 1 ? c.label : undefined))), + ); + + const results: TrialResult[] = []; + for (const [i, s] of settled.entries()) { + if (s.status === "fulfilled") { + results.push(s.value); + } else { + logger.logError(`${configs[i].label}: ${s.reason instanceof Error ? s.reason.message : s.reason}`); + } + } + + if (results.length === 0) { process.exit(1); } - return { agent: name as AgentName, model: cfg.defaultModel }; - }); -} else { - // Default: single claude run - agentModels = [{ agent: "claude", model: AGENTS.claude.defaultModel }]; -} - -// Expand to full configs: agent×model × prompt -const configs = agentModels.flatMap(({ agent, model }) => { - const cfg = AGENTS[agent]; - const effort = opts.effort ?? cfg.defaultEffort; - if (!cfg.efforts.includes(effort)) { - logger.log(pc.red(`Unknown effort "${effort}" for ${agent}. Available: ${cfg.efforts.join(", ")}`)); - process.exit(1); - } - return promptNames.map((prompt) => ({ - config: { project, agent, model, effort, prompt, verbose: opts.verbose } as TrialConfig, - label: `${model}+${prompt}`, - })); + + // --- Print results --- + + if (results.length === 1) { + const r = results[0]; + const ghost = r.grading.ghostStories; + const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; + + logger.log(pc.bold("\nResult")); + logger.log(` Build: ${r.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL")}`); + logger.log(` Ghost: ${ghostStr}`); + logger.log(` TS Err: ${r.grading.typeCheckErrors}`); + logger.log(` Score: ${r.quality.score}`); + logger.log(` Cost: ${formatCost(r.execution.cost)}`); + logger.log(` Time: ${formatDuration(r.execution.duration)}`); + logger.log(` Turns: ${r.execution.turns}`); + } else { + results.sort((a, b) => (b.grading.ghostStories?.successRate ?? -1) - (a.grading.ghostStories?.successRate ?? -1)); + + const headers = ["Agent", "Model", "Prompt", "Build", "Ghost", "TS Err", "Score", "Cost", "Time", "Turns"]; + const rows = results.map((r) => { + const ghost = r.grading.ghostStories; + const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; + return [ + r.agent, + r.model, + r.prompt, + r.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL"), + ghostStr, + String(r.grading.typeCheckErrors), + String(r.quality.score), + formatCost(r.execution.cost), + formatDuration(r.execution.duration), + String(r.execution.turns), + ]; + }); + + logger.log(pc.bold("\n\nResults (sorted by ghost stories rate)")); + logger.log(formatTable(headers, rows)); + + const totalCost = results.reduce((s, r) => s + (r.execution.cost || 0), 0); + const ghostRates = results.map((r) => r.grading.ghostStories?.successRate).filter((r): r is number => r != null); + const avgGhost = ghostRates.length > 0 ? ghostRates.reduce((s, r) => s + r, 0) / ghostRates.length : 0; + + logger.log(`\nGhost stories avg: ${pc.bold(`${Math.round(avgGhost * 100)}%`)}`); + logger.log(`Total cost: ${pc.bold(formatCost(totalCost))}`); + } + + logger.log("\nDone."); + }, }); -if (configs.length === 0) { - logger.log(pc.red("No matching agent/model/prompt combinations found.")); - process.exit(1); -} - -// --- Print header --- - -const runId = randomUUID().slice(0, 8); -logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); -if (configs.length === 1) { - const { agent, model, effort, prompt } = configs[0].config; - logger.log(`Agent: ${agent} | Model: ${model} | Effort: ${effort} | Prompt: ${prompt}`); -} else { - logger.log(`${configs.length} parallel runs`); - for (const [agent, { models }] of Object.entries(AGENTS)) { - const active = models.filter((m) => configs.some((c) => c.config.model === m)); - if (active.length > 0) logger.log(` ${agent}: ${active.join(", ")}`); - } - logger.log(` prompts: ${[...new Set(promptNames)].join(", ")}`); -} -logger.log(`Run: ${runId}\n`); - -// --- Execute (always use allSettled — works for 1 or N runs) --- - -const settled = await Promise.allSettled( - configs.map((c) => runTask(c.config, createLogger(configs.length > 1 ? c.label : undefined))), -); - -const results: TrialResult[] = []; -for (const [i, s] of settled.entries()) { - if (s.status === "fulfilled") { - results.push(s.value); - } else { - logger.logError(`${configs[i].label}: ${s.reason instanceof Error ? s.reason.message : s.reason}`); - } -} - -if (results.length === 0) { - process.exit(1); -} - -// --- Print results --- - -if (results.length === 1) { - const r = results[0]; - const ghost = r.grading.ghostStories; - const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; - - logger.log(pc.bold("\nResult")); - logger.log(` Build: ${r.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL")}`); - logger.log(` Ghost: ${ghostStr}`); - logger.log(` TS Err: ${r.grading.typeCheckErrors}`); - logger.log(` Score: ${r.quality.score}`); - logger.log(` Cost: ${formatCost(r.execution.cost)}`); - logger.log(` Time: ${formatDuration(r.execution.duration)}`); - logger.log(` Turns: ${r.execution.turns}`); -} else { - results.sort((a, b) => (b.grading.ghostStories?.successRate ?? -1) - (a.grading.ghostStories?.successRate ?? -1)); - - const headers = ["Agent", "Model", "Prompt", "Build", "Ghost", "TS Err", "Score", "Cost", "Time", "Turns"]; - const rows = results.map((r) => { - const ghost = r.grading.ghostStories; - const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; - return [ - r.agent, - r.model, - r.prompt, - r.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL"), - ghostStr, - String(r.grading.typeCheckErrors), - String(r.quality.score), - formatCost(r.execution.cost), - formatDuration(r.execution.duration), - String(r.execution.turns), - ]; - }); - - logger.log(pc.bold("\n\nResults (sorted by ghost stories rate)")); - logger.log(formatTable(headers, rows)); - - const totalCost = results.reduce((s, r) => s + (r.execution.cost || 0), 0); - const ghostRates = results.map((r) => r.grading.ghostStories?.successRate).filter((r): r is number => r != null); - const avgGhost = ghostRates.length > 0 ? ghostRates.reduce((s, r) => s + r, 0) / ghostRates.length : 0; - - logger.log(`\nGhost stories avg: ${pc.bold(`${Math.round(avgGhost * 100)}%`)}`); - logger.log(`Total cost: ${pc.bold(formatCost(totalCost))}`); -} - -logger.log("\nDone."); +runMain(main); diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts index d4495a8ce471..3e5e08fcc086 100644 --- a/scripts/eval/lib/agents/codex.ts +++ b/scripts/eval/lib/agents/codex.ts @@ -2,7 +2,7 @@ import { Codex, type ModelReasoningEffort } from "@openai/codex-sdk"; import { writeFile } from "node:fs/promises"; import { join } from "node:path"; import type { Agent, ExecutionResult } from "../../types.ts"; -import { estimateCost } from "../pricing.ts"; +import { estimateCost } from "../../config.ts"; export const codexAgent: Agent = { name: "codex", diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts new file mode 100644 index 000000000000..0ea91a0f69bd --- /dev/null +++ b/scripts/eval/lib/ghost-stories.ts @@ -0,0 +1,118 @@ +/** + * Ghost stories: discover component candidates and run vitest-based + * ghost story tests to measure how many components render successfully. + * + * Self-contained — does not import from code/core. Uses the same vitest + * + STORYBOOK_COMPONENT_PATHS approach that core-server uses internally, + * but decoupled so eval has no cross-package source imports. + */ + +import { existsSync } from "node:fs"; +import { readFile } from "node:fs/promises"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; +import { x } from "tinyexec"; +import { glob } from "glob"; + +const COMPONENT_GLOB = "**/*.{tsx,jsx}"; +const IGNORE_PATTERNS = [ + "**/node_modules/**", + "**/.git/**", + "**/dist/**", + "**/__mocks__/**", + "**/build/**", + "**/storybook-static/**", + "**/*.test.*", + "**/*.spec.*", + "**/*.stories.*", + "**/*.story.*", + "**/*.d.*", + "**/*.config.*", + "**/stories/{Button,Header,Page}.*", + "**/stories/{button,header,page}.*", +]; + +/** + * Find component files that are candidates for ghost story testing. + * Uses glob-based discovery — sufficient for eval grading purposes. + */ +export async function findComponentCandidates(opts: { + cwd: string; + sampleSize?: number; +}): Promise<{ candidates: string[]; error?: string }> { + const { cwd, sampleSize = 20 } = opts; + try { + const files = await glob(COMPONENT_GLOB, { + cwd, + absolute: true, + ignore: IGNORE_PATTERNS, + }); + return { candidates: files.slice(0, sampleSize) }; + } catch { + return { candidates: [], error: "Failed to find component candidates" }; + } +} + +export interface GhostStoryRunResult { + total: number; + passed: number; + successRate: number; + runError?: string; +} + +/** + * Run ghost stories by executing vitest with STORYBOOK_COMPONENT_PATHS. + * + * The storybook vitest plugin auto-generates and tests stories for the + * specified component files. Non-zero exit from vitest is expected when + * some stories fail — we parse the JSON report for actual results. + */ +export async function runGhostStories( + candidates: string[], + opts: { cwd: string }, +): Promise { + const outputFile = join(tmpdir(), `ghost-stories-${Date.now()}.json`); + + const result = await x("npx", [ + "vitest", "run", + "--reporter=json", + "--testTimeout=1000", + `--outputFile=${outputFile}`, + ...candidates, + ], { + throwOnError: false, + timeout: 300_000, + nodeOptions: { + cwd: opts.cwd, + env: { + ...process.env, + STORYBOOK_COMPONENT_PATHS: candidates.join(";"), + }, + }, + }); + + const stderr = (result.stderr ?? "").toLowerCase(); + if (stderr.includes("browsertype.launch")) { + return { total: 0, passed: 0, successRate: 0, runError: "Playwright not installed" }; + } + if (stderr.includes("no tests found")) { + return { total: 0, passed: 0, successRate: 0, runError: "No tests found" }; + } + + if (!existsSync(outputFile)) { + return { total: 0, passed: 0, successRate: 0, runError: "JSON report not found" }; + } + + try { + const report = JSON.parse(await readFile(outputFile, "utf-8")); + if (!report.testResults?.length) { + return { total: 0, passed: 0, successRate: 0, runError: "No test results in report" }; + } + const total: number = report.numTotalTests ?? 0; + const passed: number = report.numPassedTests ?? 0; + const successRate = total > 0 ? parseFloat((passed / total).toFixed(2)) : 0; + return { total, passed, successRate }; + } catch { + return { total: 0, passed: 0, successRate: 0, runError: "Failed to parse vitest report" }; + } +} diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index d04f3af1252c..fe155c5b53a5 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -4,7 +4,7 @@ import type { GradingResult, GhostStoriesResult, QualityResult, QualityWeights, import { DEFAULT_QUALITY_WEIGHTS } from "../types.ts"; import { x } from "tinyexec"; import { detectSetupPatterns } from "./setup-patterns.ts"; -import { getComponentCandidates, runGhostStories } from "../../../code/core/src/core-server/index.ts"; +import { findComponentCandidates, runGhostStories } from "./ghost-stories.ts"; /** Filter changed files to only storybook-related ones. */ export function filterStorybookFiles(changedFiles: ChangedFile[]): ChangedFile[] { @@ -157,7 +157,7 @@ async function getChangedFiles(repoRoot: string, baseline: string): Promise { logger.logStep("Running ghost stories..."); - const { candidates, error } = await getComponentCandidates({ sampleSize: 20, cwd: projectPath }); + const { candidates, error } = await findComponentCandidates({ sampleSize: 20, cwd: projectPath }); if (error || candidates.length === 0) { logger.logError(error ?? "No candidate components found"); return undefined; @@ -165,13 +165,12 @@ async function gradeGhostStories(projectPath: string, logger: Logger): Promise 0) { - logger.logSuccess(`Ghost stories: ${passed}/${total} passed (${Math.round(successRate * 100)}%)`); + } else if (result.total > 0) { + logger.logSuccess(`Ghost stories: ${result.passed}/${result.total} passed (${Math.round(result.successRate * 100)}%)`); } - return { candidateCount: candidates.length, total, passed, successRate }; + return { candidateCount: candidates.length, total: result.total, passed: result.passed, successRate: result.successRate }; } diff --git a/scripts/eval/lib/grading-pipeline.test.ts b/scripts/eval/lib/grading-pipeline.test.ts index 5ffcab12170f..40d2d36fdccc 100644 --- a/scripts/eval/lib/grading-pipeline.test.ts +++ b/scripts/eval/lib/grading-pipeline.test.ts @@ -4,7 +4,7 @@ import { tmpdir } from 'node:os'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; -import { getComponentCandidates } from '../../../code/core/src/core-server/index'; +import { findComponentCandidates } from './ghost-stories'; import { computeQualityScore, countTypeCheckErrors, @@ -38,7 +38,7 @@ function writeFile(relativePath: string, content: string) { } async function findCandidates(cwd: string) { - const { candidates } = await getComponentCandidates({ cwd, sampleSize: 20 }); + const { candidates } = await findComponentCandidates({ cwd, sampleSize: 20 }); return candidates.map((c) => c.replace(cwd + '/', '')); } diff --git a/scripts/eval/lib/pricing.ts b/scripts/eval/lib/pricing.ts deleted file mode 100644 index 16fb7995211e..000000000000 --- a/scripts/eval/lib/pricing.ts +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Shared cost estimation from token usage. - * - * Pricing tables live in config.ts alongside agent definitions. - * This module provides the math. - */ - -import { AGENTS } from "../config.ts"; -import type { AgentName } from "../types.ts"; - -export interface TokenUsage { - inputTokens: number; - cachedInputTokens: number; - outputTokens: number; -} - -/** Estimate cost from token usage using the pricing table in config. */ -export function estimateCost(agent: AgentName, model: string, usage: TokenUsage): number | undefined { - const pricing = AGENTS[agent].pricing[model]; - if (!pricing) return undefined; - const freshInput = usage.inputTokens - usage.cachedInputTokens; - return ( - (freshInput / 1_000_000) * pricing.input + - (usage.cachedInputTokens / 1_000_000) * pricing.cachedInput + - (usage.outputTokens / 1_000_000) * pricing.output - ); -} diff --git a/scripts/eval/lib/run-task.test.ts b/scripts/eval/lib/run-task.test.ts index ccd7aa562333..503046a4761c 100644 --- a/scripts/eval/lib/run-task.test.ts +++ b/scripts/eval/lib/run-task.test.ts @@ -13,13 +13,17 @@ vi.mock('./prepare-trial', () => ({ vi.mock('./grade', () => ({ grade: vi.fn(), })); -vi.mock('./save', () => ({ - captureEnvironment: vi.fn().mockResolvedValue({ - nodeVersion: 'v22.21.1', - evalBranch: 'test-branch', - evalCommit: 'abc123', - }), -})); +vi.mock('./utils', async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + captureEnvironment: vi.fn().mockResolvedValue({ + nodeVersion: 'v22.21.1', + evalBranch: 'test-branch', + evalCommit: 'abc123', + }), + }; +}); vi.mock('./agents/claude-code', () => ({ claudeAgent: { name: 'claude', execute: vi.fn() }, })); @@ -31,7 +35,7 @@ import { claudeAgent } from './agents/claude-code'; import { grade } from './grade'; import { prepareTrial } from './prepare-trial'; import { runTask } from './run-task'; -import { captureEnvironment } from './save'; +import { captureEnvironment } from './utils'; let TMP: string; diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-task.ts index bb9532e45915..73cdcc8fd72c 100644 --- a/scripts/eval/lib/run-task.ts +++ b/scripts/eval/lib/run-task.ts @@ -5,8 +5,7 @@ import { claudeAgent } from "./agents/claude-code.ts"; import { codexAgent } from "./agents/codex.ts"; import { prepareTrial } from "./prepare-trial.ts"; import { grade } from "./grade.ts"; -import { captureEnvironment } from "./save.ts"; -import { generateTrialId, generatePrompt, createLogger } from "./utils.ts"; +import { generateTrialId, loadPrompt, captureEnvironment, createLogger } from "./utils.ts"; const agents: Record = { claude: claudeAgent, @@ -33,12 +32,8 @@ export async function runTask( // 2. Capture environment await captureEnvironment(paths.resultsDir); - // 3. Generate the prompt (with project-specific template variables) - const prompt = generatePrompt(promptName, { - projectName: project.name, - description: project.description ?? "", - projectDir: project.projectDir ?? ".", - }); + // 3. Load the prompt + const prompt = loadPrompt(promptName); await writeFile(join(paths.resultsDir, "prompt.md"), prompt); // 4. Execute the agent diff --git a/scripts/eval/lib/save.ts b/scripts/eval/lib/save.ts deleted file mode 100644 index 363ac29abe00..000000000000 --- a/scripts/eval/lib/save.ts +++ /dev/null @@ -1,25 +0,0 @@ -import { writeFile } from "node:fs/promises"; -import { join } from "node:path"; -import { x } from "tinyexec"; - -export interface Environment { - nodeVersion: string; - /** Git branch of the eval harness (storybook monorepo), not the evaluated project. */ - evalBranch: string; - /** Git commit of the eval harness (storybook monorepo), not the evaluated project. */ - evalCommit: string; -} - -export async function captureEnvironment(resultsDir: string): Promise { - let evalBranch = "unknown"; - let evalCommit = "unknown"; - try { - evalBranch = (await x("git", ["rev-parse", "--abbrev-ref", "HEAD"])).stdout.trim(); - evalCommit = (await x("git", ["rev-parse", "HEAD"])).stdout.trim(); - } catch { - /* not in a git repo */ - } - const env: Environment = { nodeVersion: process.version, evalBranch, evalCommit }; - await writeFile(join(resultsDir, "environment.json"), JSON.stringify(env, null, 2)); - return env; -} diff --git a/scripts/eval/lib/utils.test.ts b/scripts/eval/lib/utils.test.ts index 7d18d74625f2..a230b00740bb 100644 --- a/scripts/eval/lib/utils.test.ts +++ b/scripts/eval/lib/utils.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest'; -import { formatDuration, formatCost, generateTrialId, generatePrompt, listPrompts, formatTable } from './utils'; +import { formatDuration, formatCost, generateTrialId, loadPrompt, listPrompts, formatTable } from './utils'; describe('formatDuration', () => { it('formats seconds under a minute', () => { @@ -70,31 +70,31 @@ describe('listPrompts', () => { }); }); -describe('generatePrompt', () => { +describe('loadPrompt', () => { it('loads setup prompt by default', () => { - const prompt = generatePrompt(); + const prompt = loadPrompt(); expect(prompt).toContain('Storybook'); expect(prompt.length).toBeGreaterThan(0); }); it('loads setup prompt by name', () => { - const prompt = generatePrompt('setup'); + const prompt = loadPrompt('setup'); expect(prompt).toContain('Storybook setup'); expect(prompt).not.toContain('React + Vite'); }); it('loads self-heal prompt', () => { - const prompt = generatePrompt('self-heal'); + const prompt = loadPrompt('self-heal'); expect(prompt).toContain('Self-healing'); expect(prompt).toContain('vitest'); }); it('throws for unknown prompt', () => { - expect(() => generatePrompt('nonexistent-prompt-xyz')).toThrow('Prompt not found'); + expect(() => loadPrompt('nonexistent-prompt-xyz')).toThrow('Prompt not found'); }); it('returns trimmed content', () => { - const prompt = generatePrompt('setup'); + const prompt = loadPrompt('setup'); expect(prompt).toBe(prompt.trim()); }); }); diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts index 019a7765906e..af6c0284808a 100644 --- a/scripts/eval/lib/utils.ts +++ b/scripts/eval/lib/utils.ts @@ -1,6 +1,8 @@ import { readFileSync, existsSync, readdirSync } from "node:fs"; -import { resolve, basename } from "node:path"; +import { writeFile } from "node:fs/promises"; +import { resolve, basename, join } from "node:path"; import pc from "picocolors"; +import { x } from "tinyexec"; import type { Logger } from "../types.ts"; export const REPO_ROOT = resolve(import.meta.dirname, "..", "..", ".."); @@ -59,19 +61,13 @@ export function formatTable(headers: string[], rows: string[][]): string { // --- Prompts --- -/** Load a prompt by name from prompts/{name}.md, with optional template variables. */ -export function generatePrompt(name = "setup", vars?: Record): string { +/** Load a prompt by name from prompts/{name}.md. */ +export function loadPrompt(name = "setup"): string { const file = resolve(PROMPTS_DIR, `${name}.md`); if (!existsSync(file)) { throw new Error(`Prompt not found: ${file}\nAvailable: ${listPrompts().join(", ")}`); } - let content = readFileSync(file, "utf-8").trim(); - if (vars) { - for (const [key, value] of Object.entries(vars)) { - content = content.replaceAll(`{{${key}}}`, value); - } - } - return content; + return readFileSync(file, "utf-8").trim(); } /** List available prompt names. */ @@ -81,3 +77,27 @@ export function listPrompts(): string[] { .filter((f) => f.endsWith(".md")) .map((f) => basename(f, ".md")); } + +// --- Environment capture --- + +export interface Environment { + nodeVersion: string; + /** Git branch of the eval harness (storybook monorepo), not the evaluated project. */ + evalBranch: string; + /** Git commit of the eval harness (storybook monorepo), not the evaluated project. */ + evalCommit: string; +} + +export async function captureEnvironment(resultsDir: string): Promise { + let evalBranch = "unknown"; + let evalCommit = "unknown"; + try { + evalBranch = (await x("git", ["rev-parse", "--abbrev-ref", "HEAD"])).stdout.trim(); + evalCommit = (await x("git", ["rev-parse", "HEAD"])).stdout.trim(); + } catch { + /* not in a git repo */ + } + const env: Environment = { nodeVersion: process.version, evalBranch, evalCommit }; + await writeFile(join(resultsDir, "environment.json"), JSON.stringify(env, null, 2)); + return env; +} diff --git a/scripts/eval/types.test.ts b/scripts/eval/types.test.ts index 18fb167ab361..9883eddddb42 100644 --- a/scripts/eval/types.test.ts +++ b/scripts/eval/types.test.ts @@ -2,16 +2,10 @@ import { describe, expect, it } from 'vitest'; import { AGENTS, PROJECTS } from './config'; -/** - * Basic shape validation (required fields, defaults, types) is handled by Zod - * schemas at import time — AgentConfig.parse() in config.ts throws on invalid - * config. These tests cover cross-cutting invariants that Zod cannot express. - */ +/** Cross-cutting config invariants that TypeScript's type system cannot express. */ describe('AGENTS', () => { - it('validates against schema (the import itself proves this)', () => { - // AgentConfig.parse() runs at import time. If this test file loads, - // the config is valid (models non-empty, defaultModel in list, etc.). + it('has expected agents', () => { expect(Object.keys(AGENTS)).toEqual(['claude', 'codex']); }); diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index bb4038f19808..13bd7f7cad99 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -1,13 +1,11 @@ /** * Core types for the Storybook setup eval system. * - * Data types use Zod schemas for runtime validation. - * Behavioral interfaces (Logger, Agent) stay as plain TypeScript. + * Plain TypeScript interfaces — no runtime validation library. + * Validation happens at the boundaries (CLI parsing via citty). */ -import { z } from "zod"; - -// --- Logger (behavioral interface — not validated at runtime) --- +// --- Logger --- export interface Logger { log: (msg: string) => void; @@ -18,138 +16,131 @@ export interface Logger { // --- Agent Name --- -export const AgentName = z.enum(["claude", "codex"]); -export type AgentName = z.infer; +export type AgentName = "claude" | "codex"; // --- Project --- -export const Project = z.object({ - name: z.string().min(1), - repo: z.string().url(), - branch: z.string().optional(), - projectDir: z.string().optional(), - description: z.string().optional(), -}); -export type Project = z.infer; +export interface Project { + name: string; + repo: string; + branch?: string; + projectDir?: string; + description?: string; +} // --- Trial Config --- -export const TrialConfig = z.object({ - project: Project, - agent: AgentName, - model: z.string(), - effort: z.string(), - prompt: z.string(), - verbose: z.boolean().optional(), -}); -export type TrialConfig = z.infer; +export interface TrialConfig { + project: Project; + agent: AgentName; + model: string; + effort: string; + prompt: string; + verbose?: boolean; +} // --- Trial Paths --- -export const TrialPaths = z.object({ - trialDir: z.string(), - repoRoot: z.string(), - projectPath: z.string(), - resultsDir: z.string(), - baselineCommit: z.string(), -}); -export type TrialPaths = z.infer; +export interface TrialPaths { + trialDir: string; + repoRoot: string; + projectPath: string; + resultsDir: string; + baselineCommit: string; +} // --- Execution --- -export const ExecutionResult = z.object({ - agent: z.string(), - model: z.string(), - effort: z.string(), - cost: z.number().optional(), - duration: z.number(), - durationApi: z.number().optional(), - turns: z.number(), -}); -export type ExecutionResult = z.infer; +export interface ExecutionResult { + agent: string; + model: string; + effort: string; + cost?: number; + duration: number; + durationApi?: number; + turns: number; +} // --- Changed Files --- -export const ChangedFile = z.object({ - path: z.string(), - status: z.enum(["A", "M", "D", "R"]), -}); -export type ChangedFile = z.infer; +export interface ChangedFile { + path: string; + status: "A" | "M" | "D" | "R"; +} // --- Setup Patterns --- -export const SetupPattern = z.object({ - id: z.string(), - label: z.string(), - sourceFiles: z.array(z.string()), -}); -export type SetupPattern = z.infer; +export interface SetupPattern { + id: string; + label: string; + sourceFiles: string[]; +} // --- Ghost Stories --- -export const GhostStoriesResult = z.object({ - candidateCount: z.number(), - total: z.number(), - passed: z.number(), - successRate: z.number(), -}); -export type GhostStoriesResult = z.infer; +export interface GhostStoriesResult { + candidateCount: number; + total: number; + passed: number; + successRate: number; +} // --- Grading --- -export const GradingResult = z.object({ - buildSuccess: z.boolean(), - buildError: z.string().optional(), - typeCheckErrors: z.number(), - typeCheckOutput: z.string().optional(), - changedFiles: z.array(ChangedFile), - storybookFiles: z.array(ChangedFile), - setupPatterns: z.array(SetupPattern), - ghostStories: GhostStoriesResult.optional(), -}); -export type GradingResult = z.infer; +export interface GradingResult { + buildSuccess: boolean; + buildError?: string; + typeCheckErrors: number; + typeCheckOutput?: string; + changedFiles: ChangedFile[]; + storybookFiles: ChangedFile[]; + setupPatterns: SetupPattern[]; + ghostStories?: GhostStoriesResult; +} // --- Quality Score --- -export const QualityWeights = z.object({ - ghostStories: z.number().default(0.4), - build: z.number().default(0.25), - typecheck: z.number().default(0.25), - performance: z.number().default(0.1), -}); -export type QualityWeights = z.infer; - -export const DEFAULT_QUALITY_WEIGHTS: QualityWeights = QualityWeights.parse({}); - -export const QualityResult = z.object({ - score: z.number(), - breakdown: z.object({ - build: z.number(), - typecheck: z.number(), - ghostStories: z.number(), - performance: z.number(), - }), -}); -export type QualityResult = z.infer; +export interface QualityWeights { + ghostStories: number; + build: number; + typecheck: number; + performance: number; +} + +export const DEFAULT_QUALITY_WEIGHTS: QualityWeights = { + ghostStories: 0.4, + build: 0.25, + typecheck: 0.25, + performance: 0.1, +}; + +export interface QualityResult { + score: number; + breakdown: { + build: number; + typecheck: number; + ghostStories: number; + performance: number; + }; +} // --- Trial Result --- -export const TrialResult = z.object({ - schemaVersion: z.literal(1), - project: z.string(), - agent: z.string(), - model: z.string(), - effort: z.string(), - prompt: z.string(), - timestamp: z.string(), - baselineCommit: z.string(), - execution: ExecutionResult, - grading: GradingResult, - quality: QualityResult, -}); -export type TrialResult = z.infer; - -// --- Agent Interface (behavioral — not validated) --- +export interface TrialResult { + schemaVersion: 1; + project: string; + agent: string; + model: string; + effort: string; + prompt: string; + timestamp: string; + baselineCommit: string; + execution: ExecutionResult; + grading: GradingResult; + quality: QualityResult; +} + +// --- Agent Interface --- export interface Agent { name: AgentName; diff --git a/scripts/package.json b/scripts/package.json index abb5e6360401..48fbc54c8704 100644 --- a/scripts/package.json +++ b/scripts/package.json @@ -76,6 +76,7 @@ "@vitest/coverage-v8": "^4.1.0", "ansi-regex": "^6.0.1", "chromatic": "^13.3.4", + "citty": "^0.2.1", "codecov": "^3.8.1", "commander": "^14.0.2", "cross-env": "^7.0.3", diff --git a/yarn.lock b/yarn.lock index 719ae2d9c2e5..aec95012dc21 100644 --- a/yarn.lock +++ b/yarn.lock @@ -8870,6 +8870,7 @@ __metadata: "@vitest/coverage-v8": "npm:^4.1.0" ansi-regex: "npm:^6.0.1" chromatic: "npm:^13.3.4" + citty: "npm:^0.2.1" codecov: "npm:^3.8.1" commander: "npm:^14.0.2" cross-env: "npm:^7.0.3" @@ -13830,6 +13831,13 @@ __metadata: languageName: node linkType: hard +"citty@npm:^0.2.1": + version: 0.2.1 + resolution: "citty@npm:0.2.1" + checksum: 10c0/504ac5aeb076f750bf5f25d40c730083e8ed6112eac2f00dbe341a223c46ad16893ce73dfdb55b2d0da505100b9678968ee0443637c45b21917db48daa5a6977 + languageName: node + linkType: hard + "cjs-module-lexer@npm:^1.2.3": version: 1.4.3 resolution: "cjs-module-lexer@npm:1.4.3" From 73d74150b74ce81519505e55772b452a1a13e4d2 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sun, 29 Mar 2026 08:55:40 +0700 Subject: [PATCH 37/63] Fix eval ghost-stories globbing lint --- scripts/eval/lib/ghost-stories.ts | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts index 0ea91a0f69bd..cf65bb7b4a30 100644 --- a/scripts/eval/lib/ghost-stories.ts +++ b/scripts/eval/lib/ghost-stories.ts @@ -8,11 +8,10 @@ */ import { existsSync } from "node:fs"; -import { readFile } from "node:fs/promises"; -import { join } from "node:path"; +import { glob, readFile } from "node:fs/promises"; +import { join, resolve } from "node:path"; import { tmpdir } from "node:os"; import { x } from "tinyexec"; -import { glob } from "glob"; const COMPONENT_GLOB = "**/*.{tsx,jsx}"; const IGNORE_PATTERNS = [ @@ -42,12 +41,11 @@ export async function findComponentCandidates(opts: { }): Promise<{ candidates: string[]; error?: string }> { const { cwd, sampleSize = 20 } = opts; try { - const files = await glob(COMPONENT_GLOB, { + const files = await Array.fromAsync(glob(COMPONENT_GLOB, { cwd, - absolute: true, - ignore: IGNORE_PATTERNS, - }); - return { candidates: files.slice(0, sampleSize) }; + exclude: IGNORE_PATTERNS, + })); + return { candidates: files.map((file) => resolve(cwd, file)).slice(0, sampleSize) }; } catch { return { candidates: [], error: "Failed to find component candidates" }; } From 98a2f74d6ad8f5ddf467643b0329a3ea63db43df Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sun, 29 Mar 2026 08:59:49 +0700 Subject: [PATCH 38/63] Refine eval grading review fixes --- scripts/eval/lib/grade.test.ts | 23 +- scripts/eval/lib/grade.ts | 14 +- ...peline.test.ts => grading-helpers.test.ts} | 16 +- scripts/eval/lib/package-manager.ts | 5 +- scripts/eval/lib/run-task.test.ts | 89 +++---- scripts/eval/prepare-repos.ts | 220 ------------------ scripts/eval/types.test.ts | 73 ++++-- scripts/eval/types.ts | 2 + 8 files changed, 138 insertions(+), 304 deletions(-) rename scripts/eval/lib/{grading-pipeline.test.ts => grading-helpers.test.ts} (91%) delete mode 100644 scripts/eval/prepare-repos.ts diff --git a/scripts/eval/lib/grade.test.ts b/scripts/eval/lib/grade.test.ts index ee68248aea7d..6925d8e3fa06 100644 --- a/scripts/eval/lib/grade.test.ts +++ b/scripts/eval/lib/grade.test.ts @@ -15,9 +15,10 @@ describe('filterStorybookFiles', () => { { path: '.storybook/preview.tsx', status: 'A' }, { path: 'src/App.tsx', status: 'M' }, ]; - const result = filterStorybookFiles(files); - expect(result).toHaveLength(2); - expect(result.map((f) => f.path)).toEqual(['.storybook/main.ts', '.storybook/preview.tsx']); + expect(filterStorybookFiles(files)).toMatchObject([ + { path: '.storybook/main.ts', status: 'M' }, + { path: '.storybook/preview.tsx', status: 'A' }, + ]); }); it('matches story files with various extensions', () => { @@ -29,7 +30,7 @@ describe('filterStorybookFiles', () => { { path: 'src/Button.tsx', status: 'M' }, { path: 'src/Button.test.tsx', status: 'M' }, ]; - expect(filterStorybookFiles(files)).toHaveLength(4); + expect(filterStorybookFiles(files)).toMatchObject(files.slice(0, 4)); }); it('returns empty for no storybook files', () => { @@ -43,6 +44,16 @@ describe('filterStorybookFiles', () => { it('handles empty input', () => { expect(filterStorybookFiles([])).toHaveLength(0); }); + + it('matches renamed files using either side of the rename', () => { + const files: ChangedFile[] = [ + { path: 'src/Button.tsx', previousPath: 'src/Button.stories.tsx', status: 'R' }, + { path: '.storybook/preview.tsx', previousPath: 'config/preview.tsx', status: 'R' }, + { path: 'src/App.tsx', previousPath: 'src/Main.tsx', status: 'R' }, + ]; + + expect(filterStorybookFiles(files)).toMatchObject(files.slice(0, 2)); + }); }); describe('computeQualityScore', () => { @@ -157,11 +168,11 @@ describe('countTypeCheckErrors', () => { describe('parseChangedFiles', () => { it('parses added, modified, deleted, and renamed files', () => { const output = 'A\tsrc/new-file.ts\nM\tsrc/existing.ts\nD\tsrc/removed.ts\nR100\told.ts\tnew.ts'; - expect(parseChangedFiles(output)).toEqual([ + expect(parseChangedFiles(output)).toMatchObject([ { path: 'src/new-file.ts', status: 'A' }, { path: 'src/existing.ts', status: 'M' }, { path: 'src/removed.ts', status: 'D' }, - { path: 'old.ts\tnew.ts', status: 'R' }, + { path: 'new.ts', previousPath: 'old.ts', status: 'R' }, ]); }); diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index fe155c5b53a5..e0237ac28566 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -8,8 +8,11 @@ import { findComponentCandidates, runGhostStories } from "./ghost-stories.ts"; /** Filter changed files to only storybook-related ones. */ export function filterStorybookFiles(changedFiles: ChangedFile[]): ChangedFile[] { + const isStorybookPath = (path?: string) => + path != null && (path.includes(".storybook/") || /\.(stories|story)\.[tj]sx?$/.test(path)); + return changedFiles.filter( - (f) => f.path.includes(".storybook/") || /\.(stories|story)\.[tj]sx?$/.test(f.path), + (f) => isStorybookPath(f.path) || isStorybookPath(f.previousPath), ); } @@ -66,7 +69,14 @@ export function parseChangedFiles(gitOutput: string): ChangedFile[] { .filter(Boolean) .map((line) => { const [status, ...parts] = line.split("\t"); - return { path: parts.join("\t"), status: (status?.charAt(0) || "M") as ChangedFile["status"] }; + const normalizedStatus = (status?.charAt(0) || "M") as ChangedFile["status"]; + + if (normalizedStatus === "R" && parts.length >= 2) { + const [previousPath, path] = parts; + return { path, previousPath, status: normalizedStatus }; + } + + return { path: parts.join("\t"), status: normalizedStatus }; }); } diff --git a/scripts/eval/lib/grading-pipeline.test.ts b/scripts/eval/lib/grading-helpers.test.ts similarity index 91% rename from scripts/eval/lib/grading-pipeline.test.ts rename to scripts/eval/lib/grading-helpers.test.ts index 40d2d36fdccc..63d3921bcb42 100644 --- a/scripts/eval/lib/grading-pipeline.test.ts +++ b/scripts/eval/lib/grading-helpers.test.ts @@ -14,15 +14,15 @@ import { import { detectSetupPatterns } from './setup-patterns'; /** - * High-level test: simulate the grading pipeline on a fake project directory. - * Data flows from one step to the next — candidate count feeds into the - * quality assessment, patterns inform what we expect from the changed files, etc. + * Helper-level test: compose grading helpers on a fake project directory. + * This exercises candidate discovery, setup-pattern detection, git-output parsing, + * and quality-score calculation without pretending to cover the full grade() flow. */ let TMP: string; beforeEach(() => { - TMP = join(tmpdir(), `eval-grading-pipeline-${Date.now()}`); + TMP = join(tmpdir(), `eval-grading-helpers-${Date.now()}`); mkdirSync(join(TMP, 'src', 'components'), { recursive: true }); mkdirSync(join(TMP, '.storybook'), { recursive: true }); }); @@ -42,8 +42,8 @@ async function findCandidates(cwd: string) { return candidates.map((c) => c.replace(cwd + '/', '')); } -describe('grading pipeline', () => { - it('grades a well-configured project: candidates found, patterns detected, high quality', async () => { +describe('grading helpers', () => { + it('composes helper signals for a well-configured project', async () => { // Set up a realistic project with components and storybook config writeFile( 'src/components/Button.tsx', @@ -111,7 +111,7 @@ describe('grading pipeline', () => { expect(quality.score).toBe(1); }); - it('grades a broken project: candidates found but build fails, low quality', async () => { + it('composes helper signals for a broken project', async () => { writeFile( 'src/components/Widget.tsx', [ @@ -144,7 +144,7 @@ describe('grading pipeline', () => { expect(quality.breakdown.build).toBe(0); }); - it('more candidates with setup patterns yields higher confidence in the grade', async () => { + it('keeps helper output stable as candidate count grows', async () => { // Rich project: many simple components for (let i = 0; i < 5; i++) { writeFile( diff --git a/scripts/eval/lib/package-manager.ts b/scripts/eval/lib/package-manager.ts index ab11af3a26a1..c2993f004f27 100644 --- a/scripts/eval/lib/package-manager.ts +++ b/scripts/eval/lib/package-manager.ts @@ -1,10 +1,9 @@ /** * Shared package manager detection and dependency installation. * - * Used by both the trial preparation (prepare-trial.ts) and the - * one-time repo preparation script (prepare-repos.ts). + * Used by trial preparation and any other eval flows that need a + * package-manager-aware install step. */ - import { existsSync } from "node:fs"; import { join } from "node:path"; import { x } from "tinyexec"; diff --git a/scripts/eval/lib/run-task.test.ts b/scripts/eval/lib/run-task.test.ts index 503046a4761c..73f0c170c57e 100644 --- a/scripts/eval/lib/run-task.test.ts +++ b/scripts/eval/lib/run-task.test.ts @@ -105,31 +105,30 @@ describe('runTask pipeline', () => { const result = await runTask(baseConfig); - // Config fields mapped correctly - expect(result.schemaVersion).toBe(1); - expect(result.project).toBe('test-project'); - expect(result.agent).toBe('claude'); - expect(result.model).toBe('sonnet-4.6'); - expect(result.effort).toBe('high'); - expect(result.prompt).toBe('setup'); - expect(result.timestamp).toMatch(/^\d{4}-\d{2}-\d{2}T/); - - // prepareTrial output flows into result - expect(result.baselineCommit).toBe('deadbeef'); - - // Agent execution output flows into result - expect(result.execution).toEqual({ + expect(result).toMatchObject({ + schemaVersion: 1, + project: 'test-project', agent: 'claude', model: 'sonnet-4.6', effort: 'high', - cost: 0.42, - duration: 45.2, - turns: 12, + prompt: 'setup', + baselineCommit: 'deadbeef', + execution: { + agent: 'claude', + model: 'sonnet-4.6', + effort: 'high', + cost: 0.42, + duration: 45.2, + turns: 12, + }, + grading: { + buildSuccess: true, + }, + quality: { + score: 1, + }, }); - - // Grade output flows into result - expect(result.grading.buildSuccess).toBe(true); - expect(result.quality.score).toBe(1); + expect(result.timestamp).toMatch(/^\d{4}-\d{2}-\d{2}T/); }); it('calls pipeline steps with correct arguments', async () => { @@ -142,28 +141,31 @@ describe('runTask pipeline', () => { await runTask(config); - // prepareTrial receives the project and a logger - expect(vi.mocked(prepareTrial).mock.calls[0][0].name).toBe('mealdrop'); - // Third arg is the logger + expect(vi.mocked(prepareTrial).mock.calls[0][0]).toMatchObject({ + name: 'mealdrop', + repo: 'https://github.com/test/mealdrop', + branch: 'eval-baseline', + }); expect(vi.mocked(prepareTrial).mock.calls[0][2]).toBeDefined(); - // captureEnvironment receives the results dir expect(vi.mocked(captureEnvironment).mock.calls[0][0]).toBe(join(TMP, 'results')); - // Agent receives a params object with prompt, projectPath, model, effort, resultsDir, logger const params = vi.mocked(claudeAgent.execute).mock.calls[0][0] as Record; - expect(params.prompt).toContain('Storybook setup'); - expect(params.projectPath).toBe(TMP); - expect(params.model).toBe('sonnet-4.6'); - expect(params.effort).toBe('high'); - expect(params.resultsDir).toBe(join(TMP, 'results')); + expect(params).toMatchObject({ + prompt: expect.stringContaining('Storybook setup'), + projectPath: TMP, + model: 'sonnet-4.6', + effort: 'high', + resultsDir: join(TMP, 'results'), + }); expect(params.logger).toBeDefined(); - // grade receives the trial paths and a logger const gradePaths = vi.mocked(grade).mock.calls[0][0]; - expect(gradePaths.baselineCommit).toBe('deadbeef'); - expect(gradePaths.projectPath).toBe(TMP); - // Second arg is the logger + expect(gradePaths).toMatchObject({ + baselineCommit: 'deadbeef', + projectPath: TMP, + resultsDir: join(TMP, 'results'), + }); expect(vi.mocked(grade).mock.calls[0][1]).toBeDefined(); }); @@ -174,13 +176,13 @@ describe('runTask pipeline', () => { const resultsDir = join(TMP, 'results'); - // summary.json is parseable and matches the returned result const summary: TrialResult = JSON.parse(readFileSync(join(resultsDir, 'summary.json'), 'utf-8')); - expect(summary.schemaVersion).toBe(1); - expect(summary.execution.cost).toBe(0.42); - expect(summary.grading.buildSuccess).toBe(true); + expect(summary).toMatchObject({ + schemaVersion: 1, + execution: { cost: 0.42 }, + grading: { buildSuccess: true }, + }); - // prompt.md contains the real setup prompt const promptContent = readFileSync(join(resultsDir, 'prompt.md'), 'utf-8'); expect(promptContent).toContain('Storybook setup'); }); @@ -188,9 +190,10 @@ describe('runTask pipeline', () => { it('propagates failed build into result', async () => { setupMocks({ buildSuccess: false, typeCheckErrors: 5 }); - const result = await runTask(baseConfig); - expect(result.grading.buildSuccess).toBe(false); - expect(result.quality.score).toBe(0.3); + await expect(runTask(baseConfig)).resolves.toMatchObject({ + grading: { buildSuccess: false, typeCheckErrors: 5 }, + quality: { score: 0.3 }, + }); }); it('does not call grade before agent finishes', async () => { diff --git a/scripts/eval/prepare-repos.ts b/scripts/eval/prepare-repos.ts deleted file mode 100644 index dc0913b4b0f5..000000000000 --- a/scripts/eval/prepare-repos.ts +++ /dev/null @@ -1,220 +0,0 @@ -/** - * One-time script to prepare eval baseline repos. - * - * For each benchmark project: - * 1. Fork the repo to your GitHub account - * 2. Clone the fork - * 3. Clean storybook files, install deps, run `storybook init` - * 4. Commit and push as `eval-baseline` branch - * - * After this, each eval trial just does a fast shallow clone of the - * prepared branch — no more storybook init during trials. - * - * Usage: node scripts/eval/prepare-repos.ts - * - * NOTE: The REPOS list below contains the *original* upstream repos - * (e.g. "yannbf/mealdrop"), which is distinct from the *fork* URLs in - * config.ts PROJECTS (e.g. "kasperpeulen/mealdrop"). This script forks - * and pushes eval-baseline branches to those forks. - */ - -import { existsSync, mkdirSync, readFileSync, writeFileSync, rmSync, readdirSync } from "node:fs"; -import { join } from "node:path"; -import { x } from "tinyexec"; -import { createLogger } from "./lib/utils.ts"; -import { installDeps } from "./lib/package-manager.ts"; - -const logger = createLogger(); - -const EVAL_ROOT = join(import.meta.dirname, "..", "..", "..", "..", "storybook-eval"); -const PREP_DIR = join(EVAL_ROOT, "prepared-repos"); -const BASELINE_BRANCH = "eval-baseline"; - -/** Known storybook init starter files that are safe to remove. */ -const STARTER_FILES = new Set([ - 'button.stories.ts', 'button.stories.tsx', 'button.stories.js', 'button.stories.jsx', - 'header.stories.ts', 'header.stories.tsx', 'header.stories.js', 'header.stories.jsx', - 'page.stories.ts', 'page.stories.tsx', 'page.stories.js', 'page.stories.jsx', - 'button.tsx', 'button.jsx', 'button.ts', 'button.js', 'button.css', - 'header.tsx', 'header.jsx', 'header.ts', 'header.js', 'header.css', - 'page.tsx', 'page.jsx', 'page.ts', 'page.js', 'page.css', - 'configure-your-project.mdx', -]); - -interface BenchmarkRepo { - name: string; - repo: string; - branch?: string; - projectDir?: string; -} - -const REPOS: BenchmarkRepo[] = [ - { name: 'mealdrop', repo: 'yannbf/mealdrop', branch: 'without-storybook' }, - { name: 'edgy', repo: 'catherineisonline/edgy' }, - { name: 'wikitok', repo: 'IsaacGemal/wikitok', projectDir: 'frontend' }, - { name: 'baklava', repo: 'fortanix/baklava', branch: 'master' }, - { name: 'echarts', repo: 'tmkx/echarts-react' }, - { name: 'evergreen-ci', repo: 'evergreen-ci/ui', projectDir: 'packages/lib' }, -]; - -function cleanNpmEnv(): Record { - const env: Record = {}; - for (const [k, v] of Object.entries(process.env)) { - if (v != null && !k.startsWith('npm_config_')) env[k] = v; - } - env.npm_config_registry = 'https://registry.npmjs.org/'; - return env; -} - -const GIT_ENV = { - GIT_AUTHOR_NAME: 'eval', - GIT_AUTHOR_EMAIL: 'eval@storybook.js.org', - GIT_COMMITTER_NAME: 'eval', - GIT_COMMITTER_EMAIL: 'eval@storybook.js.org', -}; - -async function run(cmd: string, args: string[], opts: { cwd?: string; env?: Record; timeout?: number } = {}) { - return x(cmd, args, { timeout: opts.timeout, nodeOptions: { cwd: opts.cwd, env: opts.env as NodeJS.ProcessEnv } }); -} - -function stripStorybookDeps(pkgPath: string) { - if (!existsSync(pkgPath)) return; - const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8')); - let changed = false; - for (const field of ['dependencies', 'devDependencies', 'peerDependencies']) { - const deps = pkg[field]; - if (!deps) continue; - for (const key of Object.keys(deps)) { - if (key === 'storybook' || key.startsWith('@storybook/') || key === 'eslint-plugin-storybook') { - delete deps[key]; - changed = true; - } - } - } - if (pkg.scripts) { - for (const key of Object.keys(pkg.scripts)) { - if (key === 'storybook' || key === 'build-storybook') { - delete pkg.scripts[key]; - changed = true; - } - } - } - if (changed) writeFileSync(pkgPath, JSON.stringify(pkg, null, 2) + '\n'); -} - -function cleanStorybookFiles(dir: string) { - for (const name of ['.storybook', 'storybook-static']) { - const target = join(dir, name); - if (existsSync(target)) rmSync(target, { recursive: true }); - } - for (const storiesDir of ['stories', join('src', 'stories')]) { - const target = join(dir, storiesDir); - if (existsSync(target) && isStarterDirectory(target)) { - rmSync(target, { recursive: true }); - } - } - stripStorybookDeps(join(dir, 'package.json')); -} - -function isStarterDirectory(dir: string): boolean { - try { - return readdirSync(dir, { withFileTypes: true }).every( - (e) => !e.isDirectory() && STARTER_FILES.has(e.name.toLowerCase()) - ); - } catch { - return false; - } -} - -async function prepareRepo(repo: BenchmarkRepo) { - logger.log(`\n=== ${repo.name} ===`); - const repoDir = join(PREP_DIR, repo.name); - - // 1. Fork (idempotent — gh fork is a no-op if already forked) - logger.logStep(`Forking ${repo.repo}...`); - try { - await run('gh', ['repo', 'fork', repo.repo, '--clone=false']); - } catch { - logger.log(` ! Fork may already exist, continuing...`); - } - - // Figure out the fork name (gh forks to authenticated user) - const whoami = (await run('gh', ['api', 'user', '--jq', '.login'])).stdout.trim(); - const forkSlug = `${whoami}/${repo.repo.split('/')[1]}`; - logger.logStep(`Fork: ${forkSlug}`); - - // 2. Clone (or pull) the fork - if (existsSync(repoDir)) { - logger.logStep(`Updating existing clone...`); - await run('git', ['fetch', 'origin'], { cwd: repoDir }); - const branch = repo.branch || (await run('git', ['remote', 'show', 'origin'], { cwd: repoDir })) - .stdout.match(/HEAD branch:\s*(\S+)/)?.[1] || 'main'; - await run('git', ['checkout', branch], { cwd: repoDir }); - await run('git', ['reset', '--hard', `origin/${branch}`], { cwd: repoDir }); - await run('git', ['clean', '-fdx', '-e', 'node_modules'], { cwd: repoDir }); - } else { - logger.logStep(`Cloning ${forkSlug}...`); - const cloneArgs = ['clone', `https://github.com/${forkSlug}.git`, repoDir]; - if (repo.branch) cloneArgs.splice(1, 0, '--branch', repo.branch); - await run('git', cloneArgs, { timeout: 120_000 }); - } - - // 3. Create eval-baseline branch - logger.logStep(`Creating ${BASELINE_BRANCH} branch...`); - await run('git', ['checkout', '-B', BASELINE_BRANCH], { cwd: repoDir }); - - // 4. Clean storybook files - const projectDir = repo.projectDir ? join(repoDir, repo.projectDir) : repoDir; - cleanStorybookFiles(projectDir); - - // 5. Install dependencies - await installDeps(projectDir, logger, cleanNpmEnv()); - - // 6. Run storybook init - logger.logStep(`Running storybook init...`); - const env = cleanNpmEnv(); - await run('npx', ['storybook@latest', 'init', '--yes', '--no-dev'], { - cwd: projectDir, - env: { ...env, STORYBOOK_DISABLE_TELEMETRY: '1' }, - timeout: 300_000, - }); - - // 7. Post-init install - await installDeps(projectDir, logger, cleanNpmEnv()); - - // 8. Commit everything - logger.logStep(`Committing baseline...`); - await run('git', ['add', '-A'], { cwd: repoDir, env: { ...cleanNpmEnv(), ...GIT_ENV } }); - await run('git', ['commit', '-m', 'eval baseline after storybook init', '--allow-empty'], { - cwd: repoDir, - env: { ...cleanNpmEnv(), ...GIT_ENV }, - }); - - // 9. Force-push the baseline branch - logger.logStep(`Pushing ${BASELINE_BRANCH}...`); - await run('git', ['push', '-f', 'origin', BASELINE_BRANCH], { cwd: repoDir }); - - logger.logSuccess(`${repo.name} ready at ${forkSlug}#${BASELINE_BRANCH}`); - return { name: repo.name, forkRepo: `https://github.com/${forkSlug}`, branch: BASELINE_BRANCH, projectDir: repo.projectDir }; -} - -// --- Main --- -mkdirSync(PREP_DIR, { recursive: true }); - -logger.log(`Preparing eval baseline repos`); -logger.log(`Output: ${PREP_DIR}\n`); - -const results = []; -for (const repo of REPOS) { - try { - const result = await prepareRepo(repo); - results.push(result); - } catch (error) { - logger.logError(`Failed: ${error instanceof Error ? error.message : error}`); - } -} - -logger.log(`\n\nPrepared repos:`); -for (const r of results) { - logger.logSuccess(`${r.name}: ${r.forkRepo}#${r.branch}${r.projectDir ? ` (${r.projectDir})` : ''}`); -} diff --git a/scripts/eval/types.test.ts b/scripts/eval/types.test.ts index 9883eddddb42..eca9509fc7e6 100644 --- a/scripts/eval/types.test.ts +++ b/scripts/eval/types.test.ts @@ -2,42 +2,71 @@ import { describe, expect, it } from 'vitest'; import { AGENTS, PROJECTS } from './config'; -/** Cross-cutting config invariants that TypeScript's type system cannot express. */ +const githubRepoUrl = /^https:\/\/github\.com\/[^/]+\/[^/]+$/; describe('AGENTS', () => { - it('has expected agents', () => { - expect(Object.keys(AGENTS)).toEqual(['claude', 'codex']); + it('keeps each agent default inside its supported model and effort lists', () => { + for (const config of Object.values(AGENTS)) { + expect(config).toMatchObject({ + defaultModel: expect.any(String), + defaultEffort: expect.any(String), + }); + expect(config.models).toContain(config.defaultModel); + expect(config.efforts).toContain(config.defaultEffort); + } }); - it('no model is shared between agents', () => { - const allModels = Object.values(AGENTS).flatMap((a) => a.models); - expect(new Set(allModels).size).toBe(allModels.length); + it('keeps Claude models fully remappable to SDK model ids', () => { + expect(AGENTS.claude).toMatchObject({ + defaultModel: 'sonnet-4.6', + defaultEffort: 'high', + sdkModelIds: Object.fromEntries( + AGENTS.claude.models.map((model) => [model, expect.any(String)]), + ), + }); }); - it('sdkModelIds only reference known models', () => { - for (const [, cfg] of Object.entries(AGENTS)) { - for (const model of Object.keys(cfg.sdkModelIds)) { - expect(cfg.models).toContain(model); - } - } - }); - - it('pricing only references known models', () => { - for (const [, cfg] of Object.entries(AGENTS)) { - for (const model of Object.keys(cfg.pricing)) { - expect(cfg.models).toContain(model); - } - } + it('keeps Codex models fully priceable from token usage', () => { + expect(AGENTS.codex).toMatchObject({ + defaultModel: 'gpt-5.4', + defaultEffort: 'high', + pricing: Object.fromEntries( + AGENTS.codex.models.map((model) => [ + model, + { + input: expect.any(Number), + cachedInput: expect.any(Number), + output: expect.any(Number), + }, + ]), + ), + }); }); }); describe('PROJECTS', () => { - it('has at least one project', () => { + it('pins every benchmark project to a pre-initialized eval-baseline repo', () => { expect(PROJECTS.length).toBeGreaterThan(0); + + for (const project of PROJECTS) { + expect(project).toMatchObject({ + branch: 'eval-baseline', + repo: expect.stringMatching(githubRepoUrl), + description: expect.any(String), + }); + } }); - it('project names are unique', () => { + it('keeps benchmark project metadata unambiguous', () => { const names = PROJECTS.map((p) => p.name); + const repos = PROJECTS.map((p) => p.repo); + expect(new Set(names).size).toBe(names.length); + expect(new Set(repos).size).toBe(repos.length); + + for (const project of PROJECTS) { + if (!project.projectDir) continue; + expect(project.projectDir).toMatch(/^(?!\/)(?!\.\.?(?:\/|$)).+/); + } }); }); diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index 13bd7f7cad99..2d8efd422b2f 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -66,6 +66,8 @@ export interface ExecutionResult { export interface ChangedFile { path: string; status: "A" | "M" | "D" | "R"; + /** For renames, the original path before the move. */ + previousPath?: string; } // --- Setup Patterns --- From 35d5699c55198bb810182f9a4bbb5587aae8826d Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sun, 29 Mar 2026 13:50:31 +0700 Subject: [PATCH 39/63] Rewrite review-pr skill: scrollable single-page instead of Reveal.js slides MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace slideshow format with a scrollable HTML page using file cards - Show complete file contents for new files, diffs for modified files - Lexend + JetBrains Mono fonts, light/dark theme, mobile-responsive - Static server on port 3000 (no live-reload) - Issues shown inline as smell-boxes, never block page generation - Simplified to 5 steps: gather → read → generate → serve → iterate --- .agents/skills/review-pr/SKILL.md | 634 +++++++++++------------------- 1 file changed, 228 insertions(+), 406 deletions(-) diff --git a/.agents/skills/review-pr/SKILL.md b/.agents/skills/review-pr/SKILL.md index 0b8a41c8fc95..5acbc64c9282 100644 --- a/.agents/skills/review-pr/SKILL.md +++ b/.agents/skills/review-pr/SKILL.md @@ -1,141 +1,61 @@ --- name: review-pr -description: "Generate a Reveal.js slideshow to review a PR. Use when the user says 'review pr', 'review this PR', 'slideshow review', 'pr slideshow', or wants to review PR changes in a narrative presentation format." +description: "Generate a scrollable single-page PR review. Use when the user says 'review pr', 'review this PR', 'pr review', or wants to review PR changes in a narrative format." allowed-tools: Bash, Read, Write, Edit, Agent, Grep, Glob --- -# PR Review Slideshow +# PR Review — Scrollable Single-Page -Generate a Reveal.js slideshow that walks through a PR as a narrative — starting from the main flow, then zooming into every detail. +Generate a scrollable single-page HTML document that walks through a PR as a narrative — big picture first, then every file grouped by area, tests before implementation. -## Philosophy +**Always generate the page immediately.** Never block on cleanup or fix discussions. Include issues as inline smell-boxes. -Two principles — both matter, they work on different axes: +## Principles -1. **Big picture first.** The horizontal flow goes broad → specific. Start with the high-level "what and why", then progressively zoom into each area of change. -2. **Tests first.** At each stop along the way, show the test before the implementation. The test explains *what* the behavior is. The implementation explains *how*. - -Together: you walk through the PR from the broadest overview to the smallest detail, and at every level you see the test before you see the code. - -Other principles: -- **Discuss before you fix, fix before you present.** Flag readability problems, get approval, then clean up. -- **If reading the tests doesn't make the change obvious, that's a smell.** Flag it. -- **Cover everything.** By the last slide, every changed file has been addressed. -- **Less is more.** Omit boilerplate, but always note what you left out. +1. **Big picture first.** Summary of what and why, then logical areas from most to least important. +2. **Tests first.** Within each area, show tests before implementation. +3. **Show whole files.** New files: complete content including imports. Modified files: diff. Use `
` for files over 100 lines. +4. **Cover everything.** Every changed file appears somewhere. +5. **Issues inline.** Flag problems as smell-boxes next to the relevant file. Never block page generation. ## Step 1 — Gather PR data -Determine the PR to review. If the user provides a PR number, use that. Otherwise detect from the current branch: - ```bash -# Get PR number from current branch +# Get PR metadata (use gh pr view if a PR number is given) gh pr view --json number,title,author,headRefName,baseRefName,body,additions,deletions,changedFiles - -# Get the list of changed files gh pr diff --name-only - -# Get the full diff gh pr diff ``` -If a PR number or URL is given as an argument, pass it to `gh pr view ` and `gh pr diff `. - -## Step 2 — Read and analyze changes - -For each changed file: - -1. Read the full diff (from `gh pr diff`) -2. Read the full file content for surrounding context (use `Read` tool) -3. Identify if it's a **test file**, **type definition**, **implementation**, **config**, or **docs** - -For each implementation file, look for a corresponding test file: -- `foo.ts` → look for `foo.test.ts`, `foo.spec.ts`, `foo.test.tsx`, `__tests__/foo.ts` -- Even if the test file wasn't changed, read it for context if the implementation was changed - -## Step 3 — Identify and discuss problems - -Before building the slideshow, scan the PR for readability issues. **Don't fix anything yet** — present findings and let the user decide. - -```bash -gh pr checkout -``` - -Look for: -- Vague test names, massive test setup, missing assertions -- Changed code with no test coverage -- Unclear names, dead code, overly clever logic -- `any` types where a proper type is obvious - -Present a numbered list with concrete examples and suggested fixes. Then **wait** — the user decides what gets fixed (all, some, or none). - -After approval, fix in the working tree, lint, test, commit as a separate commit, and push. - -If the user says skip, go straight to Step 4 — unfixed issues will naturally show up in the slideshow as code that's hard to explain. - -## Step 4 — Plan the narrative +## Step 2 — Read all changed files -The slideshow tells a story on two axes. +For each changed file, read the full file content with the `Read` tool. Also read the full diff from `gh pr diff`. Classify each file as test, implementation, config, or docs. -### Horizontal axis: big picture → specific areas +## Step 3 — Generate the page -Group the changes into logical areas and order them broad-to-specific: +Group changes into logical areas. Within each area: tests first, then implementation, then config. -1. **Big picture** — what this PR does and why, in plain English -2. **Core areas** — the main logical groups of change, ordered from most important to least. Each area becomes a horizontal slide. -3. **Supporting changes** — config, dependencies, docs that don't fit the core areas -4. **Summary** — key takeaways +Write to `~/life/slideshows/pr-/index.html`. -### Vertical axis: test first, then implementation +**Verify every file from `gh pr diff --name-only` appears in the page** — in an area, in supporting changes, or as a bullet point. -Within each area, the vertical slides follow this order: +### HTML structure -1. **Overview** — what changed in this area, in plain English (the top slide) -2. **Test** — the test that explains the behavior. Show it fully. The reader should understand the *what* from this alone. -3. **Implementation** — the code that makes the test pass. Show enough context. -4. **More details** — types, helpers, surrounding context, additional tests - -If there's no test for an area, the overview slide flags that with a smell-box, and the implementation goes directly below it. - -### Check coverage — MANDATORY - -After planning, run through the list of changed files from `gh pr diff --name-only` and verify **every single file** appears somewhere in the slideshow — in an area, in a zoom-in, or in supporting changes. - -This is not optional. If a file is missing from the slideshow, the review is incomplete. Use `file-path` spans for every file so coverage can be verified by searching the HTML. - -For files with trivial changes (e.g. lockfiles, tsconfig one-liners), a bullet point in the Supporting Changes slide is enough. But they must appear. - -## Step 5 — Generate the slideshow - -Pick a short unique ID for this slideshow — use the PR number (e.g. `pr-34365`). The output directory is: +The page has this structure: ``` -~/life/slideshows// +Sticky topbar (nav links to each area) +Header (title, author, branch, stats) +Big picture section +Area 1 (test files → impl files → config) +Area 2 +... +Supporting changes (config, lockfiles, docs) ``` -Write the slideshow to `~/life/slideshows//index.html`. - -### Narrative structure +### Complete HTML template -``` -[Title] → [Big Picture] → [Area 1] → [Area 2] → ... → [Supporting] → [Summary] - ↓ ↓ ↓ - [Test 1a] [Test 2a] [Config diffs] - ↓ ↓ - [Impl 1a] [Impl 2a] - ↓ - [Test 1b] - ↓ - [Impl 1b] -``` - -**Horizontal (← →)** = big picture → specific areas. Read left-to-right to understand the shape of the PR. -**Vertical (↓)** = test first, then implementation. Press down to see *what* the behavior is (test), then *how* it works (code). - -A reader who only goes right sees each area at a glance. A reader who also goes down gets the full test-then-implementation story for each area. - -### HTML template - -Use this exact template structure. Replace `{{SLIDES}}` with generated slide content: +Copy this template exactly. Replace `{{PLACEHOLDERS}}` with actual content. ```html @@ -143,384 +63,286 @@ Use this exact template structure. Replace `{{SLIDES}}` with generated slide con - PR Review: {{TITLE}} + PR #{{NUMBER}}: {{TITLE}} - - - -
-
- {{SLIDES}} -
-
- - - - - - -``` -### Slide guidelines +
+ #{{NUMBER}} + Overview + +
-**Title slide:** -```html -
-

PR #{{NUMBER}}: {{TITLE}}

-

by {{AUTHOR}} · {{BRANCH}} → {{BASE}}

+
+ +
+

{{TITLE}}

+
by {{AUTHOR}} · {{BRANCH}} → {{BASE}}
{{FILES}} files +{{ADDITIONS}} - -{{DELETIONS}} + −{{DELETIONS}}
-
+
+ + +
+

What this PR does

+

{{SUMMARY}}

+
+
+ + +
+

{{N}}. {{Area Name}}

+

{{What changed in this area}}

+ +
+
+ + + + + + + + + + + + + ``` -**Big picture slide — sets up the story, previews the areas:** -```html -
-

What this PR does

-

{{2-3 sentence summary of the change and why it matters}}

-

Areas of change

-
    -
  1. {{Area 1}} — {{one-liner}}
  2. -
  3. {{Area 2}} — {{one-liner}}
  4. -
  5. {{Area 3}} — {{one-liner}}
  6. -
-

→ to walk through each area

-
-``` +### Building blocks -**Area slide — overview on top, test below, implementation below that:** -```html -
- -
-

{{Area name}}

-

{{What changed in this area and why, in plain English}}

- path/to/main-file.ts -

-// Show just the key change — the "headline" that orients the reader
-    
-
{{How this area connects to the rest of the PR}}
-

↓ tests, then implementation

-
- - -
-

What the tests say

- path/to/file.test.ts -

-// Show the full test — the reader should now understand the expected behavior
-    
-
{{Plain-English summary of what this test tells us}}
-
- - -
-

Implementation

- path/to/file.ts -

-// Show the implementation with enough surrounding context
-    
-
{{Why this approach was taken}}
-
- - -
-``` +Copy-paste these patterns to build the page content. -**Area with no test (flag the smell, show implementation directly):** +**New file — shown directly (use for files under ~100 lines):** ```html -
-
-

{{Area name}}

-

{{What changed and why}}

- path/to/file.ts -

-// Show the changed code
-    
-
🔍 No test covers this change — the behavior has to be inferred from the implementation.
-

↓ details

-
- -
+
+
+ impl + new + path/to/file.ts +
+

What this file does.

+
{{FULL FILE CONTENT}}
+
``` -**Supporting changes slide — for files that don't fit the main flow:** +**New file — collapsed (use for files over ~100 lines):** ```html -
-
-

Supporting changes

-

These files support the main flow but aren't part of it:

-
    -
  • package.json — added dependency X
  • -
  • tsconfig.json — enabled option Y
  • -
-

↓ details

-
- -
+
+
+ test + new + path/to/file.test.ts +
+

What this test covers.

+
+ Full file ({{N}} lines) +
{{FULL FILE CONTENT}}
+
+
``` -**When omitting code:** +**Modified file — diff:** ```html -
⏭ 47 lines of error handling omitted — standard try/catch pattern
+
+
+ modified + path/to/file.ts +
+

What changed.

+
-old line
++new line
+
``` -**When a test exists but doesn't fully explain the code:** +**Supporting change — no code block needed:** ```html -
🔍 The test only covers the happy path — the implementation handles 3 edge cases that aren't tested.
+
+
+ config + modified + yarn.lock +
+

Lockfile updated for new dependencies.

+
``` -**Diff highlights for before → after:** +**Inline issue (place after a file card):** ```html -
- const oldWay = doThing(a, b);
-
+ const newWay = doThingBetter(a, b, options);
+
No unit tests for this file.
``` -**Summary slide:** +**Context note (place after a file card):** ```html -
-

Summary

-
    -
  • {{Key takeaway 1}}
  • -
  • {{Key takeaway 2}}
  • -
-
{{Open questions or concerns, if any}}
-
+
This is the only caller of the renamed function.
``` -### Code display rules +### Badge reference + +| Badge | Class | Use for | +|-------|-------|---------| +| `test` | `badge-test` | Test files | +| `impl` | `badge-impl` | Implementation files | +| `config` | `badge-config` | Config, docs, prompts, lockfiles | +| `new` | `badge-new` | New files (combine with test/impl/config) | +| `modified` | `badge-modified` | Modified files | -1. **Horizontal top slides = area overview** — the headline change, just enough to follow the big picture going left-to-right -2. **First zoom = test** — show complete test bodies, the reader now understands the behavior -3. **Second zoom = implementation** — the code that makes the test pass, with surrounding context -3. **Use `data-line-numbers="X-Y"` to highlight changed lines** within a larger code block -4. **One concept per slide** — split large changes across multiple vertical slides -5. **Max ~30 lines of code per slide** — if more, split or omit with an omitted-box -6. **HTML-escape all code content** — replace `<` with `<`, `>` with `>`, `&` with `&` in all code blocks and diff divs -7. **Every changed file must appear somewhere** — this is the most important rule. Run `gh pr diff --name-only` and check every file off against the slideshow. Missing files = incomplete review. Use `` for each file so coverage is verifiable +### Syntax highlighting languages -## Step 6 — Write the server and start it +| Language | Class | Use for | +|----------|-------|---------| +| TypeScript | `language-typescript` | `.ts`, `.tsx`, `.js`, `.jsx` files | +| Diff | `language-diff` | Modified file diffs (lines start with `+`/`-`) | +| JSON | `language-json` | `.json` files | +| Markdown | `language-markdown` | `.md` files | -Write this live-reload server to `~/life/slideshows//server.mjs`: +### HTML escaping + +All code content inside `` blocks must be HTML-escaped: +- `&` → `&` +- `<` → `<` +- `>` → `>` + +Tip: use a Node script to read files and generate escaped HTML when there are many new files. + +## Step 4 — Serve the page + +Kill any existing server on port 3000, write the server, start it: + +```bash +lsof -ti:3000 | xargs kill -9 2>/dev/null || true +``` + +Write this static server to `~/life/slideshows/pr-/server.mjs`: ```javascript import { createServer } from 'node:http'; -import { readFileSync, watch } from 'node:fs'; +import { readFileSync } from 'node:fs'; import { join, extname } from 'node:path'; const dir = new URL('.', import.meta.url).pathname; const port = 3000; -let clients = []; - -watch(dir, { recursive: true }, (event, filename) => { - if (filename === 'server.mjs') return; - clients.forEach(res => { - try { res.write('data: reload\n\n'); } catch {} - }); -}); createServer((req, res) => { - if (req.url === '/__sse') { - res.writeHead(200, { - 'Content-Type': 'text/event-stream', - 'Cache-Control': 'no-cache', - 'Connection': 'keep-alive', - }); - res.write('data: connected\n\n'); - clients.push(res); - req.on('close', () => { clients = clients.filter(c => c !== res); }); - return; - } - try { const filePath = join(dir, req.url === '/' ? 'index.html' : req.url); - let content = readFileSync(filePath); + const content = readFileSync(filePath); const ext = extname(filePath); const types = { '.html': 'text/html', '.js': 'text/javascript', '.css': 'text/css', '.json': 'application/json', - '.mjs': 'text/javascript', }; res.writeHead(200, { 'Content-Type': types[ext] || 'application/octet-stream' }); - - if (ext === '.html') { - content = content.toString().replace('', - `\n`); - } res.end(content); } catch { res.writeHead(404).end('Not found'); } }).listen(port, () => { console.log(`\n PR Review: http://localhost:${port}\n`); - console.log(' Watching for changes...\n'); }); ``` -Then start it: +Then: ```bash -mkdir -p ~/life/slideshows/ -# Write server.mjs and index.html first, then: -node ~/life/slideshows//server.mjs & -open http://localhost:3000 # macOS +node ~/life/slideshows/pr-/server.mjs & # run_in_background: true +open http://localhost:3000 ``` -Run the server in the background using Bash with `run_in_background: true`. - -## Step 7 — Iterate - -After generating the initial slideshow, tell the user: -- The slideshow is live at http://localhost:3000 -- They can ask you to update specific slides -- The browser will auto-reload when you write changes - -When the user asks for updates, just rewrite `~/life/slideshows//index.html` — the browser will auto-reload. - -## Important rules +## Step 5 — Iterate -- **Discuss fixes first.** Scan for readability problems, present them, wait for approval before changing code. -- **Horizontal = big picture.** A reader pressing only → sees each area of change at a glance. -- **Vertical = test first, then implementation.** Press ↓ to see the test (what), then the code (how). -- **Cover everything.** Every changed file appears in the slideshow — in the flow, in a zoom-in, or in supporting changes. -- **Always HTML-escape code.** `<` → `<`, `>` → `>`, `&` → `&`. -- **Kill any existing server on port 3000** before starting: `lsof -ti:3000 | xargs kill -9 2>/dev/null || true` -- **Note omissions.** If you skip code, always say what and roughly how much. -- **One concept per slide.** Use vertical slides to go deeper, not wider. -- **Separate fix commit.** Never mix review fixes with the author's commits. +Tell the user: +- The page is live at http://localhost:3000 +- They can ask to update specific sections +- Refresh the browser after updates From 842ac2850b173bbabc177ebfd54759109330dbde Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sun, 29 Mar 2026 17:05:17 +0700 Subject: [PATCH 40/63] Update review-pr skill: two-layer format, TS-highlighted diffs, readability review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Two layers per area: curated walkthrough (API→Tests→Impl) + collapsed full files - Use language-typescript with data-diff attribute instead of language-diff - Post-processing script for line-level add/remove backgrounds on top of TS highlighting - Add readability review guidance: logical order, clear names, comments, test quality - Order areas high-level to low-level --- .agents/skills/review-pr/SKILL.md | 158 ++++++++++++++++++------------ 1 file changed, 94 insertions(+), 64 deletions(-) diff --git a/.agents/skills/review-pr/SKILL.md b/.agents/skills/review-pr/SKILL.md index 5acbc64c9282..ee72ae87f8fe 100644 --- a/.agents/skills/review-pr/SKILL.md +++ b/.agents/skills/review-pr/SKILL.md @@ -6,57 +6,70 @@ allowed-tools: Bash, Read, Write, Edit, Agent, Grep, Glob # PR Review — Scrollable Single-Page -Generate a scrollable single-page HTML document that walks through a PR as a narrative — big picture first, then every file grouped by area, tests before implementation. +Generate a scrollable single-page HTML document that reviews a PR as a readable narrative. -**Always generate the page immediately.** Never block on cleanup or fix discussions. Include issues as inline smell-boxes. +**Always generate the page immediately.** Never block on cleanup or fix discussions. ## Principles -1. **Big picture first.** Summary of what and why, then logical areas from most to least important. -2. **Tests first.** Within each area, show tests before implementation. -3. **Show whole files.** New files: complete content including imports. Modified files: diff. Use `
` for files over 100 lines. -4. **Cover everything.** Every changed file appears somewhere. -5. **Issues inline.** Flag problems as smell-boxes next to the relevant file. Never block page generation. +1. **Two layers per area.** The top layer is a curated, readable walkthrough — API surface, key test assertions, and core implementation logic woven together with prose. Only the important parts. Below it, the full files are collapsed in `
` for reference. +2. **High-level to low-level.** Order areas from entry points and orchestration down to utilities and types. The reader understands architecture before details. +3. **API → Tests → Implementation.** Within each area's readable section, show the API first (types, interfaces, exports), then the tests (what does it do?), then the implementation (how?). +4. **Review readability.** For each file, assess: logical order? Clear names? Comments where the *why* isn't obvious? Tests readable enough to serve as docs? Flag issues as smell-boxes. Call out well-written tests with note-boxes. +5. **Cover everything.** Every changed file appears somewhere. ## Step 1 — Gather PR data ```bash -# Get PR metadata (use gh pr view if a PR number is given) gh pr view --json number,title,author,headRefName,baseRefName,body,additions,deletions,changedFiles gh pr diff --name-only gh pr diff ``` +If a PR number or URL is given as an argument, pass it to `gh pr view ` and `gh pr diff `. + ## Step 2 — Read all changed files -For each changed file, read the full file content with the `Read` tool. Also read the full diff from `gh pr diff`. Classify each file as test, implementation, config, or docs. +Read the full file content of every changed file with the `Read` tool. Also read the full diff. Classify each file as test, implementation, config, or docs. ## Step 3 — Generate the page -Group changes into logical areas. Within each area: tests first, then implementation, then config. +For each area, write two layers: + +### Layer 1: Readable walkthrough (always visible) + +A curated narrative that mixes prose with **short code snippets** — only the important parts. Structure it as: + +1. **API** — key types, interfaces, function signatures, exports. The contract. +2. **Tests** — the most important test cases. What the behavior is. Cherry-pick the assertions that explain the module. +3. **Implementation** — the core logic. Skip boilerplate, show the interesting parts. + +Use narrative `

` tags between snippets to explain what the reader is looking at and review readability. + +### Layer 2: Full files (always collapsed) + +Below the walkthrough, include every file in the area as a collapsed `

` block with the complete file content (or diff for modified files). The reader expands these for reference. Write to `~/life/slideshows/pr-/index.html`. -**Verify every file from `gh pr diff --name-only` appears in the page** — in an area, in supporting changes, or as a bullet point. +**Verify every file from `gh pr diff --name-only` appears in the page.** ### HTML structure -The page has this structure: - ``` -Sticky topbar (nav links to each area) -Header (title, author, branch, stats) +Sticky topbar (nav links) +Header (title, author, stats) Big picture section -Area 1 (test files → impl files → config) +Area 1 + Readable walkthrough (API → Tests → Implementation snippets) + Full files (collapsed) Area 2 -... -Supporting changes (config, lockfiles, docs) + ... +Supporting changes ``` ### Complete HTML template -Copy this template exactly. Replace `{{PLACEHOLDERS}}` with actual content. - ```html @@ -132,6 +145,8 @@ Copy this template exactly. Replace `{{PLACEHOLDERS}}` with actual content. details > summary { cursor: pointer; padding: 9px 14px; font-family: 'JetBrains Mono', monospace; font-size: 12px; color: var(--muted); background: var(--surface); border-top: 1px solid var(--card-border); user-select: none; } details > summary:hover { color: var(--fg); } details[open] > summary { border-bottom: 1px solid var(--card-border); } + .diff-line-add { display: block; background: var(--add-bg); margin: 0 -10px; padding: 0 10px; } + .diff-line-del { display: block; background: var(--del-bg); margin: 0 -10px; padding: 0 10px; } .area-divider { border: none; border-top: 2px solid var(--border); margin: 48px 0 40px; } @media (max-width: 768px), (max-height: 500px) { body { font-size: 14px; } @@ -181,72 +196,92 @@ Copy this template exactly. Replace `{{PLACEHOLDERS}}` with actual content.

{{N}}. {{Area Name}}

-

{{What changed in this area}}

- +

{{What this area does}}

+ + +

- - - - + ``` ### Building blocks -Copy-paste these patterns to build the page content. - -**New file — shown directly (use for files under ~100 lines):** +**Layer 1 — Readable walkthrough snippet** (curated excerpt with prose): ```html
-
- impl - new - path/to/file.ts +
+

API: The pipeline is typed as a single function returning TrialResult:

+
+
export async function runTask(config: TrialConfig): Promise<TrialResult>
+
+

Tests: The ordering test makes the sequential contract clear:

-

What this file does.

-
{{FULL FILE CONTENT}}
+
await runTask(baseConfig);
+expect(callOrder).toEqual(['prepare', 'agent', 'grade']);
+
+

Implementation: The pipeline is strictly sequential — grade needs the agent's file changes:

+
+
const paths = await prepareTrial(config.project, trialId, logger);
+const execution = await agent.execute({ prompt, projectPath, ... });
+const { grading, quality } = await grade(paths, logger, execution.duration);
``` -**New file — collapsed (use for files over ~100 lines):** +**Layer 2 — Full file (collapsed, for new files):** ```html
- test + impl new - path/to/file.test.ts + path/to/file.ts
-

What this test covers.

Full file ({{N}} lines) -
{{FULL FILE CONTENT}}
+
{{FULL FILE CONTENT, HTML-ESCAPED}}
``` -**Modified file — diff:** +**Layer 2 — Full file (collapsed, for modified files with diff):** + +Use `language-typescript data-diff` — this gives TypeScript syntax highlighting plus line-level add/remove backgrounds via the post-processing script. Lines starting with `+` get green background, `-` get red. + ```html
modified path/to/file.ts
-

What changed.

-
-old line
+  
+ Diff +
-old line
 +new line
+
``` -**Supporting change — no code block needed:** +**Supporting change — no code needed:** ```html
@@ -258,14 +293,14 @@ Copy-paste these patterns to build the page content.
``` -**Inline issue (place after a file card):** +**Inline issue:** ```html
No unit tests for this file.
``` -**Context note (place after a file card):** +**Positive note:** ```html -
This is the only caller of the renamed function.
+
These test names read like a specification — good documentation.
``` ### Badge reference @@ -278,33 +313,30 @@ Copy-paste these patterns to build the page content. | `new` | `badge-new` | New files (combine with test/impl/config) | | `modified` | `badge-modified` | Modified files | -### Syntax highlighting languages +### Syntax highlighting -| Language | Class | Use for | -|----------|-------|---------| -| TypeScript | `language-typescript` | `.ts`, `.tsx`, `.js`, `.jsx` files | -| Diff | `language-diff` | Modified file diffs (lines start with `+`/`-`) | -| JSON | `language-json` | `.json` files | -| Markdown | `language-markdown` | `.md` files | +| Class | Use for | +|-------|---------| +| `language-typescript` | `.ts`, `.tsx`, `.js`, `.jsx` (new files) | +| `language-typescript` + `data-diff` attribute | Modified file diffs — gets TS highlighting plus line-level add/remove backgrounds | +| `language-json` | `.json` files | +| `language-markdown` | `.md` files | -### HTML escaping +**Important:** Do NOT use `language-diff` — it only does `+`/`-` coloring without syntax highlighting. Instead use `language-typescript` with the `data-diff` attribute for diffs. The post-processing script handles line backgrounds. -All code content inside `` blocks must be HTML-escaped: -- `&` → `&` -- `<` → `<` -- `>` → `>` +### HTML escaping -Tip: use a Node script to read files and generate escaped HTML when there are many new files. +All code inside `` blocks must be escaped: `&` → `&`, `<` → `<`, `>` → `>`. ## Step 4 — Serve the page -Kill any existing server on port 3000, write the server, start it: +Kill any existing server, write a static server, start it: ```bash lsof -ti:3000 | xargs kill -9 2>/dev/null || true ``` -Write this static server to `~/life/slideshows/pr-/server.mjs`: +Write to `~/life/slideshows/pr-/server.mjs`: ```javascript import { createServer } from 'node:http'; @@ -333,8 +365,6 @@ createServer((req, res) => { }); ``` -Then: - ```bash node ~/life/slideshows/pr-/server.mjs & # run_in_background: true open http://localhost:3000 From 6e5fcf4a94ff87238d5cd1e9d4fd472fa947c5ef Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sun, 29 Mar 2026 17:09:38 +0700 Subject: [PATCH 41/63] Update review-pr skill: show full interface bodies in walkthrough Principle 3 now explicitly requires showing complete interface definitions where they're first relevant, not just type names. --- .agents/skills/review-pr/SKILL.md | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/.agents/skills/review-pr/SKILL.md b/.agents/skills/review-pr/SKILL.md index ee72ae87f8fe..5a655dbeafff 100644 --- a/.agents/skills/review-pr/SKILL.md +++ b/.agents/skills/review-pr/SKILL.md @@ -14,7 +14,7 @@ Generate a scrollable single-page HTML document that reviews a PR as a readable 1. **Two layers per area.** The top layer is a curated, readable walkthrough — API surface, key test assertions, and core implementation logic woven together with prose. Only the important parts. Below it, the full files are collapsed in `
` for reference. 2. **High-level to low-level.** Order areas from entry points and orchestration down to utilities and types. The reader understands architecture before details. -3. **API → Tests → Implementation.** Within each area's readable section, show the API first (types, interfaces, exports), then the tests (what does it do?), then the implementation (how?). +3. **API → Tests → Implementation.** Within each area's readable section, show the API first (types, interfaces, exports), then the tests (what does it do?), then the implementation (how?). **Show full interface bodies** — not just names. The reader should see every field of `TrialResult`, `AgentConfig`, etc. in the walkthrough where they're first relevant. Don't defer to "see types.ts". 4. **Review readability.** For each file, assess: logical order? Clear names? Comments where the *why* isn't obvious? Tests readable enough to serve as docs? Flag issues as smell-boxes. Call out well-written tests with note-boxes. 5. **Cover everything.** Every changed file appears somewhere. @@ -228,12 +228,33 @@ document.querySelectorAll('code[data-diff]').forEach(block => { ### Building blocks **Layer 1 — Readable walkthrough snippet** (curated excerpt with prose): + +Show full interface bodies where they're first relevant — not just names: + ```html
-

API: The pipeline is typed as a single function returning TrialResult:

+

API: The pipeline takes a config and returns a full result:

-
export async function runTask(config: TrialConfig): Promise<TrialResult>
+
export async function runTask(config: TrialConfig): Promise<TrialResult>
+
+export interface TrialConfig {
+  project: Project;
+  agent: AgentName;    // "claude" | "codex"
+  model: string;
+  effort: string;
+  prompt: string;
+}
+
+export interface TrialResult {
+  schemaVersion: 1;
+  project: string;
+  agent: string;
+  model: string;
+  execution: ExecutionResult;
+  grading: GradingResult;
+  quality: QualityResult;
+}

Tests: The ordering test makes the sequential contract clear:

From 87abae4ab6f0e3a3e5d1235940cc51884186613b Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sun, 29 Mar 2026 17:46:50 +0700 Subject: [PATCH 42/63] Refactor: use composition for AgentRunConfig instead of extends Extract AgentRunConfig { agent, model, effort } and compose it as a `run` field in TrialConfig, ExecutionResult, and TrialResult instead of spreading via extends/inheritance. --- scripts/eval/eval.ts | 10 ++--- scripts/eval/lib/agents/claude-code.ts | 4 +- scripts/eval/lib/agents/codex.ts | 2 +- scripts/eval/lib/run-task.test.ts | 18 +++------ scripts/eval/lib/run-task.ts | 7 ++-- scripts/eval/types.ts | 53 ++++++++++++++------------ 6 files changed, 44 insertions(+), 50 deletions(-) diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 38b723fc4f00..5bfa4339e6be 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -113,7 +113,7 @@ const main = defineCommand({ process.exit(1); } return promptNames.map((prompt) => ({ - config: { project, agent, model, effort, prompt, verbose: args.verbose } as TrialConfig, + config: { project, run: { agent, model, effort }, prompt, verbose: args.verbose } as TrialConfig, label: `${model}+${prompt}`, })); }); @@ -128,12 +128,12 @@ const main = defineCommand({ const runId = randomUUID().slice(0, 8); logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); if (configs.length === 1) { - const { agent, model, effort, prompt } = configs[0].config; + const { run: { agent, model, effort }, prompt } = configs[0].config; logger.log(`Agent: ${agent} | Model: ${model} | Effort: ${effort} | Prompt: ${prompt}`); } else { logger.log(`${configs.length} parallel runs`); for (const [agent, { models }] of Object.entries(AGENTS)) { - const active = models.filter((m) => configs.some((c) => c.config.model === m)); + const active = models.filter((m) => configs.some((c) => c.config.run.model === m)); if (active.length > 0) logger.log(` ${agent}: ${active.join(", ")}`); } logger.log(` prompts: ${[...new Set(promptNames)].join(", ")}`); @@ -182,8 +182,8 @@ const main = defineCommand({ const ghost = r.grading.ghostStories; const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; return [ - r.agent, - r.model, + r.run.agent, + r.run.model, r.prompt, r.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL"), ghostStr, diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index 774d356c9797..10b0224db96f 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -117,9 +117,7 @@ export const claudeAgent: Agent = { await writeFile(join(resultsDir, "transcript.json"), JSON.stringify(messages, null, 2)); return { - agent: "claude", - model, - effort, + run: { agent: "claude", model, effort }, cost, duration, durationApi, diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts index 3e5e08fcc086..464e8af54d45 100644 --- a/scripts/eval/lib/agents/codex.ts +++ b/scripts/eval/lib/agents/codex.ts @@ -81,6 +81,6 @@ export const codexAgent: Agent = { await writeFile(join(resultsDir, "transcript.json"), JSON.stringify(items, null, 2)); - return { agent: "codex", model, effort, cost, duration, turns }; + return { run: { agent: "codex", model, effort }, cost, duration, turns }; }, }; diff --git a/scripts/eval/lib/run-task.test.ts b/scripts/eval/lib/run-task.test.ts index 73f0c170c57e..db2d4ab1a8d3 100644 --- a/scripts/eval/lib/run-task.test.ts +++ b/scripts/eval/lib/run-task.test.ts @@ -65,9 +65,7 @@ function setupMocks(overrides?: { }); vi.mocked(claudeAgent.execute).mockResolvedValue({ - agent: 'claude', - model: 'sonnet-4.6', - effort: 'high', + run: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, cost, duration: 45.2, turns: 12, @@ -93,9 +91,7 @@ function setupMocks(overrides?: { const baseConfig: TrialConfig = { project: { name: 'test-project', repo: 'https://github.com/test/repo', branch: 'main' }, - agent: 'claude', - model: 'sonnet-4.6', - effort: 'high', + run: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, prompt: 'setup', }; @@ -108,15 +104,11 @@ describe('runTask pipeline', () => { expect(result).toMatchObject({ schemaVersion: 1, project: 'test-project', - agent: 'claude', - model: 'sonnet-4.6', - effort: 'high', + run: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, prompt: 'setup', baselineCommit: 'deadbeef', execution: { - agent: 'claude', - model: 'sonnet-4.6', - effort: 'high', + run: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, cost: 0.42, duration: 45.2, turns: 12, @@ -213,7 +205,7 @@ describe('runTask pipeline', () => { vi.mocked(claudeAgent.execute).mockImplementation(async () => { callOrder.push('agent'); - return { agent: 'claude', model: 'sonnet-4.6', effort: 'high', cost: 0.1, duration: 10, turns: 3 }; + return { run: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, cost: 0.1, duration: 10, turns: 3 }; }); vi.mocked(grade).mockImplementation(async () => { diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-task.ts index 73cdcc8fd72c..c475bd23bf7c 100644 --- a/scripts/eval/lib/run-task.ts +++ b/scripts/eval/lib/run-task.ts @@ -19,7 +19,8 @@ export async function runTask( config: TrialConfig, logger?: Logger, ): Promise { - const { project, agent: agentName, model, effort, prompt: promptName } = config; + const { project, run, prompt: promptName } = config; + const { agent: agentName, model, effort } = run; const log = logger ?? createLogger(); const trialId = generateTrialId(project.name, agentName, model, promptName || "setup"); const timestamp = new Date().toISOString(); @@ -58,9 +59,7 @@ export async function runTask( const result: TrialResult = { schemaVersion: 1, project: project.name, - agent: agentName, - model, - effort, + run, timestamp, prompt: promptName || "setup", baselineCommit: paths.baselineCommit, diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index 2d8efd422b2f..98e804f79458 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -14,10 +14,31 @@ export interface Logger { logError: (msg: string) => void; } -// --- Agent Name --- +// --- Agent --- export type AgentName = "claude" | "codex"; +/** Agent + model + effort — the three values that define how the agent runs. */ +export interface AgentRunConfig { + agent: AgentName; + /** Friendly model name (e.g. "sonnet-4.6", "gpt-5.4"). Must exist in `AGENTS[agent].models`. */ + model: string; + /** Reasoning effort level. Must exist in `AGENTS[agent].efforts`. */ + effort: string; +} + +export interface Agent { + name: AgentName; + execute(params: { + prompt: string; + projectPath: string; + model: string; + effort: string; + resultsDir: string; + logger: Logger; + }): Promise; +} + // --- Project --- export interface Project { @@ -31,11 +52,13 @@ export interface Project { // --- Trial Config --- export interface TrialConfig { + /** Which project to evaluate (cloned from its eval-baseline branch). */ project: Project; - agent: AgentName; - model: string; - effort: string; + /** Agent, model, and effort level. */ + run: AgentRunConfig; + /** Prompt name — maps to `prompts/{name}.md` (e.g. "setup", "self-heal"). */ prompt: string; + /** Log agent messages to stdout. */ verbose?: boolean; } @@ -52,9 +75,7 @@ export interface TrialPaths { // --- Execution --- export interface ExecutionResult { - agent: string; - model: string; - effort: string; + run: AgentRunConfig; cost?: number; duration: number; durationApi?: number; @@ -131,9 +152,7 @@ export interface QualityResult { export interface TrialResult { schemaVersion: 1; project: string; - agent: string; - model: string; - effort: string; + run: AgentRunConfig; prompt: string; timestamp: string; baselineCommit: string; @@ -141,17 +160,3 @@ export interface TrialResult { grading: GradingResult; quality: QualityResult; } - -// --- Agent Interface --- - -export interface Agent { - name: AgentName; - execute(params: { - prompt: string; - projectPath: string; - model: string; - effort: string; - resultsDir: string; - logger: Logger; - }): Promise; -} From c0720ee5543c56a16ee94e45563ee70621462fb4 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sun, 29 Mar 2026 20:15:41 +0700 Subject: [PATCH 43/63] Rename eval data structures for clarity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - AgentRunConfig → AgentVariant (it's the experimental variant, not a "run config") - Agent → AgentDriver, AgentConfig → AgentDefinition (disambiguate) - ExecutionResult → Execution, GradingResult → Grade, QualityResult → QualityScore - TrialResult → TrialReport, TrialPaths → TrialWorkspace - ChangedFile → FileChange, Pricing → TokenPricing, Environment → EvalEnvironment - GhostStoriesResult → GhostStoryGrade, GhostStoryRunResult → GhostStoryOutput - QualityWeights → ScoreWeights, DEFAULT_QUALITY_WEIGHTS → DEFAULT_SCORE_WEIGHTS - Field renames: run → variant, grading → grade, quality → score, changedFiles → fileChanges, storybookFiles → storybookChanges - Extract AgentExecuteParams with variant: AgentVariant (reuses the model) - Remove redundant run field from Execution (lives on TrialReport only) --- scripts/eval/config.ts | 14 +++--- scripts/eval/eval.ts | 42 ++++++++-------- scripts/eval/lib/agents/claude-code.ts | 11 ++--- scripts/eval/lib/agents/codex.ts | 12 ++--- scripts/eval/lib/ghost-stories.ts | 4 +- scripts/eval/lib/grade.test.ts | 10 ++-- scripts/eval/lib/grade.ts | 46 ++++++++--------- scripts/eval/lib/prepare-trial.ts | 4 +- scripts/eval/lib/run-task.test.ts | 47 +++++++++--------- scripts/eval/lib/run-task.ts | 49 +++++++++---------- scripts/eval/lib/utils.ts | 6 +-- scripts/eval/types.ts | 68 +++++++++++++------------- 12 files changed, 154 insertions(+), 159 deletions(-) diff --git a/scripts/eval/config.ts b/scripts/eval/config.ts index 8c47155e87ca..59cbd0af905c 100644 --- a/scripts/eval/config.ts +++ b/scripts/eval/config.ts @@ -5,11 +5,11 @@ * and cost estimation utilities. */ -import type { AgentName, Project } from "./types.ts"; +import type { AgentId, Project } from "./types.ts"; // --- Pricing --- -export interface Pricing { +export interface TokenPricing { input: number; cachedInput: number; output: number; @@ -21,20 +21,20 @@ export interface TokenUsage { outputTokens: number; } -// --- Agent Config --- +// --- Agent Definition --- -export interface AgentConfig { +export interface AgentDefinition { models: string[]; defaultModel: string; /** Map friendly model names to SDK-specific model IDs (e.g. "sonnet-4.6" → "claude-sonnet-4-6"). */ sdkModelIds: Record; /** Per-million-token pricing for manual cost estimation (agents that don't report cost natively). */ - pricing: Record; + pricing: Record; efforts: string[]; defaultEffort: string; } -export const AGENTS: Record = { +export const AGENTS: Record = { claude: { models: ["sonnet-4.6", "opus-4.6", "haiku-4.5"], defaultModel: "sonnet-4.6", @@ -62,7 +62,7 @@ export const AGENTS: Record = { // --- Cost Estimation --- /** Estimate cost from token usage using the pricing table. */ -export function estimateCost(agent: AgentName, model: string, usage: TokenUsage): number | undefined { +export function estimateCost(agent: AgentId, model: string, usage: TokenUsage): number | undefined { const pricing = AGENTS[agent].pricing[model]; if (!pricing) return undefined; const freshInput = usage.inputTokens - usage.cachedInputTokens; diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 5bfa4339e6be..44cd83d5ba98 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -16,7 +16,7 @@ import { defineCommand, runMain } from "citty"; import { randomUUID } from "node:crypto"; import pc from "picocolors"; -import type { AgentName, TrialConfig, TrialResult } from "./types.ts"; +import type { AgentId, TrialConfig, TrialReport } from "./types.ts"; import { AGENTS, PROJECTS } from "./config.ts"; import { runTask } from "./lib/run-task.ts"; import { createLogger, formatDuration, formatCost, formatTable, listPrompts } from "./lib/utils.ts"; @@ -72,7 +72,7 @@ const main = defineCommand({ const allModels = Object.values(AGENTS).flatMap((cfg) => cfg.models); // Determine agent → model pairs - let agentModels: Array<{ agent: AgentName; model: string }>; + let agentModels: Array<{ agent: AgentId; model: string }>; if (args.model) { // Models specified — infer agent per model @@ -82,7 +82,7 @@ const main = defineCommand({ logger.log(pc.red(`Unknown model: ${model}. Available: ${allModels.join(", ")}`)); process.exit(1); } - return { agent: entry[0] as AgentName, model }; + return { agent: entry[0] as AgentId, model }; }); // If --agent is also specified, filter to matching agents if (args.agent) { @@ -92,12 +92,12 @@ const main = defineCommand({ } else if (args.agent) { // Agents specified — use default model per agent agentModels = args.agent.split(",").map((name) => { - const cfg = AGENTS[name as AgentName]; + const cfg = AGENTS[name as AgentId]; if (!cfg) { logger.log(pc.red(`Unknown agent: ${name}. Options: ${Object.keys(AGENTS).join(", ")}`)); process.exit(1); } - return { agent: name as AgentName, model: cfg.defaultModel }; + return { agent: name as AgentId, model: cfg.defaultModel }; }); } else { // Default: single claude run @@ -113,7 +113,7 @@ const main = defineCommand({ process.exit(1); } return promptNames.map((prompt) => ({ - config: { project, run: { agent, model, effort }, prompt, verbose: args.verbose } as TrialConfig, + config: { project, variant: { agent, model, effort }, prompt, verbose: args.verbose } as TrialConfig, label: `${model}+${prompt}`, })); }); @@ -128,12 +128,12 @@ const main = defineCommand({ const runId = randomUUID().slice(0, 8); logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); if (configs.length === 1) { - const { run: { agent, model, effort }, prompt } = configs[0].config; + const { variant: { agent, model, effort }, prompt } = configs[0].config; logger.log(`Agent: ${agent} | Model: ${model} | Effort: ${effort} | Prompt: ${prompt}`); } else { logger.log(`${configs.length} parallel runs`); for (const [agent, { models }] of Object.entries(AGENTS)) { - const active = models.filter((m) => configs.some((c) => c.config.run.model === m)); + const active = models.filter((m) => configs.some((c) => c.config.variant.model === m)); if (active.length > 0) logger.log(` ${agent}: ${active.join(", ")}`); } logger.log(` prompts: ${[...new Set(promptNames)].join(", ")}`); @@ -146,7 +146,7 @@ const main = defineCommand({ configs.map((c) => runTask(c.config, createLogger(configs.length > 1 ? c.label : undefined))), ); - const results: TrialResult[] = []; + const results: TrialReport[] = []; for (const [i, s] of settled.entries()) { if (s.status === "fulfilled") { results.push(s.value); @@ -163,32 +163,32 @@ const main = defineCommand({ if (results.length === 1) { const r = results[0]; - const ghost = r.grading.ghostStories; + const ghost = r.grade.ghostStories; const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; logger.log(pc.bold("\nResult")); - logger.log(` Build: ${r.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL")}`); + logger.log(` Build: ${r.grade.buildSuccess ? pc.green("PASS") : pc.red("FAIL")}`); logger.log(` Ghost: ${ghostStr}`); - logger.log(` TS Err: ${r.grading.typeCheckErrors}`); - logger.log(` Score: ${r.quality.score}`); + logger.log(` TS Err: ${r.grade.typeCheckErrors}`); + logger.log(` Score: ${r.score.score}`); logger.log(` Cost: ${formatCost(r.execution.cost)}`); logger.log(` Time: ${formatDuration(r.execution.duration)}`); logger.log(` Turns: ${r.execution.turns}`); } else { - results.sort((a, b) => (b.grading.ghostStories?.successRate ?? -1) - (a.grading.ghostStories?.successRate ?? -1)); + results.sort((a, b) => (b.grade.ghostStories?.successRate ?? -1) - (a.grade.ghostStories?.successRate ?? -1)); const headers = ["Agent", "Model", "Prompt", "Build", "Ghost", "TS Err", "Score", "Cost", "Time", "Turns"]; const rows = results.map((r) => { - const ghost = r.grading.ghostStories; + const ghost = r.grade.ghostStories; const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; return [ - r.run.agent, - r.run.model, + r.variant.agent, + r.variant.model, r.prompt, - r.grading.buildSuccess ? pc.green("PASS") : pc.red("FAIL"), + r.grade.buildSuccess ? pc.green("PASS") : pc.red("FAIL"), ghostStr, - String(r.grading.typeCheckErrors), - String(r.quality.score), + String(r.grade.typeCheckErrors), + String(r.score.score), formatCost(r.execution.cost), formatDuration(r.execution.duration), String(r.execution.turns), @@ -199,7 +199,7 @@ const main = defineCommand({ logger.log(formatTable(headers, rows)); const totalCost = results.reduce((s, r) => s + (r.execution.cost || 0), 0); - const ghostRates = results.map((r) => r.grading.ghostStories?.successRate).filter((r): r is number => r != null); + const ghostRates = results.map((r) => r.grade.ghostStories?.successRate).filter((r): r is number => r != null); const avgGhost = ghostRates.length > 0 ? ghostRates.reduce((s, r) => s + r, 0) / ghostRates.length : 0; logger.log(`\nGhost stories avg: ${pc.bold(`${Math.round(avgGhost * 100)}%`)}`); diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index 10b0224db96f..2d440d224127 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -3,7 +3,7 @@ import { query } from "@anthropic-ai/claude-agent-sdk"; import { writeFile } from "node:fs/promises"; import { join } from "node:path"; import { AGENTS } from "../../config.ts"; -import type { Agent, ExecutionResult, Logger } from "../../types.ts"; +import type { AgentDriver, Execution, Logger } from "../../types.ts"; function logMessage(message: SDKMessage, logger: Logger) { switch (message.type) { @@ -70,18 +70,18 @@ function logMessage(message: SDKMessage, logger: Logger) { const MAX_TURNS = 50; -export const claudeAgent: Agent = { +export const claudeAgent: AgentDriver = { name: "claude", async execute({ prompt, projectPath, - model, - effort = "high", + variant, resultsDir, logger, - }): Promise { + }): Promise { const startTime = Date.now(); + const { model, effort } = variant; const sdkModel = AGENTS.claude.sdkModelIds[model] ?? model; let cost: number | undefined; @@ -117,7 +117,6 @@ export const claudeAgent: Agent = { await writeFile(join(resultsDir, "transcript.json"), JSON.stringify(messages, null, 2)); return { - run: { agent: "claude", model, effort }, cost, duration, durationApi, diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts index 464e8af54d45..b74e2432c740 100644 --- a/scripts/eval/lib/agents/codex.ts +++ b/scripts/eval/lib/agents/codex.ts @@ -1,21 +1,21 @@ import { Codex, type ModelReasoningEffort } from "@openai/codex-sdk"; import { writeFile } from "node:fs/promises"; import { join } from "node:path"; -import type { Agent, ExecutionResult } from "../../types.ts"; +import type { AgentDriver, Execution } from "../../types.ts"; import { estimateCost } from "../../config.ts"; -export const codexAgent: Agent = { +export const codexAgent: AgentDriver = { name: "codex", async execute({ prompt, projectPath, - model, - effort = "high", + variant, resultsDir, logger, - }): Promise { + }): Promise { const startTime = Date.now(); + const { model, effort } = variant; const codex = new Codex(); const thread = codex.startThread({ @@ -81,6 +81,6 @@ export const codexAgent: Agent = { await writeFile(join(resultsDir, "transcript.json"), JSON.stringify(items, null, 2)); - return { run: { agent: "codex", model, effort }, cost, duration, turns }; + return { cost, duration, turns }; }, }; diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts index cf65bb7b4a30..459cb22e280b 100644 --- a/scripts/eval/lib/ghost-stories.ts +++ b/scripts/eval/lib/ghost-stories.ts @@ -51,7 +51,7 @@ export async function findComponentCandidates(opts: { } } -export interface GhostStoryRunResult { +export interface GhostStoryOutput { total: number; passed: number; successRate: number; @@ -68,7 +68,7 @@ export interface GhostStoryRunResult { export async function runGhostStories( candidates: string[], opts: { cwd: string }, -): Promise { +): Promise { const outputFile = join(tmpdir(), `ghost-stories-${Date.now()}.json`); const result = await x("npx", [ diff --git a/scripts/eval/lib/grade.test.ts b/scripts/eval/lib/grade.test.ts index 6925d8e3fa06..8145c9f5ca7e 100644 --- a/scripts/eval/lib/grade.test.ts +++ b/scripts/eval/lib/grade.test.ts @@ -6,11 +6,11 @@ import { countTypeCheckErrors, parseChangedFiles, } from './grade'; -import type { ChangedFile } from '../types'; +import type { FileChange } from '../types'; describe('filterStorybookFiles', () => { it('matches files in .storybook/ directory', () => { - const files: ChangedFile[] = [ + const files: FileChange[] = [ { path: '.storybook/main.ts', status: 'M' }, { path: '.storybook/preview.tsx', status: 'A' }, { path: 'src/App.tsx', status: 'M' }, @@ -22,7 +22,7 @@ describe('filterStorybookFiles', () => { }); it('matches story files with various extensions', () => { - const files: ChangedFile[] = [ + const files: FileChange[] = [ { path: 'src/Button.stories.tsx', status: 'A' }, { path: 'src/Header.stories.ts', status: 'A' }, { path: 'src/Page.story.jsx', status: 'A' }, @@ -34,7 +34,7 @@ describe('filterStorybookFiles', () => { }); it('returns empty for no storybook files', () => { - const files: ChangedFile[] = [ + const files: FileChange[] = [ { path: 'src/App.tsx', status: 'M' }, { path: 'package.json', status: 'M' }, ]; @@ -46,7 +46,7 @@ describe('filterStorybookFiles', () => { }); it('matches renamed files using either side of the rename', () => { - const files: ChangedFile[] = [ + const files: FileChange[] = [ { path: 'src/Button.tsx', previousPath: 'src/Button.stories.tsx', status: 'R' }, { path: '.storybook/preview.tsx', previousPath: 'config/preview.tsx', status: 'R' }, { path: 'src/App.tsx', previousPath: 'src/Main.tsx', status: 'R' }, diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index e0237ac28566..4811417615a3 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -1,17 +1,17 @@ import { writeFile } from "node:fs/promises"; import { join } from "node:path"; -import type { GradingResult, GhostStoriesResult, QualityResult, QualityWeights, TrialPaths, ChangedFile, Logger } from "../types.ts"; -import { DEFAULT_QUALITY_WEIGHTS } from "../types.ts"; +import type { Grade, GhostStoryGrade, QualityScore, ScoreWeights, TrialWorkspace, FileChange, Logger } from "../types.ts"; +import { DEFAULT_SCORE_WEIGHTS } from "../types.ts"; import { x } from "tinyexec"; import { detectSetupPatterns } from "./setup-patterns.ts"; import { findComponentCandidates, runGhostStories } from "./ghost-stories.ts"; -/** Filter changed files to only storybook-related ones. */ -export function filterStorybookFiles(changedFiles: ChangedFile[]): ChangedFile[] { +/** Filter file changes to only storybook-related ones. */ +export function filterStorybookFiles(fileChanges: FileChange[]): FileChange[] { const isStorybookPath = (path?: string) => path != null && (path.includes(".storybook/") || /\.(stories|story)\.[tj]sx?$/.test(path)); - return changedFiles.filter( + return fileChanges.filter( (f) => isStorybookPath(f.path) || isStorybookPath(f.previousPath), ); } @@ -30,8 +30,8 @@ export function computeQualityScore( ghostSuccessRate?: number; durationSeconds?: number; }, - weights: QualityWeights = DEFAULT_QUALITY_WEIGHTS, -): QualityResult { + weights: ScoreWeights = DEFAULT_SCORE_WEIGHTS, +): QualityScore { const buildScore = opts.buildSuccess ? 1 : 0; const tcScore = Math.max(0, 1 - opts.typeCheckErrors / 20); const ghostScore = opts.ghostSuccessRate ?? 0; @@ -61,15 +61,15 @@ export function countTypeCheckErrors(tscOutput: string): number { return (tscOutput.match(/error TS\d+/g) || []).length; } -/** Parse git diff --name-status output into ChangedFile objects. */ -export function parseChangedFiles(gitOutput: string): ChangedFile[] { +/** Parse git diff --name-status output into FileChange objects. */ +export function parseChangedFiles(gitOutput: string): FileChange[] { return gitOutput .trim() .split("\n") .filter(Boolean) .map((line) => { const [status, ...parts] = line.split("\t"); - const normalizedStatus = (status?.charAt(0) || "M") as ChangedFile["status"]; + const normalizedStatus = (status?.charAt(0) || "M") as FileChange["status"]; if (normalizedStatus === "R" && parts.length >= 2) { const [previousPath, path] = parts; @@ -81,17 +81,17 @@ export function parseChangedFiles(gitOutput: string): ChangedFile[] { } export async function grade( - paths: TrialPaths, + workspace: TrialWorkspace, logger: Logger, agentDuration?: number, -): Promise<{ grading: GradingResult; quality: QualityResult }> { - const { repoRoot, projectPath, resultsDir, baselineCommit } = paths; +): Promise<{ grade: Grade; score: QualityScore }> { + const { repoRoot, projectPath, resultsDir, baselineCommit } = workspace; // Changed files logger.logStep("Collecting agent changes..."); - const changedFiles = await getChangedFiles(repoRoot, baselineCommit); - const storybookFiles = filterStorybookFiles(changedFiles); - logger.logSuccess(`${changedFiles.length} files changed (${storybookFiles.length} storybook-related)`); + const fileChanges = await getChangedFiles(repoRoot, baselineCommit); + const storybookChanges = filterStorybookFiles(fileChanges); + logger.logSuccess(`${fileChanges.length} files changed (${storybookChanges.length} storybook-related)`); // Setup patterns const setupPatterns = await detectSetupPatterns(projectPath); @@ -132,28 +132,28 @@ export async function grade( // Ghost stories (only if build passed) const ghostStories = buildSuccess ? await gradeGhostStories(projectPath, logger) : undefined; - const grading: GradingResult = { + const trialGrade: Grade = { buildSuccess, buildError: buildSuccess ? undefined : buildOutput.slice(-2000), typeCheckErrors, typeCheckOutput: typeCheckErrors > 0 ? tscOutput.slice(-2000) : undefined, - changedFiles, - storybookFiles, + fileChanges, + storybookChanges, setupPatterns, ghostStories, }; - const quality = computeQualityScore({ + const score = computeQualityScore({ buildSuccess, typeCheckErrors, ghostSuccessRate: ghostStories?.successRate, durationSeconds: agentDuration, }); - return { grading, quality }; + return { grade: trialGrade, score }; } -async function getChangedFiles(repoRoot: string, baseline: string): Promise { +async function getChangedFiles(repoRoot: string, baseline: string): Promise { // Stage all files so `git diff --cached` picks up new files the agent created. // Safe: this runs on an ephemeral trial copy, not the real repo. await x("git", ["add", "-A"], { nodeOptions: { cwd: repoRoot } }); @@ -164,7 +164,7 @@ async function getChangedFiles(repoRoot: string, baseline: string): Promise { +async function gradeGhostStories(projectPath: string, logger: Logger): Promise { logger.logStep("Running ghost stories..."); const { candidates, error } = await findComponentCandidates({ sampleSize: 20, cwd: projectPath }); diff --git a/scripts/eval/lib/prepare-trial.ts b/scripts/eval/lib/prepare-trial.ts index 301e06f29e0e..e11f5956b61e 100644 --- a/scripts/eval/lib/prepare-trial.ts +++ b/scripts/eval/lib/prepare-trial.ts @@ -1,7 +1,7 @@ import { existsSync } from "node:fs"; import { cp, mkdir } from "node:fs/promises"; import { join } from "node:path"; -import type { Project, TrialPaths, Logger } from "../types.ts"; +import type { Project, TrialWorkspace, Logger } from "../types.ts"; import { x } from "tinyexec"; import { installDeps } from "./package-manager.ts"; import { CACHE_DIR, TRIALS_DIR } from "./utils.ts"; @@ -10,7 +10,7 @@ import { CACHE_DIR, TRIALS_DIR } from "./utils.ts"; * First run: clone eval-baseline -> install deps -> cache it. * Subsequent runs: copy from cache. Agent starts immediately. */ -export async function prepareTrial(project: Project, trialId: string, logger: Logger): Promise { +export async function prepareTrial(project: Project, trialId: string, logger: Logger): Promise { const cacheDir = join(CACHE_DIR, project.name); const trialDir = join(TRIALS_DIR, trialId); const repoRoot = join(trialDir, "project"); diff --git a/scripts/eval/lib/run-task.test.ts b/scripts/eval/lib/run-task.test.ts index db2d4ab1a8d3..72acd8ce92d8 100644 --- a/scripts/eval/lib/run-task.test.ts +++ b/scripts/eval/lib/run-task.test.ts @@ -4,7 +4,7 @@ import { tmpdir } from 'node:os'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; -import type { TrialConfig, TrialResult } from '../types'; +import type { TrialConfig, TrialReport } from '../types'; // Mock external dependencies to avoid real git/storybook/vitest calls vi.mock('./prepare-trial', () => ({ @@ -65,38 +65,37 @@ function setupMocks(overrides?: { }); vi.mocked(claudeAgent.execute).mockResolvedValue({ - run: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, cost, duration: 45.2, turns: 12, }); vi.mocked(grade).mockResolvedValue({ - grading: { + grade: { buildSuccess, typeCheckErrors, - changedFiles: [ + fileChanges: [ { path: '.storybook/preview.tsx', status: 'A' }, { path: 'src/Button.stories.tsx', status: 'A' }, ], - storybookFiles: [ + storybookChanges: [ { path: '.storybook/preview.tsx', status: 'A' }, { path: 'src/Button.stories.tsx', status: 'A' }, ], setupPatterns: [{ id: 'tailwind', label: 'Tailwind CSS', sourceFiles: ['.storybook/preview.ts'] }], }, - quality: { score: buildSuccess ? 1 : 0.3, breakdown: { build: buildSuccess ? 1 : 0, typecheck: 1, ghostStories: 0, performance: 0 } }, + score: { score: buildSuccess ? 1 : 0.3, breakdown: { build: buildSuccess ? 1 : 0, typecheck: 1, ghostStories: 0, performance: 0 } }, }); } const baseConfig: TrialConfig = { project: { name: 'test-project', repo: 'https://github.com/test/repo', branch: 'main' }, - run: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, + variant: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, prompt: 'setup', }; describe('runTask pipeline', () => { - it('assembles a complete TrialResult from pipeline steps', async () => { + it('assembles a complete TrialReport from pipeline steps', async () => { setupMocks(); const result = await runTask(baseConfig); @@ -104,19 +103,18 @@ describe('runTask pipeline', () => { expect(result).toMatchObject({ schemaVersion: 1, project: 'test-project', - run: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, + variant: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, prompt: 'setup', baselineCommit: 'deadbeef', execution: { - run: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, cost: 0.42, duration: 45.2, turns: 12, }, - grading: { + grade: { buildSuccess: true, }, - quality: { + score: { score: 1, }, }); @@ -146,14 +144,13 @@ describe('runTask pipeline', () => { expect(params).toMatchObject({ prompt: expect.stringContaining('Storybook setup'), projectPath: TMP, - model: 'sonnet-4.6', - effort: 'high', + variant: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, resultsDir: join(TMP, 'results'), }); expect(params.logger).toBeDefined(); - const gradePaths = vi.mocked(grade).mock.calls[0][0]; - expect(gradePaths).toMatchObject({ + const gradeWorkspace = vi.mocked(grade).mock.calls[0][0]; + expect(gradeWorkspace).toMatchObject({ baselineCommit: 'deadbeef', projectPath: TMP, resultsDir: join(TMP, 'results'), @@ -168,11 +165,11 @@ describe('runTask pipeline', () => { const resultsDir = join(TMP, 'results'); - const summary: TrialResult = JSON.parse(readFileSync(join(resultsDir, 'summary.json'), 'utf-8')); + const summary: TrialReport = JSON.parse(readFileSync(join(resultsDir, 'summary.json'), 'utf-8')); expect(summary).toMatchObject({ schemaVersion: 1, execution: { cost: 0.42 }, - grading: { buildSuccess: true }, + grade: { buildSuccess: true }, }); const promptContent = readFileSync(join(resultsDir, 'prompt.md'), 'utf-8'); @@ -183,8 +180,8 @@ describe('runTask pipeline', () => { setupMocks({ buildSuccess: false, typeCheckErrors: 5 }); await expect(runTask(baseConfig)).resolves.toMatchObject({ - grading: { buildSuccess: false, typeCheckErrors: 5 }, - quality: { score: 0.3 }, + grade: { buildSuccess: false, typeCheckErrors: 5 }, + score: { score: 0.3 }, }); }); @@ -205,20 +202,20 @@ describe('runTask pipeline', () => { vi.mocked(claudeAgent.execute).mockImplementation(async () => { callOrder.push('agent'); - return { run: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, cost: 0.1, duration: 10, turns: 3 }; + return { cost: 0.1, duration: 10, turns: 3 }; }); vi.mocked(grade).mockImplementation(async () => { callOrder.push('grade'); return { - grading: { + grade: { buildSuccess: true, typeCheckErrors: 0, - changedFiles: [], - storybookFiles: [], + fileChanges: [], + storybookChanges: [], setupPatterns: [], }, - quality: { score: 1, breakdown: { build: 1, typecheck: 1, ghostStories: 0, performance: 0 } }, + score: { score: 1, breakdown: { build: 1, typecheck: 1, ghostStories: 0, performance: 0 } }, }; }); diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-task.ts index c475bd23bf7c..b2864a3b57c6 100644 --- a/scripts/eval/lib/run-task.ts +++ b/scripts/eval/lib/run-task.ts @@ -1,13 +1,13 @@ import { writeFile } from "node:fs/promises"; import { join } from "node:path"; -import type { AgentName, Logger, TrialConfig, TrialResult, Agent } from "../types.ts"; +import type { AgentId, Logger, TrialConfig, TrialReport, AgentDriver } from "../types.ts"; import { claudeAgent } from "./agents/claude-code.ts"; import { codexAgent } from "./agents/codex.ts"; import { prepareTrial } from "./prepare-trial.ts"; import { grade } from "./grade.ts"; import { generateTrialId, loadPrompt, captureEnvironment, createLogger } from "./utils.ts"; -const agents: Record = { +const drivers: Record = { claude: claudeAgent, codex: codexAgent, }; @@ -18,9 +18,9 @@ const agents: Record = { export async function runTask( config: TrialConfig, logger?: Logger, -): Promise { - const { project, run, prompt: promptName } = config; - const { agent: agentName, model, effort } = run; +): Promise { + const { project, variant, prompt: promptName } = config; + const { agent: agentName, model } = variant; const log = logger ?? createLogger(); const trialId = generateTrialId(project.name, agentName, model, promptName || "setup"); const timestamp = new Date().toISOString(); @@ -28,24 +28,23 @@ export async function runTask( log.log(`Preparing ${project.name}...`); // 1. Prepare the trial - const paths = await prepareTrial(project, trialId, log); + const workspace = await prepareTrial(project, trialId, log); // 2. Capture environment - await captureEnvironment(paths.resultsDir); + await captureEnvironment(workspace.resultsDir); // 3. Load the prompt const prompt = loadPrompt(promptName); - await writeFile(join(paths.resultsDir, "prompt.md"), prompt); + await writeFile(join(workspace.resultsDir, "prompt.md"), prompt); // 4. Execute the agent - log.log(` Running ${agentName} (${model}, effort=${effort})...`); - const agent = agents[agentName]; - const execution = await agent.execute({ + log.log(` Running ${agentName} (${model}, effort=${variant.effort})...`); + const driver = drivers[agentName]; + const execution = await driver.execute({ prompt, - projectPath: paths.projectPath, - model, - effort, - resultsDir: paths.resultsDir, + projectPath: workspace.projectPath, + variant, + resultsDir: workspace.resultsDir, logger: log, }); log.logSuccess( @@ -53,23 +52,23 @@ export async function runTask( ); // 5. Grade the results (pass agent duration for performance scoring) - const { grading, quality } = await grade(paths, log, execution.duration); + const { grade: trialGrade, score } = await grade(workspace, log, execution.duration); - // 6. Assemble final result - const result: TrialResult = { + // 6. Assemble final report + const report: TrialReport = { schemaVersion: 1, project: project.name, - run, + variant, timestamp, prompt: promptName || "setup", - baselineCommit: paths.baselineCommit, + baselineCommit: workspace.baselineCommit, execution, - grading, - quality, + grade: trialGrade, + score, }; - await writeFile(join(paths.resultsDir, "summary.json"), JSON.stringify(result, null, 2)); - log.logSuccess(`Results saved to ${paths.resultsDir}`); + await writeFile(join(workspace.resultsDir, "summary.json"), JSON.stringify(report, null, 2)); + log.logSuccess(`Results saved to ${workspace.resultsDir}`); - return result; + return report; } diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts index af6c0284808a..ca2ee217cfc6 100644 --- a/scripts/eval/lib/utils.ts +++ b/scripts/eval/lib/utils.ts @@ -80,7 +80,7 @@ export function listPrompts(): string[] { // --- Environment capture --- -export interface Environment { +export interface EvalEnvironment { nodeVersion: string; /** Git branch of the eval harness (storybook monorepo), not the evaluated project. */ evalBranch: string; @@ -88,7 +88,7 @@ export interface Environment { evalCommit: string; } -export async function captureEnvironment(resultsDir: string): Promise { +export async function captureEnvironment(resultsDir: string): Promise { let evalBranch = "unknown"; let evalCommit = "unknown"; try { @@ -97,7 +97,7 @@ export async function captureEnvironment(resultsDir: string): Promise; +export interface AgentExecuteParams { + prompt: string; + projectPath: string; + variant: AgentVariant; + resultsDir: string; + logger: Logger; +} + +export interface AgentDriver { + name: AgentId; + execute(params: AgentExecuteParams): Promise; } // --- Project --- @@ -55,16 +56,16 @@ export interface TrialConfig { /** Which project to evaluate (cloned from its eval-baseline branch). */ project: Project; /** Agent, model, and effort level. */ - run: AgentRunConfig; + variant: AgentVariant; /** Prompt name — maps to `prompts/{name}.md` (e.g. "setup", "self-heal"). */ prompt: string; /** Log agent messages to stdout. */ verbose?: boolean; } -// --- Trial Paths --- +// --- Trial Workspace --- -export interface TrialPaths { +export interface TrialWorkspace { trialDir: string; repoRoot: string; projectPath: string; @@ -74,17 +75,16 @@ export interface TrialPaths { // --- Execution --- -export interface ExecutionResult { - run: AgentRunConfig; +export interface Execution { cost?: number; duration: number; durationApi?: number; turns: number; } -// --- Changed Files --- +// --- File Changes --- -export interface ChangedFile { +export interface FileChange { path: string; status: "A" | "M" | "D" | "R"; /** For renames, the original path before the move. */ @@ -101,7 +101,7 @@ export interface SetupPattern { // --- Ghost Stories --- -export interface GhostStoriesResult { +export interface GhostStoryGrade { candidateCount: number; total: number; passed: number; @@ -110,34 +110,34 @@ export interface GhostStoriesResult { // --- Grading --- -export interface GradingResult { +export interface Grade { buildSuccess: boolean; buildError?: string; typeCheckErrors: number; typeCheckOutput?: string; - changedFiles: ChangedFile[]; - storybookFiles: ChangedFile[]; + fileChanges: FileChange[]; + storybookChanges: FileChange[]; setupPatterns: SetupPattern[]; - ghostStories?: GhostStoriesResult; + ghostStories?: GhostStoryGrade; } // --- Quality Score --- -export interface QualityWeights { +export interface ScoreWeights { ghostStories: number; build: number; typecheck: number; performance: number; } -export const DEFAULT_QUALITY_WEIGHTS: QualityWeights = { +export const DEFAULT_SCORE_WEIGHTS: ScoreWeights = { ghostStories: 0.4, build: 0.25, typecheck: 0.25, performance: 0.1, }; -export interface QualityResult { +export interface QualityScore { score: number; breakdown: { build: number; @@ -147,16 +147,16 @@ export interface QualityResult { }; } -// --- Trial Result --- +// --- Trial Report --- -export interface TrialResult { +export interface TrialReport { schemaVersion: 1; project: string; - run: AgentRunConfig; + variant: AgentVariant; prompt: string; timestamp: string; baselineCommit: string; - execution: ExecutionResult; - grading: GradingResult; - quality: QualityResult; + execution: Execution; + grade: Grade; + score: QualityScore; } From 920e6d3541bbb8361d7543670152346b1e1c5908 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Sun, 29 Mar 2026 21:19:48 +0700 Subject: [PATCH 44/63] Make Project.branch required and remove non-null assertion Every project needs a branch for cloning. The type now reflects that, and the `branch!` assertion in prepareTrial is no longer needed. --- scripts/eval/lib/prepare-trial.ts | 2 +- scripts/eval/types.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/eval/lib/prepare-trial.ts b/scripts/eval/lib/prepare-trial.ts index e11f5956b61e..86db0f7f6107 100644 --- a/scripts/eval/lib/prepare-trial.ts +++ b/scripts/eval/lib/prepare-trial.ts @@ -22,7 +22,7 @@ export async function prepareTrial(project: Project, trialId: string, logger: Lo } else { logger.logStep(`Cloning ${project.repo}#${project.branch}...`); await mkdir(CACHE_DIR, { recursive: true }); - await x("git", ["clone", "--depth", "1", "--branch", project.branch!, project.repo, repoRoot], { + await x("git", ["clone", "--depth", "1", "--branch", project.branch, project.repo, repoRoot], { timeout: 120_000, }); const projectPath = project.projectDir ? join(repoRoot, project.projectDir) : repoRoot; diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index febb881ec194..be68b3c6fa4a 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -45,7 +45,7 @@ export interface AgentDriver { export interface Project { name: string; repo: string; - branch?: string; + branch: string; projectDir?: string; description?: string; } From b5f3a7bea2ba0388302ede62ac34fd46c6026703 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Mon, 30 Mar 2026 14:24:13 +0700 Subject: [PATCH 45/63] =?UTF-8?q?Refine=20eval=20API:=20discriminated=20Ag?= =?UTF-8?q?entVariant=20union,=20rename=20runTask=E2=86=92runTrial,=20thro?= =?UTF-8?q?w=20on=20ghost=20story=20errors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Make AgentVariant a discriminated union on agent, with typed model/effort per agent - Rename runTask→runTrial and run-task.ts→run-trial.ts for consistent domain naming - Store full Project in TrialReport instead of just the name for reproducibility - Replace error-object returns with GhostStoryError throws in ghost-stories.ts - Fix successRate rounding to use Math.round(x*100)/100 consistently - Extract scoring magic numbers into named constants - Validate git status chars against known set instead of blind casting - Truncate build/typecheck output at line boundaries --- scripts/eval/eval.ts | 389 +++++++++--------- scripts/eval/lib/ghost-stories.ts | 50 +-- scripts/eval/lib/grade.ts | 56 ++- scripts/eval/lib/grading-helpers.test.ts | 2 +- .../{run-task.test.ts => run-trial.test.ts} | 18 +- .../eval/lib/{run-task.ts => run-trial.ts} | 4 +- scripts/eval/types.ts | 23 +- 7 files changed, 281 insertions(+), 261 deletions(-) rename scripts/eval/lib/{run-task.test.ts => run-trial.test.ts} (93%) rename scripts/eval/lib/{run-task.ts => run-trial.ts} (97%) diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 44cd83d5ba98..9090d8a112ff 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -5,209 +5,206 @@ * via type stripping. Import specifiers use explicit .ts extensions. * * Usage: - * node eval/eval.ts -p mealdrop # single run (claude, default model) - * node eval/eval.ts -p mealdrop -m gpt-5.4 # single run (agent inferred from model) - * node eval/eval.ts -p mealdrop -m sonnet-4.6,gpt-5.4 # parallel runs - * node eval/eval.ts -p mealdrop -a claude,codex # parallel runs (default model each) - * node eval/eval.ts --list-projects # list projects - * node eval/eval.ts --list-models # list models - * node eval/eval.ts --list-prompts # list prompts + * node eval/eval.ts -p mealdrop # single run (claude, default model) + * node eval/eval.ts -p mealdrop -m gpt-5.4 # single run (agent inferred from model) + * node eval/eval.ts -p mealdrop -m sonnet-4.6 -m gpt-5.4 # parallel runs + * node eval/eval.ts -p mealdrop -a claude -a codex # parallel runs (default model each) + * node eval/eval.ts --list-projects # list projects + * node eval/eval.ts --list-models # list models + * node eval/eval.ts --list-prompts # list prompts */ -import { defineCommand, runMain } from "citty"; +import { parseArgs } from "node:util"; +import { z } from "zod"; import { randomUUID } from "node:crypto"; import pc from "picocolors"; import type { AgentId, TrialConfig, TrialReport } from "./types.ts"; import { AGENTS, PROJECTS } from "./config.ts"; -import { runTask } from "./lib/run-task.ts"; +import { runTrial } from "./lib/run-trial.ts"; import { createLogger, formatDuration, formatCost, formatTable, listPrompts } from "./lib/utils.ts"; -const main = defineCommand({ - meta: { - name: "eval", - description: "Storybook setup eval harness — measure AI agent quality on real-world projects", - }, - args: { - project: { type: "string", alias: "p", description: "Project name" }, - agent: { type: "string", alias: "a", description: "Agent(s), comma-separated" }, - model: { type: "string", alias: "m", description: "Model(s), comma-separated" }, - effort: { type: "string", alias: "e", description: "Effort level" }, - prompt: { type: "string", description: "Prompt name", default: "setup" }, - verbose: { type: "boolean", alias: "v", description: "Verbose output", default: false }, - listProjects: { type: "boolean", description: "List available projects", default: false }, - listModels: { type: "boolean", description: "List available models", default: false }, - listPrompts: { type: "boolean", description: "List available prompts", default: false }, - }, - async run({ args }) { - const logger = createLogger(); - - // --- List commands --- - - if (args.listProjects) { - for (const p of PROJECTS) logger.log(` ${pc.bold(p.name)} — ${p.description}`); - return; - } - if (args.listModels) { - for (const [agent, { models }] of Object.entries(AGENTS)) { - logger.log(`\n ${pc.bold(agent)}`); - for (const m of models) logger.log(` ${m}`); - } - return; - } - if (args.listPrompts) { - for (const name of listPrompts()) logger.log(` ${pc.bold(name)}`); - return; - } - - // --- Validate project --- - - const project = PROJECTS.find((p) => p.name === args.project); - if (!project) { - logger.log(pc.red(`Specify a project with -p. Available: ${PROJECTS.map((p) => p.name).join(", ")}`)); - process.exit(1); - } - - // --- Build configs (supports comma-separated values for parallel runs) --- - - const promptNames = args.prompt!.split(","); - const allModels = Object.values(AGENTS).flatMap((cfg) => cfg.models); - - // Determine agent → model pairs - let agentModels: Array<{ agent: AgentId; model: string }>; - - if (args.model) { - // Models specified — infer agent per model - agentModels = args.model.split(",").map((model) => { - const entry = Object.entries(AGENTS).find(([, cfg]) => cfg.models.includes(model)); - if (!entry) { - logger.log(pc.red(`Unknown model: ${model}. Available: ${allModels.join(", ")}`)); - process.exit(1); - } - return { agent: entry[0] as AgentId, model }; - }); - // If --agent is also specified, filter to matching agents - if (args.agent) { - const filter = args.agent.split(","); - agentModels = agentModels.filter((am) => filter.includes(am.agent)); - } - } else if (args.agent) { - // Agents specified — use default model per agent - agentModels = args.agent.split(",").map((name) => { - const cfg = AGENTS[name as AgentId]; - if (!cfg) { - logger.log(pc.red(`Unknown agent: ${name}. Options: ${Object.keys(AGENTS).join(", ")}`)); - process.exit(1); - } - return { agent: name as AgentId, model: cfg.defaultModel }; - }); - } else { - // Default: single claude run - agentModels = [{ agent: "claude", model: AGENTS.claude.defaultModel }]; - } - - // Expand to full configs: agent×model × prompt - const configs = agentModels.flatMap(({ agent, model }) => { - const cfg = AGENTS[agent]; - const effort = args.effort ?? cfg.defaultEffort; - if (!cfg.efforts.includes(effort)) { - logger.log(pc.red(`Unknown effort "${effort}" for ${agent}. Available: ${cfg.efforts.join(", ")}`)); - process.exit(1); - } - return promptNames.map((prompt) => ({ - config: { project, variant: { agent, model, effort }, prompt, verbose: args.verbose } as TrialConfig, - label: `${model}+${prompt}`, - })); - }); - - if (configs.length === 0) { - logger.log(pc.red("No matching agent/model/prompt combinations found.")); - process.exit(1); - } - - // --- Print header --- - - const runId = randomUUID().slice(0, 8); - logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); - if (configs.length === 1) { - const { variant: { agent, model, effort }, prompt } = configs[0].config; - logger.log(`Agent: ${agent} | Model: ${model} | Effort: ${effort} | Prompt: ${prompt}`); - } else { - logger.log(`${configs.length} parallel runs`); - for (const [agent, { models }] of Object.entries(AGENTS)) { - const active = models.filter((m) => configs.some((c) => c.config.variant.model === m)); - if (active.length > 0) logger.log(` ${agent}: ${active.join(", ")}`); - } - logger.log(` prompts: ${[...new Set(promptNames)].join(", ")}`); - } - logger.log(`Run: ${runId}\n`); - - // --- Execute (always use allSettled — works for 1 or N runs) --- - - const settled = await Promise.allSettled( - configs.map((c) => runTask(c.config, createLogger(configs.length > 1 ? c.label : undefined))), - ); - - const results: TrialReport[] = []; - for (const [i, s] of settled.entries()) { - if (s.status === "fulfilled") { - results.push(s.value); - } else { - logger.logError(`${configs[i].label}: ${s.reason instanceof Error ? s.reason.message : s.reason}`); - } - } - - if (results.length === 0) { - process.exit(1); - } - - // --- Print results --- - - if (results.length === 1) { - const r = results[0]; - const ghost = r.grade.ghostStories; - const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; - - logger.log(pc.bold("\nResult")); - logger.log(` Build: ${r.grade.buildSuccess ? pc.green("PASS") : pc.red("FAIL")}`); - logger.log(` Ghost: ${ghostStr}`); - logger.log(` TS Err: ${r.grade.typeCheckErrors}`); - logger.log(` Score: ${r.score.score}`); - logger.log(` Cost: ${formatCost(r.execution.cost)}`); - logger.log(` Time: ${formatDuration(r.execution.duration)}`); - logger.log(` Turns: ${r.execution.turns}`); - } else { - results.sort((a, b) => (b.grade.ghostStories?.successRate ?? -1) - (a.grade.ghostStories?.successRate ?? -1)); - - const headers = ["Agent", "Model", "Prompt", "Build", "Ghost", "TS Err", "Score", "Cost", "Time", "Turns"]; - const rows = results.map((r) => { - const ghost = r.grade.ghostStories; - const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; - return [ - r.variant.agent, - r.variant.model, - r.prompt, - r.grade.buildSuccess ? pc.green("PASS") : pc.red("FAIL"), - ghostStr, - String(r.grade.typeCheckErrors), - String(r.score.score), - formatCost(r.execution.cost), - formatDuration(r.execution.duration), - String(r.execution.turns), - ]; - }); - - logger.log(pc.bold("\n\nResults (sorted by ghost stories rate)")); - logger.log(formatTable(headers, rows)); - - const totalCost = results.reduce((s, r) => s + (r.execution.cost || 0), 0); - const ghostRates = results.map((r) => r.grade.ghostStories?.successRate).filter((r): r is number => r != null); - const avgGhost = ghostRates.length > 0 ? ghostRates.reduce((s, r) => s + r, 0) / ghostRates.length : 0; - - logger.log(`\nGhost stories avg: ${pc.bold(`${Math.round(avgGhost * 100)}%`)}`); - logger.log(`Total cost: ${pc.bold(formatCost(totalCost))}`); - } - - logger.log("\nDone."); +// --- Derive valid options from config --- + +const PROJECT_NAMES = PROJECTS.map((p) => p.name) as [string, ...string[]]; +const AGENT_NAMES = Object.keys(AGENTS) as [string, ...string[]]; +const ALL_MODELS = Object.values(AGENTS).flatMap((a) => a.models) as [string, ...string[]]; +const ALL_EFFORTS = [...new Set(Object.values(AGENTS).flatMap((a) => a.efforts))] as [string, ...string[]]; + +// --- Parse & validate CLI args --- + +const argsSchema = z.object({ + project: z.enum(PROJECT_NAMES).optional(), + agent: z.array(z.enum(AGENT_NAMES)).optional(), + model: z.array(z.enum(ALL_MODELS)).optional(), + effort: z.enum(ALL_EFFORTS).optional(), + prompt: z.string().default("setup"), + verbose: z.boolean().default(false), + listProjects: z.boolean().default(false), + listModels: z.boolean().default(false), + listPrompts: z.boolean().default(false), +}); + +const { values } = parseArgs({ + options: { + project: { type: "string", short: "p" }, + agent: { type: "string", short: "a", multiple: true }, + model: { type: "string", short: "m", multiple: true }, + effort: { type: "string", short: "e" }, + prompt: { type: "string" }, + verbose: { type: "boolean", short: "v" }, + "list-projects": { type: "boolean" }, + "list-models": { type: "boolean" }, + "list-prompts": { type: "boolean" }, }, + args: process.argv.slice(2), + strict: true, +}); + +const parsed = argsSchema.safeParse({ + ...values, + listProjects: values["list-projects"], + listModels: values["list-models"], + listPrompts: values["list-prompts"], +}); + +if (!parsed.success) { + for (const issue of parsed.error.issues) { + console.error(pc.red(` ${issue.path.join(".")}: ${issue.message}`)); + } + process.exit(1); +} + +const args = parsed.data; +const logger = createLogger(); + +// --- List commands --- + +if (args.listProjects) { + for (const p of PROJECTS) logger.log(` ${pc.bold(p.name)} — ${p.description}`); + process.exit(0); +} +if (args.listModels) { + for (const [agent, { models }] of Object.entries(AGENTS)) { + logger.log(`\n ${pc.bold(agent)}`); + for (const m of models) logger.log(` ${m}`); + } + process.exit(0); +} +if (args.listPrompts) { + for (const name of listPrompts()) logger.log(` ${pc.bold(name)}`); + process.exit(0); +} + +// --- Validate project (required when not listing) --- + +if (!args.project) { + logger.log(pc.red(`Specify a project with -p. Available: ${PROJECT_NAMES.join(", ")}`)); + process.exit(1); +} +const project = PROJECTS.find((p) => p.name === args.project)!; + +// --- Build agent/model pairs (zod already validated individual values) --- + +function inferAgent(model: string): AgentId { + return Object.entries(AGENTS).find(([, cfg]) => cfg.models.includes(model))![0] as AgentId; +} + +const agentModels: Array<{ agent: AgentId; model: string }> = args.model + ? args.model.map((m) => ({ agent: inferAgent(m), model: m })) + .filter((am) => !args.agent || args.agent.includes(am.agent)) + : args.agent + ? args.agent.map((a) => ({ agent: a as AgentId, model: AGENTS[a as AgentId].defaultModel })) + : [{ agent: "claude", model: AGENTS.claude.defaultModel }]; + +const promptNames = args.prompt.split(","); +const configs = agentModels.flatMap(({ agent, model }) => { + const effort = args.effort ?? AGENTS[agent].defaultEffort; + return promptNames.map((prompt) => ({ + config: { project, variant: { agent, model, effort }, prompt, verbose: args.verbose } as TrialConfig, + label: `${model}+${prompt}`, + })); }); -runMain(main); +// --- Print header --- + +const runId = randomUUID().slice(0, 8); +logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); +if (configs.length === 1) { + const { variant: { agent, model, effort }, prompt } = configs[0].config; + logger.log(`Agent: ${agent} | Model: ${model} | Effort: ${effort} | Prompt: ${prompt}`); +} else { + logger.log(`${configs.length} parallel runs`); + for (const [agent, { models }] of Object.entries(AGENTS)) { + const active = models.filter((m) => configs.some((c) => c.config.variant.model === m)); + if (active.length > 0) logger.log(` ${agent}: ${active.join(", ")}`); + } + logger.log(` prompts: ${[...new Set(promptNames)].join(", ")}`); +} +logger.log(`Run: ${runId}\n`); + +// --- Execute (always use allSettled — works for 1 or N runs) --- + +const settled = await Promise.allSettled( + configs.map((c) => runTrial(c.config, createLogger(configs.length > 1 ? c.label : undefined))), +); + +const results: TrialReport[] = []; +for (const [i, s] of settled.entries()) { + if (s.status === "fulfilled") { + results.push(s.value); + } else { + logger.logError(`${configs[i].label}: ${s.reason instanceof Error ? s.reason.message : s.reason}`); + } +} + +if (results.length === 0) { + process.exit(1); +} + +// --- Print results --- + +if (results.length === 1) { + const r = results[0]; + const ghost = r.grade.ghostStories; + const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; + + logger.log(pc.bold("\nResult")); + logger.log(` Build: ${r.grade.buildSuccess ? pc.green("PASS") : pc.red("FAIL")}`); + logger.log(` Ghost: ${ghostStr}`); + logger.log(` TS Err: ${r.grade.typeCheckErrors}`); + logger.log(` Score: ${r.score.score}`); + logger.log(` Cost: ${formatCost(r.execution.cost)}`); + logger.log(` Time: ${formatDuration(r.execution.duration)}`); + logger.log(` Turns: ${r.execution.turns}`); +} else { + results.sort((a, b) => (b.grade.ghostStories?.successRate ?? -1) - (a.grade.ghostStories?.successRate ?? -1)); + + const headers = ["Agent", "Model", "Prompt", "Build", "Ghost", "TS Err", "Score", "Cost", "Time", "Turns"]; + const rows = results.map((r) => { + const ghost = r.grade.ghostStories; + const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; + return [ + r.variant.agent, + r.variant.model, + r.prompt, + r.grade.buildSuccess ? pc.green("PASS") : pc.red("FAIL"), + ghostStr, + String(r.grade.typeCheckErrors), + String(r.score.score), + formatCost(r.execution.cost), + formatDuration(r.execution.duration), + String(r.execution.turns), + ]; + }); + + logger.log(pc.bold("\n\nResults (sorted by ghost stories rate)")); + logger.log(formatTable(headers, rows)); + + const totalCost = results.reduce((s, r) => s + (r.execution.cost || 0), 0); + const ghostRates = results.map((r) => r.grade.ghostStories?.successRate).filter((r): r is number => r != null); + const avgGhost = ghostRates.length > 0 ? ghostRates.reduce((s, r) => s + r, 0) / ghostRates.length : 0; + + logger.log(`\nGhost stories avg: ${pc.bold(`${Math.round(avgGhost * 100)}%`)}`); + logger.log(`Total cost: ${pc.bold(formatCost(totalCost))}`); +} + +logger.log("\nDone."); diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts index 459cb22e280b..37edfe48e233 100644 --- a/scripts/eval/lib/ghost-stories.ts +++ b/scripts/eval/lib/ghost-stories.ts @@ -31,6 +31,13 @@ const IGNORE_PATTERNS = [ "**/stories/{button,header,page}.*", ]; +export class GhostStoryError extends Error { + constructor(message: string) { + super(message); + this.name = "GhostStoryError"; + } +} + /** * Find component files that are candidates for ghost story testing. * Uses glob-based discovery — sufficient for eval grading purposes. @@ -38,24 +45,19 @@ const IGNORE_PATTERNS = [ export async function findComponentCandidates(opts: { cwd: string; sampleSize?: number; -}): Promise<{ candidates: string[]; error?: string }> { +}): Promise { const { cwd, sampleSize = 20 } = opts; - try { - const files = await Array.fromAsync(glob(COMPONENT_GLOB, { - cwd, - exclude: IGNORE_PATTERNS, - })); - return { candidates: files.map((file) => resolve(cwd, file)).slice(0, sampleSize) }; - } catch { - return { candidates: [], error: "Failed to find component candidates" }; - } + const files = await Array.fromAsync(glob(COMPONENT_GLOB, { + cwd, + exclude: IGNORE_PATTERNS, + })); + return files.map((file) => resolve(cwd, file)).slice(0, sampleSize); } export interface GhostStoryOutput { total: number; passed: number; successRate: number; - runError?: string; } /** @@ -91,26 +93,28 @@ export async function runGhostStories( const stderr = (result.stderr ?? "").toLowerCase(); if (stderr.includes("browsertype.launch")) { - return { total: 0, passed: 0, successRate: 0, runError: "Playwright not installed" }; + throw new GhostStoryError("Playwright not installed"); } if (stderr.includes("no tests found")) { - return { total: 0, passed: 0, successRate: 0, runError: "No tests found" }; + throw new GhostStoryError("No tests found"); } if (!existsSync(outputFile)) { - return { total: 0, passed: 0, successRate: 0, runError: "JSON report not found" }; + throw new GhostStoryError("JSON report not found"); } + let report: any; try { - const report = JSON.parse(await readFile(outputFile, "utf-8")); - if (!report.testResults?.length) { - return { total: 0, passed: 0, successRate: 0, runError: "No test results in report" }; - } - const total: number = report.numTotalTests ?? 0; - const passed: number = report.numPassedTests ?? 0; - const successRate = total > 0 ? parseFloat((passed / total).toFixed(2)) : 0; - return { total, passed, successRate }; + report = JSON.parse(await readFile(outputFile, "utf-8")); } catch { - return { total: 0, passed: 0, successRate: 0, runError: "Failed to parse vitest report" }; + throw new GhostStoryError("Failed to parse vitest report"); + } + + if (!report.testResults?.length) { + throw new GhostStoryError("No test results in report"); } + const total: number = report.numTotalTests ?? 0; + const passed: number = report.numPassedTests ?? 0; + const successRate = total > 0 ? Math.round((passed / total) * 100) / 100 : 0; + return { total, passed, successRate }; } diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index 4811417615a3..e6e9f3db359c 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -6,6 +6,13 @@ import { x } from "tinyexec"; import { detectSetupPatterns } from "./setup-patterns.ts"; import { findComponentCandidates, runGhostStories } from "./ghost-stories.ts"; +/** Maximum TypeScript errors before the typecheck score reaches 0. */ +const MAX_TYPECHECK_ERRORS = 20; +/** Agent duration (seconds) at or below which performance scores 1.0. */ +const PERFECT_DURATION_S = 120; +/** Agent duration (seconds) at or above which performance scores 0. */ +const ZERO_SCORE_DURATION_S = 600; + /** Filter file changes to only storybook-related ones. */ export function filterStorybookFiles(fileChanges: FileChange[]): FileChange[] { const isStorybookPath = (path?: string) => @@ -33,10 +40,10 @@ export function computeQualityScore( weights: ScoreWeights = DEFAULT_SCORE_WEIGHTS, ): QualityScore { const buildScore = opts.buildSuccess ? 1 : 0; - const tcScore = Math.max(0, 1 - opts.typeCheckErrors / 20); + const tcScore = Math.max(0, 1 - opts.typeCheckErrors / MAX_TYPECHECK_ERRORS); const ghostScore = opts.ghostSuccessRate ?? 0; const d = opts.durationSeconds; - const perfScore = d == null ? 0 : Math.max(0, Math.min(1, 1 - (d - 120) / 480)); + const perfScore = d == null ? 0 : Math.max(0, Math.min(1, 1 - (d - PERFECT_DURATION_S) / (ZERO_SCORE_DURATION_S - PERFECT_DURATION_S))); const score = Math.round( (ghostScore * weights.ghostStories + @@ -69,7 +76,8 @@ export function parseChangedFiles(gitOutput: string): FileChange[] { .filter(Boolean) .map((line) => { const [status, ...parts] = line.split("\t"); - const normalizedStatus = (status?.charAt(0) || "M") as FileChange["status"]; + const firstChar = status?.charAt(0) ?? ""; + const normalizedStatus = (["A", "M", "D", "R"].includes(firstChar) ? firstChar : "M") as FileChange["status"]; if (normalizedStatus === "R" && parts.length >= 2) { const [previousPath, path] = parts; @@ -80,6 +88,14 @@ export function parseChangedFiles(gitOutput: string): FileChange[] { }); } +/** Truncate text to approximately maxChars, snapping to a line boundary. */ +function truncateEnd(text: string, maxChars: number): string { + if (text.length <= maxChars) return text; + const truncated = text.slice(-maxChars); + const firstNewline = truncated.indexOf('\n'); + return firstNewline >= 0 ? truncated.slice(firstNewline + 1) : truncated; +} + export async function grade( workspace: TrialWorkspace, logger: Logger, @@ -134,9 +150,9 @@ export async function grade( const trialGrade: Grade = { buildSuccess, - buildError: buildSuccess ? undefined : buildOutput.slice(-2000), + buildError: buildSuccess ? undefined : truncateEnd(buildOutput, 2000), typeCheckErrors, - typeCheckOutput: typeCheckErrors > 0 ? tscOutput.slice(-2000) : undefined, + typeCheckOutput: typeCheckErrors > 0 ? truncateEnd(tscOutput, 2000) : undefined, fileChanges, storybookChanges, setupPatterns, @@ -167,20 +183,22 @@ async function getChangedFiles(repoRoot: string, baseline: string): Promise { logger.logStep("Running ghost stories..."); - const { candidates, error } = await findComponentCandidates({ sampleSize: 20, cwd: projectPath }); - if (error || candidates.length === 0) { - logger.logError(error ?? "No candidate components found"); + try { + const candidates = await findComponentCandidates({ sampleSize: 20, cwd: projectPath }); + if (candidates.length === 0) { + logger.logError("No candidate components found"); + return undefined; + } + logger.logStep(`Found ${candidates.length} candidate component(s)`); + + const result = await runGhostStories(candidates, { cwd: projectPath }); + if (result.total > 0) { + logger.logSuccess(`Ghost stories: ${result.passed}/${result.total} passed (${Math.round(result.successRate * 100)}%)`); + } + + return { candidateCount: candidates.length, total: result.total, passed: result.passed, successRate: result.successRate }; + } catch (error) { + logger.logError(`Ghost stories: ${error instanceof Error ? error.message : String(error)}`); return undefined; } - logger.logStep(`Found ${candidates.length} candidate component(s)`); - - const result = await runGhostStories(candidates, { cwd: projectPath }); - - if (result.runError) { - logger.logError(`Ghost stories: ${result.runError}`); - } else if (result.total > 0) { - logger.logSuccess(`Ghost stories: ${result.passed}/${result.total} passed (${Math.round(result.successRate * 100)}%)`); - } - - return { candidateCount: candidates.length, total: result.total, passed: result.passed, successRate: result.successRate }; } diff --git a/scripts/eval/lib/grading-helpers.test.ts b/scripts/eval/lib/grading-helpers.test.ts index 63d3921bcb42..b46b6df822db 100644 --- a/scripts/eval/lib/grading-helpers.test.ts +++ b/scripts/eval/lib/grading-helpers.test.ts @@ -38,7 +38,7 @@ function writeFile(relativePath: string, content: string) { } async function findCandidates(cwd: string) { - const { candidates } = await findComponentCandidates({ cwd, sampleSize: 20 }); + const candidates = await findComponentCandidates({ cwd, sampleSize: 20 }); return candidates.map((c) => c.replace(cwd + '/', '')); } diff --git a/scripts/eval/lib/run-task.test.ts b/scripts/eval/lib/run-trial.test.ts similarity index 93% rename from scripts/eval/lib/run-task.test.ts rename to scripts/eval/lib/run-trial.test.ts index 72acd8ce92d8..362bc458db24 100644 --- a/scripts/eval/lib/run-task.test.ts +++ b/scripts/eval/lib/run-trial.test.ts @@ -34,14 +34,14 @@ vi.mock('./agents/codex', () => ({ import { claudeAgent } from './agents/claude-code'; import { grade } from './grade'; import { prepareTrial } from './prepare-trial'; -import { runTask } from './run-task'; +import { runTrial } from './run-trial'; import { captureEnvironment } from './utils'; let TMP: string; beforeEach(() => { vi.clearAllMocks(); - TMP = join(tmpdir(), `eval-run-task-${Date.now()}`); + TMP = join(tmpdir(), `eval-run-trial-${Date.now()}`); mkdirSync(join(TMP, 'results'), { recursive: true }); }); @@ -94,15 +94,15 @@ const baseConfig: TrialConfig = { prompt: 'setup', }; -describe('runTask pipeline', () => { +describe('runTrial pipeline', () => { it('assembles a complete TrialReport from pipeline steps', async () => { setupMocks(); - const result = await runTask(baseConfig); + const result = await runTrial(baseConfig); expect(result).toMatchObject({ schemaVersion: 1, - project: 'test-project', + project: { name: 'test-project', repo: 'https://github.com/test/repo', branch: 'main' }, variant: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, prompt: 'setup', baselineCommit: 'deadbeef', @@ -129,7 +129,7 @@ describe('runTask pipeline', () => { project: { name: 'mealdrop', repo: 'https://github.com/test/mealdrop', branch: 'eval-baseline' }, }; - await runTask(config); + await runTrial(config); expect(vi.mocked(prepareTrial).mock.calls[0][0]).toMatchObject({ name: 'mealdrop', @@ -161,7 +161,7 @@ describe('runTask pipeline', () => { it('writes summary.json and prompt.md to results dir', async () => { setupMocks(); - await runTask(baseConfig); + await runTrial(baseConfig); const resultsDir = join(TMP, 'results'); @@ -179,7 +179,7 @@ describe('runTask pipeline', () => { it('propagates failed build into result', async () => { setupMocks({ buildSuccess: false, typeCheckErrors: 5 }); - await expect(runTask(baseConfig)).resolves.toMatchObject({ + await expect(runTrial(baseConfig)).resolves.toMatchObject({ grade: { buildSuccess: false, typeCheckErrors: 5 }, score: { score: 0.3 }, }); @@ -219,7 +219,7 @@ describe('runTask pipeline', () => { }; }); - await runTask(baseConfig); + await runTrial(baseConfig); expect(callOrder).toEqual(['prepare', 'agent', 'grade']); }); diff --git a/scripts/eval/lib/run-task.ts b/scripts/eval/lib/run-trial.ts similarity index 97% rename from scripts/eval/lib/run-task.ts rename to scripts/eval/lib/run-trial.ts index b2864a3b57c6..f874188ee01b 100644 --- a/scripts/eval/lib/run-task.ts +++ b/scripts/eval/lib/run-trial.ts @@ -15,7 +15,7 @@ const drivers: Record = { /** * Run a full eval trial: prepare -> execute agent -> grade -> save. */ -export async function runTask( +export async function runTrial( config: TrialConfig, logger?: Logger, ): Promise { @@ -57,7 +57,7 @@ export async function runTask( // 6. Assemble final report const report: TrialReport = { schemaVersion: 1, - project: project.name, + project, variant, timestamp, prompt: promptName || "setup", diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index be68b3c6fa4a..dbf90db52c7e 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -1,8 +1,8 @@ /** * Core types for the Storybook setup eval system. * - * Plain TypeScript interfaces — no runtime validation library. - * Validation happens at the boundaries (CLI parsing via citty). + * Plain TypeScript interfaces — runtime validation at the CLI boundary + * uses zod (see eval.ts). */ // --- Logger --- @@ -16,16 +16,17 @@ export interface Logger { // --- Agent --- -export type AgentId = "claude" | "codex"; +export type ClaudeModel = "sonnet-4.6" | "opus-4.6" | "haiku-4.5"; +export type CodexModel = "gpt-5.4"; +export type ClaudeEffort = "low" | "medium" | "high" | "max"; +export type CodexEffort = "low" | "medium" | "high" | "xhigh"; /** Agent + model + effort — the three values that define how the agent runs. */ -export interface AgentVariant { - agent: AgentId; - /** Friendly model name (e.g. "sonnet-4.6", "gpt-5.4"). Must exist in `AGENTS[agent].models`. */ - model: string; - /** Reasoning effort level. Must exist in `AGENTS[agent].efforts`. */ - effort: string; -} +export type AgentVariant = + | { agent: "claude"; model: ClaudeModel; effort: ClaudeEffort } + | { agent: "codex"; model: CodexModel; effort: CodexEffort }; + +export type AgentId = AgentVariant["agent"]; export interface AgentExecuteParams { prompt: string; @@ -151,7 +152,7 @@ export interface QualityScore { export interface TrialReport { schemaVersion: 1; - project: string; + project: Project; variant: AgentVariant; prompt: string; timestamp: string; From b4bab02d5784f663afabc9aae50c784ac98480bd Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Mon, 30 Mar 2026 14:51:15 +0700 Subject: [PATCH 46/63] Fix CI: format eval files, fix effort type narrowing in Claude agent --- scripts/eval/config.ts | 78 +++++++------- scripts/eval/eval.ts | 126 ++++++++++++++--------- scripts/eval/lib/agents/claude-code.ts | 81 +++++++-------- scripts/eval/lib/agents/codex.ts | 58 ++++++----- scripts/eval/lib/ghost-stories.ts | 109 +++++++++++--------- scripts/eval/lib/grade.test.ts | 95 ++++++++++++++--- scripts/eval/lib/grade.ts | 117 +++++++++++++-------- scripts/eval/lib/grading-helpers.test.ts | 29 ++++-- scripts/eval/lib/package-manager.ts | 36 ++++--- scripts/eval/lib/prepare-trial.ts | 40 ++++--- scripts/eval/lib/run-trial.test.ts | 21 +++- scripts/eval/lib/run-trial.ts | 31 +++--- scripts/eval/lib/setup-patterns.test.ts | 10 +- scripts/eval/lib/setup-patterns.ts | 54 +++++++--- scripts/eval/lib/utils.test.ts | 24 +++-- scripts/eval/lib/utils.ts | 68 ++++++------ scripts/eval/types.test.ts | 4 +- scripts/eval/types.ts | 16 +-- 18 files changed, 594 insertions(+), 403 deletions(-) diff --git a/scripts/eval/config.ts b/scripts/eval/config.ts index 59cbd0af905c..6fcd8596229f 100644 --- a/scripts/eval/config.ts +++ b/scripts/eval/config.ts @@ -5,7 +5,7 @@ * and cost estimation utilities. */ -import type { AgentId, Project } from "./types.ts"; +import type { AgentId, Project } from './types.ts'; // --- Pricing --- @@ -36,26 +36,26 @@ export interface AgentDefinition { export const AGENTS: Record = { claude: { - models: ["sonnet-4.6", "opus-4.6", "haiku-4.5"], - defaultModel: "sonnet-4.6", + models: ['sonnet-4.6', 'opus-4.6', 'haiku-4.5'], + defaultModel: 'sonnet-4.6', sdkModelIds: { - "sonnet-4.6": "claude-sonnet-4-6", - "opus-4.6": "claude-opus-4-6", - "haiku-4.5": "claude-haiku-4-5", + 'sonnet-4.6': 'claude-sonnet-4-6', + 'opus-4.6': 'claude-opus-4-6', + 'haiku-4.5': 'claude-haiku-4-5', }, pricing: {}, - efforts: ["low", "medium", "high", "max"], - defaultEffort: "high", + efforts: ['low', 'medium', 'high', 'max'], + defaultEffort: 'high', }, codex: { - models: ["gpt-5.4"], - defaultModel: "gpt-5.4", + models: ['gpt-5.4'], + defaultModel: 'gpt-5.4', sdkModelIds: {}, pricing: { - "gpt-5.4": { input: 2.5, cachedInput: 0.625, output: 10.0 }, + 'gpt-5.4': { input: 2.5, cachedInput: 0.625, output: 10.0 }, }, - efforts: ["low", "medium", "high", "xhigh"], - defaultEffort: "high", + efforts: ['low', 'medium', 'high', 'xhigh'], + defaultEffort: 'high', }, }; @@ -77,41 +77,41 @@ export function estimateCost(agent: AgentId, model: string, usage: TokenUsage): export const PROJECTS: Project[] = [ { - name: "mealdrop", - repo: "https://github.com/kasperpeulen/mealdrop", - branch: "eval-baseline", - description: "Styled components, Redux, React Router", + name: 'mealdrop', + repo: 'https://github.com/kasperpeulen/mealdrop', + branch: 'eval-baseline', + description: 'Styled components, Redux, React Router', }, { - name: "edgy", - repo: "https://github.com/kasperpeulen/edgy", - branch: "eval-baseline", - description: "Tailwind, HeadlessUI, React Router", + name: 'edgy', + repo: 'https://github.com/kasperpeulen/edgy', + branch: 'eval-baseline', + description: 'Tailwind, HeadlessUI, React Router', }, { - name: "wikitok", - repo: "https://github.com/kasperpeulen/wikitok", - branch: "eval-baseline", - projectDir: "frontend", - description: "Simple project with Tailwind", + name: 'wikitok', + repo: 'https://github.com/kasperpeulen/wikitok', + branch: 'eval-baseline', + projectDir: 'frontend', + description: 'Simple project with Tailwind', }, { - name: "baklava", - repo: "https://github.com/kasperpeulen/baklava", - branch: "eval-baseline", - description: "Component library with Zustand", + name: 'baklava', + repo: 'https://github.com/kasperpeulen/baklava', + branch: 'eval-baseline', + description: 'Component library with Zustand', }, { - name: "echarts", - repo: "https://github.com/kasperpeulen/echarts-react", - branch: "eval-baseline", - description: "ECharts React wrapper", + name: 'echarts', + repo: 'https://github.com/kasperpeulen/echarts-react', + branch: 'eval-baseline', + description: 'ECharts React wrapper', }, { - name: "evergreen-ci", - repo: "https://github.com/kasperpeulen/ui", - branch: "eval-baseline", - projectDir: "packages/lib", - description: "GraphQL", + name: 'evergreen-ci', + repo: 'https://github.com/kasperpeulen/ui', + branch: 'eval-baseline', + projectDir: 'packages/lib', + description: 'GraphQL', }, ]; diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 9090d8a112ff..9c2e33de2e99 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -13,21 +13,24 @@ * node eval/eval.ts --list-models # list models * node eval/eval.ts --list-prompts # list prompts */ -import { parseArgs } from "node:util"; -import { z } from "zod"; -import { randomUUID } from "node:crypto"; -import pc from "picocolors"; -import type { AgentId, TrialConfig, TrialReport } from "./types.ts"; -import { AGENTS, PROJECTS } from "./config.ts"; -import { runTrial } from "./lib/run-trial.ts"; -import { createLogger, formatDuration, formatCost, formatTable, listPrompts } from "./lib/utils.ts"; +import { parseArgs } from 'node:util'; +import { z } from 'zod'; +import { randomUUID } from 'node:crypto'; +import pc from 'picocolors'; +import type { AgentId, TrialConfig, TrialReport } from './types.ts'; +import { AGENTS, PROJECTS } from './config.ts'; +import { runTrial } from './lib/run-trial.ts'; +import { createLogger, formatDuration, formatCost, formatTable, listPrompts } from './lib/utils.ts'; // --- Derive valid options from config --- const PROJECT_NAMES = PROJECTS.map((p) => p.name) as [string, ...string[]]; const AGENT_NAMES = Object.keys(AGENTS) as [string, ...string[]]; const ALL_MODELS = Object.values(AGENTS).flatMap((a) => a.models) as [string, ...string[]]; -const ALL_EFFORTS = [...new Set(Object.values(AGENTS).flatMap((a) => a.efforts))] as [string, ...string[]]; +const ALL_EFFORTS = [...new Set(Object.values(AGENTS).flatMap((a) => a.efforts))] as [ + string, + ...string[], +]; // --- Parse & validate CLI args --- @@ -36,7 +39,7 @@ const argsSchema = z.object({ agent: z.array(z.enum(AGENT_NAMES)).optional(), model: z.array(z.enum(ALL_MODELS)).optional(), effort: z.enum(ALL_EFFORTS).optional(), - prompt: z.string().default("setup"), + prompt: z.string().default('setup'), verbose: z.boolean().default(false), listProjects: z.boolean().default(false), listModels: z.boolean().default(false), @@ -45,15 +48,15 @@ const argsSchema = z.object({ const { values } = parseArgs({ options: { - project: { type: "string", short: "p" }, - agent: { type: "string", short: "a", multiple: true }, - model: { type: "string", short: "m", multiple: true }, - effort: { type: "string", short: "e" }, - prompt: { type: "string" }, - verbose: { type: "boolean", short: "v" }, - "list-projects": { type: "boolean" }, - "list-models": { type: "boolean" }, - "list-prompts": { type: "boolean" }, + project: { type: 'string', short: 'p' }, + agent: { type: 'string', short: 'a', multiple: true }, + model: { type: 'string', short: 'm', multiple: true }, + effort: { type: 'string', short: 'e' }, + prompt: { type: 'string' }, + verbose: { type: 'boolean', short: 'v' }, + 'list-projects': { type: 'boolean' }, + 'list-models': { type: 'boolean' }, + 'list-prompts': { type: 'boolean' }, }, args: process.argv.slice(2), strict: true, @@ -61,14 +64,14 @@ const { values } = parseArgs({ const parsed = argsSchema.safeParse({ ...values, - listProjects: values["list-projects"], - listModels: values["list-models"], - listPrompts: values["list-prompts"], + listProjects: values['list-projects'], + listModels: values['list-models'], + listPrompts: values['list-prompts'], }); if (!parsed.success) { for (const issue of parsed.error.issues) { - console.error(pc.red(` ${issue.path.join(".")}: ${issue.message}`)); + console.error(pc.red(` ${issue.path.join('.')}: ${issue.message}`)); } process.exit(1); } @@ -97,7 +100,7 @@ if (args.listPrompts) { // --- Validate project (required when not listing) --- if (!args.project) { - logger.log(pc.red(`Specify a project with -p. Available: ${PROJECT_NAMES.join(", ")}`)); + logger.log(pc.red(`Specify a project with -p. Available: ${PROJECT_NAMES.join(', ')}`)); process.exit(1); } const project = PROJECTS.find((p) => p.name === args.project)!; @@ -109,17 +112,23 @@ function inferAgent(model: string): AgentId { } const agentModels: Array<{ agent: AgentId; model: string }> = args.model - ? args.model.map((m) => ({ agent: inferAgent(m), model: m })) + ? args.model + .map((m) => ({ agent: inferAgent(m), model: m })) .filter((am) => !args.agent || args.agent.includes(am.agent)) : args.agent ? args.agent.map((a) => ({ agent: a as AgentId, model: AGENTS[a as AgentId].defaultModel })) - : [{ agent: "claude", model: AGENTS.claude.defaultModel }]; + : [{ agent: 'claude', model: AGENTS.claude.defaultModel }]; -const promptNames = args.prompt.split(","); +const promptNames = args.prompt.split(','); const configs = agentModels.flatMap(({ agent, model }) => { const effort = args.effort ?? AGENTS[agent].defaultEffort; return promptNames.map((prompt) => ({ - config: { project, variant: { agent, model, effort }, prompt, verbose: args.verbose } as TrialConfig, + config: { + project, + variant: { agent, model, effort }, + prompt, + verbose: args.verbose, + } as TrialConfig, label: `${model}+${prompt}`, })); }); @@ -129,30 +138,35 @@ const configs = agentModels.flatMap(({ agent, model }) => { const runId = randomUUID().slice(0, 8); logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); if (configs.length === 1) { - const { variant: { agent, model, effort }, prompt } = configs[0].config; + const { + variant: { agent, model, effort }, + prompt, + } = configs[0].config; logger.log(`Agent: ${agent} | Model: ${model} | Effort: ${effort} | Prompt: ${prompt}`); } else { logger.log(`${configs.length} parallel runs`); for (const [agent, { models }] of Object.entries(AGENTS)) { const active = models.filter((m) => configs.some((c) => c.config.variant.model === m)); - if (active.length > 0) logger.log(` ${agent}: ${active.join(", ")}`); + if (active.length > 0) logger.log(` ${agent}: ${active.join(', ')}`); } - logger.log(` prompts: ${[...new Set(promptNames)].join(", ")}`); + logger.log(` prompts: ${[...new Set(promptNames)].join(', ')}`); } logger.log(`Run: ${runId}\n`); // --- Execute (always use allSettled — works for 1 or N runs) --- const settled = await Promise.allSettled( - configs.map((c) => runTrial(c.config, createLogger(configs.length > 1 ? c.label : undefined))), + configs.map((c) => runTrial(c.config, createLogger(configs.length > 1 ? c.label : undefined))) ); const results: TrialReport[] = []; for (const [i, s] of settled.entries()) { - if (s.status === "fulfilled") { + if (s.status === 'fulfilled') { results.push(s.value); } else { - logger.logError(`${configs[i].label}: ${s.reason instanceof Error ? s.reason.message : s.reason}`); + logger.logError( + `${configs[i].label}: ${s.reason instanceof Error ? s.reason.message : s.reason}` + ); } } @@ -165,10 +179,12 @@ if (results.length === 0) { if (results.length === 1) { const r = results[0]; const ghost = r.grade.ghostStories; - const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; + const ghostStr = ghost + ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` + : '-'; - logger.log(pc.bold("\nResult")); - logger.log(` Build: ${r.grade.buildSuccess ? pc.green("PASS") : pc.red("FAIL")}`); + logger.log(pc.bold('\nResult')); + logger.log(` Build: ${r.grade.buildSuccess ? pc.green('PASS') : pc.red('FAIL')}`); logger.log(` Ghost: ${ghostStr}`); logger.log(` TS Err: ${r.grade.typeCheckErrors}`); logger.log(` Score: ${r.score.score}`); @@ -176,17 +192,32 @@ if (results.length === 1) { logger.log(` Time: ${formatDuration(r.execution.duration)}`); logger.log(` Turns: ${r.execution.turns}`); } else { - results.sort((a, b) => (b.grade.ghostStories?.successRate ?? -1) - (a.grade.ghostStories?.successRate ?? -1)); - - const headers = ["Agent", "Model", "Prompt", "Build", "Ghost", "TS Err", "Score", "Cost", "Time", "Turns"]; + results.sort( + (a, b) => (b.grade.ghostStories?.successRate ?? -1) - (a.grade.ghostStories?.successRate ?? -1) + ); + + const headers = [ + 'Agent', + 'Model', + 'Prompt', + 'Build', + 'Ghost', + 'TS Err', + 'Score', + 'Cost', + 'Time', + 'Turns', + ]; const rows = results.map((r) => { const ghost = r.grade.ghostStories; - const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` : "-"; + const ghostStr = ghost + ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` + : '-'; return [ r.variant.agent, r.variant.model, r.prompt, - r.grade.buildSuccess ? pc.green("PASS") : pc.red("FAIL"), + r.grade.buildSuccess ? pc.green('PASS') : pc.red('FAIL'), ghostStr, String(r.grade.typeCheckErrors), String(r.score.score), @@ -196,15 +227,18 @@ if (results.length === 1) { ]; }); - logger.log(pc.bold("\n\nResults (sorted by ghost stories rate)")); + logger.log(pc.bold('\n\nResults (sorted by ghost stories rate)')); logger.log(formatTable(headers, rows)); const totalCost = results.reduce((s, r) => s + (r.execution.cost || 0), 0); - const ghostRates = results.map((r) => r.grade.ghostStories?.successRate).filter((r): r is number => r != null); - const avgGhost = ghostRates.length > 0 ? ghostRates.reduce((s, r) => s + r, 0) / ghostRates.length : 0; + const ghostRates = results + .map((r) => r.grade.ghostStories?.successRate) + .filter((r): r is number => r != null); + const avgGhost = + ghostRates.length > 0 ? ghostRates.reduce((s, r) => s + r, 0) / ghostRates.length : 0; logger.log(`\nGhost stories avg: ${pc.bold(`${Math.round(avgGhost * 100)}%`)}`); logger.log(`Total cost: ${pc.bold(formatCost(totalCost))}`); } -logger.log("\nDone."); +logger.log('\nDone.'); diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index 2d440d224127..ebb201489fd2 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -1,17 +1,17 @@ -import type { SDKMessage } from "@anthropic-ai/claude-agent-sdk"; -import { query } from "@anthropic-ai/claude-agent-sdk"; -import { writeFile } from "node:fs/promises"; -import { join } from "node:path"; -import { AGENTS } from "../../config.ts"; -import type { AgentDriver, Execution, Logger } from "../../types.ts"; +import type { SDKMessage } from '@anthropic-ai/claude-agent-sdk'; +import { query } from '@anthropic-ai/claude-agent-sdk'; +import { writeFile } from 'node:fs/promises'; +import { join } from 'node:path'; +import { AGENTS } from '../../config.ts'; +import type { AgentDriver, Execution, Logger } from '../../types.ts'; function logMessage(message: SDKMessage, logger: Logger) { switch (message.type) { - case "assistant": { + case 'assistant': { for (const block of message.message.content) { - if (block.type === "text") { + if (block.type === 'text') { logger.log(`💬 ${block.text}`); - } else if (block.type === "tool_use") { + } else if (block.type === 'tool_use') { logger.log(`🔧 ${block.name}(${JSON.stringify(block.input).slice(0, 200)})`); } } @@ -20,48 +20,52 @@ function logMessage(message: SDKMessage, logger: Logger) { } break; } - case "user": { + case 'user': { const content = message.message.content; if (!Array.isArray(content)) break; for (const block of content) { - if (block.type === "tool_result") { + if (block.type === 'tool_result') { const text = - typeof block.content === "string" + typeof block.content === 'string' ? block.content.slice(0, 200) : Array.isArray(block.content) ? block.content .map((b: { type: string; text?: string }) => - "text" in b ? b.text : `[${b.type}]`, + 'text' in b ? b.text : `[${b.type}]` ) - .join("") + .join('') .slice(0, 200) - : "[no content]"; + : '[no content]'; logger.log(`📎 tool_result(${block.tool_use_id?.slice(-8)}): ${text}`); } } break; } - case "result": - if (message.subtype === "success") { - logger.logSuccess(`Done — ${message.num_turns} turns, $${message.total_cost_usd?.toFixed(4)}`); + case 'result': + if (message.subtype === 'success') { + logger.logSuccess( + `Done — ${message.num_turns} turns, $${message.total_cost_usd?.toFixed(4)}` + ); } else { - logger.logError(`Error (${message.subtype}): ${message.errors?.join(", ")}`); + logger.logError(`Error (${message.subtype}): ${message.errors?.join(', ')}`); } break; - case "system": - if (message.subtype === "init") { + case 'system': + if (message.subtype === 'init') { logger.log(`🚀 Session started — model: ${message.model}`); - } else if (message.subtype === "api_retry") { + } else if (message.subtype === 'api_retry') { logger.log(`🔄 API retry: attempt ${message.attempt}/${message.max_retries}`); - } else if (message.subtype === "status") { - logger.log(`📊 status: ${message.status ?? "unknown"}`); + } else if (message.subtype === 'status') { + logger.log(`📊 status: ${message.status ?? 'unknown'}`); } break; - case "tool_use_summary": + case 'tool_use_summary': logger.log(`📋 ${message.summary.slice(0, 200)}`); break; - case "rate_limit_event": - logger.log(`⏳ Rate limited — status: ${message.rate_limit_info?.status}, resets at: ${message.rate_limit_info?.resetsAt}`); + case 'rate_limit_event': + logger.log( + `⏳ Rate limited — status: ${message.rate_limit_info?.status}, resets at: ${message.rate_limit_info?.resetsAt}` + ); break; default: break; @@ -71,17 +75,12 @@ function logMessage(message: SDKMessage, logger: Logger) { const MAX_TURNS = 50; export const claudeAgent: AgentDriver = { - name: "claude", + name: 'claude', - async execute({ - prompt, - projectPath, - variant, - resultsDir, - logger, - }): Promise { + async execute({ prompt, projectPath, variant, resultsDir, logger }): Promise { const startTime = Date.now(); - const { model, effort } = variant; + const { model } = variant; + const effort = variant.effort as 'low' | 'medium' | 'high' | 'max'; const sdkModel = AGENTS.claude.sdkModelIds[model] ?? model; let cost: number | undefined; @@ -94,27 +93,27 @@ export const claudeAgent: AgentDriver = { options: { model: sdkModel, cwd: projectPath, - allowedTools: ["Read", "Write", "Edit", "Bash", "Glob", "Grep"], + allowedTools: ['Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep'], maxTurns: MAX_TURNS, effort, debug: true, - systemPrompt: { type: "preset", preset: "claude_code" }, + systemPrompt: { type: 'preset', preset: 'claude_code' }, }, })) { logMessage(message, logger); messages.push(message); - if (message.type === "result" && message.subtype === "success") { + if (message.type === 'result' && message.subtype === 'success') { cost = message.total_cost_usd as number | undefined; turns = (message.num_turns as number) ?? 0; durationApi = - typeof message.duration_api_ms === "number" ? message.duration_api_ms / 1000 : undefined; + typeof message.duration_api_ms === 'number' ? message.duration_api_ms / 1000 : undefined; } } const duration = (Date.now() - startTime) / 1000; - await writeFile(join(resultsDir, "transcript.json"), JSON.stringify(messages, null, 2)); + await writeFile(join(resultsDir, 'transcript.json'), JSON.stringify(messages, null, 2)); return { cost, diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts index b74e2432c740..19de4777815a 100644 --- a/scripts/eval/lib/agents/codex.ts +++ b/scripts/eval/lib/agents/codex.ts @@ -1,19 +1,13 @@ -import { Codex, type ModelReasoningEffort } from "@openai/codex-sdk"; -import { writeFile } from "node:fs/promises"; -import { join } from "node:path"; -import type { AgentDriver, Execution } from "../../types.ts"; -import { estimateCost } from "../../config.ts"; +import { Codex, type ModelReasoningEffort } from '@openai/codex-sdk'; +import { writeFile } from 'node:fs/promises'; +import { join } from 'node:path'; +import type { AgentDriver, Execution } from '../../types.ts'; +import { estimateCost } from '../../config.ts'; export const codexAgent: AgentDriver = { - name: "codex", + name: 'codex', - async execute({ - prompt, - projectPath, - variant, - resultsDir, - logger, - }): Promise { + async execute({ prompt, projectPath, variant, resultsDir, logger }): Promise { const startTime = Date.now(); const { model, effort } = variant; @@ -22,7 +16,7 @@ export const codexAgent: AgentDriver = { model, modelReasoningEffort: effort as ModelReasoningEffort, workingDirectory: projectPath, - approvalPolicy: "never", + approvalPolicy: 'never', }); const { events } = await thread.runStreamed(prompt); @@ -34,52 +28,60 @@ export const codexAgent: AgentDriver = { for await (const event of events) { switch (event.type) { - case "item.completed": { + case 'item.completed': { const item = event.item; items.push(item); switch (item.type) { - case "agent_message": + case 'agent_message': logger.log(`💬 ${item.text.slice(0, 300)}`); break; - case "command_execution": - logger.log(`🔧 $ ${item.command} → exit ${item.exit_code ?? "?"}`); + case 'command_execution': + logger.log(`🔧 $ ${item.command} → exit ${item.exit_code ?? '?'}`); if (item.exit_code !== 0 && item.aggregated_output) { logger.log(` ${item.aggregated_output.slice(-200)}`); } break; - case "file_change": + case 'file_change': for (const c of item.changes) logger.log(`📝 ${c.kind} ${c.path}`); break; - case "reasoning": + case 'reasoning': logger.log(`🧠 ${item.text.slice(0, 200)}`); break; - case "error": + case 'error': logger.logError(item.message); break; } break; } - case "turn.completed": + case 'turn.completed': totalInput += event.usage.input_tokens; totalCached += event.usage.cached_input_tokens; totalOutput += event.usage.output_tokens; turns++; - logger.log(`📊 tokens: ${event.usage.input_tokens}in / ${event.usage.output_tokens}out (${event.usage.cached_input_tokens} cached)`); + logger.log( + `📊 tokens: ${event.usage.input_tokens}in / ${event.usage.output_tokens}out (${event.usage.cached_input_tokens} cached)` + ); break; - case "turn.failed": + case 'turn.failed': logger.logError(`Turn failed: ${event.error.message}`); break; - case "error": + case 'error': logger.logError(`Error: ${event.message}`); break; } } const duration = (Date.now() - startTime) / 1000; - const cost = estimateCost("codex", model, { inputTokens: totalInput, cachedInputTokens: totalCached, outputTokens: totalOutput }); - logger.logSuccess(`Done — ${turns} turns, ${Math.round(duration)}s, ${totalInput}in/${totalOutput}out tokens${cost != null ? `, $${cost.toFixed(4)}` : ""}`); + const cost = estimateCost('codex', model, { + inputTokens: totalInput, + cachedInputTokens: totalCached, + outputTokens: totalOutput, + }); + logger.logSuccess( + `Done — ${turns} turns, ${Math.round(duration)}s, ${totalInput}in/${totalOutput}out tokens${cost != null ? `, $${cost.toFixed(4)}` : ''}` + ); - await writeFile(join(resultsDir, "transcript.json"), JSON.stringify(items, null, 2)); + await writeFile(join(resultsDir, 'transcript.json'), JSON.stringify(items, null, 2)); return { cost, duration, turns }; }, diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts index 37edfe48e233..964cc22ed33f 100644 --- a/scripts/eval/lib/ghost-stories.ts +++ b/scripts/eval/lib/ghost-stories.ts @@ -7,34 +7,34 @@ * but decoupled so eval has no cross-package source imports. */ -import { existsSync } from "node:fs"; -import { glob, readFile } from "node:fs/promises"; -import { join, resolve } from "node:path"; -import { tmpdir } from "node:os"; -import { x } from "tinyexec"; +import { existsSync } from 'node:fs'; +import { glob, readFile } from 'node:fs/promises'; +import { join, resolve } from 'node:path'; +import { tmpdir } from 'node:os'; +import { x } from 'tinyexec'; -const COMPONENT_GLOB = "**/*.{tsx,jsx}"; +const COMPONENT_GLOB = '**/*.{tsx,jsx}'; const IGNORE_PATTERNS = [ - "**/node_modules/**", - "**/.git/**", - "**/dist/**", - "**/__mocks__/**", - "**/build/**", - "**/storybook-static/**", - "**/*.test.*", - "**/*.spec.*", - "**/*.stories.*", - "**/*.story.*", - "**/*.d.*", - "**/*.config.*", - "**/stories/{Button,Header,Page}.*", - "**/stories/{button,header,page}.*", + '**/node_modules/**', + '**/.git/**', + '**/dist/**', + '**/__mocks__/**', + '**/build/**', + '**/storybook-static/**', + '**/*.test.*', + '**/*.spec.*', + '**/*.stories.*', + '**/*.story.*', + '**/*.d.*', + '**/*.config.*', + '**/stories/{Button,Header,Page}.*', + '**/stories/{button,header,page}.*', ]; export class GhostStoryError extends Error { constructor(message: string) { super(message); - this.name = "GhostStoryError"; + this.name = 'GhostStoryError'; } } @@ -47,10 +47,12 @@ export async function findComponentCandidates(opts: { sampleSize?: number; }): Promise { const { cwd, sampleSize = 20 } = opts; - const files = await Array.fromAsync(glob(COMPONENT_GLOB, { - cwd, - exclude: IGNORE_PATTERNS, - })); + const files = await Array.fromAsync( + glob(COMPONENT_GLOB, { + cwd, + exclude: IGNORE_PATTERNS, + }) + ); return files.map((file) => resolve(cwd, file)).slice(0, sampleSize); } @@ -69,49 +71,54 @@ export interface GhostStoryOutput { */ export async function runGhostStories( candidates: string[], - opts: { cwd: string }, + opts: { cwd: string } ): Promise { const outputFile = join(tmpdir(), `ghost-stories-${Date.now()}.json`); - const result = await x("npx", [ - "vitest", "run", - "--reporter=json", - "--testTimeout=1000", - `--outputFile=${outputFile}`, - ...candidates, - ], { - throwOnError: false, - timeout: 300_000, - nodeOptions: { - cwd: opts.cwd, - env: { - ...process.env, - STORYBOOK_COMPONENT_PATHS: candidates.join(";"), + const result = await x( + 'npx', + [ + 'vitest', + 'run', + '--reporter=json', + '--testTimeout=1000', + `--outputFile=${outputFile}`, + ...candidates, + ], + { + throwOnError: false, + timeout: 300_000, + nodeOptions: { + cwd: opts.cwd, + env: { + ...process.env, + STORYBOOK_COMPONENT_PATHS: candidates.join(';'), + }, }, - }, - }); + } + ); - const stderr = (result.stderr ?? "").toLowerCase(); - if (stderr.includes("browsertype.launch")) { - throw new GhostStoryError("Playwright not installed"); + const stderr = (result.stderr ?? '').toLowerCase(); + if (stderr.includes('browsertype.launch')) { + throw new GhostStoryError('Playwright not installed'); } - if (stderr.includes("no tests found")) { - throw new GhostStoryError("No tests found"); + if (stderr.includes('no tests found')) { + throw new GhostStoryError('No tests found'); } if (!existsSync(outputFile)) { - throw new GhostStoryError("JSON report not found"); + throw new GhostStoryError('JSON report not found'); } let report: any; try { - report = JSON.parse(await readFile(outputFile, "utf-8")); + report = JSON.parse(await readFile(outputFile, 'utf-8')); } catch { - throw new GhostStoryError("Failed to parse vitest report"); + throw new GhostStoryError('Failed to parse vitest report'); } if (!report.testResults?.length) { - throw new GhostStoryError("No test results in report"); + throw new GhostStoryError('No test results in report'); } const total: number = report.numTotalTests ?? 0; const passed: number = report.numPassedTests ?? 0; diff --git a/scripts/eval/lib/grade.test.ts b/scripts/eval/lib/grade.test.ts index 8145c9f5ca7e..4360204fcefa 100644 --- a/scripts/eval/lib/grade.test.ts +++ b/scripts/eval/lib/grade.test.ts @@ -61,7 +61,10 @@ describe('computeQualityScore', () => { it('returns 1.0 when everything passes and agent is fast', () => { const result = computeQualityScore({ - buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 60, + buildSuccess: true, + typeCheckErrors: 0, + ghostSuccessRate: 1.0, + durationSeconds: 60, }); expect(result.score).toBe(1); expect(result.breakdown).toEqual({ build: 1, typecheck: 1, ghostStories: 1, performance: 1 }); @@ -69,73 +72,132 @@ describe('computeQualityScore', () => { it('ghost stories have 40% weight', () => { const result = computeQualityScore({ - buildSuccess: false, typeCheckErrors: 20, ghostSuccessRate: 1.0, durationSeconds: 600, + buildSuccess: false, + typeCheckErrors: 20, + ghostSuccessRate: 1.0, + durationSeconds: 600, }); expect(result.score).toBe(0.4); }); it('build has 25% weight', () => { const result = computeQualityScore({ - buildSuccess: true, typeCheckErrors: 20, ghostSuccessRate: 0, durationSeconds: 600, + buildSuccess: true, + typeCheckErrors: 20, + ghostSuccessRate: 0, + durationSeconds: 600, }); expect(result.score).toBe(0.25); }); it('performance has 10% weight', () => { const result = computeQualityScore({ - buildSuccess: false, typeCheckErrors: 20, ghostSuccessRate: 0, durationSeconds: 60, + buildSuccess: false, + typeCheckErrors: 20, + ghostSuccessRate: 0, + durationSeconds: 60, }); expect(result.score).toBe(0.1); }); it('returns 0 when everything fails', () => { const result = computeQualityScore({ - buildSuccess: false, typeCheckErrors: 20, ghostSuccessRate: 0, durationSeconds: 600, + buildSuccess: false, + typeCheckErrors: 20, + ghostSuccessRate: 0, + durationSeconds: 600, }); expect(result.score).toBe(0); }); it('scales typecheck score linearly', () => { const result = computeQualityScore({ - buildSuccess: true, typeCheckErrors: 10, ghostSuccessRate: 1.0, durationSeconds: 60, + buildSuccess: true, + typeCheckErrors: 10, + ghostSuccessRate: 1.0, + durationSeconds: 60, }); expect(result.breakdown.typecheck).toBe(0.5); }); it('clamps typecheck score at 0 for >= 20 errors', () => { - const a = computeQualityScore({ buildSuccess: true, typeCheckErrors: 20, ghostSuccessRate: 1.0, durationSeconds: 60 }); - const b = computeQualityScore({ buildSuccess: true, typeCheckErrors: 50, ghostSuccessRate: 1.0, durationSeconds: 60 }); + const a = computeQualityScore({ + buildSuccess: true, + typeCheckErrors: 20, + ghostSuccessRate: 1.0, + durationSeconds: 60, + }); + const b = computeQualityScore({ + buildSuccess: true, + typeCheckErrors: 50, + ghostSuccessRate: 1.0, + durationSeconds: 60, + }); expect(a.breakdown.typecheck).toBe(0); expect(b.breakdown.typecheck).toBe(0); }); it('treats undefined ghost stories as 0', () => { - const a = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 0, durationSeconds: 60 }); + const a = computeQualityScore({ + buildSuccess: true, + typeCheckErrors: 0, + ghostSuccessRate: 0, + durationSeconds: 60, + }); const b = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, durationSeconds: 60 }); expect(a.score).toBe(b.score); }); it('performance: ≤120s scores 1.0', () => { - const a = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 0 }); - const b = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 120 }); + const a = computeQualityScore({ + buildSuccess: true, + typeCheckErrors: 0, + ghostSuccessRate: 1.0, + durationSeconds: 0, + }); + const b = computeQualityScore({ + buildSuccess: true, + typeCheckErrors: 0, + ghostSuccessRate: 1.0, + durationSeconds: 120, + }); expect(a.breakdown.performance).toBe(1); expect(b.breakdown.performance).toBe(1); }); it('performance: 360s scores 0.5', () => { - const r = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 360 }); + const r = computeQualityScore({ + buildSuccess: true, + typeCheckErrors: 0, + ghostSuccessRate: 1.0, + durationSeconds: 360, + }); expect(r.breakdown.performance).toBe(0.5); }); it('performance: ≥600s scores 0', () => { - const a = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 600 }); - const b = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 1000 }); + const a = computeQualityScore({ + buildSuccess: true, + typeCheckErrors: 0, + ghostSuccessRate: 1.0, + durationSeconds: 600, + }); + const b = computeQualityScore({ + buildSuccess: true, + typeCheckErrors: 0, + ghostSuccessRate: 1.0, + durationSeconds: 1000, + }); expect(a.breakdown.performance).toBe(0); expect(b.breakdown.performance).toBe(0); }); it('performance: undefined duration scores 0', () => { - const r = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0 }); + const r = computeQualityScore({ + buildSuccess: true, + typeCheckErrors: 0, + ghostSuccessRate: 1.0, + }); expect(r.breakdown.performance).toBe(0); }); }); @@ -167,7 +229,8 @@ describe('countTypeCheckErrors', () => { describe('parseChangedFiles', () => { it('parses added, modified, deleted, and renamed files', () => { - const output = 'A\tsrc/new-file.ts\nM\tsrc/existing.ts\nD\tsrc/removed.ts\nR100\told.ts\tnew.ts'; + const output = + 'A\tsrc/new-file.ts\nM\tsrc/existing.ts\nD\tsrc/removed.ts\nR100\told.ts\tnew.ts'; expect(parseChangedFiles(output)).toMatchObject([ { path: 'src/new-file.ts', status: 'A' }, { path: 'src/existing.ts', status: 'M' }, diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index e6e9f3db359c..14fcaece7c0b 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -1,10 +1,18 @@ -import { writeFile } from "node:fs/promises"; -import { join } from "node:path"; -import type { Grade, GhostStoryGrade, QualityScore, ScoreWeights, TrialWorkspace, FileChange, Logger } from "../types.ts"; -import { DEFAULT_SCORE_WEIGHTS } from "../types.ts"; -import { x } from "tinyexec"; -import { detectSetupPatterns } from "./setup-patterns.ts"; -import { findComponentCandidates, runGhostStories } from "./ghost-stories.ts"; +import { writeFile } from 'node:fs/promises'; +import { join } from 'node:path'; +import type { + Grade, + GhostStoryGrade, + QualityScore, + ScoreWeights, + TrialWorkspace, + FileChange, + Logger, +} from '../types.ts'; +import { DEFAULT_SCORE_WEIGHTS } from '../types.ts'; +import { x } from 'tinyexec'; +import { detectSetupPatterns } from './setup-patterns.ts'; +import { findComponentCandidates, runGhostStories } from './ghost-stories.ts'; /** Maximum TypeScript errors before the typecheck score reaches 0. */ const MAX_TYPECHECK_ERRORS = 20; @@ -16,11 +24,9 @@ const ZERO_SCORE_DURATION_S = 600; /** Filter file changes to only storybook-related ones. */ export function filterStorybookFiles(fileChanges: FileChange[]): FileChange[] { const isStorybookPath = (path?: string) => - path != null && (path.includes(".storybook/") || /\.(stories|story)\.[tj]sx?$/.test(path)); + path != null && (path.includes('.storybook/') || /\.(stories|story)\.[tj]sx?$/.test(path)); - return fileChanges.filter( - (f) => isStorybookPath(f.path) || isStorybookPath(f.previousPath), - ); + return fileChanges.filter((f) => isStorybookPath(f.path) || isStorybookPath(f.previousPath)); } /** @@ -37,20 +43,26 @@ export function computeQualityScore( ghostSuccessRate?: number; durationSeconds?: number; }, - weights: ScoreWeights = DEFAULT_SCORE_WEIGHTS, + weights: ScoreWeights = DEFAULT_SCORE_WEIGHTS ): QualityScore { const buildScore = opts.buildSuccess ? 1 : 0; const tcScore = Math.max(0, 1 - opts.typeCheckErrors / MAX_TYPECHECK_ERRORS); const ghostScore = opts.ghostSuccessRate ?? 0; const d = opts.durationSeconds; - const perfScore = d == null ? 0 : Math.max(0, Math.min(1, 1 - (d - PERFECT_DURATION_S) / (ZERO_SCORE_DURATION_S - PERFECT_DURATION_S))); + const perfScore = + d == null + ? 0 + : Math.max( + 0, + Math.min(1, 1 - (d - PERFECT_DURATION_S) / (ZERO_SCORE_DURATION_S - PERFECT_DURATION_S)) + ); const score = Math.round( (ghostScore * weights.ghostStories + buildScore * weights.build + tcScore * weights.typecheck + perfScore * weights.performance) * - 100, + 100 ) / 100; return { score, @@ -72,19 +84,21 @@ export function countTypeCheckErrors(tscOutput: string): number { export function parseChangedFiles(gitOutput: string): FileChange[] { return gitOutput .trim() - .split("\n") + .split('\n') .filter(Boolean) .map((line) => { - const [status, ...parts] = line.split("\t"); - const firstChar = status?.charAt(0) ?? ""; - const normalizedStatus = (["A", "M", "D", "R"].includes(firstChar) ? firstChar : "M") as FileChange["status"]; + const [status, ...parts] = line.split('\t'); + const firstChar = status?.charAt(0) ?? ''; + const normalizedStatus = ( + ['A', 'M', 'D', 'R'].includes(firstChar) ? firstChar : 'M' + ) as FileChange['status']; - if (normalizedStatus === "R" && parts.length >= 2) { + if (normalizedStatus === 'R' && parts.length >= 2) { const [previousPath, path] = parts; return { path, previousPath, status: normalizedStatus }; } - return { path: parts.join("\t"), status: normalizedStatus }; + return { path: parts.join('\t'), status: normalizedStatus }; }); } @@ -99,48 +113,59 @@ function truncateEnd(text: string, maxChars: number): string { export async function grade( workspace: TrialWorkspace, logger: Logger, - agentDuration?: number, + agentDuration?: number ): Promise<{ grade: Grade; score: QualityScore }> { const { repoRoot, projectPath, resultsDir, baselineCommit } = workspace; // Changed files - logger.logStep("Collecting agent changes..."); + logger.logStep('Collecting agent changes...'); const fileChanges = await getChangedFiles(repoRoot, baselineCommit); const storybookChanges = filterStorybookFiles(fileChanges); - logger.logSuccess(`${fileChanges.length} files changed (${storybookChanges.length} storybook-related)`); + logger.logSuccess( + `${fileChanges.length} files changed (${storybookChanges.length} storybook-related)` + ); // Setup patterns const setupPatterns = await detectSetupPatterns(projectPath); - if (setupPatterns.length > 0) logger.logSuccess(`Detected patterns: ${setupPatterns.map((p) => p.label).join(", ")}`); + if (setupPatterns.length > 0) + logger.logSuccess(`Detected patterns: ${setupPatterns.map((p) => p.label).join(', ')}`); // Storybook build + TypeScript check in parallel - logger.logStep("Running storybook build + typecheck..."); + logger.logStep('Running storybook build + typecheck...'); const [build, tsc] = await Promise.all([ - x("npx", ["storybook", "build", "--quiet"], { + x('npx', ['storybook', 'build', '--quiet'], { throwOnError: false, timeout: 300_000, nodeOptions: { cwd: projectPath, - env: { ...process.env, STORYBOOK_DISABLE_TELEMETRY: "1", NODE_OPTIONS: "--max_old_space_size=4096" }, + env: { + ...process.env, + STORYBOOK_DISABLE_TELEMETRY: '1', + NODE_OPTIONS: '--max_old_space_size=4096', + }, }, }), - x("npx", ["tsc", "--noEmit"], { throwOnError: false, timeout: 120_000, nodeOptions: { cwd: projectPath } }), + x('npx', ['tsc', '--noEmit'], { + throwOnError: false, + timeout: 120_000, + nodeOptions: { cwd: projectPath }, + }), ]); const buildSuccess = build.exitCode === 0; - const buildOutput = build.stdout + "\n" + build.stderr; - await writeFile(join(resultsDir, "build-output.txt"), buildOutput); + const buildOutput = build.stdout + '\n' + build.stderr; + await writeFile(join(resultsDir, 'build-output.txt'), buildOutput); if (buildSuccess) { - logger.logSuccess("Storybook build succeeded"); + logger.logSuccess('Storybook build succeeded'); } else { logger.logError(`Storybook build failed (exit ${build.exitCode})`); } - const tscOutput = tsc.stdout + "\n" + tsc.stderr; - await writeFile(join(resultsDir, "typecheck-output.txt"), tscOutput); + const tscOutput = tsc.stdout + '\n' + tsc.stderr; + await writeFile(join(resultsDir, 'typecheck-output.txt'), tscOutput); const typeCheckErrors = countTypeCheckErrors(tscOutput); if (typeCheckErrors === 0) { - logger.logSuccess("No TypeScript errors"); + logger.logSuccess('No TypeScript errors'); } else { logger.logError(`${typeCheckErrors} TypeScript error(s)`); } @@ -172,31 +197,41 @@ export async function grade( async function getChangedFiles(repoRoot: string, baseline: string): Promise { // Stage all files so `git diff --cached` picks up new files the agent created. // Safe: this runs on an ephemeral trial copy, not the real repo. - await x("git", ["add", "-A"], { nodeOptions: { cwd: repoRoot } }); - const { stdout } = await x("git", ["diff", "--cached", "--name-status", baseline], { + await x('git', ['add', '-A'], { nodeOptions: { cwd: repoRoot } }); + const { stdout } = await x('git', ['diff', '--cached', '--name-status', baseline], { throwOnError: false, nodeOptions: { cwd: repoRoot }, }); return parseChangedFiles(stdout); } -async function gradeGhostStories(projectPath: string, logger: Logger): Promise { - logger.logStep("Running ghost stories..."); +async function gradeGhostStories( + projectPath: string, + logger: Logger +): Promise { + logger.logStep('Running ghost stories...'); try { const candidates = await findComponentCandidates({ sampleSize: 20, cwd: projectPath }); if (candidates.length === 0) { - logger.logError("No candidate components found"); + logger.logError('No candidate components found'); return undefined; } logger.logStep(`Found ${candidates.length} candidate component(s)`); const result = await runGhostStories(candidates, { cwd: projectPath }); if (result.total > 0) { - logger.logSuccess(`Ghost stories: ${result.passed}/${result.total} passed (${Math.round(result.successRate * 100)}%)`); + logger.logSuccess( + `Ghost stories: ${result.passed}/${result.total} passed (${Math.round(result.successRate * 100)}%)` + ); } - return { candidateCount: candidates.length, total: result.total, passed: result.passed, successRate: result.successRate }; + return { + candidateCount: candidates.length, + total: result.total, + passed: result.passed, + successRate: result.successRate, + }; } catch (error) { logger.logError(`Ghost stories: ${error instanceof Error ? error.message : String(error)}`); return undefined; diff --git a/scripts/eval/lib/grading-helpers.test.ts b/scripts/eval/lib/grading-helpers.test.ts index b46b6df822db..c002b9fea4fa 100644 --- a/scripts/eval/lib/grading-helpers.test.ts +++ b/scripts/eval/lib/grading-helpers.test.ts @@ -107,7 +107,12 @@ describe('grading helpers', () => { expect(changedFiles).toHaveLength(storybookFiles.length + 1); // Step 4: Build passed, no TS errors, 100% ghost stories, fast agent → perfect score - const quality = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 60 }); + const quality = computeQualityScore({ + buildSuccess: true, + typeCheckErrors: 0, + ghostSuccessRate: 1.0, + durationSeconds: 60, + }); expect(quality.score).toBe(1); }); @@ -134,12 +139,17 @@ describe('grading helpers', () => { const tscLines = candidates.map( (c, i) => `${c}(${i + 1},1): error TS2304: Cannot find name 'React'.` ); - tscLines.push("src/App.tsx(10,5): error TS2345: Argument not assignable."); + tscLines.push('src/App.tsx(10,5): error TS2345: Argument not assignable.'); const errorCount = countTypeCheckErrors(tscLines.join('\n')); expect(errorCount).toBe(candidates.length + 1); // Build failed, no ghost stories, errors, slow → low quality - const quality = computeQualityScore({ buildSuccess: false, typeCheckErrors: errorCount, ghostSuccessRate: 0, durationSeconds: 600 }); + const quality = computeQualityScore({ + buildSuccess: false, + typeCheckErrors: errorCount, + ghostSuccessRate: 0, + durationSeconds: 600, + }); expect(quality.score).toBeLessThan(0.3); expect(quality.breakdown.build).toBe(0); }); @@ -166,14 +176,19 @@ describe('grading helpers', () => { expect(patterns.map((p) => p.id)).toContain('router-provider'); // Agent wrote one story per candidate — all storybook-related - const gitOutput = candidates - .map((c) => `A\t${c.replace(/\.tsx$/, '.stories.tsx')}`) - .join('\n'); + const gitOutput = candidates.map((c) => `A\t${c.replace(/\.tsx$/, '.stories.tsx')}`).join('\n'); const storybookFiles = filterStorybookFiles(parseChangedFiles(gitOutput)); expect(storybookFiles).toHaveLength(candidates.length); // Clean build + 100% ghost stories + fast → perfect - expect(computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, ghostSuccessRate: 1.0, durationSeconds: 60 }).score).toBe(1); + expect( + computeQualityScore({ + buildSuccess: true, + typeCheckErrors: 0, + ghostSuccessRate: 1.0, + durationSeconds: 60, + }).score + ).toBe(1); }); }); diff --git a/scripts/eval/lib/package-manager.ts b/scripts/eval/lib/package-manager.ts index c2993f004f27..9112e0834fd1 100644 --- a/scripts/eval/lib/package-manager.ts +++ b/scripts/eval/lib/package-manager.ts @@ -4,29 +4,33 @@ * Used by trial preparation and any other eval flows that need a * package-manager-aware install step. */ -import { existsSync } from "node:fs"; -import { join } from "node:path"; -import { x } from "tinyexec"; -import type { Logger } from "../types.ts"; +import { existsSync } from 'node:fs'; +import { join } from 'node:path'; +import { x } from 'tinyexec'; +import type { Logger } from '../types.ts'; /** Detect the package manager from lock files in a directory. */ export function detectPackageManager(dir: string): string { - if (existsSync(join(dir, "pnpm-lock.yaml")) || existsSync(join(dir, "pnpm-workspace.yaml"))) return "pnpm"; - if (existsSync(join(dir, "yarn.lock"))) return "yarn"; - if (existsSync(join(dir, "bun.lockb")) || existsSync(join(dir, "bun.lock"))) return "bun"; - return "npm"; + if (existsSync(join(dir, 'pnpm-lock.yaml')) || existsSync(join(dir, 'pnpm-workspace.yaml'))) + return 'pnpm'; + if (existsSync(join(dir, 'yarn.lock'))) return 'yarn'; + if (existsSync(join(dir, 'bun.lockb')) || existsSync(join(dir, 'bun.lock'))) return 'bun'; + return 'npm'; } function getInstallArgs(pm: string, dir: string): [string, string[]] { switch (pm) { - case "pnpm": - return ["pnpm", ["install", "--no-frozen-lockfile"]]; - case "yarn": - return ["yarn", existsSync(join(dir, ".yarnrc.yml")) ? ["install", "--no-immutable"] : ["install"]]; - case "bun": - return ["bun", ["install"]]; + case 'pnpm': + return ['pnpm', ['install', '--no-frozen-lockfile']]; + case 'yarn': + return [ + 'yarn', + existsSync(join(dir, '.yarnrc.yml')) ? ['install', '--no-immutable'] : ['install'], + ]; + case 'bun': + return ['bun', ['install']]; default: - return ["npm", ["install", "--ignore-scripts"]]; + return ['npm', ['install', '--ignore-scripts']]; } } @@ -34,7 +38,7 @@ function getInstallArgs(pm: string, dir: string): [string, string[]] { export async function installDeps( dir: string, logger: Logger, - env?: Record, + env?: Record ): Promise { const pm = detectPackageManager(dir); const [cmd, args] = getInstallArgs(pm, dir); diff --git a/scripts/eval/lib/prepare-trial.ts b/scripts/eval/lib/prepare-trial.ts index 86db0f7f6107..58a75bb29c8f 100644 --- a/scripts/eval/lib/prepare-trial.ts +++ b/scripts/eval/lib/prepare-trial.ts @@ -1,42 +1,48 @@ -import { existsSync } from "node:fs"; -import { cp, mkdir } from "node:fs/promises"; -import { join } from "node:path"; -import type { Project, TrialWorkspace, Logger } from "../types.ts"; -import { x } from "tinyexec"; -import { installDeps } from "./package-manager.ts"; -import { CACHE_DIR, TRIALS_DIR } from "./utils.ts"; +import { existsSync } from 'node:fs'; +import { cp, mkdir } from 'node:fs/promises'; +import { join } from 'node:path'; +import type { Project, TrialWorkspace, Logger } from '../types.ts'; +import { x } from 'tinyexec'; +import { installDeps } from './package-manager.ts'; +import { CACHE_DIR, TRIALS_DIR } from './utils.ts'; /** * First run: clone eval-baseline -> install deps -> cache it. * Subsequent runs: copy from cache. Agent starts immediately. */ -export async function prepareTrial(project: Project, trialId: string, logger: Logger): Promise { +export async function prepareTrial( + project: Project, + trialId: string, + logger: Logger +): Promise { const cacheDir = join(CACHE_DIR, project.name); const trialDir = join(TRIALS_DIR, trialId); - const repoRoot = join(trialDir, "project"); + const repoRoot = join(trialDir, 'project'); await mkdir(trialDir, { recursive: true }); - if (existsSync(join(cacheDir, ".git"))) { - logger.logStep("Copying from cache..."); + if (existsSync(join(cacheDir, '.git'))) { + logger.logStep('Copying from cache...'); await cp(cacheDir, repoRoot, { recursive: true }); } else { logger.logStep(`Cloning ${project.repo}#${project.branch}...`); await mkdir(CACHE_DIR, { recursive: true }); - await x("git", ["clone", "--depth", "1", "--branch", project.branch, project.repo, repoRoot], { + await x('git', ['clone', '--depth', '1', '--branch', project.branch, project.repo, repoRoot], { timeout: 120_000, }); const projectPath = project.projectDir ? join(repoRoot, project.projectDir) : repoRoot; await installDeps(projectPath, logger); - logger.logSuccess("Dependencies installed"); - logger.logStep("Caching for future runs..."); + logger.logSuccess('Dependencies installed'); + logger.logStep('Caching for future runs...'); await cp(repoRoot, cacheDir, { recursive: true }); } - const baselineCommit = (await x("git", ["rev-parse", "HEAD"], { nodeOptions: { cwd: repoRoot } })).stdout.trim(); + const baselineCommit = ( + await x('git', ['rev-parse', 'HEAD'], { nodeOptions: { cwd: repoRoot } }) + ).stdout.trim(); const projectPath = project.projectDir ? join(repoRoot, project.projectDir) : repoRoot; - const resultsDir = join(trialDir, "results"); + const resultsDir = join(trialDir, 'results'); await mkdir(resultsDir, { recursive: true }); - logger.logSuccess("Trial ready"); + logger.logSuccess('Trial ready'); return { trialDir, repoRoot, projectPath, resultsDir, baselineCommit }; } diff --git a/scripts/eval/lib/run-trial.test.ts b/scripts/eval/lib/run-trial.test.ts index 362bc458db24..33962f2deb05 100644 --- a/scripts/eval/lib/run-trial.test.ts +++ b/scripts/eval/lib/run-trial.test.ts @@ -82,9 +82,14 @@ function setupMocks(overrides?: { { path: '.storybook/preview.tsx', status: 'A' }, { path: 'src/Button.stories.tsx', status: 'A' }, ], - setupPatterns: [{ id: 'tailwind', label: 'Tailwind CSS', sourceFiles: ['.storybook/preview.ts'] }], + setupPatterns: [ + { id: 'tailwind', label: 'Tailwind CSS', sourceFiles: ['.storybook/preview.ts'] }, + ], + }, + score: { + score: buildSuccess ? 1 : 0.3, + breakdown: { build: buildSuccess ? 1 : 0, typecheck: 1, ghostStories: 0, performance: 0 }, }, - score: { score: buildSuccess ? 1 : 0.3, breakdown: { build: buildSuccess ? 1 : 0, typecheck: 1, ghostStories: 0, performance: 0 } }, }); } @@ -126,7 +131,11 @@ describe('runTrial pipeline', () => { const config: TrialConfig = { ...baseConfig, - project: { name: 'mealdrop', repo: 'https://github.com/test/mealdrop', branch: 'eval-baseline' }, + project: { + name: 'mealdrop', + repo: 'https://github.com/test/mealdrop', + branch: 'eval-baseline', + }, }; await runTrial(config); @@ -140,7 +149,7 @@ describe('runTrial pipeline', () => { expect(vi.mocked(captureEnvironment).mock.calls[0][0]).toBe(join(TMP, 'results')); - const params = vi.mocked(claudeAgent.execute).mock.calls[0][0] as Record; + const params = vi.mocked(claudeAgent.execute).mock.calls[0][0]; expect(params).toMatchObject({ prompt: expect.stringContaining('Storybook setup'), projectPath: TMP, @@ -165,7 +174,9 @@ describe('runTrial pipeline', () => { const resultsDir = join(TMP, 'results'); - const summary: TrialReport = JSON.parse(readFileSync(join(resultsDir, 'summary.json'), 'utf-8')); + const summary: TrialReport = JSON.parse( + readFileSync(join(resultsDir, 'summary.json'), 'utf-8') + ); expect(summary).toMatchObject({ schemaVersion: 1, execution: { cost: 0.42 }, diff --git a/scripts/eval/lib/run-trial.ts b/scripts/eval/lib/run-trial.ts index f874188ee01b..4d656adb389b 100644 --- a/scripts/eval/lib/run-trial.ts +++ b/scripts/eval/lib/run-trial.ts @@ -1,11 +1,11 @@ -import { writeFile } from "node:fs/promises"; -import { join } from "node:path"; -import type { AgentId, Logger, TrialConfig, TrialReport, AgentDriver } from "../types.ts"; -import { claudeAgent } from "./agents/claude-code.ts"; -import { codexAgent } from "./agents/codex.ts"; -import { prepareTrial } from "./prepare-trial.ts"; -import { grade } from "./grade.ts"; -import { generateTrialId, loadPrompt, captureEnvironment, createLogger } from "./utils.ts"; +import { writeFile } from 'node:fs/promises'; +import { join } from 'node:path'; +import type { AgentId, Logger, TrialConfig, TrialReport, AgentDriver } from '../types.ts'; +import { claudeAgent } from './agents/claude-code.ts'; +import { codexAgent } from './agents/codex.ts'; +import { prepareTrial } from './prepare-trial.ts'; +import { grade } from './grade.ts'; +import { generateTrialId, loadPrompt, captureEnvironment, createLogger } from './utils.ts'; const drivers: Record = { claude: claudeAgent, @@ -15,14 +15,11 @@ const drivers: Record = { /** * Run a full eval trial: prepare -> execute agent -> grade -> save. */ -export async function runTrial( - config: TrialConfig, - logger?: Logger, -): Promise { +export async function runTrial(config: TrialConfig, logger?: Logger): Promise { const { project, variant, prompt: promptName } = config; const { agent: agentName, model } = variant; const log = logger ?? createLogger(); - const trialId = generateTrialId(project.name, agentName, model, promptName || "setup"); + const trialId = generateTrialId(project.name, agentName, model, promptName || 'setup'); const timestamp = new Date().toISOString(); log.log(`Preparing ${project.name}...`); @@ -35,7 +32,7 @@ export async function runTrial( // 3. Load the prompt const prompt = loadPrompt(promptName); - await writeFile(join(workspace.resultsDir, "prompt.md"), prompt); + await writeFile(join(workspace.resultsDir, 'prompt.md'), prompt); // 4. Execute the agent log.log(` Running ${agentName} (${model}, effort=${variant.effort})...`); @@ -48,7 +45,7 @@ export async function runTrial( logger: log, }); log.logSuccess( - `Agent completed (${Math.round(execution.duration)}s, ${execution.cost ? `$${execution.cost.toFixed(2)}` : "cost N/A"}, ${execution.turns} turns)`, + `Agent completed (${Math.round(execution.duration)}s, ${execution.cost ? `$${execution.cost.toFixed(2)}` : 'cost N/A'}, ${execution.turns} turns)` ); // 5. Grade the results (pass agent duration for performance scoring) @@ -60,14 +57,14 @@ export async function runTrial( project, variant, timestamp, - prompt: promptName || "setup", + prompt: promptName || 'setup', baselineCommit: workspace.baselineCommit, execution, grade: trialGrade, score, }; - await writeFile(join(workspace.resultsDir, "summary.json"), JSON.stringify(report, null, 2)); + await writeFile(join(workspace.resultsDir, 'summary.json'), JSON.stringify(report, null, 2)); log.logSuccess(`Results saved to ${workspace.resultsDir}`); return report; diff --git a/scripts/eval/lib/setup-patterns.test.ts b/scripts/eval/lib/setup-patterns.test.ts index 47ca1d9a3dcb..e88ae3916ce9 100644 --- a/scripts/eval/lib/setup-patterns.test.ts +++ b/scripts/eval/lib/setup-patterns.test.ts @@ -53,10 +53,7 @@ describe('detectSetupPatterns', () => { }); it('detects Redux provider', async () => { - writeConfig( - 'preview.tsx', - `import { Provider } from 'react-redux';\n` - ); + writeConfig('preview.tsx', `import { Provider } from 'react-redux';\n`); expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('redux-provider'); }); @@ -117,10 +114,7 @@ describe('detectSetupPatterns', () => { it('does not detect patterns in files outside .storybook/', async () => { // Write a router import in a source file, not in .storybook/ mkdirSync(join(TMP, 'src'), { recursive: true }); - writeFileSync( - join(TMP, 'src', 'App.tsx'), - `import { BrowserRouter } from 'react-router-dom';` - ); + writeFileSync(join(TMP, 'src', 'App.tsx'), `import { BrowserRouter } from 'react-router-dom';`); // .storybook/ has no patterns writeConfig('main.ts', `export default { stories: ['../src/**/*.stories.tsx'] };`); diff --git a/scripts/eval/lib/setup-patterns.ts b/scripts/eval/lib/setup-patterns.ts index f6ade6a64a8e..037c7362015f 100644 --- a/scripts/eval/lib/setup-patterns.ts +++ b/scripts/eval/lib/setup-patterns.ts @@ -1,24 +1,44 @@ -import { readFile, readdir } from "node:fs/promises"; -import { existsSync } from "node:fs"; -import { join, relative } from "node:path"; -import type { SetupPattern } from "../types.ts"; +import { readFile, readdir } from 'node:fs/promises'; +import { existsSync } from 'node:fs'; +import { join, relative } from 'node:path'; +import type { SetupPattern } from '../types.ts'; const RULES = [ - { id: "global-css", label: "Global CSS import", pattern: /import\s+['"][^'"]+\.(css|scss|sass|less)['"]|import\s+['"]tailwindcss/ }, - { id: "tailwind", label: "Tailwind CSS", pattern: /@tailwind|tailwindcss|tailwind\.css/ }, - { id: "styled-components", label: "Styled Components", pattern: /styled-components|createGlobalStyle/ }, - { id: "router-provider", label: "React Router", pattern: /MemoryRouter|BrowserRouter|RouterProvider/ }, - { id: "redux-provider", label: "Redux Provider", pattern: /react-redux.*Provider| { - const dir = join(projectPath, ".storybook"); + const dir = join(projectPath, '.storybook'); if (!existsSync(dir)) return []; // Read all entries recursively, then attempt to read each as a file @@ -27,11 +47,11 @@ export async function detectSetupPatterns(projectPath: string): Promise { const fullPath = join(dir, entry); try { - return { path: fullPath, content: await readFile(fullPath, "utf-8") }; + return { path: fullPath, content: await readFile(fullPath, 'utf-8') }; } catch { return null; // directories or unreadable files } - }), + }) ); const files = fileContents.filter((f): f is { path: string; content: string } => f !== null); diff --git a/scripts/eval/lib/utils.test.ts b/scripts/eval/lib/utils.test.ts index a230b00740bb..55f68bbc034d 100644 --- a/scripts/eval/lib/utils.test.ts +++ b/scripts/eval/lib/utils.test.ts @@ -1,6 +1,13 @@ import { describe, expect, it } from 'vitest'; -import { formatDuration, formatCost, generateTrialId, loadPrompt, listPrompts, formatTable } from './utils'; +import { + formatDuration, + formatCost, + generateTrialId, + loadPrompt, + listPrompts, + formatTable, +} from './utils'; describe('formatDuration', () => { it('formats seconds under a minute', () => { @@ -103,7 +110,10 @@ describe('formatTable', () => { it('formats a simple table with aligned columns', () => { const result = formatTable( ['Name', 'Score'], - [['Alice', '100'], ['Bob', '95']], + [ + ['Alice', '100'], + ['Bob', '95'], + ] ); const lines = result.split('\n'); expect(lines).toHaveLength(4); // header + divider + 2 rows @@ -115,10 +125,7 @@ describe('formatTable', () => { }); it('auto-sizes columns to fit content', () => { - const result = formatTable( - ['X', 'Y'], - [['short', 'a-much-longer-value']], - ); + const result = formatTable(['X', 'Y'], [['short', 'a-much-longer-value']]); const lines = result.split('\n'); // Header column for Y should be padded to match the data width const headerCols = lines[0].split(' | '); @@ -128,10 +135,7 @@ describe('formatTable', () => { it('handles ANSI escape codes in cells', () => { const green = '\x1b[32mPASS\x1b[39m'; - const result = formatTable( - ['Status'], - [[green], ['FAIL']], - ); + const result = formatTable(['Status'], [[green], ['FAIL']]); const lines = result.split('\n'); // Both rows should be the same visible width // The ANSI row has extra invisible chars but should still align diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts index ca2ee217cfc6..1439edef3f38 100644 --- a/scripts/eval/lib/utils.ts +++ b/scripts/eval/lib/utils.ts @@ -1,25 +1,25 @@ -import { readFileSync, existsSync, readdirSync } from "node:fs"; -import { writeFile } from "node:fs/promises"; -import { resolve, basename, join } from "node:path"; -import pc from "picocolors"; -import { x } from "tinyexec"; -import type { Logger } from "../types.ts"; - -export const REPO_ROOT = resolve(import.meta.dirname, "..", "..", ".."); -export const EVAL_ROOT = resolve(REPO_ROOT, "..", "storybook-eval"); -export const CACHE_DIR = resolve(EVAL_ROOT, ".cache", "repos"); -export const TRIALS_DIR = resolve(EVAL_ROOT, "trials"); -export const PROMPTS_DIR = resolve(import.meta.dirname, "..", "prompts"); +import { readFileSync, existsSync, readdirSync } from 'node:fs'; +import { writeFile } from 'node:fs/promises'; +import { resolve, basename, join } from 'node:path'; +import pc from 'picocolors'; +import { x } from 'tinyexec'; +import type { Logger } from '../types.ts'; + +export const REPO_ROOT = resolve(import.meta.dirname, '..', '..', '..'); +export const EVAL_ROOT = resolve(REPO_ROOT, '..', 'storybook-eval'); +export const CACHE_DIR = resolve(EVAL_ROOT, '.cache', 'repos'); +export const TRIALS_DIR = resolve(EVAL_ROOT, 'trials'); +export const PROMPTS_DIR = resolve(import.meta.dirname, '..', 'prompts'); // --- Logging --- export function createLogger(prefix?: string): Logger { - const p = prefix ? pc.dim(`[${prefix}]`) + " " : ""; + const p = prefix ? pc.dim(`[${prefix}]`) + ' ' : ''; return { log: (msg: string) => console.log(`${p}${msg}`), - logStep: (msg: string) => console.log(`${p} ${pc.cyan(">")} ${msg}`), - logSuccess: (msg: string) => console.log(`${p} ${pc.green("✓")} ${msg}`), - logError: (msg: string) => console.log(`${p} ${pc.red("✗")} ${msg}`), + logStep: (msg: string) => console.log(`${p} ${pc.cyan('>')} ${msg}`), + logSuccess: (msg: string) => console.log(`${p} ${pc.green('✓')} ${msg}`), + logError: (msg: string) => console.log(`${p} ${pc.red('✗')} ${msg}`), }; } @@ -28,54 +28,54 @@ export function createLogger(prefix?: string): Logger { export const formatDuration = (s: number) => s < 60 ? `${Math.round(s)}s` : `${Math.floor(s / 60)}m${Math.round(s % 60)}s`; -export const formatCost = (cost?: number) => (cost == null ? "-" : `$${cost.toFixed(2)}`); +export const formatCost = (cost?: number) => (cost == null ? '-' : `$${cost.toFixed(2)}`); export function generateTrialId(project: string, agent: string, model: string, prompt: string) { - const ts = new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19); + const ts = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19); return `${ts}-${project}-${agent}-${model}-${prompt}-${crypto.randomUUID().slice(0, 8)}`; } // --- Table formatting --- /** Strip ANSI escape codes for accurate width calculation. */ -const stripAnsi = (str: string) => str.replace(/\x1b\[[0-9;]*m/g, ""); +const stripAnsi = (str: string) => str.replace(/\x1b\[[0-9;]*m/g, ''); /** Format data as an aligned table with automatic column widths. */ export function formatTable(headers: string[], rows: string[][]): string { const widths = headers.map((h, i) => - Math.max(h.length, ...rows.map((r) => stripAnsi(r[i] ?? "").length)), + Math.max(h.length, ...rows.map((r) => stripAnsi(r[i] ?? '').length)) ); const pad = (str: string, width: number) => { const visible = stripAnsi(str).length; - return str + " ".repeat(Math.max(0, width - visible)); + return str + ' '.repeat(Math.max(0, width - visible)); }; - const sep = " | "; + const sep = ' | '; return [ headers.map((h, i) => pad(h, widths[i])).join(sep), - widths.map((w) => "-".repeat(w)).join("-+-"), + widths.map((w) => '-'.repeat(w)).join('-+-'), ...rows.map((row) => row.map((cell, i) => pad(cell, widths[i])).join(sep)), - ].join("\n"); + ].join('\n'); } // --- Prompts --- /** Load a prompt by name from prompts/{name}.md. */ -export function loadPrompt(name = "setup"): string { +export function loadPrompt(name = 'setup'): string { const file = resolve(PROMPTS_DIR, `${name}.md`); if (!existsSync(file)) { - throw new Error(`Prompt not found: ${file}\nAvailable: ${listPrompts().join(", ")}`); + throw new Error(`Prompt not found: ${file}\nAvailable: ${listPrompts().join(', ')}`); } - return readFileSync(file, "utf-8").trim(); + return readFileSync(file, 'utf-8').trim(); } /** List available prompt names. */ export function listPrompts(): string[] { if (!existsSync(PROMPTS_DIR)) return []; return readdirSync(PROMPTS_DIR) - .filter((f) => f.endsWith(".md")) - .map((f) => basename(f, ".md")); + .filter((f) => f.endsWith('.md')) + .map((f) => basename(f, '.md')); } // --- Environment capture --- @@ -89,15 +89,15 @@ export interface EvalEnvironment { } export async function captureEnvironment(resultsDir: string): Promise { - let evalBranch = "unknown"; - let evalCommit = "unknown"; + let evalBranch = 'unknown'; + let evalCommit = 'unknown'; try { - evalBranch = (await x("git", ["rev-parse", "--abbrev-ref", "HEAD"])).stdout.trim(); - evalCommit = (await x("git", ["rev-parse", "HEAD"])).stdout.trim(); + evalBranch = (await x('git', ['rev-parse', '--abbrev-ref', 'HEAD'])).stdout.trim(); + evalCommit = (await x('git', ['rev-parse', 'HEAD'])).stdout.trim(); } catch { /* not in a git repo */ } const env: EvalEnvironment = { nodeVersion: process.version, evalBranch, evalCommit }; - await writeFile(join(resultsDir, "environment.json"), JSON.stringify(env, null, 2)); + await writeFile(join(resultsDir, 'environment.json'), JSON.stringify(env, null, 2)); return env; } diff --git a/scripts/eval/types.test.ts b/scripts/eval/types.test.ts index eca9509fc7e6..1af38c595900 100644 --- a/scripts/eval/types.test.ts +++ b/scripts/eval/types.test.ts @@ -21,7 +21,7 @@ describe('AGENTS', () => { defaultModel: 'sonnet-4.6', defaultEffort: 'high', sdkModelIds: Object.fromEntries( - AGENTS.claude.models.map((model) => [model, expect.any(String)]), + AGENTS.claude.models.map((model) => [model, expect.any(String)]) ), }); }); @@ -38,7 +38,7 @@ describe('AGENTS', () => { cachedInput: expect.any(Number), output: expect.any(Number), }, - ]), + ]) ), }); }); diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index dbf90db52c7e..8c82e3edeefd 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -16,17 +16,17 @@ export interface Logger { // --- Agent --- -export type ClaudeModel = "sonnet-4.6" | "opus-4.6" | "haiku-4.5"; -export type CodexModel = "gpt-5.4"; -export type ClaudeEffort = "low" | "medium" | "high" | "max"; -export type CodexEffort = "low" | "medium" | "high" | "xhigh"; +export type ClaudeModel = 'sonnet-4.6' | 'opus-4.6' | 'haiku-4.5'; +export type CodexModel = 'gpt-5.4'; +export type ClaudeEffort = 'low' | 'medium' | 'high' | 'max'; +export type CodexEffort = 'low' | 'medium' | 'high' | 'xhigh'; /** Agent + model + effort — the three values that define how the agent runs. */ export type AgentVariant = - | { agent: "claude"; model: ClaudeModel; effort: ClaudeEffort } - | { agent: "codex"; model: CodexModel; effort: CodexEffort }; + | { agent: 'claude'; model: ClaudeModel; effort: ClaudeEffort } + | { agent: 'codex'; model: CodexModel; effort: CodexEffort }; -export type AgentId = AgentVariant["agent"]; +export type AgentId = AgentVariant['agent']; export interface AgentExecuteParams { prompt: string; @@ -87,7 +87,7 @@ export interface Execution { export interface FileChange { path: string; - status: "A" | "M" | "D" | "R"; + status: 'A' | 'M' | 'D' | 'R'; /** For renames, the original path before the move. */ previousPath?: string; } From 35561d2bf257f31337d22e77dfe05eb4b68a8ddd Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Mon, 30 Mar 2026 18:27:40 +0700 Subject: [PATCH 47/63] Eval: import ghost stories from core-server, grade without empty renders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Export `getComponentCandidates` and `runGhostStories` from `storybook/internal/core-server` so eval uses the real implementation instead of a duplicate. - Delete `scripts/eval/lib/ghost-stories.ts` — no more wrapper file. - Grade ghost stories using `successRateWithoutEmptyRender` so components that render an empty DOM don't count as passing. - Simplify eval CLI to single-trial runs with Zod discriminated union. - Derive agent model/effort types from const arrays. - Expand setup prompt with step-by-step instructions and self-healing loop. - Remove unused self-heal prompt. --- code/core/src/core-server/index.ts | 3 + scripts/eval/config.ts | 21 +- scripts/eval/eval.ts | 240 ++++++++--------------- scripts/eval/lib/ghost-stories.ts | 127 ------------ scripts/eval/lib/grade.ts | 23 ++- scripts/eval/lib/grading-helpers.test.ts | 4 +- scripts/eval/lib/utils.test.ts | 11 +- scripts/eval/prompts/self-heal.md | 11 -- scripts/eval/prompts/setup.md | 207 +++++++++++++++++-- scripts/eval/types.ts | 29 ++- scripts/package.json | 1 + scripts/tsconfig.json | 1 + yarn.lock | 1 + 13 files changed, 330 insertions(+), 349 deletions(-) delete mode 100644 scripts/eval/lib/ghost-stories.ts delete mode 100644 scripts/eval/prompts/self-heal.md diff --git a/code/core/src/core-server/index.ts b/code/core/src/core-server/index.ts index f475fa6166ca..b1669cb685c1 100644 --- a/code/core/src/core-server/index.ts +++ b/code/core/src/core-server/index.ts @@ -32,3 +32,6 @@ export { } from './stores/test-provider'; export { getServerPort } from './utils/server-address'; + +export { getComponentCandidates } from './utils/ghost-stories/get-candidates'; +export { runGhostStories } from './utils/ghost-stories/run-story-tests'; diff --git a/scripts/eval/config.ts b/scripts/eval/config.ts index 6fcd8596229f..12582790b9fe 100644 --- a/scripts/eval/config.ts +++ b/scripts/eval/config.ts @@ -5,7 +5,14 @@ * and cost estimation utilities. */ -import type { AgentId, Project } from './types.ts'; +import { + CLAUDE_MODELS, + CODEX_MODELS, + CLAUDE_EFFORTS, + CODEX_EFFORTS, + type AgentId, + type Project, +} from './types.ts'; // --- Pricing --- @@ -24,19 +31,19 @@ export interface TokenUsage { // --- Agent Definition --- export interface AgentDefinition { - models: string[]; + models: readonly string[]; defaultModel: string; /** Map friendly model names to SDK-specific model IDs (e.g. "sonnet-4.6" → "claude-sonnet-4-6"). */ sdkModelIds: Record; /** Per-million-token pricing for manual cost estimation (agents that don't report cost natively). */ pricing: Record; - efforts: string[]; + efforts: readonly string[]; defaultEffort: string; } export const AGENTS: Record = { claude: { - models: ['sonnet-4.6', 'opus-4.6', 'haiku-4.5'], + models: CLAUDE_MODELS, defaultModel: 'sonnet-4.6', sdkModelIds: { 'sonnet-4.6': 'claude-sonnet-4-6', @@ -44,17 +51,17 @@ export const AGENTS: Record = { 'haiku-4.5': 'claude-haiku-4-5', }, pricing: {}, - efforts: ['low', 'medium', 'high', 'max'], + efforts: CLAUDE_EFFORTS, defaultEffort: 'high', }, codex: { - models: ['gpt-5.4'], + models: CODEX_MODELS, defaultModel: 'gpt-5.4', sdkModelIds: {}, pricing: { 'gpt-5.4': { input: 2.5, cachedInput: 0.625, output: 10.0 }, }, - efforts: ['low', 'medium', 'high', 'xhigh'], + efforts: CODEX_EFFORTS, defaultEffort: 'high', }, }; diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 9c2e33de2e99..4ca81dc856fd 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -1,56 +1,79 @@ /** - * Eval harness entry point — single or parallel trial runs. + * Eval harness entry point. * * Runs with `node ./eval/eval.ts` (no jiti). Node 22+ supports .ts natively * via type stripping. Import specifiers use explicit .ts extensions. * * Usage: - * node eval/eval.ts -p mealdrop # single run (claude, default model) - * node eval/eval.ts -p mealdrop -m gpt-5.4 # single run (agent inferred from model) - * node eval/eval.ts -p mealdrop -m sonnet-4.6 -m gpt-5.4 # parallel runs - * node eval/eval.ts -p mealdrop -a claude -a codex # parallel runs (default model each) - * node eval/eval.ts --list-projects # list projects - * node eval/eval.ts --list-models # list models - * node eval/eval.ts --list-prompts # list prompts + * node eval/eval.ts -p mealdrop # claude defaults + * node eval/eval.ts -p mealdrop -a codex # codex defaults + * node eval/eval.ts -p mealdrop -m gpt-5.4 # codex (inferred) + * node eval/eval.ts -p mealdrop -a claude -e max # claude with max effort + * node eval/eval.ts --list-projects + * node eval/eval.ts --list-models + * node eval/eval.ts --list-prompts */ import { parseArgs } from 'node:util'; import { z } from 'zod'; -import { randomUUID } from 'node:crypto'; import pc from 'picocolors'; -import type { AgentId, TrialConfig, TrialReport } from './types.ts'; +import { + AGENT_IDS, + CLAUDE_MODELS, + CLAUDE_EFFORTS, + CODEX_MODELS, + CODEX_EFFORTS, + type AgentId, + type AgentVariant, + type TrialConfig, +} from './types.ts'; import { AGENTS, PROJECTS } from './config.ts'; import { runTrial } from './lib/run-trial.ts'; -import { createLogger, formatDuration, formatCost, formatTable, listPrompts } from './lib/utils.ts'; +import { createLogger, formatDuration, formatCost, listPrompts } from './lib/utils.ts'; -// --- Derive valid options from config --- +// --- Helpers --- const PROJECT_NAMES = PROJECTS.map((p) => p.name) as [string, ...string[]]; -const AGENT_NAMES = Object.keys(AGENTS) as [string, ...string[]]; -const ALL_MODELS = Object.values(AGENTS).flatMap((a) => a.models) as [string, ...string[]]; -const ALL_EFFORTS = [...new Set(Object.values(AGENTS).flatMap((a) => a.efforts))] as [ - string, - ...string[], -]; -// --- Parse & validate CLI args --- +function inferAgent(model: string): AgentId { + for (const id of AGENT_IDS) { + if (AGENTS[id].models.includes(model)) return id; + } + throw new Error(`No agent found for model: ${model}`); +} -const argsSchema = z.object({ +// --- CLI schema: base options + discriminated union on agent --- + +const base = { project: z.enum(PROJECT_NAMES).optional(), - agent: z.array(z.enum(AGENT_NAMES)).optional(), - model: z.array(z.enum(ALL_MODELS)).optional(), - effort: z.enum(ALL_EFFORTS).optional(), prompt: z.string().default('setup'), verbose: z.boolean().default(false), listProjects: z.boolean().default(false), listModels: z.boolean().default(false), listPrompts: z.boolean().default(false), -}); +}; + +const argsSchema = z.discriminatedUnion('agent', [ + z.object({ + ...base, + agent: z.literal('claude'), + model: z.enum(CLAUDE_MODELS).default('sonnet-4.6'), + effort: z.enum(CLAUDE_EFFORTS).default('high'), + }), + z.object({ + ...base, + agent: z.literal('codex'), + model: z.enum(CODEX_MODELS).default('gpt-5.4'), + effort: z.enum(CODEX_EFFORTS).default('high'), + }), +]); + +// --- Parse CLI --- const { values } = parseArgs({ options: { project: { type: 'string', short: 'p' }, - agent: { type: 'string', short: 'a', multiple: true }, - model: { type: 'string', short: 'm', multiple: true }, + agent: { type: 'string', short: 'a' }, + model: { type: 'string', short: 'm' }, effort: { type: 'string', short: 'e' }, prompt: { type: 'string' }, verbose: { type: 'boolean', short: 'v' }, @@ -62,8 +85,12 @@ const { values } = parseArgs({ strict: true, }); +// Resolve the discriminator: explicit --agent, inferred from --model, or default to claude. +const agent = values.agent ?? (values.model ? inferAgent(values.model) : 'claude'); + const parsed = argsSchema.safeParse({ ...values, + agent, listProjects: values['list-projects'], listModels: values['list-models'], listPrompts: values['list-prompts'], @@ -86,8 +113,8 @@ if (args.listProjects) { process.exit(0); } if (args.listModels) { - for (const [agent, { models }] of Object.entries(AGENTS)) { - logger.log(`\n ${pc.bold(agent)}`); + for (const [name, { models }] of Object.entries(AGENTS)) { + logger.log(`\n ${pc.bold(name)}`); for (const m of models) logger.log(` ${m}`); } process.exit(0); @@ -97,7 +124,7 @@ if (args.listPrompts) { process.exit(0); } -// --- Validate project (required when not listing) --- +// --- Validate project --- if (!args.project) { logger.log(pc.red(`Specify a project with -p. Available: ${PROJECT_NAMES.join(', ')}`)); @@ -105,140 +132,37 @@ if (!args.project) { } const project = PROJECTS.find((p) => p.name === args.project)!; -// --- Build agent/model pairs (zod already validated individual values) --- - -function inferAgent(model: string): AgentId { - return Object.entries(AGENTS).find(([, cfg]) => cfg.models.includes(model))![0] as AgentId; -} - -const agentModels: Array<{ agent: AgentId; model: string }> = args.model - ? args.model - .map((m) => ({ agent: inferAgent(m), model: m })) - .filter((am) => !args.agent || args.agent.includes(am.agent)) - : args.agent - ? args.agent.map((a) => ({ agent: a as AgentId, model: AGENTS[a as AgentId].defaultModel })) - : [{ agent: 'claude', model: AGENTS.claude.defaultModel }]; - -const promptNames = args.prompt.split(','); -const configs = agentModels.flatMap(({ agent, model }) => { - const effort = args.effort ?? AGENTS[agent].defaultEffort; - return promptNames.map((prompt) => ({ - config: { - project, - variant: { agent, model, effort }, - prompt, - verbose: args.verbose, - } as TrialConfig, - label: `${model}+${prompt}`, - })); -}); +// --- Run trial --- -// --- Print header --- +const variant: AgentVariant = + args.agent === 'claude' + ? { agent: args.agent, model: args.model, effort: args.effort } + : { agent: args.agent, model: args.model, effort: args.effort }; -const runId = randomUUID().slice(0, 8); logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); -if (configs.length === 1) { - const { - variant: { agent, model, effort }, - prompt, - } = configs[0].config; - logger.log(`Agent: ${agent} | Model: ${model} | Effort: ${effort} | Prompt: ${prompt}`); -} else { - logger.log(`${configs.length} parallel runs`); - for (const [agent, { models }] of Object.entries(AGENTS)) { - const active = models.filter((m) => configs.some((c) => c.config.variant.model === m)); - if (active.length > 0) logger.log(` ${agent}: ${active.join(', ')}`); - } - logger.log(` prompts: ${[...new Set(promptNames)].join(', ')}`); -} -logger.log(`Run: ${runId}\n`); - -// --- Execute (always use allSettled — works for 1 or N runs) --- - -const settled = await Promise.allSettled( - configs.map((c) => runTrial(c.config, createLogger(configs.length > 1 ? c.label : undefined))) +logger.log( + `Agent: ${variant.agent} | Model: ${variant.model} | Effort: ${variant.effort} | Prompt: ${args.prompt}\n` ); -const results: TrialReport[] = []; -for (const [i, s] of settled.entries()) { - if (s.status === 'fulfilled') { - results.push(s.value); - } else { - logger.logError( - `${configs[i].label}: ${s.reason instanceof Error ? s.reason.message : s.reason}` - ); - } -} - -if (results.length === 0) { - process.exit(1); -} +const result = await runTrial( + { project, variant, prompt: args.prompt, verbose: args.verbose } satisfies TrialConfig, + logger +); -// --- Print results --- - -if (results.length === 1) { - const r = results[0]; - const ghost = r.grade.ghostStories; - const ghostStr = ghost - ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` - : '-'; - - logger.log(pc.bold('\nResult')); - logger.log(` Build: ${r.grade.buildSuccess ? pc.green('PASS') : pc.red('FAIL')}`); - logger.log(` Ghost: ${ghostStr}`); - logger.log(` TS Err: ${r.grade.typeCheckErrors}`); - logger.log(` Score: ${r.score.score}`); - logger.log(` Cost: ${formatCost(r.execution.cost)}`); - logger.log(` Time: ${formatDuration(r.execution.duration)}`); - logger.log(` Turns: ${r.execution.turns}`); -} else { - results.sort( - (a, b) => (b.grade.ghostStories?.successRate ?? -1) - (a.grade.ghostStories?.successRate ?? -1) - ); - - const headers = [ - 'Agent', - 'Model', - 'Prompt', - 'Build', - 'Ghost', - 'TS Err', - 'Score', - 'Cost', - 'Time', - 'Turns', - ]; - const rows = results.map((r) => { - const ghost = r.grade.ghostStories; - const ghostStr = ghost - ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` - : '-'; - return [ - r.variant.agent, - r.variant.model, - r.prompt, - r.grade.buildSuccess ? pc.green('PASS') : pc.red('FAIL'), - ghostStr, - String(r.grade.typeCheckErrors), - String(r.score.score), - formatCost(r.execution.cost), - formatDuration(r.execution.duration), - String(r.execution.turns), - ]; - }); - - logger.log(pc.bold('\n\nResults (sorted by ghost stories rate)')); - logger.log(formatTable(headers, rows)); - - const totalCost = results.reduce((s, r) => s + (r.execution.cost || 0), 0); - const ghostRates = results - .map((r) => r.grade.ghostStories?.successRate) - .filter((r): r is number => r != null); - const avgGhost = - ghostRates.length > 0 ? ghostRates.reduce((s, r) => s + r, 0) / ghostRates.length : 0; - - logger.log(`\nGhost stories avg: ${pc.bold(`${Math.round(avgGhost * 100)}%`)}`); - logger.log(`Total cost: ${pc.bold(formatCost(totalCost))}`); -} +// --- Print result --- + +const ghost = result.grade.ghostStories; +const ghostStr = ghost + ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` + : '-'; + +logger.log(pc.bold('\nResult')); +logger.log(` Build: ${result.grade.buildSuccess ? pc.green('PASS') : pc.red('FAIL')}`); +logger.log(` Ghost: ${ghostStr}`); +logger.log(` TS Err: ${result.grade.typeCheckErrors}`); +logger.log(` Score: ${result.score.score}`); +logger.log(` Cost: ${formatCost(result.execution.cost)}`); +logger.log(` Time: ${formatDuration(result.execution.duration)}`); +logger.log(` Turns: ${result.execution.turns}`); logger.log('\nDone.'); diff --git a/scripts/eval/lib/ghost-stories.ts b/scripts/eval/lib/ghost-stories.ts deleted file mode 100644 index 964cc22ed33f..000000000000 --- a/scripts/eval/lib/ghost-stories.ts +++ /dev/null @@ -1,127 +0,0 @@ -/** - * Ghost stories: discover component candidates and run vitest-based - * ghost story tests to measure how many components render successfully. - * - * Self-contained — does not import from code/core. Uses the same vitest - * + STORYBOOK_COMPONENT_PATHS approach that core-server uses internally, - * but decoupled so eval has no cross-package source imports. - */ - -import { existsSync } from 'node:fs'; -import { glob, readFile } from 'node:fs/promises'; -import { join, resolve } from 'node:path'; -import { tmpdir } from 'node:os'; -import { x } from 'tinyexec'; - -const COMPONENT_GLOB = '**/*.{tsx,jsx}'; -const IGNORE_PATTERNS = [ - '**/node_modules/**', - '**/.git/**', - '**/dist/**', - '**/__mocks__/**', - '**/build/**', - '**/storybook-static/**', - '**/*.test.*', - '**/*.spec.*', - '**/*.stories.*', - '**/*.story.*', - '**/*.d.*', - '**/*.config.*', - '**/stories/{Button,Header,Page}.*', - '**/stories/{button,header,page}.*', -]; - -export class GhostStoryError extends Error { - constructor(message: string) { - super(message); - this.name = 'GhostStoryError'; - } -} - -/** - * Find component files that are candidates for ghost story testing. - * Uses glob-based discovery — sufficient for eval grading purposes. - */ -export async function findComponentCandidates(opts: { - cwd: string; - sampleSize?: number; -}): Promise { - const { cwd, sampleSize = 20 } = opts; - const files = await Array.fromAsync( - glob(COMPONENT_GLOB, { - cwd, - exclude: IGNORE_PATTERNS, - }) - ); - return files.map((file) => resolve(cwd, file)).slice(0, sampleSize); -} - -export interface GhostStoryOutput { - total: number; - passed: number; - successRate: number; -} - -/** - * Run ghost stories by executing vitest with STORYBOOK_COMPONENT_PATHS. - * - * The storybook vitest plugin auto-generates and tests stories for the - * specified component files. Non-zero exit from vitest is expected when - * some stories fail — we parse the JSON report for actual results. - */ -export async function runGhostStories( - candidates: string[], - opts: { cwd: string } -): Promise { - const outputFile = join(tmpdir(), `ghost-stories-${Date.now()}.json`); - - const result = await x( - 'npx', - [ - 'vitest', - 'run', - '--reporter=json', - '--testTimeout=1000', - `--outputFile=${outputFile}`, - ...candidates, - ], - { - throwOnError: false, - timeout: 300_000, - nodeOptions: { - cwd: opts.cwd, - env: { - ...process.env, - STORYBOOK_COMPONENT_PATHS: candidates.join(';'), - }, - }, - } - ); - - const stderr = (result.stderr ?? '').toLowerCase(); - if (stderr.includes('browsertype.launch')) { - throw new GhostStoryError('Playwright not installed'); - } - if (stderr.includes('no tests found')) { - throw new GhostStoryError('No tests found'); - } - - if (!existsSync(outputFile)) { - throw new GhostStoryError('JSON report not found'); - } - - let report: any; - try { - report = JSON.parse(await readFile(outputFile, 'utf-8')); - } catch { - throw new GhostStoryError('Failed to parse vitest report'); - } - - if (!report.testResults?.length) { - throw new GhostStoryError('No test results in report'); - } - const total: number = report.numTotalTests ?? 0; - const passed: number = report.numPassedTests ?? 0; - const successRate = total > 0 ? Math.round((passed / total) * 100) / 100 : 0; - return { total, passed, successRate }; -} diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index 14fcaece7c0b..08481c99b38b 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -12,7 +12,7 @@ import type { import { DEFAULT_SCORE_WEIGHTS } from '../types.ts'; import { x } from 'tinyexec'; import { detectSetupPatterns } from './setup-patterns.ts'; -import { findComponentCandidates, runGhostStories } from './ghost-stories.ts'; +import { getComponentCandidates, runGhostStories } from 'storybook/internal/core-server'; /** Maximum TypeScript errors before the typecheck score reaches 0. */ const MAX_TYPECHECK_ERRORS = 20; @@ -212,7 +212,7 @@ async function gradeGhostStories( logger.logStep('Running ghost stories...'); try { - const candidates = await findComponentCandidates({ sampleSize: 20, cwd: projectPath }); + const { candidates } = await getComponentCandidates({ sampleSize: 20, cwd: projectPath }); if (candidates.length === 0) { logger.logError('No candidate components found'); return undefined; @@ -220,17 +220,26 @@ async function gradeGhostStories( logger.logStep(`Found ${candidates.length} candidate component(s)`); const result = await runGhostStories(candidates, { cwd: projectPath }); - if (result.total > 0) { + + if (result.runError) { + logger.logError(`Ghost stories: ${result.runError}`); + return undefined; + } + + const summary = 'summary' in result ? result.summary : undefined; + + if (summary && summary.total > 0) { + const realPassed = summary.passed - summary.passedButEmptyRender; logger.logSuccess( - `Ghost stories: ${result.passed}/${result.total} passed (${Math.round(result.successRate * 100)}%)` + `Ghost stories: ${realPassed}/${summary.total} passed (${Math.round(summary.successRateWithoutEmptyRender * 100)}%)${summary.passedButEmptyRender > 0 ? ` (${summary.passedButEmptyRender} empty renders excluded)` : ''}` ); } return { candidateCount: candidates.length, - total: result.total, - passed: result.passed, - successRate: result.successRate, + total: summary?.total ?? 0, + passed: (summary?.passed ?? 0) - (summary?.passedButEmptyRender ?? 0), + successRate: summary?.successRateWithoutEmptyRender ?? 0, }; } catch (error) { logger.logError(`Ghost stories: ${error instanceof Error ? error.message : String(error)}`); diff --git a/scripts/eval/lib/grading-helpers.test.ts b/scripts/eval/lib/grading-helpers.test.ts index c002b9fea4fa..5ea8477d2faa 100644 --- a/scripts/eval/lib/grading-helpers.test.ts +++ b/scripts/eval/lib/grading-helpers.test.ts @@ -4,7 +4,7 @@ import { tmpdir } from 'node:os'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; -import { findComponentCandidates } from './ghost-stories'; +import { getComponentCandidates } from 'storybook/internal/core-server'; import { computeQualityScore, countTypeCheckErrors, @@ -38,7 +38,7 @@ function writeFile(relativePath: string, content: string) { } async function findCandidates(cwd: string) { - const candidates = await findComponentCandidates({ cwd, sampleSize: 20 }); + const { candidates } = await getComponentCandidates({ cwd, sampleSize: 20 }); return candidates.map((c) => c.replace(cwd + '/', '')); } diff --git a/scripts/eval/lib/utils.test.ts b/scripts/eval/lib/utils.test.ts index 55f68bbc034d..7b4ebe4e5024 100644 --- a/scripts/eval/lib/utils.test.ts +++ b/scripts/eval/lib/utils.test.ts @@ -67,7 +67,6 @@ describe('listPrompts', () => { it('lists available prompt names', () => { const prompts = listPrompts(); expect(prompts).toContain('setup'); - expect(prompts).toContain('self-heal'); }); it('returns only names without .md extension', () => { @@ -86,14 +85,8 @@ describe('loadPrompt', () => { it('loads setup prompt by name', () => { const prompt = loadPrompt('setup'); - expect(prompt).toContain('Storybook setup'); - expect(prompt).not.toContain('React + Vite'); - }); - - it('loads self-heal prompt', () => { - const prompt = loadPrompt('self-heal'); - expect(prompt).toContain('Self-healing'); - expect(prompt).toContain('vitest'); + expect(prompt).toContain('Storybook'); + expect(prompt).toContain('### Step 1'); }); it('throws for unknown prompt', () => { diff --git a/scripts/eval/prompts/self-heal.md b/scripts/eval/prompts/self-heal.md deleted file mode 100644 index a5950f8263de..000000000000 --- a/scripts/eval/prompts/self-heal.md +++ /dev/null @@ -1,11 +0,0 @@ -## Self-healing loop - -Storybook init created a Vitest integration (`npx vitest --project=storybook`). Use it to verify your setup: - -1. Run `npx vitest run --project=storybook` to test if stories render. -2. Read the error output carefully — it tells you exactly which stories fail and why. -3. Make the smallest fix that addresses the root cause (missing provider, missing CSS, wrong alias, etc.). -4. Re-run `npx vitest run --project=storybook`. -5. Repeat until all stories pass or remaining failures are clearly outside Storybook setup scope. - -Do not stop after the first partial improvement. Keep iterating. diff --git a/scripts/eval/prompts/setup.md b/scripts/eval/prompts/setup.md index 342ec4749339..4021e67becde 100644 --- a/scripts/eval/prompts/setup.md +++ b/scripts/eval/prompts/setup.md @@ -1,27 +1,196 @@ -You are finishing Storybook setup for an existing codebase. +Attention: The following instructions must be followed in order to successfully set up Storybook in this project. Do not skip steps or attempt to do them out of order. -## Starting state +Your goal is to make Storybook fully functional in this project by analyzing the codebase, +configuring the preview with the right decorators, and writing example stories for 9 components. -- Storybook was already installed with `npx storybook@latest init --yes`. -- Do not rerun `storybook init`. -- The goal is not to create a demo app. The goal is to make Storybook work for the actual project code. +Work through these steps in order. After each story file, run Vitest to verify it renders. +If the test fails, read the error, fix the issue, and re-run until it passes before moving on. -## Objectives +### Step 1: Analyze the codebase -1. Make Storybook render the project's real components with the providers, globals, aliases, styles, mocks, and environment they need. -2. Replace or remove init placeholder stories/components when they stop being useful. -3. Add or update a small representative set of stories for existing components from the project. -4. Prefer reusable setup in `.storybook` over per-story hacks. +Before writing any stories, understand what the components need to render: -## Constraints +- Scan the project for context providers, theme systems, routers, stores, and i18n setups. + Look at the app's entry point (e.g. `App.tsx`, `main.tsx`, `layout.tsx`) to see what + providers wrap the component tree. +- Identify global CSS or style imports required for components to look correct. +- Note any path aliases configured in tsconfig or bundler config. +- Read `.storybook/main.ts` (or `main.js`) to find the `stories` glob patterns. + Your story files must match those patterns to be picked up by Storybook. -- Keep changes focused on Storybook setup and the minimum related support files. -- Avoid changing product source unless genuinely required. -- Reuse existing app providers and styling entry points when possible. +### Step 2: Configure `.storybook/preview.ts` with decorators -## Verification +Add decorators that wrap every story with the providers your components need. +Without this, most non-trivial components will crash. -- Use `npx vitest run --project=storybook` to verify stories render correctly. -- Use `npx storybook build` as a secondary check. -- Fix the highest-signal Storybook problem first. -- Iterate until the setup is stable enough that another user can keep writing stories without additional setup work. +If the project uses CSF Factory (look for `definePreview` in `.storybook/preview.ts`): +```ts +// .storybook/preview.ts +import '../src/index.css'; // import global styles + +import { definePreview } from 'storybook/preview'; + +export const config = definePreview({ + decorators: [ + (Story) => ( + + + + + + ), + ], +}); +``` + +Otherwise: +```ts +// .storybook/preview.ts +import '../src/index.css'; // import global styles + +const preview = { + decorators: [ + (Story) => ( + + + + + + ), + ], +}; +export default preview; +``` + +Common decorators to add: +- **Theme providers** (e.g. ThemeProvider, MUI ThemeProvider, styled-components, Tailwind) +- **Router** (e.g. MemoryRouter, BrowserRouter mock) +- **State stores** (e.g. Redux Provider, Zustand, Jotai) +- **i18n** (e.g. IntlProvider, I18nextProvider) +- **Global CSS** — import global stylesheets at the top of preview.ts + +### Step 3: Write stories for 9 components + +Pick 9 real components from the codebase, 3 of each complexity level. +Use the title prefix `AI Generated//` so they are grouped +together in the Storybook sidebar. + +**Simple (3 components)** — Presentational with few props, no internal state. +Examples: Button, Badge, Avatar, Icon, Label, Chip. +Title format: `AI Generated/Simple/` + +**Medium (3 components)** — Multiple visual variants or composed from simpler components. +Examples: Card, Alert, Input, Select, Tooltip, Tabs. +Title format: `AI Generated/Medium/` + +**Complex (3 components)** — Internal state, side effects, or deep composition. +Examples: Modal, DataTable, Form, Dropdown, Accordion, Sidebar. +Title format: `AI Generated/Complex/` + +For each component, create a `.stories.ts` file next to the component. +Each file must have at least 2 story exports covering the component's main states. +Make sure the file location and naming matches the `stories` patterns in `.storybook/main.ts`. + +If the project uses CSF Factory (look for `definePreview` / `config.meta` patterns): + +Story format (CSF Factory — this project uses CSF factories): +```ts +import { config } from '#.storybook/preview'; +import { Button } from './Button'; + +const meta = config.meta({ + title: 'AI Generated/Simple/Button', + component: Button, +}); + +export const Default = meta.story({ + args: { + label: 'Click me', + }, +}); + +export const Disabled = meta.story({ + args: { + label: 'Disabled', + disabled: true, + }, +}); +``` + +Otherwise: + +Story format (CSF): +```ts +import type { Meta, StoryObj } from '@storybook/react'; +import { Button } from './Button'; + +const meta = { + title: 'AI Generated/Simple/Button', + component: Button, +} satisfies Meta; + +export default meta; +type Story = StoryObj; + +export const Default: Story = { + args: { + label: 'Click me', + }, +}; + +export const Disabled: Story = { + args: { + label: 'Disabled', + disabled: true, + }, +}; +``` + +Rules: +- Every named export is a story. Use `args` to set props. +- Provide all required props via `args` — check the component's types. +- If a component needs per-story decorators (beyond the global ones), add them in the meta. +- Do NOT use `any` types. Use the component's prop types for type safety. + +Reference: https://storybook.js.org/docs/latest/writing-stories + +### Step 4: Verify each story with Vitest + +After writing each story file, immediately verify it: + +```bash +npx vitest --project storybook +``` + +**Self-healing loop — repeat for every story file:** +1. Write/update the story file +2. Run `npx vitest --project storybook ` +3. If it fails: read the error output carefully + - Missing provider → add a decorator in `.storybook/preview.ts` or in the story meta + - Missing prop → add the required prop to `args` + - Import error → fix the import path + - CSS/asset error → add static dirs or import the stylesheet +4. Fix the issue and go back to step 2 +5. Once the test passes, move to the next component + +After all 9 story files pass individually, run the full suite: +```bash +npx vitest --project storybook +``` + +### Checklist + +- [ ] Analyzed codebase for providers, global styles, and path aliases +- [ ] Read story patterns from `.storybook/main.ts` +- [ ] Configured `.storybook/preview.ts` with necessary decorators +- [ ] Simple component 1: story written and passing +- [ ] Simple component 2: story written and passing +- [ ] Simple component 3: story written and passing +- [ ] Medium component 1: story written and passing +- [ ] Medium component 2: story written and passing +- [ ] Medium component 3: story written and passing +- [ ] Complex component 1: story written and passing +- [ ] Complex component 2: story written and passing +- [ ] Complex component 3: story written and passing +- [ ] Full Vitest suite passes: `npx vitest --project storybook` +- [ ] Run `npx storybook doctor` to check for common issues (version mismatches, duplicated deps, etc.) diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts index 8c82e3edeefd..b3a6b0bcffad 100644 --- a/scripts/eval/types.ts +++ b/scripts/eval/types.ts @@ -1,8 +1,9 @@ /** * Core types for the Storybook setup eval system. * - * Plain TypeScript interfaces — runtime validation at the CLI boundary - * uses zod (see eval.ts). + * Pure TypeScript — no runtime validation. The CLI boundary (eval.ts) uses + * Zod with a discriminated union to parse args; after that, these types flow + * through the system via normal TypeScript narrowing. */ // --- Logger --- @@ -14,14 +15,24 @@ export interface Logger { logError: (msg: string) => void; } -// --- Agent --- +// --- Agent (const arrays → derived types) --- -export type ClaudeModel = 'sonnet-4.6' | 'opus-4.6' | 'haiku-4.5'; -export type CodexModel = 'gpt-5.4'; -export type ClaudeEffort = 'low' | 'medium' | 'high' | 'max'; -export type CodexEffort = 'low' | 'medium' | 'high' | 'xhigh'; +export const CLAUDE_MODELS = ['sonnet-4.6', 'opus-4.6', 'haiku-4.5'] as const; +export const CODEX_MODELS = ['gpt-5.4'] as const; +export const ALL_MODELS = [...CLAUDE_MODELS, ...CODEX_MODELS] as const; -/** Agent + model + effort — the three values that define how the agent runs. */ +export const CLAUDE_EFFORTS = ['low', 'medium', 'high', 'max'] as const; +export const CODEX_EFFORTS = ['low', 'medium', 'high', 'xhigh'] as const; +export const ALL_EFFORTS = ['low', 'medium', 'high', 'max', 'xhigh'] as const; + +export const AGENT_IDS = ['claude', 'codex'] as const; + +export type ClaudeModel = (typeof CLAUDE_MODELS)[number]; +export type CodexModel = (typeof CODEX_MODELS)[number]; +export type ClaudeEffort = (typeof CLAUDE_EFFORTS)[number]; +export type CodexEffort = (typeof CODEX_EFFORTS)[number]; + +/** Agent + model + effort — validated as a discriminated union at the CLI boundary. */ export type AgentVariant = | { agent: 'claude'; model: ClaudeModel; effort: ClaudeEffort } | { agent: 'codex'; model: CodexModel; effort: CodexEffort }; @@ -58,7 +69,7 @@ export interface TrialConfig { project: Project; /** Agent, model, and effort level. */ variant: AgentVariant; - /** Prompt name — maps to `prompts/{name}.md` (e.g. "setup", "self-heal"). */ + /** Prompt name — maps to `prompts/{name}.md` (e.g. "setup"). */ prompt: string; /** Log agent messages to stdout. */ verbose?: boolean; diff --git a/scripts/package.json b/scripts/package.json index 48fbc54c8704..09987b86a4de 100644 --- a/scripts/package.json +++ b/scripts/package.json @@ -147,6 +147,7 @@ "simple-git": "^3.30.0", "slash": "^3.0.0", "sort-package-json": "^3.5.0", + "storybook": "workspace:*", "tiny-invariant": "^1.3.3", "tinyexec": "^0.3.0", "trash": "^7.2.0", diff --git a/scripts/tsconfig.json b/scripts/tsconfig.json index 98673817b79d..84dcf9469414 100644 --- a/scripts/tsconfig.json +++ b/scripts/tsconfig.json @@ -1,6 +1,7 @@ { "compileOnSave": false, "compilerOptions": { + "customConditions": ["code"], "baseUrl": ".", "noEmit": true, "incremental": false, diff --git a/yarn.lock b/yarn.lock index aec95012dc21..1187762a6a66 100644 --- a/yarn.lock +++ b/yarn.lock @@ -8942,6 +8942,7 @@ __metadata: simple-git: "npm:^3.30.0" slash: "npm:^3.0.0" sort-package-json: "npm:^3.5.0" + storybook: "workspace:*" tiny-invariant: "npm:^1.3.3" tinyexec: "npm:^0.3.0" trash: "npm:^7.2.0" From 3011b469f84a86b718d6be301e91dc89a1a4a1de Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Mon, 30 Mar 2026 18:35:53 +0700 Subject: [PATCH 48/63] Update AGENTS.md: use yarn fmt:write from repo root --- AGENTS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AGENTS.md b/AGENTS.md index 9854c7b6515b..a538c6cdb6c0 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -235,7 +235,7 @@ When writing tests: After changing files: -1. Format with `cd code && oxfmt` +1. Format with `yarn fmt:write` (run from the repo root) 2. Lint with `yarn --cwd code lint:js:cmd --fix` or `cd code && yarn lint:js:cmd ` 3. Run relevant tests before submitting a PR From 37192c075fd021c49e6a6e979e94d299195572b3 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Mon, 30 Mar 2026 18:56:58 +0700 Subject: [PATCH 49/63] Eval: add --manual flag to prepare trial without running the agent Prepares the trial (clone, install, cache) then prints the project path, prompt file location, and the exact CLI command to run the agent yourself interactively. --- scripts/eval/eval.ts | 94 +++++++++++++++++++++++++++++++++----------- 1 file changed, 71 insertions(+), 23 deletions(-) diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 4ca81dc856fd..a5e0c5de598d 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -9,10 +9,13 @@ * node eval/eval.ts -p mealdrop -a codex # codex defaults * node eval/eval.ts -p mealdrop -m gpt-5.4 # codex (inferred) * node eval/eval.ts -p mealdrop -a claude -e max # claude with max effort + * node eval/eval.ts -p mealdrop --manual # prepare only, print instructions * node eval/eval.ts --list-projects * node eval/eval.ts --list-models * node eval/eval.ts --list-prompts */ +import { writeFile } from 'node:fs/promises'; +import { join } from 'node:path'; import { parseArgs } from 'node:util'; import { z } from 'zod'; import pc from 'picocolors'; @@ -28,7 +31,16 @@ import { } from './types.ts'; import { AGENTS, PROJECTS } from './config.ts'; import { runTrial } from './lib/run-trial.ts'; -import { createLogger, formatDuration, formatCost, listPrompts } from './lib/utils.ts'; +import { prepareTrial } from './lib/prepare-trial.ts'; +import { + createLogger, + formatDuration, + formatCost, + listPrompts, + loadPrompt, + generateTrialId, + captureEnvironment, +} from './lib/utils.ts'; // --- Helpers --- @@ -41,12 +53,23 @@ function inferAgent(model: string): AgentId { throw new Error(`No agent found for model: ${model}`); } +function buildManualCommand(variant: AgentVariant, promptPath: string): string { + const promptArg = `"$(cat ${promptPath})"`; + if (variant.agent === 'claude') { + const sdkModel = AGENTS.claude.sdkModelIds[variant.model] ?? variant.model; + return `claude --model ${sdkModel} ${promptArg}`; + } + // codex + return `codex --model ${variant.model} --reasoning-effort ${variant.effort} ${promptArg}`; +} + // --- CLI schema: base options + discriminated union on agent --- const base = { project: z.enum(PROJECT_NAMES).optional(), prompt: z.string().default('setup'), verbose: z.boolean().default(false), + manual: z.boolean().default(false), listProjects: z.boolean().default(false), listModels: z.boolean().default(false), listPrompts: z.boolean().default(false), @@ -77,6 +100,7 @@ const { values } = parseArgs({ effort: { type: 'string', short: 'e' }, prompt: { type: 'string' }, verbose: { type: 'boolean', short: 'v' }, + manual: { type: 'boolean' }, 'list-projects': { type: 'boolean' }, 'list-models': { type: 'boolean' }, 'list-prompts': { type: 'boolean' }, @@ -144,25 +168,49 @@ logger.log( `Agent: ${variant.agent} | Model: ${variant.model} | Effort: ${variant.effort} | Prompt: ${args.prompt}\n` ); -const result = await runTrial( - { project, variant, prompt: args.prompt, verbose: args.verbose } satisfies TrialConfig, - logger -); - -// --- Print result --- - -const ghost = result.grade.ghostStories; -const ghostStr = ghost - ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` - : '-'; - -logger.log(pc.bold('\nResult')); -logger.log(` Build: ${result.grade.buildSuccess ? pc.green('PASS') : pc.red('FAIL')}`); -logger.log(` Ghost: ${ghostStr}`); -logger.log(` TS Err: ${result.grade.typeCheckErrors}`); -logger.log(` Score: ${result.score.score}`); -logger.log(` Cost: ${formatCost(result.execution.cost)}`); -logger.log(` Time: ${formatDuration(result.execution.duration)}`); -logger.log(` Turns: ${result.execution.turns}`); - -logger.log('\nDone.'); +if (args.manual) { + // --- Manual mode: prepare only, print instructions --- + + const trialId = generateTrialId(project.name, variant.agent, variant.model, args.prompt); + const workspace = await prepareTrial(project, trialId, logger); + await captureEnvironment(workspace.resultsDir); + + const prompt = loadPrompt(args.prompt); + const promptPath = join(workspace.resultsDir, 'prompt.md'); + await writeFile(promptPath, prompt); + + const cliCommand = buildManualCommand(variant, promptPath); + + logger.log(pc.bold('\n── Manual mode ──')); + logger.log(`\n Trial dir: ${pc.cyan(workspace.trialDir)}`); + logger.log(` Project dir: ${pc.cyan(workspace.projectPath)}`); + logger.log(` Prompt file: ${pc.cyan(promptPath)}`); + logger.log(pc.bold('\nRun the agent yourself:\n')); + logger.log(` ${pc.green('cd')} ${workspace.projectPath}`); + logger.log(` ${pc.green(cliCommand)}\n`); +} else { + // --- Automatic mode: run trial end-to-end --- + + const result = await runTrial( + { project, variant, prompt: args.prompt, verbose: args.verbose } satisfies TrialConfig, + logger + ); + + // --- Print result --- + + const ghost = result.grade.ghostStories; + const ghostStr = ghost + ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` + : '-'; + + logger.log(pc.bold('\nResult')); + logger.log(` Build: ${result.grade.buildSuccess ? pc.green('PASS') : pc.red('FAIL')}`); + logger.log(` Ghost: ${ghostStr}`); + logger.log(` TS Err: ${result.grade.typeCheckErrors}`); + logger.log(` Score: ${result.score.score}`); + logger.log(` Cost: ${formatCost(result.execution.cost)}`); + logger.log(` Time: ${formatDuration(result.execution.duration)}`); + logger.log(` Turns: ${result.execution.turns}`); + + logger.log('\nDone.'); +} From 788773257163d5a1d126a351a7cd11d7efbb3ffb Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Mon, 30 Mar 2026 19:05:41 +0700 Subject: [PATCH 50/63] Make review-pr skill project-local and contributor-friendly - Write output to .pr-review/ instead of ~/life/slideshows/ - Add .pr-review to .gitignore - Replace eval-specific examples with generic ones - Add mkdir -p step before writing output files --- .agents/skills/review-pr/SKILL.md | 60 ++++++++++++++++--------------- .gitignore | 3 ++ 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/.agents/skills/review-pr/SKILL.md b/.agents/skills/review-pr/SKILL.md index 5a655dbeafff..5c06bb8f60a5 100644 --- a/.agents/skills/review-pr/SKILL.md +++ b/.agents/skills/review-pr/SKILL.md @@ -14,7 +14,7 @@ Generate a scrollable single-page HTML document that reviews a PR as a readable 1. **Two layers per area.** The top layer is a curated, readable walkthrough — API surface, key test assertions, and core implementation logic woven together with prose. Only the important parts. Below it, the full files are collapsed in `
` for reference. 2. **High-level to low-level.** Order areas from entry points and orchestration down to utilities and types. The reader understands architecture before details. -3. **API → Tests → Implementation.** Within each area's readable section, show the API first (types, interfaces, exports), then the tests (what does it do?), then the implementation (how?). **Show full interface bodies** — not just names. The reader should see every field of `TrialResult`, `AgentConfig`, etc. in the walkthrough where they're first relevant. Don't defer to "see types.ts". +3. **API → Tests → Implementation.** Within each area's readable section, show the API first (types, interfaces, exports), then the tests (what does it do?), then the implementation (how?). **Show full interface bodies** — not just names. The reader should see every field of key interfaces in the walkthrough where they're first relevant. Don't defer to "see types.ts". 4. **Review readability.** For each file, assess: logical order? Clear names? Comments where the *why* isn't obvious? Tests readable enough to serve as docs? Flag issues as smell-boxes. Call out well-written tests with note-boxes. 5. **Cover everything.** Every changed file appears somewhere. @@ -50,7 +50,13 @@ Use narrative `

` tags between snippets to explain what the reader is looking Below the walkthrough, include every file in the area as a collapsed `

` block with the complete file content (or diff for modified files). The reader expands these for reference. -Write to `~/life/slideshows/pr-/index.html`. +First create the output directory: + +```bash +mkdir -p .pr-review/pr- +``` + +Write to `.pr-review/pr-/index.html` (relative to the repo root). **Verify every file from `gh pr diff --name-only` appears in the page.** @@ -234,38 +240,36 @@ Show full interface bodies where they're first relevant — not just names: ```html
-

API: The pipeline takes a config and returns a full result:

+

API: The entry point takes a config and returns a result:

-
export async function runTask(config: TrialConfig): Promise<TrialResult>
-
-export interface TrialConfig {
-  project: Project;
-  agent: AgentName;    // "claude" | "codex"
-  model: string;
-  effort: string;
-  prompt: string;
+  
export async function processStory(config: StoryConfig): Promise<StoryResult>
+
+export interface StoryConfig {
+  id: string;
+  title: string;
+  component: ComponentType;
+  args: Record<string, unknown>;
+  parameters: Parameters;
 }
 
-export interface TrialResult {
-  schemaVersion: 1;
-  project: string;
-  agent: string;
-  model: string;
-  execution: ExecutionResult;
-  grading: GradingResult;
-  quality: QualityResult;
+export interface StoryResult {
+  status: 'success' | 'error';
+  rendered: boolean;
+  duration: number;
+  errors: string[];
 }
-

Tests: The ordering test makes the sequential contract clear:

+

Tests: The happy-path test shows the expected flow:

-
await runTask(baseConfig);
-expect(callOrder).toEqual(['prepare', 'agent', 'grade']);
+
const result = await processStory(baseConfig);
+expect(result.status).toBe('success');
+expect(result.rendered).toBe(true);
-

Implementation: The pipeline is strictly sequential — grade needs the agent's file changes:

+

Implementation: The pipeline is sequential — rendering depends on preparation:

-
const paths = await prepareTrial(config.project, trialId, logger);
-const execution = await agent.execute({ prompt, projectPath, ... });
-const { grading, quality } = await grade(paths, logger, execution.duration);
+
const context = await prepare(config);
+const canvas = await render(context);
+return summarize(canvas, config);
``` @@ -357,7 +361,7 @@ Kill any existing server, write a static server, start it: lsof -ti:3000 | xargs kill -9 2>/dev/null || true ``` -Write to `~/life/slideshows/pr-/server.mjs`: +Write to `.pr-review/pr-/server.mjs`: ```javascript import { createServer } from 'node:http'; @@ -387,7 +391,7 @@ createServer((req, res) => { ``` ```bash -node ~/life/slideshows/pr-/server.mjs & # run_in_background: true +node .pr-review/pr-/server.mjs & # run_in_background: true open http://localhost:3000 ``` diff --git a/.gitignore b/.gitignore index d2c36bd45454..26b22b0e8c0e 100644 --- a/.gitignore +++ b/.gitignore @@ -84,3 +84,6 @@ CLAUDE.local.md # Eval system scripts/eval/.cache scripts/eval/results + +# PR review output +.pr-review From a55f40a479defa46a762ffe96cbc61e2403237aa Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Mon, 30 Mar 2026 22:13:40 +0700 Subject: [PATCH 51/63] Colocate eval types and config, remove setup-patterns, drop section comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move types from centralized types.ts into their owning modules: - Agent types/config → lib/agents/config.ts - Project/PROJECTS → lib/projects.ts - Logger → lib/utils.ts - Grade/scoring types → lib/grade.ts - TrialConfig/TrialReport → lib/run-trial.ts - TrialWorkspace → lib/prepare-trial.ts Remove setup-patterns (detectSetupPatterns, SetupPattern) entirely. Strip all // --- section separator comments. --- scripts/eval/eval.ts | 27 +-- scripts/eval/lib/agents/claude-code.ts | 4 +- scripts/eval/lib/agents/codex.ts | 3 +- .../agents/config.test.ts} | 31 +--- scripts/eval/{ => lib/agents}/config.ts | 103 +++++------ scripts/eval/lib/grade.test.ts | 2 +- scripts/eval/lib/grade.ts | 67 +++++-- scripts/eval/lib/grading-helpers.test.ts | 42 +---- scripts/eval/lib/package-manager.ts | 2 +- scripts/eval/lib/prepare-trial.ts | 11 +- scripts/eval/lib/projects.test.ts | 32 ++++ scripts/eval/lib/projects.ts | 48 +++++ scripts/eval/lib/run-trial.test.ts | 10 +- scripts/eval/lib/run-trial.ts | 29 ++- scripts/eval/lib/setup-patterns.test.ts | 123 ------------- scripts/eval/lib/setup-patterns.ts | 68 ------- scripts/eval/lib/utils.ts | 18 +- scripts/eval/types.ts | 174 ------------------ 18 files changed, 233 insertions(+), 561 deletions(-) rename scripts/eval/{types.test.ts => lib/agents/config.test.ts} (57%) rename scripts/eval/{ => lib/agents}/config.ts (53%) create mode 100644 scripts/eval/lib/projects.test.ts create mode 100644 scripts/eval/lib/projects.ts delete mode 100644 scripts/eval/lib/setup-patterns.test.ts delete mode 100644 scripts/eval/lib/setup-patterns.ts delete mode 100644 scripts/eval/types.ts diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index a5e0c5de598d..46dd7f05067f 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -25,12 +25,12 @@ import { CLAUDE_EFFORTS, CODEX_MODELS, CODEX_EFFORTS, + AGENTS, type AgentId, type AgentVariant, - type TrialConfig, -} from './types.ts'; -import { AGENTS, PROJECTS } from './config.ts'; -import { runTrial } from './lib/run-trial.ts'; +} from './lib/agents/config.ts'; +import { PROJECTS } from './lib/projects.ts'; +import { runTrial, type TrialConfig } from './lib/run-trial.ts'; import { prepareTrial } from './lib/prepare-trial.ts'; import { createLogger, @@ -42,8 +42,6 @@ import { captureEnvironment, } from './lib/utils.ts'; -// --- Helpers --- - const PROJECT_NAMES = PROJECTS.map((p) => p.name) as [string, ...string[]]; function inferAgent(model: string): AgentId { @@ -59,12 +57,9 @@ function buildManualCommand(variant: AgentVariant, promptPath: string): string { const sdkModel = AGENTS.claude.sdkModelIds[variant.model] ?? variant.model; return `claude --model ${sdkModel} ${promptArg}`; } - // codex return `codex --model ${variant.model} --reasoning-effort ${variant.effort} ${promptArg}`; } -// --- CLI schema: base options + discriminated union on agent --- - const base = { project: z.enum(PROJECT_NAMES).optional(), prompt: z.string().default('setup'), @@ -90,8 +85,6 @@ const argsSchema = z.discriminatedUnion('agent', [ }), ]); -// --- Parse CLI --- - const { values } = parseArgs({ options: { project: { type: 'string', short: 'p' }, @@ -130,8 +123,6 @@ if (!parsed.success) { const args = parsed.data; const logger = createLogger(); -// --- List commands --- - if (args.listProjects) { for (const p of PROJECTS) logger.log(` ${pc.bold(p.name)} — ${p.description}`); process.exit(0); @@ -148,16 +139,12 @@ if (args.listPrompts) { process.exit(0); } -// --- Validate project --- - if (!args.project) { logger.log(pc.red(`Specify a project with -p. Available: ${PROJECT_NAMES.join(', ')}`)); process.exit(1); } const project = PROJECTS.find((p) => p.name === args.project)!; -// --- Run trial --- - const variant: AgentVariant = args.agent === 'claude' ? { agent: args.agent, model: args.model, effort: args.effort } @@ -169,8 +156,6 @@ logger.log( ); if (args.manual) { - // --- Manual mode: prepare only, print instructions --- - const trialId = generateTrialId(project.name, variant.agent, variant.model, args.prompt); const workspace = await prepareTrial(project, trialId, logger); await captureEnvironment(workspace.resultsDir); @@ -189,15 +174,11 @@ if (args.manual) { logger.log(` ${pc.green('cd')} ${workspace.projectPath}`); logger.log(` ${pc.green(cliCommand)}\n`); } else { - // --- Automatic mode: run trial end-to-end --- - const result = await runTrial( { project, variant, prompt: args.prompt, verbose: args.verbose } satisfies TrialConfig, logger ); - // --- Print result --- - const ghost = result.grade.ghostStories; const ghostStr = ghost ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index ebb201489fd2..e5f6882b59b8 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -2,8 +2,8 @@ import type { SDKMessage } from '@anthropic-ai/claude-agent-sdk'; import { query } from '@anthropic-ai/claude-agent-sdk'; import { writeFile } from 'node:fs/promises'; import { join } from 'node:path'; -import { AGENTS } from '../../config.ts'; -import type { AgentDriver, Execution, Logger } from '../../types.ts'; +import { AGENTS, type AgentDriver, type Execution } from './config.ts'; +import type { Logger } from '../utils.ts'; function logMessage(message: SDKMessage, logger: Logger) { switch (message.type) { diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts index 19de4777815a..beb292cb94ab 100644 --- a/scripts/eval/lib/agents/codex.ts +++ b/scripts/eval/lib/agents/codex.ts @@ -1,8 +1,7 @@ import { Codex, type ModelReasoningEffort } from '@openai/codex-sdk'; import { writeFile } from 'node:fs/promises'; import { join } from 'node:path'; -import type { AgentDriver, Execution } from '../../types.ts'; -import { estimateCost } from '../../config.ts'; +import { estimateCost, type AgentDriver, type Execution } from './config.ts'; export const codexAgent: AgentDriver = { name: 'codex', diff --git a/scripts/eval/types.test.ts b/scripts/eval/lib/agents/config.test.ts similarity index 57% rename from scripts/eval/types.test.ts rename to scripts/eval/lib/agents/config.test.ts index 1af38c595900..e364a8d942fb 100644 --- a/scripts/eval/types.test.ts +++ b/scripts/eval/lib/agents/config.test.ts @@ -1,8 +1,6 @@ import { describe, expect, it } from 'vitest'; -import { AGENTS, PROJECTS } from './config'; - -const githubRepoUrl = /^https:\/\/github\.com\/[^/]+\/[^/]+$/; +import { AGENTS } from './config'; describe('AGENTS', () => { it('keeps each agent default inside its supported model and effort lists', () => { @@ -43,30 +41,3 @@ describe('AGENTS', () => { }); }); }); - -describe('PROJECTS', () => { - it('pins every benchmark project to a pre-initialized eval-baseline repo', () => { - expect(PROJECTS.length).toBeGreaterThan(0); - - for (const project of PROJECTS) { - expect(project).toMatchObject({ - branch: 'eval-baseline', - repo: expect.stringMatching(githubRepoUrl), - description: expect.any(String), - }); - } - }); - - it('keeps benchmark project metadata unambiguous', () => { - const names = PROJECTS.map((p) => p.name); - const repos = PROJECTS.map((p) => p.repo); - - expect(new Set(names).size).toBe(names.length); - expect(new Set(repos).size).toBe(repos.length); - - for (const project of PROJECTS) { - if (!project.projectDir) continue; - expect(project.projectDir).toMatch(/^(?!\/)(?!\.\.?(?:\/|$)).+/); - } - }); -}); diff --git a/scripts/eval/config.ts b/scripts/eval/lib/agents/config.ts similarity index 53% rename from scripts/eval/config.ts rename to scripts/eval/lib/agents/config.ts index 12582790b9fe..bebf71b21c63 100644 --- a/scripts/eval/config.ts +++ b/scripts/eval/lib/agents/config.ts @@ -1,20 +1,50 @@ /** - * Runtime configuration for the Storybook eval system. - * - * Agent configs, model mappings, pricing, benchmark project definitions, - * and cost estimation utilities. + * Agent definitions, model mappings, pricing, and cost estimation. */ -import { - CLAUDE_MODELS, - CODEX_MODELS, - CLAUDE_EFFORTS, - CODEX_EFFORTS, - type AgentId, - type Project, -} from './types.ts'; +import type { Logger } from '../utils.ts'; -// --- Pricing --- +export const CLAUDE_MODELS = ['sonnet-4.6', 'opus-4.6', 'haiku-4.5'] as const; +export const CODEX_MODELS = ['gpt-5.4'] as const; +export const ALL_MODELS = [...CLAUDE_MODELS, ...CODEX_MODELS] as const; + +export const CLAUDE_EFFORTS = ['low', 'medium', 'high', 'max'] as const; +export const CODEX_EFFORTS = ['low', 'medium', 'high', 'xhigh'] as const; +export const ALL_EFFORTS = ['low', 'medium', 'high', 'max', 'xhigh'] as const; + +export const AGENT_IDS = ['claude', 'codex'] as const; + +export type ClaudeModel = (typeof CLAUDE_MODELS)[number]; +export type CodexModel = (typeof CODEX_MODELS)[number]; +export type ClaudeEffort = (typeof CLAUDE_EFFORTS)[number]; +export type CodexEffort = (typeof CODEX_EFFORTS)[number]; + +/** Agent + model + effort — validated as a discriminated union at the CLI boundary. */ +export type AgentVariant = + | { agent: 'claude'; model: ClaudeModel; effort: ClaudeEffort } + | { agent: 'codex'; model: CodexModel; effort: CodexEffort }; + +export type AgentId = AgentVariant['agent']; + +export interface Execution { + cost?: number; + duration: number; + durationApi?: number; + turns: number; +} + +export interface AgentExecuteParams { + prompt: string; + projectPath: string; + variant: AgentVariant; + resultsDir: string; + logger: Logger; +} + +export interface AgentDriver { + name: AgentId; + execute(params: AgentExecuteParams): Promise; +} export interface TokenPricing { input: number; @@ -28,8 +58,6 @@ export interface TokenUsage { outputTokens: number; } -// --- Agent Definition --- - export interface AgentDefinition { models: readonly string[]; defaultModel: string; @@ -66,8 +94,6 @@ export const AGENTS: Record = { }, }; -// --- Cost Estimation --- - /** Estimate cost from token usage using the pricing table. */ export function estimateCost(agent: AgentId, model: string, usage: TokenUsage): number | undefined { const pricing = AGENTS[agent].pricing[model]; @@ -79,46 +105,3 @@ export function estimateCost(agent: AgentId, model: string, usage: TokenUsage): (usage.outputTokens / 1_000_000) * pricing.output ); } - -// --- Projects --- - -export const PROJECTS: Project[] = [ - { - name: 'mealdrop', - repo: 'https://github.com/kasperpeulen/mealdrop', - branch: 'eval-baseline', - description: 'Styled components, Redux, React Router', - }, - { - name: 'edgy', - repo: 'https://github.com/kasperpeulen/edgy', - branch: 'eval-baseline', - description: 'Tailwind, HeadlessUI, React Router', - }, - { - name: 'wikitok', - repo: 'https://github.com/kasperpeulen/wikitok', - branch: 'eval-baseline', - projectDir: 'frontend', - description: 'Simple project with Tailwind', - }, - { - name: 'baklava', - repo: 'https://github.com/kasperpeulen/baklava', - branch: 'eval-baseline', - description: 'Component library with Zustand', - }, - { - name: 'echarts', - repo: 'https://github.com/kasperpeulen/echarts-react', - branch: 'eval-baseline', - description: 'ECharts React wrapper', - }, - { - name: 'evergreen-ci', - repo: 'https://github.com/kasperpeulen/ui', - branch: 'eval-baseline', - projectDir: 'packages/lib', - description: 'GraphQL', - }, -]; diff --git a/scripts/eval/lib/grade.test.ts b/scripts/eval/lib/grade.test.ts index 4360204fcefa..0ad1c87ae3cd 100644 --- a/scripts/eval/lib/grade.test.ts +++ b/scripts/eval/lib/grade.test.ts @@ -6,7 +6,7 @@ import { countTypeCheckErrors, parseChangedFiles, } from './grade'; -import type { FileChange } from '../types'; +import type { FileChange } from './grade'; describe('filterStorybookFiles', () => { it('matches files in .storybook/ directory', () => { diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index 08481c99b38b..8656d7d343b3 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -1,19 +1,58 @@ import { writeFile } from 'node:fs/promises'; import { join } from 'node:path'; -import type { - Grade, - GhostStoryGrade, - QualityScore, - ScoreWeights, - TrialWorkspace, - FileChange, - Logger, -} from '../types.ts'; -import { DEFAULT_SCORE_WEIGHTS } from '../types.ts'; import { x } from 'tinyexec'; -import { detectSetupPatterns } from './setup-patterns.ts'; +import type { Logger } from './utils.ts'; +import type { TrialWorkspace } from './prepare-trial.ts'; import { getComponentCandidates, runGhostStories } from 'storybook/internal/core-server'; +export interface FileChange { + path: string; + status: 'A' | 'M' | 'D' | 'R'; + /** For renames, the original path before the move. */ + previousPath?: string; +} + +export interface GhostStoryGrade { + candidateCount: number; + total: number; + passed: number; + successRate: number; +} + +export interface ScoreWeights { + ghostStories: number; + build: number; + typecheck: number; + performance: number; +} + +export const DEFAULT_SCORE_WEIGHTS: ScoreWeights = { + ghostStories: 0.4, + build: 0.25, + typecheck: 0.25, + performance: 0.1, +}; + +export interface QualityScore { + score: number; + breakdown: { + build: number; + typecheck: number; + ghostStories: number; + performance: number; + }; +} + +export interface Grade { + buildSuccess: boolean; + buildError?: string; + typeCheckErrors: number; + typeCheckOutput?: string; + fileChanges: FileChange[]; + storybookChanges: FileChange[]; + ghostStories?: GhostStoryGrade; +} + /** Maximum TypeScript errors before the typecheck score reaches 0. */ const MAX_TYPECHECK_ERRORS = 20; /** Agent duration (seconds) at or below which performance scores 1.0. */ @@ -125,11 +164,6 @@ export async function grade( `${fileChanges.length} files changed (${storybookChanges.length} storybook-related)` ); - // Setup patterns - const setupPatterns = await detectSetupPatterns(projectPath); - if (setupPatterns.length > 0) - logger.logSuccess(`Detected patterns: ${setupPatterns.map((p) => p.label).join(', ')}`); - // Storybook build + TypeScript check in parallel logger.logStep('Running storybook build + typecheck...'); const [build, tsc] = await Promise.all([ @@ -180,7 +214,6 @@ export async function grade( typeCheckOutput: typeCheckErrors > 0 ? truncateEnd(tscOutput, 2000) : undefined, fileChanges, storybookChanges, - setupPatterns, ghostStories, }; diff --git a/scripts/eval/lib/grading-helpers.test.ts b/scripts/eval/lib/grading-helpers.test.ts index 5ea8477d2faa..fe4880fa42ac 100644 --- a/scripts/eval/lib/grading-helpers.test.ts +++ b/scripts/eval/lib/grading-helpers.test.ts @@ -11,11 +11,9 @@ import { filterStorybookFiles, parseChangedFiles, } from './grade'; -import { detectSetupPatterns } from './setup-patterns'; - /** * Helper-level test: compose grading helpers on a fake project directory. - * This exercises candidate discovery, setup-pattern detection, git-output parsing, + * This exercises candidate discovery, git-output parsing, * and quality-score calculation without pretending to cover the full grade() flow. */ @@ -83,14 +81,7 @@ describe('grading helpers', () => { const candidates = await findCandidates(TMP); expect(candidates).toHaveLength(2); - // Step 2: Detect patterns — config references CSS, theme, staticDirs - const patterns = await detectSetupPatterns(TMP); - const patternIds = patterns.map((p) => p.id); - expect(patternIds).toContain('global-css'); - expect(patternIds).toContain('theme-provider'); - expect(patternIds).toContain('static-dirs'); - - // Step 3: Simulate git output where the agent added storybook config + one + // Step 2: Simulate git output where the agent added storybook config + one // story per discovered candidate, plus modified package.json const gitLines = [ 'A\t.storybook/preview.tsx', @@ -106,7 +97,7 @@ describe('grading helpers', () => { // Total includes package.json expect(changedFiles).toHaveLength(storybookFiles.length + 1); - // Step 4: Build passed, no TS errors, 100% ghost stories, fast agent → perfect score + // Step 3: Build passed, no TS errors, 100% ghost stories, fast agent → perfect score const quality = computeQualityScore({ buildSuccess: true, typeCheckErrors: 0, @@ -131,10 +122,6 @@ describe('grading helpers', () => { const candidates = await findCandidates(TMP); expect(candidates).toHaveLength(1); - // Agent didn't create any .storybook config - rmSync(join(TMP, '.storybook'), { recursive: true }); - expect(await detectSetupPatterns(TMP)).toEqual([]); - // Simulate tsc output with errors proportional to candidate count const tscLines = candidates.map( (c, i) => `${c}(${i + 1},1): error TS2304: Cannot find name 'React'.` @@ -172,9 +159,6 @@ describe('grading helpers', () => { const candidates = await findCandidates(TMP); expect(candidates).toHaveLength(5); - const patterns = await detectSetupPatterns(TMP); - expect(patterns.map((p) => p.id)).toContain('router-provider'); - // Agent wrote one story per candidate — all storybook-related const gitOutput = candidates.map((c) => `A\t${c.replace(/\.tsx$/, '.stories.tsx')}`).join('\n'); const storybookFiles = filterStorybookFiles(parseChangedFiles(gitOutput)); @@ -191,23 +175,3 @@ describe('grading helpers', () => { ).toBe(1); }); }); - -describe('setup-patterns only scans .storybook/', () => { - it('does not detect patterns in component source files', async () => { - // Router usage in a component should NOT be detected as a setup pattern - writeFile( - 'src/App.tsx', - [ - `import React from 'react';`, - `import { BrowserRouter } from 'react-router-dom';`, - `export function App() {`, - ` return
;`, - `}`, - ].join('\n') - ); - // Empty .storybook config with no patterns - writeFile('.storybook/main.ts', `export default { stories: ['../src/**/*.stories.tsx'] };`); - - expect((await detectSetupPatterns(TMP)).map((p) => p.id)).not.toContain('router-provider'); - }); -}); diff --git a/scripts/eval/lib/package-manager.ts b/scripts/eval/lib/package-manager.ts index 9112e0834fd1..db2c2c9797aa 100644 --- a/scripts/eval/lib/package-manager.ts +++ b/scripts/eval/lib/package-manager.ts @@ -7,7 +7,7 @@ import { existsSync } from 'node:fs'; import { join } from 'node:path'; import { x } from 'tinyexec'; -import type { Logger } from '../types.ts'; +import type { Logger } from './utils.ts'; /** Detect the package manager from lock files in a directory. */ export function detectPackageManager(dir: string): string { diff --git a/scripts/eval/lib/prepare-trial.ts b/scripts/eval/lib/prepare-trial.ts index 58a75bb29c8f..eb39317a6659 100644 --- a/scripts/eval/lib/prepare-trial.ts +++ b/scripts/eval/lib/prepare-trial.ts @@ -1,11 +1,20 @@ import { existsSync } from 'node:fs'; import { cp, mkdir } from 'node:fs/promises'; import { join } from 'node:path'; -import type { Project, TrialWorkspace, Logger } from '../types.ts'; +import type { Logger } from './utils.ts'; +import type { Project } from './projects.ts'; import { x } from 'tinyexec'; import { installDeps } from './package-manager.ts'; import { CACHE_DIR, TRIALS_DIR } from './utils.ts'; +export interface TrialWorkspace { + trialDir: string; + repoRoot: string; + projectPath: string; + resultsDir: string; + baselineCommit: string; +} + /** * First run: clone eval-baseline -> install deps -> cache it. * Subsequent runs: copy from cache. Agent starts immediately. diff --git a/scripts/eval/lib/projects.test.ts b/scripts/eval/lib/projects.test.ts new file mode 100644 index 000000000000..b80238500f8e --- /dev/null +++ b/scripts/eval/lib/projects.test.ts @@ -0,0 +1,32 @@ +import { describe, expect, it } from 'vitest'; + +import { PROJECTS } from './projects'; + +const githubRepoUrl = /^https:\/\/github\.com\/[^/]+\/[^/]+$/; + +describe('PROJECTS', () => { + it('pins every benchmark project to a pre-initialized eval-baseline repo', () => { + expect(PROJECTS.length).toBeGreaterThan(0); + + for (const project of PROJECTS) { + expect(project).toMatchObject({ + branch: 'eval-baseline', + repo: expect.stringMatching(githubRepoUrl), + description: expect.any(String), + }); + } + }); + + it('keeps benchmark project metadata unambiguous', () => { + const names = PROJECTS.map((p) => p.name); + const repos = PROJECTS.map((p) => p.repo); + + expect(new Set(names).size).toBe(names.length); + expect(new Set(repos).size).toBe(repos.length); + + for (const project of PROJECTS) { + if (!project.projectDir) continue; + expect(project.projectDir).toMatch(/^(?!\/)(?!\.\.?(?:\/|$)).+/); + } + }); +}); diff --git a/scripts/eval/lib/projects.ts b/scripts/eval/lib/projects.ts new file mode 100644 index 000000000000..0046ed30bac4 --- /dev/null +++ b/scripts/eval/lib/projects.ts @@ -0,0 +1,48 @@ +export interface Project { + name: string; + repo: string; + branch: string; + projectDir?: string; + description?: string; +} + +export const PROJECTS: Project[] = [ + { + name: 'mealdrop', + repo: 'https://github.com/kasperpeulen/mealdrop', + branch: 'eval-baseline', + description: 'Styled components, Redux, React Router', + }, + { + name: 'edgy', + repo: 'https://github.com/kasperpeulen/edgy', + branch: 'eval-baseline', + description: 'Tailwind, HeadlessUI, React Router', + }, + { + name: 'wikitok', + repo: 'https://github.com/kasperpeulen/wikitok', + branch: 'eval-baseline', + projectDir: 'frontend', + description: 'Simple project with Tailwind', + }, + { + name: 'baklava', + repo: 'https://github.com/kasperpeulen/baklava', + branch: 'eval-baseline', + description: 'Component library with Zustand', + }, + { + name: 'echarts', + repo: 'https://github.com/kasperpeulen/echarts-react', + branch: 'eval-baseline', + description: 'ECharts React wrapper', + }, + { + name: 'evergreen-ci', + repo: 'https://github.com/kasperpeulen/ui', + branch: 'eval-baseline', + projectDir: 'packages/lib', + description: 'GraphQL', + }, +]; diff --git a/scripts/eval/lib/run-trial.test.ts b/scripts/eval/lib/run-trial.test.ts index 33962f2deb05..297c62cbee8a 100644 --- a/scripts/eval/lib/run-trial.test.ts +++ b/scripts/eval/lib/run-trial.test.ts @@ -4,7 +4,7 @@ import { tmpdir } from 'node:os'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; -import type { TrialConfig, TrialReport } from '../types'; +import type { TrialConfig, TrialReport } from './run-trial'; // Mock external dependencies to avoid real git/storybook/vitest calls vi.mock('./prepare-trial', () => ({ @@ -82,9 +82,6 @@ function setupMocks(overrides?: { { path: '.storybook/preview.tsx', status: 'A' }, { path: 'src/Button.stories.tsx', status: 'A' }, ], - setupPatterns: [ - { id: 'tailwind', label: 'Tailwind CSS', sourceFiles: ['.storybook/preview.ts'] }, - ], }, score: { score: buildSuccess ? 1 : 0.3, @@ -151,7 +148,7 @@ describe('runTrial pipeline', () => { const params = vi.mocked(claudeAgent.execute).mock.calls[0][0]; expect(params).toMatchObject({ - prompt: expect.stringContaining('Storybook setup'), + prompt: expect.stringContaining('set up Storybook'), projectPath: TMP, variant: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, resultsDir: join(TMP, 'results'), @@ -184,7 +181,7 @@ describe('runTrial pipeline', () => { }); const promptContent = readFileSync(join(resultsDir, 'prompt.md'), 'utf-8'); - expect(promptContent).toContain('Storybook setup'); + expect(promptContent).toContain('set up Storybook'); }); it('propagates failed build into result', async () => { @@ -224,7 +221,6 @@ describe('runTrial pipeline', () => { typeCheckErrors: 0, fileChanges: [], storybookChanges: [], - setupPatterns: [], }, score: { score: 1, breakdown: { build: 1, typecheck: 1, ghostStories: 0, performance: 0 } }, }; diff --git a/scripts/eval/lib/run-trial.ts b/scripts/eval/lib/run-trial.ts index 4d656adb389b..fc8dde20fff8 100644 --- a/scripts/eval/lib/run-trial.ts +++ b/scripts/eval/lib/run-trial.ts @@ -1,12 +1,37 @@ import { writeFile } from 'node:fs/promises'; import { join } from 'node:path'; -import type { AgentId, Logger, TrialConfig, TrialReport, AgentDriver } from '../types.ts'; +import type { Logger } from './utils.ts'; +import type { AgentId, AgentDriver, AgentVariant, Execution } from './agents/config.ts'; +import type { Project } from './projects.ts'; +import { grade, type Grade, type QualityScore } from './grade.ts'; import { claudeAgent } from './agents/claude-code.ts'; import { codexAgent } from './agents/codex.ts'; import { prepareTrial } from './prepare-trial.ts'; -import { grade } from './grade.ts'; import { generateTrialId, loadPrompt, captureEnvironment, createLogger } from './utils.ts'; +export interface TrialConfig { + /** Which project to evaluate (cloned from its eval-baseline branch). */ + project: Project; + /** Agent, model, and effort level. */ + variant: AgentVariant; + /** Prompt name — maps to `prompts/{name}.md` (e.g. "setup"). */ + prompt: string; + /** Log agent messages to stdout. */ + verbose?: boolean; +} + +export interface TrialReport { + schemaVersion: 1; + project: Project; + variant: AgentVariant; + prompt: string; + timestamp: string; + baselineCommit: string; + execution: Execution; + grade: Grade; + score: QualityScore; +} + const drivers: Record = { claude: claudeAgent, codex: codexAgent, diff --git a/scripts/eval/lib/setup-patterns.test.ts b/scripts/eval/lib/setup-patterns.test.ts deleted file mode 100644 index e88ae3916ce9..000000000000 --- a/scripts/eval/lib/setup-patterns.test.ts +++ /dev/null @@ -1,123 +0,0 @@ -import { mkdirSync, writeFileSync, rmSync } from 'node:fs'; -import { join } from 'node:path'; -import { tmpdir } from 'node:os'; - -import { afterEach, beforeEach, describe, expect, it } from 'vitest'; - -import { detectSetupPatterns } from './setup-patterns'; - -let TMP: string; - -beforeEach(() => { - TMP = join(tmpdir(), `eval-setup-patterns-${Date.now()}`); - mkdirSync(join(TMP, '.storybook'), { recursive: true }); -}); - -afterEach(() => { - rmSync(TMP, { recursive: true, force: true }); -}); - -function writeConfig(name: string, content: string) { - writeFileSync(join(TMP, '.storybook', name), content); -} - -describe('detectSetupPatterns', () => { - it('returns empty when no .storybook dir', async () => { - rmSync(join(TMP, '.storybook'), { recursive: true }); - expect(await detectSetupPatterns(TMP)).toEqual([]); - }); - - it('returns empty when .storybook has no matching patterns', async () => { - writeConfig('main.ts', 'export default { stories: ["../src/**/*.stories.@(ts|tsx)"] };'); - expect(await detectSetupPatterns(TMP)).toEqual([]); - }); - - it('detects Tailwind CSS', async () => { - writeConfig('preview.ts', `import 'tailwindcss/tailwind.css';`); - expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('tailwind'); - }); - - it('detects global CSS imports', async () => { - writeConfig('preview.ts', `import '../src/styles/globals.css';`); - expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('global-css'); - }); - - it('detects styled-components', async () => { - writeConfig('preview.tsx', `import { createGlobalStyle } from 'styled-components';`); - expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('styled-components'); - }); - - it('detects React Router', async () => { - writeConfig('preview.tsx', `import { MemoryRouter } from 'react-router-dom';`); - expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('router-provider'); - }); - - it('detects Redux provider', async () => { - writeConfig('preview.tsx', `import { Provider } from 'react-redux';\n`); - expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('redux-provider'); - }); - - it('detects Zustand', async () => { - writeConfig('preview.ts', `import { create } from 'zustand';`); - expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('zustand'); - }); - - it('detects GraphQL/Apollo', async () => { - writeConfig('preview.tsx', `import { MockedProvider } from '@apollo/client/testing';`); - expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('graphql'); - }); - - it('detects theme providers', async () => { - writeConfig('preview.tsx', `import { ThemeProvider } from '@emotion/react';`); - expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('theme-provider'); - }); - - it('detects staticDirs', async () => { - writeConfig('main.ts', `export default { staticDirs: ['../public'] };`); - expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('static-dirs'); - }); - - it('detects vite alias config', async () => { - writeConfig( - 'main.ts', - `export default { viteFinal: (config) => ({ ...config, resolve: { alias: { '@': './src' } } }) };` - ); - expect((await detectSetupPatterns(TMP)).map((p) => p.id)).toContain('vite-alias'); - }); - - it('detects multiple patterns in the same file', async () => { - writeConfig( - 'preview.tsx', - [ - `import '../src/index.css';`, - `import { MemoryRouter } from 'react-router-dom';`, - `import { ThemeProvider } from '@emotion/react';`, - ].join('\n') - ); - const ids = (await detectSetupPatterns(TMP)).map((p) => p.id); - expect(ids).toContain('global-css'); - expect(ids).toContain('router-provider'); - expect(ids).toContain('theme-provider'); - }); - - it('includes sourceFiles relative to project path', async () => { - writeConfig('preview.ts', `import 'tailwindcss';`); - const tailwind = (await detectSetupPatterns(TMP)).find((p) => p.id === 'tailwind'); - expect(tailwind?.sourceFiles).toEqual(['.storybook/preview.ts']); - }); - - it('does not false-positive on unrelated React hooks', async () => { - writeConfig('preview.ts', `import { useState, useEffect } from 'react';`); - expect(await detectSetupPatterns(TMP)).toEqual([]); - }); - - it('does not detect patterns in files outside .storybook/', async () => { - // Write a router import in a source file, not in .storybook/ - mkdirSync(join(TMP, 'src'), { recursive: true }); - writeFileSync(join(TMP, 'src', 'App.tsx'), `import { BrowserRouter } from 'react-router-dom';`); - // .storybook/ has no patterns - writeConfig('main.ts', `export default { stories: ['../src/**/*.stories.tsx'] };`); - - expect((await detectSetupPatterns(TMP)).map((p) => p.id)).not.toContain('router-provider'); - }); -}); diff --git a/scripts/eval/lib/setup-patterns.ts b/scripts/eval/lib/setup-patterns.ts deleted file mode 100644 index 037c7362015f..000000000000 --- a/scripts/eval/lib/setup-patterns.ts +++ /dev/null @@ -1,68 +0,0 @@ -import { readFile, readdir } from 'node:fs/promises'; -import { existsSync } from 'node:fs'; -import { join, relative } from 'node:path'; -import type { SetupPattern } from '../types.ts'; - -const RULES = [ - { - id: 'global-css', - label: 'Global CSS import', - pattern: /import\s+['"][^'"]+\.(css|scss|sass|less)['"]|import\s+['"]tailwindcss/, - }, - { id: 'tailwind', label: 'Tailwind CSS', pattern: /@tailwind|tailwindcss|tailwind\.css/ }, - { - id: 'styled-components', - label: 'Styled Components', - pattern: /styled-components|createGlobalStyle/, - }, - { - id: 'router-provider', - label: 'React Router', - pattern: /MemoryRouter|BrowserRouter|RouterProvider/, - }, - { - id: 'redux-provider', - label: 'Redux Provider', - pattern: /react-redux.*Provider| { - const dir = join(projectPath, '.storybook'); - if (!existsSync(dir)) return []; - - // Read all entries recursively, then attempt to read each as a file - const entries = await readdir(dir, { recursive: true }); - const fileContents = await Promise.all( - entries.map(async (entry) => { - const fullPath = join(dir, entry); - try { - return { path: fullPath, content: await readFile(fullPath, 'utf-8') }; - } catch { - return null; // directories or unreadable files - } - }) - ); - - const files = fileContents.filter((f): f is { path: string; content: string } => f !== null); - - const results: SetupPattern[] = []; - for (const { id, label, pattern } of RULES) { - const matches = files.filter((f) => pattern.test(f.content)); - if (matches.length > 0) { - results.push({ id, label, sourceFiles: matches.map((f) => relative(projectPath, f.path)) }); - } - } - - return results; -} diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts index 1439edef3f38..ddb60e9c394b 100644 --- a/scripts/eval/lib/utils.ts +++ b/scripts/eval/lib/utils.ts @@ -3,7 +3,13 @@ import { writeFile } from 'node:fs/promises'; import { resolve, basename, join } from 'node:path'; import pc from 'picocolors'; import { x } from 'tinyexec'; -import type { Logger } from '../types.ts'; + +export interface Logger { + log: (msg: string) => void; + logStep: (msg: string) => void; + logSuccess: (msg: string) => void; + logError: (msg: string) => void; +} export const REPO_ROOT = resolve(import.meta.dirname, '..', '..', '..'); export const EVAL_ROOT = resolve(REPO_ROOT, '..', 'storybook-eval'); @@ -11,8 +17,6 @@ export const CACHE_DIR = resolve(EVAL_ROOT, '.cache', 'repos'); export const TRIALS_DIR = resolve(EVAL_ROOT, 'trials'); export const PROMPTS_DIR = resolve(import.meta.dirname, '..', 'prompts'); -// --- Logging --- - export function createLogger(prefix?: string): Logger { const p = prefix ? pc.dim(`[${prefix}]`) + ' ' : ''; return { @@ -23,8 +27,6 @@ export function createLogger(prefix?: string): Logger { }; } -// --- Formatting --- - export const formatDuration = (s: number) => s < 60 ? `${Math.round(s)}s` : `${Math.floor(s / 60)}m${Math.round(s % 60)}s`; @@ -35,8 +37,6 @@ export function generateTrialId(project: string, agent: string, model: string, p return `${ts}-${project}-${agent}-${model}-${prompt}-${crypto.randomUUID().slice(0, 8)}`; } -// --- Table formatting --- - /** Strip ANSI escape codes for accurate width calculation. */ const stripAnsi = (str: string) => str.replace(/\x1b\[[0-9;]*m/g, ''); @@ -59,8 +59,6 @@ export function formatTable(headers: string[], rows: string[][]): string { ].join('\n'); } -// --- Prompts --- - /** Load a prompt by name from prompts/{name}.md. */ export function loadPrompt(name = 'setup'): string { const file = resolve(PROMPTS_DIR, `${name}.md`); @@ -78,8 +76,6 @@ export function listPrompts(): string[] { .map((f) => basename(f, '.md')); } -// --- Environment capture --- - export interface EvalEnvironment { nodeVersion: string; /** Git branch of the eval harness (storybook monorepo), not the evaluated project. */ diff --git a/scripts/eval/types.ts b/scripts/eval/types.ts deleted file mode 100644 index b3a6b0bcffad..000000000000 --- a/scripts/eval/types.ts +++ /dev/null @@ -1,174 +0,0 @@ -/** - * Core types for the Storybook setup eval system. - * - * Pure TypeScript — no runtime validation. The CLI boundary (eval.ts) uses - * Zod with a discriminated union to parse args; after that, these types flow - * through the system via normal TypeScript narrowing. - */ - -// --- Logger --- - -export interface Logger { - log: (msg: string) => void; - logStep: (msg: string) => void; - logSuccess: (msg: string) => void; - logError: (msg: string) => void; -} - -// --- Agent (const arrays → derived types) --- - -export const CLAUDE_MODELS = ['sonnet-4.6', 'opus-4.6', 'haiku-4.5'] as const; -export const CODEX_MODELS = ['gpt-5.4'] as const; -export const ALL_MODELS = [...CLAUDE_MODELS, ...CODEX_MODELS] as const; - -export const CLAUDE_EFFORTS = ['low', 'medium', 'high', 'max'] as const; -export const CODEX_EFFORTS = ['low', 'medium', 'high', 'xhigh'] as const; -export const ALL_EFFORTS = ['low', 'medium', 'high', 'max', 'xhigh'] as const; - -export const AGENT_IDS = ['claude', 'codex'] as const; - -export type ClaudeModel = (typeof CLAUDE_MODELS)[number]; -export type CodexModel = (typeof CODEX_MODELS)[number]; -export type ClaudeEffort = (typeof CLAUDE_EFFORTS)[number]; -export type CodexEffort = (typeof CODEX_EFFORTS)[number]; - -/** Agent + model + effort — validated as a discriminated union at the CLI boundary. */ -export type AgentVariant = - | { agent: 'claude'; model: ClaudeModel; effort: ClaudeEffort } - | { agent: 'codex'; model: CodexModel; effort: CodexEffort }; - -export type AgentId = AgentVariant['agent']; - -export interface AgentExecuteParams { - prompt: string; - projectPath: string; - variant: AgentVariant; - resultsDir: string; - logger: Logger; -} - -export interface AgentDriver { - name: AgentId; - execute(params: AgentExecuteParams): Promise; -} - -// --- Project --- - -export interface Project { - name: string; - repo: string; - branch: string; - projectDir?: string; - description?: string; -} - -// --- Trial Config --- - -export interface TrialConfig { - /** Which project to evaluate (cloned from its eval-baseline branch). */ - project: Project; - /** Agent, model, and effort level. */ - variant: AgentVariant; - /** Prompt name — maps to `prompts/{name}.md` (e.g. "setup"). */ - prompt: string; - /** Log agent messages to stdout. */ - verbose?: boolean; -} - -// --- Trial Workspace --- - -export interface TrialWorkspace { - trialDir: string; - repoRoot: string; - projectPath: string; - resultsDir: string; - baselineCommit: string; -} - -// --- Execution --- - -export interface Execution { - cost?: number; - duration: number; - durationApi?: number; - turns: number; -} - -// --- File Changes --- - -export interface FileChange { - path: string; - status: 'A' | 'M' | 'D' | 'R'; - /** For renames, the original path before the move. */ - previousPath?: string; -} - -// --- Setup Patterns --- - -export interface SetupPattern { - id: string; - label: string; - sourceFiles: string[]; -} - -// --- Ghost Stories --- - -export interface GhostStoryGrade { - candidateCount: number; - total: number; - passed: number; - successRate: number; -} - -// --- Grading --- - -export interface Grade { - buildSuccess: boolean; - buildError?: string; - typeCheckErrors: number; - typeCheckOutput?: string; - fileChanges: FileChange[]; - storybookChanges: FileChange[]; - setupPatterns: SetupPattern[]; - ghostStories?: GhostStoryGrade; -} - -// --- Quality Score --- - -export interface ScoreWeights { - ghostStories: number; - build: number; - typecheck: number; - performance: number; -} - -export const DEFAULT_SCORE_WEIGHTS: ScoreWeights = { - ghostStories: 0.4, - build: 0.25, - typecheck: 0.25, - performance: 0.1, -}; - -export interface QualityScore { - score: number; - breakdown: { - build: number; - typecheck: number; - ghostStories: number; - performance: number; - }; -} - -// --- Trial Report --- - -export interface TrialReport { - schemaVersion: 1; - project: Project; - variant: AgentVariant; - prompt: string; - timestamp: string; - baselineCommit: string; - execution: Execution; - grade: Grade; - score: QualityScore; -} From a11a78387ff571b49e3fffd90823e21b95556bd1 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Mon, 30 Mar 2026 22:21:02 +0700 Subject: [PATCH 52/63] Update review-pr skill instructions --- .agents/skills/review-pr/SKILL.md | 66 ++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 15 deletions(-) diff --git a/.agents/skills/review-pr/SKILL.md b/.agents/skills/review-pr/SKILL.md index 5c06bb8f60a5..253cdfb8c3c0 100644 --- a/.agents/skills/review-pr/SKILL.md +++ b/.agents/skills/review-pr/SKILL.md @@ -12,11 +12,43 @@ Generate a scrollable single-page HTML document that reviews a PR as a readable ## Principles -1. **Two layers per area.** The top layer is a curated, readable walkthrough — API surface, key test assertions, and core implementation logic woven together with prose. Only the important parts. Below it, the full files are collapsed in `
` for reference. -2. **High-level to low-level.** Order areas from entry points and orchestration down to utilities and types. The reader understands architecture before details. -3. **API → Tests → Implementation.** Within each area's readable section, show the API first (types, interfaces, exports), then the tests (what does it do?), then the implementation (how?). **Show full interface bodies** — not just names. The reader should see every field of key interfaces in the walkthrough where they're first relevant. Don't defer to "see types.ts". -4. **Review readability.** For each file, assess: logical order? Clear names? Comments where the *why* isn't obvious? Tests readable enough to serve as docs? Flag issues as smell-boxes. Call out well-written tests with note-boxes. -5. **Cover everything.** Every changed file appears somewhere. +The purpose of this page is to help the **human reviewer** understand and review a PR quickly. The page is a reading aid — it presents the code clearly so the reviewer can form their own opinion. + +### Optimize for the reviewer's time + +The reviewer should be able to: +- **Skim** the page and grasp what the PR does in 30 seconds (big picture section). +- **Read** any area and understand what that code does without opening their editor. +- **Zoom in** to full files or diffs when they want to inspect details. + +### Two layers per area + +Each area has two layers: +- **Layer 1 (always visible):** A curated walkthrough — prose explanation with cherry-picked code snippets. Only the parts that matter for understanding. +- **Layer 2 (collapsed):** Full file contents or diffs in `
` blocks. The reviewer expands these to zoom in. + +### High-level to low-level + +Order areas following the **call graph** from entry points down. The reviewer understands the big picture before details. For example: CLI entry point → orchestration → each pipeline step → helpers → utilities → types. + +### Within each area: Explain → Contract → Tests → Implementation + +Structure each area's walkthrough in this order: + +1. **Explanation** — Plain prose first. What does this module do? Why does it exist? How does it fit into the bigger picture? The reviewer should understand the *purpose* before seeing any code. +2. **Functions & data structures** — Show function signatures and the key types/interfaces they use. This is the contract — what goes in, what comes out. Show full interface bodies inline where they're first relevant. Don't defer to "see types.ts". +3. **Tests** — Cherry-pick the test cases that make the behavior concrete. Tests are executable documentation — they turn abstract descriptions into specific examples. +4. **Implementation** — The interesting parts of *how* it works. Skip boilerplate, show the core logic. + +Use narrative `

` tags between snippets to guide the reviewer through each transition. + +### Flag obvious issues, but don't force opinions + +If you notice something clearly wrong (bug, missing error handling, naming mismatch), flag it with a smell-box. If something is notably well done, use a note-box. But don't manufacture opinions — if the code is fine, just present it clearly. The reviewer will decide what matters. + +### Cover everything + +Every changed file appears somewhere — either in a walkthrough snippet or in a collapsed full-file block. ## Step 1 — Gather PR data @@ -38,13 +70,14 @@ For each area, write two layers: ### Layer 1: Readable walkthrough (always visible) -A curated narrative that mixes prose with **short code snippets** — only the important parts. Structure it as: +A curated narrative that mixes prose with **short code snippets**. Structure it following the principle order: -1. **API** — key types, interfaces, function signatures, exports. The contract. -2. **Tests** — the most important test cases. What the behavior is. Cherry-pick the assertions that explain the module. -3. **Implementation** — the core logic. Skip boilerplate, show the interesting parts. +1. **Explanation** — plain prose describing what this area does, why it exists, and how it fits. +2. **Functions & data structures** — key function signatures and the types they use. Show the contract. +3. **Tests** — cherry-picked test cases that make the behavior concrete with specific examples. +4. **Implementation** — the core logic. Skip boilerplate, show the interesting parts. -Use narrative `

` tags between snippets to explain what the reader is looking at and review readability. +Use narrative `

` tags between snippets to guide the reviewer through each transition. Add smell-boxes or note-boxes only when something genuinely stands out. ### Layer 2: Full files (always collapsed) @@ -67,7 +100,7 @@ Sticky topbar (nav links) Header (title, author, stats) Big picture section Area 1 - Readable walkthrough (API → Tests → Implementation snippets) + Readable walkthrough (Explain → Contract → Tests → Implementation) Full files (collapsed) Area 2 ... @@ -235,12 +268,14 @@ document.querySelectorAll('code[data-diff]').forEach(block => { **Layer 1 — Readable walkthrough snippet** (curated excerpt with prose): -Show full interface bodies where they're first relevant — not just names: +Start with explanation, then show the contract (function + types), then tests, then implementation. Show full interface bodies inline — not just names: ```html

-

API: The entry point takes a config and returns a result:

+

processStory is the core rendering pipeline. It takes a story config, prepares the + rendering context, mounts the component, and returns a result with status and timing.

+

The function signature and the types it uses:

export async function processStory(config: StoryConfig): Promise<StoryResult>
 
@@ -259,13 +294,13 @@ export interface StoryResult {
   errors: string[];
 }
-

Tests: The happy-path test shows the expected flow:

+

The happy-path test shows the expected flow concretely:

const result = await processStory(baseConfig);
 expect(result.status).toBe('success');
 expect(result.rendered).toBe(true);
-

Implementation: The pipeline is sequential — rendering depends on preparation:

+

The implementation is a sequential pipeline — rendering depends on preparation:

const context = await prepare(config);
 const canvas = await render(context);
@@ -328,6 +363,7 @@ Use `language-typescript data-diff` — this gives TypeScript syntax highlightin
 
These test names read like a specification — good documentation.
``` + ### Badge reference | Badge | Class | Use for | From b4e9cb8a5266c8f0db8483106d75a6a25e830ae9 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Mon, 30 Mar 2026 23:48:16 +0700 Subject: [PATCH 53/63] Improve eval install detection and Codex pricing --- scripts/eval/lib/agents/config.test.ts | 17 +++--- scripts/eval/lib/agents/config.ts | 2 +- scripts/eval/lib/package-manager.test.ts | 68 +++++++++++++++++++++++ scripts/eval/lib/package-manager.ts | 69 ++++++++++++++++++++---- scripts/eval/lib/prepare-trial.ts | 2 +- 5 files changed, 136 insertions(+), 22 deletions(-) create mode 100644 scripts/eval/lib/package-manager.test.ts diff --git a/scripts/eval/lib/agents/config.test.ts b/scripts/eval/lib/agents/config.test.ts index e364a8d942fb..24e967d9abed 100644 --- a/scripts/eval/lib/agents/config.test.ts +++ b/scripts/eval/lib/agents/config.test.ts @@ -28,16 +28,13 @@ describe('AGENTS', () => { expect(AGENTS.codex).toMatchObject({ defaultModel: 'gpt-5.4', defaultEffort: 'high', - pricing: Object.fromEntries( - AGENTS.codex.models.map((model) => [ - model, - { - input: expect.any(Number), - cachedInput: expect.any(Number), - output: expect.any(Number), - }, - ]) - ), + pricing: { + 'gpt-5.4': { + input: 2.5, + cachedInput: 0.25, + output: 15, + }, + }, }); }); }); diff --git a/scripts/eval/lib/agents/config.ts b/scripts/eval/lib/agents/config.ts index bebf71b21c63..3322437b324d 100644 --- a/scripts/eval/lib/agents/config.ts +++ b/scripts/eval/lib/agents/config.ts @@ -87,7 +87,7 @@ export const AGENTS: Record = { defaultModel: 'gpt-5.4', sdkModelIds: {}, pricing: { - 'gpt-5.4': { input: 2.5, cachedInput: 0.625, output: 10.0 }, + 'gpt-5.4': { input: 2.5, cachedInput: 0.25, output: 15.0 }, }, efforts: CODEX_EFFORTS, defaultEffort: 'high', diff --git a/scripts/eval/lib/package-manager.test.ts b/scripts/eval/lib/package-manager.test.ts new file mode 100644 index 000000000000..ea1bcea75924 --- /dev/null +++ b/scripts/eval/lib/package-manager.test.ts @@ -0,0 +1,68 @@ +import { mkdirSync, rmSync, writeFileSync } from 'node:fs'; +import { dirname, join } from 'node:path'; +import { tmpdir } from 'node:os'; + +import { afterEach, describe, expect, it } from 'vitest'; + +import { detectPackageManager, resolveInstallRoot } from './package-manager'; + +const TEMP_DIRS: string[] = []; + +function createTempDir(name: string) { + const dir = join(tmpdir(), `storybook-eval-${name}-${Date.now()}-${Math.random().toString(16).slice(2)}`); + mkdirSync(dir, { recursive: true }); + TEMP_DIRS.push(dir); + return dir; +} + +function writeFile(relativePath: string, root: string) { + const fullPath = join(root, relativePath); + mkdirSync(dirname(fullPath), { recursive: true }); + writeFileSync(fullPath, ''); +} + +afterEach(() => { + for (const dir of TEMP_DIRS.splice(0)) { + rmSync(dir, { recursive: true, force: true }); + } +}); + +describe('detectPackageManager', () => { + it('recognizes npm from package-lock files', () => { + const root = createTempDir('npm-lock'); + writeFile('package-lock.json', root); + + expect(detectPackageManager(root)).toBe('npm'); + }); +}); + +describe('resolveInstallRoot', () => { + it('keeps nested standalone apps on their own install root', () => { + const repoRoot = createTempDir('nested-bun'); + const projectDir = join(repoRoot, 'frontend'); + mkdirSync(projectDir, { recursive: true }); + writeFile('frontend/bun.lock', repoRoot); + + expect(resolveInstallRoot(projectDir, repoRoot)).toBe(projectDir); + }); + + it('walks up to the repo workspace root when lockfiles live above projectDir', () => { + const repoRoot = createTempDir('pnpm-workspace'); + const projectDir = join(repoRoot, 'packages', 'lib'); + mkdirSync(projectDir, { recursive: true }); + writeFile('pnpm-lock.yaml', repoRoot); + writeFile('pnpm-workspace.yaml', repoRoot); + + expect(resolveInstallRoot(projectDir, repoRoot)).toBe(repoRoot); + }); + + it('does not walk above the cloned repo root', () => { + const parent = createTempDir('parent-lock'); + const repoRoot = join(parent, 'repo'); + const projectDir = join(repoRoot, 'packages', 'lib'); + mkdirSync(projectDir, { recursive: true }); + writeFile('yarn.lock', parent); + + expect(resolveInstallRoot(projectDir, repoRoot)).toBe(projectDir); + }); +}); diff --git a/scripts/eval/lib/package-manager.ts b/scripts/eval/lib/package-manager.ts index db2c2c9797aa..aa66c4a4467a 100644 --- a/scripts/eval/lib/package-manager.ts +++ b/scripts/eval/lib/package-manager.ts @@ -5,19 +5,62 @@ * package-manager-aware install step. */ import { existsSync } from 'node:fs'; -import { join } from 'node:path'; +import { dirname, join, resolve } from 'node:path'; import { x } from 'tinyexec'; import type { Logger } from './utils.ts'; +const PACKAGE_MANAGER_MARKERS = { + pnpm: ['pnpm-lock.yaml', 'pnpm-workspace.yaml'], + yarn: ['yarn.lock'], + bun: ['bun.lockb', 'bun.lock'], + npm: ['package-lock.json', 'npm-shrinkwrap.json'], +} as const; + +function hasAnyMarker(dir: string): boolean { + return Object.values(PACKAGE_MANAGER_MARKERS).some((files) => + files.some((file) => existsSync(join(dir, file))) + ); +} + /** Detect the package manager from lock files in a directory. */ export function detectPackageManager(dir: string): string { - if (existsSync(join(dir, 'pnpm-lock.yaml')) || existsSync(join(dir, 'pnpm-workspace.yaml'))) - return 'pnpm'; - if (existsSync(join(dir, 'yarn.lock'))) return 'yarn'; - if (existsSync(join(dir, 'bun.lockb')) || existsSync(join(dir, 'bun.lock'))) return 'bun'; + if (PACKAGE_MANAGER_MARKERS.pnpm.some((file) => existsSync(join(dir, file)))) return 'pnpm'; + if (PACKAGE_MANAGER_MARKERS.yarn.some((file) => existsSync(join(dir, file)))) return 'yarn'; + if (PACKAGE_MANAGER_MARKERS.bun.some((file) => existsSync(join(dir, file)))) return 'bun'; + if (PACKAGE_MANAGER_MARKERS.npm.some((file) => existsSync(join(dir, file)))) return 'npm'; return 'npm'; } +/** + * Resolve the directory where dependency installation should run. + * + * For nested projects inside a workspace, the lockfile often lives above `dir`. + * We walk upward until we find the closest package-manager marker, stopping at + * the cloned repo root so we do not accidentally use markers from outside the trial. + */ +export function resolveInstallRoot(dir: string, stopAt?: string): string { + const start = resolve(dir); + const boundary = stopAt ? resolve(stopAt) : undefined; + + let current = start; + while (true) { + if (hasAnyMarker(current)) { + return current; + } + + if (boundary && current === boundary) { + return start; + } + + const parent = dirname(current); + if (parent === current) { + return start; + } + + current = parent; + } +} + function getInstallArgs(pm: string, dir: string): [string, string[]] { switch (pm) { case 'pnpm': @@ -38,13 +81,19 @@ function getInstallArgs(pm: string, dir: string): [string, string[]] { export async function installDeps( dir: string, logger: Logger, - env?: Record + env?: Record, + options?: { stopAt?: string } ): Promise { - const pm = detectPackageManager(dir); - const [cmd, args] = getInstallArgs(pm, dir); - logger.logStep(`Installing with ${pm}...`); + const installRoot = resolveInstallRoot(dir, options?.stopAt); + const pm = detectPackageManager(installRoot); + const [cmd, args] = getInstallArgs(pm, installRoot); + logger.logStep( + installRoot === resolve(dir) + ? `Installing with ${pm}...` + : `Installing with ${pm} from ${installRoot}...` + ); await x(cmd, args, { timeout: 300_000, - nodeOptions: { cwd: dir, ...(env && { env: env as NodeJS.ProcessEnv }) }, + nodeOptions: { cwd: installRoot, ...(env && { env: env as NodeJS.ProcessEnv }) }, }); } diff --git a/scripts/eval/lib/prepare-trial.ts b/scripts/eval/lib/prepare-trial.ts index eb39317a6659..0e59a7ca1843 100644 --- a/scripts/eval/lib/prepare-trial.ts +++ b/scripts/eval/lib/prepare-trial.ts @@ -39,7 +39,7 @@ export async function prepareTrial( timeout: 120_000, }); const projectPath = project.projectDir ? join(repoRoot, project.projectDir) : repoRoot; - await installDeps(projectPath, logger); + await installDeps(projectPath, logger, undefined, { stopAt: repoRoot }); logger.logSuccess('Dependencies installed'); logger.logStep('Caching for future runs...'); await cp(repoRoot, cacheDir, { recursive: true }); From 9a850af66e6d8c97240f2e720a2caa412134595b Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Tue, 31 Mar 2026 00:04:42 +0700 Subject: [PATCH 54/63] Changes from Codex --- .../utils/ghost-stories/get-candidates.ts | 96 +++++++------- .../ghost-stories/parse-vitest-report.ts | 110 ++++++++-------- .../shared/checklist-store/checklistData.tsx | 65 ++++----- .../shared/utils/categorize-render-errors.ts | 86 ++++++------ code/lib/cli-storybook/src/bin/run.ts | 123 +++++++++--------- scripts/eval/eval.ts | 32 ++--- scripts/eval/lib/agents/claude-code.ts | 104 +++++++-------- scripts/eval/lib/grade.ts | 16 +-- scripts/eval/lib/grading-helpers.test.ts | 22 ++-- scripts/eval/lib/package-manager.test.ts | 29 +++-- scripts/eval/lib/package-manager.ts | 44 +++---- scripts/eval/lib/run-trial.test.ts | 82 ++++++------ scripts/eval/lib/utils.ts | 8 +- 13 files changed, 413 insertions(+), 404 deletions(-) diff --git a/code/core/src/core-server/utils/ghost-stories/get-candidates.ts b/code/core/src/core-server/utils/ghost-stories/get-candidates.ts index 661196a3ebea..e230fd10d103 100644 --- a/code/core/src/core-server/utils/ghost-stories/get-candidates.ts +++ b/code/core/src/core-server/utils/ghost-stories/get-candidates.ts @@ -7,54 +7,6 @@ import { glob } from 'glob'; import { getComponentComplexity } from './component-analyzer.ts'; -// A valid candidate includes React code and at least one export -function isValidCandidate(source: string): boolean { - const ast = babelParse(source); - - let hasJSX = false; - let hasExport = false; - - traverse(ast, { - JSXElement(path) { - hasJSX = true; - - if (hasExport) { - path.stop(); - } - }, - JSXFragment(path) { - hasJSX = true; - - if (hasExport) { - path.stop(); - } - }, - ExportNamedDeclaration(path) { - hasExport = true; - - if (hasJSX) { - path.stop(); - } - }, - ExportDefaultDeclaration(path) { - hasExport = true; - - if (hasJSX) { - path.stop(); - } - }, - ExportAllDeclaration(path) { - hasExport = true; - - if (hasJSX) { - path.stop(); - } - }, - }); - - return hasJSX && hasExport; -} - /** * Based on a list of files, analyze them to find potential candidates to generate story files for. * this is based on whether the file has JSX and exports and how many runtime LOC and imports it @@ -195,3 +147,51 @@ export async function getComponentCandidates({ }; } } + +// A valid candidate includes React code and at least one export +function isValidCandidate(source: string): boolean { + const ast = babelParse(source); + + let hasJSX = false; + let hasExport = false; + + traverse(ast, { + JSXElement(path) { + hasJSX = true; + + if (hasExport) { + path.stop(); + } + }, + JSXFragment(path) { + hasJSX = true; + + if (hasExport) { + path.stop(); + } + }, + ExportNamedDeclaration(path) { + hasExport = true; + + if (hasJSX) { + path.stop(); + } + }, + ExportDefaultDeclaration(path) { + hasExport = true; + + if (hasJSX) { + path.stop(); + } + }, + ExportAllDeclaration(path) { + hasExport = true; + + if (hasJSX) { + path.stop(); + } + }, + }); + + return hasJSX && hasExport; +} diff --git a/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts b/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts index e0bd41cc53a6..27a28dca91d5 100644 --- a/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts +++ b/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts @@ -6,61 +6,6 @@ import { type TestRunSummary, } from './types.ts'; -/** - * For a given list of test results: - * - * - Go through failures - * - Categorize errors into categories - * - Return structured data about the run, with categorized errors instead of the actual error - * messages - */ -function extractCategorizedErrors(testResults: StoryTestResult[]): ErrorCategorizationResult { - const failed = testResults.filter((r) => r.status === 'FAIL' && r.error); - - // Map: category -> { count, uniqueErrors: Set, matchedDependencies } - const map = new Map< - ErrorCategory, - { count: number; uniqueErrors: Set; matchedDependencies: Set } - >(); - - // To count unique error messages (by their message, not by category) - const uniqueErrorMessages = new Set(); - - for (const r of failed) { - const { category, matchedDependencies } = categorizeError(r.error!, r.stack); - - if (!map.has(category)) { - map.set(category, { count: 0, uniqueErrors: new Set(), matchedDependencies: new Set() }); - } - - const data = map.get(category)!; - data.count++; - matchedDependencies.forEach((dep) => data.matchedDependencies.add(dep)); - - // Use the full error message for unique error message counting - uniqueErrorMessages.add(r.error!); - data.uniqueErrors.add(r.error!); - } - - const categorizedErrors = Array.from(map.entries()).reduce>( - (acc, [category, data]) => { - acc[category] = { - uniqueCount: data.uniqueErrors.size, - count: data.count, - matchedDependencies: Array.from(data.matchedDependencies).sort(), - }; - return acc; - }, - {} - ); - - return { - totalErrors: failed.length, - uniqueErrorCount: uniqueErrorMessages.size, - categorizedErrors, - }; -} - /** Transform the Vitest test results to our expected format and return a TestRunSummary */ export function parseVitestResults(report: any): TestRunSummary { // Transform the Vitest test results to our expected format @@ -123,3 +68,58 @@ export function parseVitestResults(report: any): TestRunSummary { }, }; } + +/** + * For a given list of test results: + * + * - Go through failures + * - Categorize errors into categories + * - Return structured data about the run, with categorized errors instead of the actual error + * messages + */ +function extractCategorizedErrors(testResults: StoryTestResult[]): ErrorCategorizationResult { + const failed = testResults.filter((r) => r.status === 'FAIL' && r.error); + + // Map: category -> { count, uniqueErrors: Set, matchedDependencies } + const map = new Map< + ErrorCategory, + { count: number; uniqueErrors: Set; matchedDependencies: Set } + >(); + + // To count unique error messages (by their message, not by category) + const uniqueErrorMessages = new Set(); + + for (const r of failed) { + const { category, matchedDependencies } = categorizeError(r.error!, r.stack); + + if (!map.has(category)) { + map.set(category, { count: 0, uniqueErrors: new Set(), matchedDependencies: new Set() }); + } + + const data = map.get(category)!; + data.count++; + matchedDependencies.forEach((dep) => data.matchedDependencies.add(dep)); + + // Use the full error message for unique error message counting + uniqueErrorMessages.add(r.error!); + data.uniqueErrors.add(r.error!); + } + + const categorizedErrors = Array.from(map.entries()).reduce>( + (acc, [category, data]) => { + acc[category] = { + uniqueCount: data.uniqueErrors.size, + count: data.count, + matchedDependencies: Array.from(data.matchedDependencies).sort(), + }; + return acc; + }, + {} + ); + + return { + totalErrors: failed.length, + uniqueErrorCount: uniqueErrorMessages.size, + categorizedErrors, + }; +} diff --git a/code/core/src/shared/checklist-store/checklistData.tsx b/code/core/src/shared/checklist-store/checklistData.tsx index 7d8752877a0a..a1b3301c48c4 100644 --- a/code/core/src/shared/checklist-store/checklistData.tsx +++ b/code/core/src/shared/checklist-store/checklistData.tsx @@ -45,14 +45,6 @@ const CodeWrapper = styled.div(({ theme }) => ({ }, })); -const CodeSnippet = (props: ComponentProps) => ( - - - - - -); - type ItemId = keyof (typeof initialState)['items']; export interface ChecklistData { @@ -121,29 +113,6 @@ export interface ChecklistData { }[]; } -const isExample = (id: string) => - id.startsWith('example-') || id.startsWith('configure-your-project--'); - -const subscribeToIndex: ( - condition: (entries: Record) => boolean -) => ChecklistData['sections'][number]['items'][number]['subscribe'] = - (condition) => - ({ api, done }) => { - const check = () => - condition( - Object.entries(api.getIndex()?.entries || {}).reduce( - (acc, [id, entry]) => (isExample(entry.id) ? acc : Object.assign(acc, { [id]: entry })), - {} as Record - ) - ); - if (check()) { - done(); - } else { - api.once(PREVIEW_INITIALIZED, () => check() && done()); - return api.on(STORY_INDEX_INVALIDATED, () => check() && done()); - } - }; - export const checklistData = { sections: [ { @@ -1272,3 +1241,37 @@ npm install @my/awesome-project }, ], } as const satisfies ChecklistData; + +function CodeSnippet(props: ComponentProps) { + return ( + + + + + + ); +} + +function isExample(id: string) { + return id.startsWith('example-') || id.startsWith('configure-your-project--'); +} + +function subscribeToIndex( + condition: (entries: Record) => boolean +): ChecklistData['sections'][number]['items'][number]['subscribe'] { + return ({ api, done }) => { + const check = () => + condition( + Object.entries(api.getIndex()?.entries || {}).reduce( + (acc, [id, entry]) => (isExample(entry.id) ? acc : Object.assign(acc, { [id]: entry })), + {} as Record + ) + ); + if (check()) { + done(); + } else { + api.once(PREVIEW_INITIALIZED, () => check() && done()); + return api.on(STORY_INDEX_INVALIDATED, () => check() && done()); + } + }; +} diff --git a/code/core/src/shared/utils/categorize-render-errors.ts b/code/core/src/shared/utils/categorize-render-errors.ts index 2bf36b1086a3..e5a3c7aa5c08 100644 --- a/code/core/src/shared/utils/categorize-render-errors.ts +++ b/code/core/src/shared/utils/categorize-render-errors.ts @@ -41,34 +41,6 @@ interface CategorizationRule { match: (ctx: ErrorContext) => boolean; } -// From a message and stack, return a context for each category matchers -function buildErrorContext(message: string, stack?: string): ErrorContext { - const normalizedMessage = message.toLowerCase(); - const normalizedStack = (stack ?? '').toLowerCase(); - - const stackDeps = new Set(); - const stackLines = normalizedStack.split('\n').filter(Boolean); - - for (const line of stackLines) { - // Extracts any module name between '/deps/' and '.js' - // e.g. http://localhost:63315/node_modules/.cache/storybook/490ab5/sb-vitest/deps/@emotion/react.js:500:10 - // would become '@emotion/react' - // NOTE this is Vite dependent for now. - const depMatch = line.match(/\/deps\/([^:]+)\.js/); - if (depMatch) { - stackDeps.add(depMatch[1]); - } - } - - return { - message, - stack, - normalizedMessage, - normalizedStack, - stackDeps, - }; -} - /** * Each rule is a category matcher with a priority. The higher the priority, the more specific the * rule is. For instance you might have an error message that matches two categories @@ -216,21 +188,6 @@ export function categorizeError( return { category: rule.category, matchedDependencies }; } -function getMatchedDependencies(category: ErrorCategory, ctx: ErrorContext): string[] { - switch (category) { - case ERROR_CATEGORIES.MISSING_STATE_PROVIDER: - return Array.from(ctx.stackDeps).filter(isStateManagementPackage); - case ERROR_CATEGORIES.MISSING_ROUTER_PROVIDER: - return Array.from(ctx.stackDeps).filter(isRouterPackage); - case ERROR_CATEGORIES.MISSING_THEME_PROVIDER: - return Array.from(ctx.stackDeps).filter(isStylingPackage); - case ERROR_CATEGORIES.MISSING_TRANSLATION_PROVIDER: - return Array.from(ctx.stackDeps).filter(isI18nPackage); - default: - return []; - } -} - /** For a given category, return a description of the error for better legibility. */ export function getCategoryDescription(category: ErrorCategory): string { switch (category) { @@ -274,3 +231,46 @@ export function getCategoryDescription(category: ErrorCategory): string { return 'Error could not be categorized'; } } + +function getMatchedDependencies(category: ErrorCategory, ctx: ErrorContext): string[] { + switch (category) { + case ERROR_CATEGORIES.MISSING_STATE_PROVIDER: + return Array.from(ctx.stackDeps).filter(isStateManagementPackage); + case ERROR_CATEGORIES.MISSING_ROUTER_PROVIDER: + return Array.from(ctx.stackDeps).filter(isRouterPackage); + case ERROR_CATEGORIES.MISSING_THEME_PROVIDER: + return Array.from(ctx.stackDeps).filter(isStylingPackage); + case ERROR_CATEGORIES.MISSING_TRANSLATION_PROVIDER: + return Array.from(ctx.stackDeps).filter(isI18nPackage); + default: + return []; + } +} + +// From a message and stack, return a context for each category matchers +function buildErrorContext(message: string, stack?: string): ErrorContext { + const normalizedMessage = message.toLowerCase(); + const normalizedStack = (stack ?? '').toLowerCase(); + + const stackDeps = new Set(); + const stackLines = normalizedStack.split('\n').filter(Boolean); + + for (const line of stackLines) { + // Extracts any module name between '/deps/' and '.js' + // e.g. http://localhost:63315/node_modules/.cache/storybook/490ab5/sb-vitest/deps/@emotion/react.js:500:10 + // would become '@emotion/react' + // NOTE this is Vite dependent for now. + const depMatch = line.match(/\/deps\/([^:]+)\.js/); + if (depMatch) { + stackDeps.add(depMatch[1]); + } + } + + return { + message, + stack, + normalizedMessage, + normalizedStack, + stackDeps, + }; +} diff --git a/code/lib/cli-storybook/src/bin/run.ts b/code/lib/cli-storybook/src/bin/run.ts index 794ab3560251..3a7249c34469 100644 --- a/code/lib/cli-storybook/src/bin/run.ts +++ b/code/lib/cli-storybook/src/bin/run.ts @@ -28,67 +28,6 @@ import { type UpgradeOptions, upgrade } from '../upgrade'; addToGlobalContext('cliVersion', versions.storybook); -// Return a failed exit code but write the logs to a file first -const handleCommandFailure = - (logFilePath: string | boolean | undefined) => - async (error: unknown): Promise => { - if (!(error instanceof HandledError)) { - logger.error(String(error)); - } - - try { - const logFile = await logTracker.writeToFile(logFilePath); - logger.log(`Debug logs are written to: ${logFile}`); - } catch {} - logger.outro(''); - process.exit(1); - }; - -const command = (name: string) => - program - .command(name) - .option( - '--disable-telemetry', - 'Disable sending telemetry data', - optionalEnvToBoolean(process.env.STORYBOOK_DISABLE_TELEMETRY) - ) - .option('--debug', 'Get more logs in debug mode', false) - .option('--enable-crash-reports', 'Enable sending crash reports to telemetry data') - .option( - '--logfile [path]', - 'Write all debug logs to the specified file at the end of the run. Defaults to debug-storybook.log when [path] is not provided' - ) - .option('--loglevel ', 'Define log level', 'info') - .hook('preAction', async (self) => { - const options = self.opts(); - if (options.debug) { - logger.setLogLevel('debug'); - } - - if (options.loglevel) { - logger.setLogLevel(options.loglevel); - } - - if (options.logfile) { - logTracker.enableLogWriting(); - } - - try { - await globalSettings(); - } catch (e) { - logger.error('Error loading global settings:\n' + String(e)); - } - }) - .hook('postAction', async (command) => { - if (logTracker.shouldWriteLogsToFile) { - try { - const logFile = await logTracker.writeToFile(command.getOptionValue('logfile')); - logger.log(`Debug logs are written to: ${logFile}`); - } catch {} - logger.outro(CLI_COLORS.success('Done!')); - } - }); - command('init') .description('Initialize Storybook into your project') .option('-f --force', 'Force add Storybook') @@ -331,4 +270,66 @@ program.on('command:*', ([invalidCmd]) => { process.exit(1); }); +function command(name: string) { + return program + .command(name) + .option( + '--disable-telemetry', + 'Disable sending telemetry data', + optionalEnvToBoolean(process.env.STORYBOOK_DISABLE_TELEMETRY) + ) + .option('--debug', 'Get more logs in debug mode', false) + .option('--enable-crash-reports', 'Enable sending crash reports to telemetry data') + .option( + '--logfile [path]', + 'Write all debug logs to the specified file at the end of the run. Defaults to debug-storybook.log when [path] is not provided' + ) + .option('--loglevel ', 'Define log level', 'info') + .hook('preAction', async (self) => { + const options = self.opts(); + if (options.debug) { + logger.setLogLevel('debug'); + } + + if (options.loglevel) { + logger.setLogLevel(options.loglevel); + } + + if (options.logfile) { + logTracker.enableLogWriting(); + } + + try { + await globalSettings(); + } catch (e) { + logger.error('Error loading global settings:\n' + String(e)); + } + }) + .hook('postAction', async (command) => { + if (logTracker.shouldWriteLogsToFile) { + try { + const logFile = await logTracker.writeToFile(command.getOptionValue('logfile')); + logger.log(`Debug logs are written to: ${logFile}`); + } catch {} + logger.outro(CLI_COLORS.success('Done!')); + } + }); +} + +// Return a failed exit code but write the logs to a file first +function handleCommandFailure(logFilePath: string | boolean | undefined) { + return async (error: unknown): Promise => { + if (!(error instanceof HandledError)) { + logger.error(String(error)); + } + + try { + const logFile = await logTracker.writeToFile(logFilePath); + logger.log(`Debug logs are written to: ${logFile}`); + } catch {} + logger.outro(''); + process.exit(1); + }; +} + program.usage(' [options]').version(String(version)).parse(process.argv); diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 46dd7f05067f..3ff4ba57d1bc 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -44,22 +44,6 @@ import { const PROJECT_NAMES = PROJECTS.map((p) => p.name) as [string, ...string[]]; -function inferAgent(model: string): AgentId { - for (const id of AGENT_IDS) { - if (AGENTS[id].models.includes(model)) return id; - } - throw new Error(`No agent found for model: ${model}`); -} - -function buildManualCommand(variant: AgentVariant, promptPath: string): string { - const promptArg = `"$(cat ${promptPath})"`; - if (variant.agent === 'claude') { - const sdkModel = AGENTS.claude.sdkModelIds[variant.model] ?? variant.model; - return `claude --model ${sdkModel} ${promptArg}`; - } - return `codex --model ${variant.model} --reasoning-effort ${variant.effort} ${promptArg}`; -} - const base = { project: z.enum(PROJECT_NAMES).optional(), prompt: z.string().default('setup'), @@ -195,3 +179,19 @@ if (args.manual) { logger.log('\nDone.'); } + +function inferAgent(model: string): AgentId { + for (const id of AGENT_IDS) { + if (AGENTS[id].models.includes(model)) return id; + } + throw new Error(`No agent found for model: ${model}`); +} + +function buildManualCommand(variant: AgentVariant, promptPath: string): string { + const promptArg = `"$(cat ${promptPath})"`; + if (variant.agent === 'claude') { + const sdkModel = AGENTS.claude.sdkModelIds[variant.model] ?? variant.model; + return `claude --model ${sdkModel} ${promptArg}`; + } + return `codex --model ${variant.model} --reasoning-effort ${variant.effort} ${promptArg}`; +} diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index e5f6882b59b8..ab5d7c3f844d 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -5,6 +5,58 @@ import { join } from 'node:path'; import { AGENTS, type AgentDriver, type Execution } from './config.ts'; import type { Logger } from '../utils.ts'; +const MAX_TURNS = 50; + +export const claudeAgent: AgentDriver = { + name: 'claude', + + async execute({ prompt, projectPath, variant, resultsDir, logger }): Promise { + const startTime = Date.now(); + const { model } = variant; + const effort = variant.effort as 'low' | 'medium' | 'high' | 'max'; + const sdkModel = AGENTS.claude.sdkModelIds[model] ?? model; + + let cost: number | undefined; + let turns = 0; + let durationApi: number | undefined; + const messages: unknown[] = []; + + for await (const message of query({ + prompt, + options: { + model: sdkModel, + cwd: projectPath, + allowedTools: ['Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep'], + maxTurns: MAX_TURNS, + effort, + debug: true, + systemPrompt: { type: 'preset', preset: 'claude_code' }, + }, + })) { + logMessage(message, logger); + messages.push(message); + + if (message.type === 'result' && message.subtype === 'success') { + cost = message.total_cost_usd as number | undefined; + turns = (message.num_turns as number) ?? 0; + durationApi = + typeof message.duration_api_ms === 'number' ? message.duration_api_ms / 1000 : undefined; + } + } + + const duration = (Date.now() - startTime) / 1000; + + await writeFile(join(resultsDir, 'transcript.json'), JSON.stringify(messages, null, 2)); + + return { + cost, + duration, + durationApi, + turns, + }; + }, +}; + function logMessage(message: SDKMessage, logger: Logger) { switch (message.type) { case 'assistant': { @@ -71,55 +123,3 @@ function logMessage(message: SDKMessage, logger: Logger) { break; } } - -const MAX_TURNS = 50; - -export const claudeAgent: AgentDriver = { - name: 'claude', - - async execute({ prompt, projectPath, variant, resultsDir, logger }): Promise { - const startTime = Date.now(); - const { model } = variant; - const effort = variant.effort as 'low' | 'medium' | 'high' | 'max'; - const sdkModel = AGENTS.claude.sdkModelIds[model] ?? model; - - let cost: number | undefined; - let turns = 0; - let durationApi: number | undefined; - const messages: unknown[] = []; - - for await (const message of query({ - prompt, - options: { - model: sdkModel, - cwd: projectPath, - allowedTools: ['Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep'], - maxTurns: MAX_TURNS, - effort, - debug: true, - systemPrompt: { type: 'preset', preset: 'claude_code' }, - }, - })) { - logMessage(message, logger); - messages.push(message); - - if (message.type === 'result' && message.subtype === 'success') { - cost = message.total_cost_usd as number | undefined; - turns = (message.num_turns as number) ?? 0; - durationApi = - typeof message.duration_api_ms === 'number' ? message.duration_api_ms / 1000 : undefined; - } - } - - const duration = (Date.now() - startTime) / 1000; - - await writeFile(join(resultsDir, 'transcript.json'), JSON.stringify(messages, null, 2)); - - return { - cost, - duration, - durationApi, - turns, - }; - }, -}; diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index 8656d7d343b3..f480a4488a7a 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -141,14 +141,6 @@ export function parseChangedFiles(gitOutput: string): FileChange[] { }); } -/** Truncate text to approximately maxChars, snapping to a line boundary. */ -function truncateEnd(text: string, maxChars: number): string { - if (text.length <= maxChars) return text; - const truncated = text.slice(-maxChars); - const firstNewline = truncated.indexOf('\n'); - return firstNewline >= 0 ? truncated.slice(firstNewline + 1) : truncated; -} - export async function grade( workspace: TrialWorkspace, logger: Logger, @@ -279,3 +271,11 @@ async function gradeGhostStories( return undefined; } } + +/** Truncate text to approximately maxChars, snapping to a line boundary. */ +function truncateEnd(text: string, maxChars: number): string { + if (text.length <= maxChars) return text; + const truncated = text.slice(-maxChars); + const firstNewline = truncated.indexOf('\n'); + return firstNewline >= 0 ? truncated.slice(firstNewline + 1) : truncated; +} diff --git a/scripts/eval/lib/grading-helpers.test.ts b/scripts/eval/lib/grading-helpers.test.ts index fe4880fa42ac..8d883da92f7a 100644 --- a/scripts/eval/lib/grading-helpers.test.ts +++ b/scripts/eval/lib/grading-helpers.test.ts @@ -29,17 +29,6 @@ afterEach(() => { rmSync(TMP, { recursive: true, force: true }); }); -function writeFile(relativePath: string, content: string) { - const fullPath = join(TMP, relativePath); - mkdirSync(join(fullPath, '..'), { recursive: true }); - writeFileSync(fullPath, content); -} - -async function findCandidates(cwd: string) { - const { candidates } = await getComponentCandidates({ cwd, sampleSize: 20 }); - return candidates.map((c) => c.replace(cwd + '/', '')); -} - describe('grading helpers', () => { it('composes helper signals for a well-configured project', async () => { // Set up a realistic project with components and storybook config @@ -175,3 +164,14 @@ describe('grading helpers', () => { ).toBe(1); }); }); + +function writeFile(relativePath: string, content: string) { + const fullPath = join(TMP, relativePath); + mkdirSync(join(fullPath, '..'), { recursive: true }); + writeFileSync(fullPath, content); +} + +async function findCandidates(cwd: string) { + const { candidates } = await getComponentCandidates({ cwd, sampleSize: 20 }); + return candidates.map((c) => c.replace(cwd + '/', '')); +} diff --git a/scripts/eval/lib/package-manager.test.ts b/scripts/eval/lib/package-manager.test.ts index ea1bcea75924..4d958198d3d7 100644 --- a/scripts/eval/lib/package-manager.test.ts +++ b/scripts/eval/lib/package-manager.test.ts @@ -8,19 +8,6 @@ import { detectPackageManager, resolveInstallRoot } from './package-manager'; const TEMP_DIRS: string[] = []; -function createTempDir(name: string) { - const dir = join(tmpdir(), `storybook-eval-${name}-${Date.now()}-${Math.random().toString(16).slice(2)}`); - mkdirSync(dir, { recursive: true }); - TEMP_DIRS.push(dir); - return dir; -} - -function writeFile(relativePath: string, root: string) { - const fullPath = join(root, relativePath); - mkdirSync(dirname(fullPath), { recursive: true }); - writeFileSync(fullPath, ''); -} - afterEach(() => { for (const dir of TEMP_DIRS.splice(0)) { rmSync(dir, { recursive: true, force: true }); @@ -66,3 +53,19 @@ describe('resolveInstallRoot', () => { expect(resolveInstallRoot(projectDir, repoRoot)).toBe(projectDir); }); }); + +function createTempDir(name: string) { + const dir = join( + tmpdir(), + `storybook-eval-${name}-${Date.now()}-${Math.random().toString(16).slice(2)}` + ); + mkdirSync(dir, { recursive: true }); + TEMP_DIRS.push(dir); + return dir; +} + +function writeFile(relativePath: string, root: string) { + const fullPath = join(root, relativePath); + mkdirSync(dirname(fullPath), { recursive: true }); + writeFileSync(fullPath, ''); +} diff --git a/scripts/eval/lib/package-manager.ts b/scripts/eval/lib/package-manager.ts index aa66c4a4467a..ea61a5444e4f 100644 --- a/scripts/eval/lib/package-manager.ts +++ b/scripts/eval/lib/package-manager.ts @@ -16,12 +16,6 @@ const PACKAGE_MANAGER_MARKERS = { npm: ['package-lock.json', 'npm-shrinkwrap.json'], } as const; -function hasAnyMarker(dir: string): boolean { - return Object.values(PACKAGE_MANAGER_MARKERS).some((files) => - files.some((file) => existsSync(join(dir, file))) - ); -} - /** Detect the package manager from lock files in a directory. */ export function detectPackageManager(dir: string): string { if (PACKAGE_MANAGER_MARKERS.pnpm.some((file) => existsSync(join(dir, file)))) return 'pnpm'; @@ -61,22 +55,6 @@ export function resolveInstallRoot(dir: string, stopAt?: string): string { } } -function getInstallArgs(pm: string, dir: string): [string, string[]] { - switch (pm) { - case 'pnpm': - return ['pnpm', ['install', '--no-frozen-lockfile']]; - case 'yarn': - return [ - 'yarn', - existsSync(join(dir, '.yarnrc.yml')) ? ['install', '--no-immutable'] : ['install'], - ]; - case 'bun': - return ['bun', ['install']]; - default: - return ['npm', ['install', '--ignore-scripts']]; - } -} - /** Install dependencies using the detected package manager. */ export async function installDeps( dir: string, @@ -97,3 +75,25 @@ export async function installDeps( nodeOptions: { cwd: installRoot, ...(env && { env: env as NodeJS.ProcessEnv }) }, }); } + +function hasAnyMarker(dir: string): boolean { + return Object.values(PACKAGE_MANAGER_MARKERS).some((files) => + files.some((file) => existsSync(join(dir, file))) + ); +} + +function getInstallArgs(pm: string, dir: string): [string, string[]] { + switch (pm) { + case 'pnpm': + return ['pnpm', ['install', '--no-frozen-lockfile']]; + case 'yarn': + return [ + 'yarn', + existsSync(join(dir, '.yarnrc.yml')) ? ['install', '--no-immutable'] : ['install'], + ]; + case 'bun': + return ['bun', ['install']]; + default: + return ['npm', ['install', '--ignore-scripts']]; + } +} diff --git a/scripts/eval/lib/run-trial.test.ts b/scripts/eval/lib/run-trial.test.ts index 297c62cbee8a..417e5b616be5 100644 --- a/scripts/eval/lib/run-trial.test.ts +++ b/scripts/eval/lib/run-trial.test.ts @@ -49,47 +49,6 @@ afterEach(() => { rmSync(TMP, { recursive: true, force: true }); }); -function setupMocks(overrides?: { - buildSuccess?: boolean; - typeCheckErrors?: number; - cost?: number; -}) { - const { buildSuccess = true, typeCheckErrors = 0, cost = 0.42 } = overrides ?? {}; - - vi.mocked(prepareTrial).mockResolvedValue({ - trialDir: TMP, - repoRoot: TMP, - projectPath: TMP, - resultsDir: join(TMP, 'results'), - baselineCommit: 'deadbeef', - }); - - vi.mocked(claudeAgent.execute).mockResolvedValue({ - cost, - duration: 45.2, - turns: 12, - }); - - vi.mocked(grade).mockResolvedValue({ - grade: { - buildSuccess, - typeCheckErrors, - fileChanges: [ - { path: '.storybook/preview.tsx', status: 'A' }, - { path: 'src/Button.stories.tsx', status: 'A' }, - ], - storybookChanges: [ - { path: '.storybook/preview.tsx', status: 'A' }, - { path: 'src/Button.stories.tsx', status: 'A' }, - ], - }, - score: { - score: buildSuccess ? 1 : 0.3, - breakdown: { build: buildSuccess ? 1 : 0, typecheck: 1, ghostStories: 0, performance: 0 }, - }, - }); -} - const baseConfig: TrialConfig = { project: { name: 'test-project', repo: 'https://github.com/test/repo', branch: 'main' }, variant: { agent: 'claude', model: 'sonnet-4.6', effort: 'high' }, @@ -231,3 +190,44 @@ describe('runTrial pipeline', () => { expect(callOrder).toEqual(['prepare', 'agent', 'grade']); }); }); + +function setupMocks(overrides?: { + buildSuccess?: boolean; + typeCheckErrors?: number; + cost?: number; +}) { + const { buildSuccess = true, typeCheckErrors = 0, cost = 0.42 } = overrides ?? {}; + + vi.mocked(prepareTrial).mockResolvedValue({ + trialDir: TMP, + repoRoot: TMP, + projectPath: TMP, + resultsDir: join(TMP, 'results'), + baselineCommit: 'deadbeef', + }); + + vi.mocked(claudeAgent.execute).mockResolvedValue({ + cost, + duration: 45.2, + turns: 12, + }); + + vi.mocked(grade).mockResolvedValue({ + grade: { + buildSuccess, + typeCheckErrors, + fileChanges: [ + { path: '.storybook/preview.tsx', status: 'A' }, + { path: 'src/Button.stories.tsx', status: 'A' }, + ], + storybookChanges: [ + { path: '.storybook/preview.tsx', status: 'A' }, + { path: 'src/Button.stories.tsx', status: 'A' }, + ], + }, + score: { + score: buildSuccess ? 1 : 0.3, + breakdown: { build: buildSuccess ? 1 : 0, typecheck: 1, ghostStories: 0, performance: 0 }, + }, + }); +} diff --git a/scripts/eval/lib/utils.ts b/scripts/eval/lib/utils.ts index ddb60e9c394b..79d24891f227 100644 --- a/scripts/eval/lib/utils.ts +++ b/scripts/eval/lib/utils.ts @@ -37,9 +37,6 @@ export function generateTrialId(project: string, agent: string, model: string, p return `${ts}-${project}-${agent}-${model}-${prompt}-${crypto.randomUUID().slice(0, 8)}`; } -/** Strip ANSI escape codes for accurate width calculation. */ -const stripAnsi = (str: string) => str.replace(/\x1b\[[0-9;]*m/g, ''); - /** Format data as an aligned table with automatic column widths. */ export function formatTable(headers: string[], rows: string[][]): string { const widths = headers.map((h, i) => @@ -97,3 +94,8 @@ export async function captureEnvironment(resultsDir: string): Promise Date: Tue, 31 Mar 2026 00:12:26 +0700 Subject: [PATCH 55/63] Changes from Codex --- .../utils/ghost-stories/get-candidates.ts | 96 +++++++------- .../ghost-stories/parse-vitest-report.ts | 110 ++++++++-------- .../shared/checklist-store/checklistData.tsx | 68 +++++----- .../shared/utils/categorize-render-errors.ts | 86 ++++++------ code/lib/cli-storybook/src/bin/run.ts | 124 +++++++++--------- 5 files changed, 242 insertions(+), 242 deletions(-) diff --git a/code/core/src/core-server/utils/ghost-stories/get-candidates.ts b/code/core/src/core-server/utils/ghost-stories/get-candidates.ts index e230fd10d103..661196a3ebea 100644 --- a/code/core/src/core-server/utils/ghost-stories/get-candidates.ts +++ b/code/core/src/core-server/utils/ghost-stories/get-candidates.ts @@ -7,6 +7,54 @@ import { glob } from 'glob'; import { getComponentComplexity } from './component-analyzer.ts'; +// A valid candidate includes React code and at least one export +function isValidCandidate(source: string): boolean { + const ast = babelParse(source); + + let hasJSX = false; + let hasExport = false; + + traverse(ast, { + JSXElement(path) { + hasJSX = true; + + if (hasExport) { + path.stop(); + } + }, + JSXFragment(path) { + hasJSX = true; + + if (hasExport) { + path.stop(); + } + }, + ExportNamedDeclaration(path) { + hasExport = true; + + if (hasJSX) { + path.stop(); + } + }, + ExportDefaultDeclaration(path) { + hasExport = true; + + if (hasJSX) { + path.stop(); + } + }, + ExportAllDeclaration(path) { + hasExport = true; + + if (hasJSX) { + path.stop(); + } + }, + }); + + return hasJSX && hasExport; +} + /** * Based on a list of files, analyze them to find potential candidates to generate story files for. * this is based on whether the file has JSX and exports and how many runtime LOC and imports it @@ -147,51 +195,3 @@ export async function getComponentCandidates({ }; } } - -// A valid candidate includes React code and at least one export -function isValidCandidate(source: string): boolean { - const ast = babelParse(source); - - let hasJSX = false; - let hasExport = false; - - traverse(ast, { - JSXElement(path) { - hasJSX = true; - - if (hasExport) { - path.stop(); - } - }, - JSXFragment(path) { - hasJSX = true; - - if (hasExport) { - path.stop(); - } - }, - ExportNamedDeclaration(path) { - hasExport = true; - - if (hasJSX) { - path.stop(); - } - }, - ExportDefaultDeclaration(path) { - hasExport = true; - - if (hasJSX) { - path.stop(); - } - }, - ExportAllDeclaration(path) { - hasExport = true; - - if (hasJSX) { - path.stop(); - } - }, - }); - - return hasJSX && hasExport; -} diff --git a/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts b/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts index 27a28dca91d5..e0bd41cc53a6 100644 --- a/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts +++ b/code/core/src/core-server/utils/ghost-stories/parse-vitest-report.ts @@ -6,6 +6,61 @@ import { type TestRunSummary, } from './types.ts'; +/** + * For a given list of test results: + * + * - Go through failures + * - Categorize errors into categories + * - Return structured data about the run, with categorized errors instead of the actual error + * messages + */ +function extractCategorizedErrors(testResults: StoryTestResult[]): ErrorCategorizationResult { + const failed = testResults.filter((r) => r.status === 'FAIL' && r.error); + + // Map: category -> { count, uniqueErrors: Set, matchedDependencies } + const map = new Map< + ErrorCategory, + { count: number; uniqueErrors: Set; matchedDependencies: Set } + >(); + + // To count unique error messages (by their message, not by category) + const uniqueErrorMessages = new Set(); + + for (const r of failed) { + const { category, matchedDependencies } = categorizeError(r.error!, r.stack); + + if (!map.has(category)) { + map.set(category, { count: 0, uniqueErrors: new Set(), matchedDependencies: new Set() }); + } + + const data = map.get(category)!; + data.count++; + matchedDependencies.forEach((dep) => data.matchedDependencies.add(dep)); + + // Use the full error message for unique error message counting + uniqueErrorMessages.add(r.error!); + data.uniqueErrors.add(r.error!); + } + + const categorizedErrors = Array.from(map.entries()).reduce>( + (acc, [category, data]) => { + acc[category] = { + uniqueCount: data.uniqueErrors.size, + count: data.count, + matchedDependencies: Array.from(data.matchedDependencies).sort(), + }; + return acc; + }, + {} + ); + + return { + totalErrors: failed.length, + uniqueErrorCount: uniqueErrorMessages.size, + categorizedErrors, + }; +} + /** Transform the Vitest test results to our expected format and return a TestRunSummary */ export function parseVitestResults(report: any): TestRunSummary { // Transform the Vitest test results to our expected format @@ -68,58 +123,3 @@ export function parseVitestResults(report: any): TestRunSummary { }, }; } - -/** - * For a given list of test results: - * - * - Go through failures - * - Categorize errors into categories - * - Return structured data about the run, with categorized errors instead of the actual error - * messages - */ -function extractCategorizedErrors(testResults: StoryTestResult[]): ErrorCategorizationResult { - const failed = testResults.filter((r) => r.status === 'FAIL' && r.error); - - // Map: category -> { count, uniqueErrors: Set, matchedDependencies } - const map = new Map< - ErrorCategory, - { count: number; uniqueErrors: Set; matchedDependencies: Set } - >(); - - // To count unique error messages (by their message, not by category) - const uniqueErrorMessages = new Set(); - - for (const r of failed) { - const { category, matchedDependencies } = categorizeError(r.error!, r.stack); - - if (!map.has(category)) { - map.set(category, { count: 0, uniqueErrors: new Set(), matchedDependencies: new Set() }); - } - - const data = map.get(category)!; - data.count++; - matchedDependencies.forEach((dep) => data.matchedDependencies.add(dep)); - - // Use the full error message for unique error message counting - uniqueErrorMessages.add(r.error!); - data.uniqueErrors.add(r.error!); - } - - const categorizedErrors = Array.from(map.entries()).reduce>( - (acc, [category, data]) => { - acc[category] = { - uniqueCount: data.uniqueErrors.size, - count: data.count, - matchedDependencies: Array.from(data.matchedDependencies).sort(), - }; - return acc; - }, - {} - ); - - return { - totalErrors: failed.length, - uniqueErrorCount: uniqueErrorMessages.size, - categorizedErrors, - }; -} diff --git a/code/core/src/shared/checklist-store/checklistData.tsx b/code/core/src/shared/checklist-store/checklistData.tsx index a1b3301c48c4..71f490ae6357 100644 --- a/code/core/src/shared/checklist-store/checklistData.tsx +++ b/code/core/src/shared/checklist-store/checklistData.tsx @@ -45,6 +45,16 @@ const CodeWrapper = styled.div(({ theme }) => ({ }, })); +function CodeSnippet(props: ComponentProps) { + return ( + + + + + + ); +} + type ItemId = keyof (typeof initialState)['items']; export interface ChecklistData { @@ -113,6 +123,30 @@ export interface ChecklistData { }[]; } +function isExample(id: string) { + return id.startsWith('example-') || id.startsWith('configure-your-project--'); +} + +function subscribeToIndex( + condition: (entries: Record) => boolean +): ChecklistData['sections'][number]['items'][number]['subscribe'] { + return ({ api, done }) => { + const check = () => + condition( + Object.entries(api.getIndex()?.entries || {}).reduce( + (acc, [id, entry]) => (isExample(entry.id) ? acc : Object.assign(acc, { [id]: entry })), + {} as Record + ) + ); + if (check()) { + done(); + } else { + api.once(PREVIEW_INITIALIZED, () => check() && done()); + return api.on(STORY_INDEX_INVALIDATED, () => check() && done()); + } + }; +} + export const checklistData = { sections: [ { @@ -1241,37 +1275,3 @@ npm install @my/awesome-project }, ], } as const satisfies ChecklistData; - -function CodeSnippet(props: ComponentProps) { - return ( - - - - - - ); -} - -function isExample(id: string) { - return id.startsWith('example-') || id.startsWith('configure-your-project--'); -} - -function subscribeToIndex( - condition: (entries: Record) => boolean -): ChecklistData['sections'][number]['items'][number]['subscribe'] { - return ({ api, done }) => { - const check = () => - condition( - Object.entries(api.getIndex()?.entries || {}).reduce( - (acc, [id, entry]) => (isExample(entry.id) ? acc : Object.assign(acc, { [id]: entry })), - {} as Record - ) - ); - if (check()) { - done(); - } else { - api.once(PREVIEW_INITIALIZED, () => check() && done()); - return api.on(STORY_INDEX_INVALIDATED, () => check() && done()); - } - }; -} diff --git a/code/core/src/shared/utils/categorize-render-errors.ts b/code/core/src/shared/utils/categorize-render-errors.ts index e5a3c7aa5c08..2bf36b1086a3 100644 --- a/code/core/src/shared/utils/categorize-render-errors.ts +++ b/code/core/src/shared/utils/categorize-render-errors.ts @@ -41,6 +41,34 @@ interface CategorizationRule { match: (ctx: ErrorContext) => boolean; } +// From a message and stack, return a context for each category matchers +function buildErrorContext(message: string, stack?: string): ErrorContext { + const normalizedMessage = message.toLowerCase(); + const normalizedStack = (stack ?? '').toLowerCase(); + + const stackDeps = new Set(); + const stackLines = normalizedStack.split('\n').filter(Boolean); + + for (const line of stackLines) { + // Extracts any module name between '/deps/' and '.js' + // e.g. http://localhost:63315/node_modules/.cache/storybook/490ab5/sb-vitest/deps/@emotion/react.js:500:10 + // would become '@emotion/react' + // NOTE this is Vite dependent for now. + const depMatch = line.match(/\/deps\/([^:]+)\.js/); + if (depMatch) { + stackDeps.add(depMatch[1]); + } + } + + return { + message, + stack, + normalizedMessage, + normalizedStack, + stackDeps, + }; +} + /** * Each rule is a category matcher with a priority. The higher the priority, the more specific the * rule is. For instance you might have an error message that matches two categories @@ -188,6 +216,21 @@ export function categorizeError( return { category: rule.category, matchedDependencies }; } +function getMatchedDependencies(category: ErrorCategory, ctx: ErrorContext): string[] { + switch (category) { + case ERROR_CATEGORIES.MISSING_STATE_PROVIDER: + return Array.from(ctx.stackDeps).filter(isStateManagementPackage); + case ERROR_CATEGORIES.MISSING_ROUTER_PROVIDER: + return Array.from(ctx.stackDeps).filter(isRouterPackage); + case ERROR_CATEGORIES.MISSING_THEME_PROVIDER: + return Array.from(ctx.stackDeps).filter(isStylingPackage); + case ERROR_CATEGORIES.MISSING_TRANSLATION_PROVIDER: + return Array.from(ctx.stackDeps).filter(isI18nPackage); + default: + return []; + } +} + /** For a given category, return a description of the error for better legibility. */ export function getCategoryDescription(category: ErrorCategory): string { switch (category) { @@ -231,46 +274,3 @@ export function getCategoryDescription(category: ErrorCategory): string { return 'Error could not be categorized'; } } - -function getMatchedDependencies(category: ErrorCategory, ctx: ErrorContext): string[] { - switch (category) { - case ERROR_CATEGORIES.MISSING_STATE_PROVIDER: - return Array.from(ctx.stackDeps).filter(isStateManagementPackage); - case ERROR_CATEGORIES.MISSING_ROUTER_PROVIDER: - return Array.from(ctx.stackDeps).filter(isRouterPackage); - case ERROR_CATEGORIES.MISSING_THEME_PROVIDER: - return Array.from(ctx.stackDeps).filter(isStylingPackage); - case ERROR_CATEGORIES.MISSING_TRANSLATION_PROVIDER: - return Array.from(ctx.stackDeps).filter(isI18nPackage); - default: - return []; - } -} - -// From a message and stack, return a context for each category matchers -function buildErrorContext(message: string, stack?: string): ErrorContext { - const normalizedMessage = message.toLowerCase(); - const normalizedStack = (stack ?? '').toLowerCase(); - - const stackDeps = new Set(); - const stackLines = normalizedStack.split('\n').filter(Boolean); - - for (const line of stackLines) { - // Extracts any module name between '/deps/' and '.js' - // e.g. http://localhost:63315/node_modules/.cache/storybook/490ab5/sb-vitest/deps/@emotion/react.js:500:10 - // would become '@emotion/react' - // NOTE this is Vite dependent for now. - const depMatch = line.match(/\/deps\/([^:]+)\.js/); - if (depMatch) { - stackDeps.add(depMatch[1]); - } - } - - return { - message, - stack, - normalizedMessage, - normalizedStack, - stackDeps, - }; -} diff --git a/code/lib/cli-storybook/src/bin/run.ts b/code/lib/cli-storybook/src/bin/run.ts index 3a7249c34469..babf1f4eadf7 100644 --- a/code/lib/cli-storybook/src/bin/run.ts +++ b/code/lib/cli-storybook/src/bin/run.ts @@ -28,6 +28,68 @@ import { type UpgradeOptions, upgrade } from '../upgrade'; addToGlobalContext('cliVersion', versions.storybook); +function command(name: string) { + return program + .command(name) + .option( + '--disable-telemetry', + 'Disable sending telemetry data', + optionalEnvToBoolean(process.env.STORYBOOK_DISABLE_TELEMETRY) + ) + .option('--debug', 'Get more logs in debug mode', false) + .option('--enable-crash-reports', 'Enable sending crash reports to telemetry data') + .option( + '--logfile [path]', + 'Write all debug logs to the specified file at the end of the run. Defaults to debug-storybook.log when [path] is not provided' + ) + .option('--loglevel ', 'Define log level', 'info') + .hook('preAction', async (self) => { + const options = self.opts(); + if (options.debug) { + logger.setLogLevel('debug'); + } + + if (options.loglevel) { + logger.setLogLevel(options.loglevel); + } + + if (options.logfile) { + logTracker.enableLogWriting(); + } + + try { + await globalSettings(); + } catch (e) { + logger.error('Error loading global settings:\n' + String(e)); + } + }) + .hook('postAction', async (command) => { + if (logTracker.shouldWriteLogsToFile) { + try { + const logFile = await logTracker.writeToFile(command.getOptionValue('logfile')); + logger.log(`Debug logs are written to: ${logFile}`); + } catch {} + logger.outro(CLI_COLORS.success('Done!')); + } + }); +} + +// Return a failed exit code but write the logs to a file first +function handleCommandFailure(logFilePath: string | boolean | undefined) { + return async (error: unknown): Promise => { + if (!(error instanceof HandledError)) { + logger.error(String(error)); + } + + try { + const logFile = await logTracker.writeToFile(logFilePath); + logger.log(`Debug logs are written to: ${logFile}`); + } catch {} + logger.outro(''); + process.exit(1); + }; +} + command('init') .description('Initialize Storybook into your project') .option('-f --force', 'Force add Storybook') @@ -270,66 +332,4 @@ program.on('command:*', ([invalidCmd]) => { process.exit(1); }); -function command(name: string) { - return program - .command(name) - .option( - '--disable-telemetry', - 'Disable sending telemetry data', - optionalEnvToBoolean(process.env.STORYBOOK_DISABLE_TELEMETRY) - ) - .option('--debug', 'Get more logs in debug mode', false) - .option('--enable-crash-reports', 'Enable sending crash reports to telemetry data') - .option( - '--logfile [path]', - 'Write all debug logs to the specified file at the end of the run. Defaults to debug-storybook.log when [path] is not provided' - ) - .option('--loglevel ', 'Define log level', 'info') - .hook('preAction', async (self) => { - const options = self.opts(); - if (options.debug) { - logger.setLogLevel('debug'); - } - - if (options.loglevel) { - logger.setLogLevel(options.loglevel); - } - - if (options.logfile) { - logTracker.enableLogWriting(); - } - - try { - await globalSettings(); - } catch (e) { - logger.error('Error loading global settings:\n' + String(e)); - } - }) - .hook('postAction', async (command) => { - if (logTracker.shouldWriteLogsToFile) { - try { - const logFile = await logTracker.writeToFile(command.getOptionValue('logfile')); - logger.log(`Debug logs are written to: ${logFile}`); - } catch {} - logger.outro(CLI_COLORS.success('Done!')); - } - }); -} - -// Return a failed exit code but write the logs to a file first -function handleCommandFailure(logFilePath: string | boolean | undefined) { - return async (error: unknown): Promise => { - if (!(error instanceof HandledError)) { - logger.error(String(error)); - } - - try { - const logFile = await logTracker.writeToFile(logFilePath); - logger.log(`Debug logs are written to: ${logFile}`); - } catch {} - logger.outro(''); - process.exit(1); - }; -} - program.usage(' [options]').version(String(version)).parse(process.argv); From db7a142cc4e47498ea2996db74f3de59f0daa966 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Tue, 31 Mar 2026 09:29:36 +0700 Subject: [PATCH 56/63] Restore helper ordering in modified PR files --- .../shared/checklist-store/checklistData.tsx | 29 +++++++-------- code/lib/cli-storybook/src/bin/run.ts | 37 +++++++++---------- 2 files changed, 31 insertions(+), 35 deletions(-) diff --git a/code/core/src/shared/checklist-store/checklistData.tsx b/code/core/src/shared/checklist-store/checklistData.tsx index 71f490ae6357..7d8752877a0a 100644 --- a/code/core/src/shared/checklist-store/checklistData.tsx +++ b/code/core/src/shared/checklist-store/checklistData.tsx @@ -45,15 +45,13 @@ const CodeWrapper = styled.div(({ theme }) => ({ }, })); -function CodeSnippet(props: ComponentProps) { - return ( - - - - - - ); -} +const CodeSnippet = (props: ComponentProps) => ( + + + + + +); type ItemId = keyof (typeof initialState)['items']; @@ -123,14 +121,14 @@ export interface ChecklistData { }[]; } -function isExample(id: string) { - return id.startsWith('example-') || id.startsWith('configure-your-project--'); -} +const isExample = (id: string) => + id.startsWith('example-') || id.startsWith('configure-your-project--'); -function subscribeToIndex( +const subscribeToIndex: ( condition: (entries: Record) => boolean -): ChecklistData['sections'][number]['items'][number]['subscribe'] { - return ({ api, done }) => { +) => ChecklistData['sections'][number]['items'][number]['subscribe'] = + (condition) => + ({ api, done }) => { const check = () => condition( Object.entries(api.getIndex()?.entries || {}).reduce( @@ -145,7 +143,6 @@ function subscribeToIndex( return api.on(STORY_INDEX_INVALIDATED, () => check() && done()); } }; -} export const checklistData = { sections: [ diff --git a/code/lib/cli-storybook/src/bin/run.ts b/code/lib/cli-storybook/src/bin/run.ts index babf1f4eadf7..794ab3560251 100644 --- a/code/lib/cli-storybook/src/bin/run.ts +++ b/code/lib/cli-storybook/src/bin/run.ts @@ -28,8 +28,24 @@ import { type UpgradeOptions, upgrade } from '../upgrade'; addToGlobalContext('cliVersion', versions.storybook); -function command(name: string) { - return program +// Return a failed exit code but write the logs to a file first +const handleCommandFailure = + (logFilePath: string | boolean | undefined) => + async (error: unknown): Promise => { + if (!(error instanceof HandledError)) { + logger.error(String(error)); + } + + try { + const logFile = await logTracker.writeToFile(logFilePath); + logger.log(`Debug logs are written to: ${logFile}`); + } catch {} + logger.outro(''); + process.exit(1); + }; + +const command = (name: string) => + program .command(name) .option( '--disable-telemetry', @@ -72,23 +88,6 @@ function command(name: string) { logger.outro(CLI_COLORS.success('Done!')); } }); -} - -// Return a failed exit code but write the logs to a file first -function handleCommandFailure(logFilePath: string | boolean | undefined) { - return async (error: unknown): Promise => { - if (!(error instanceof HandledError)) { - logger.error(String(error)); - } - - try { - const logFile = await logTracker.writeToFile(logFilePath); - logger.log(`Debug logs are written to: ${logFile}`); - } catch {} - logger.outro(''); - process.exit(1); - }; -} command('init') .description('Initialize Storybook into your project') From 2bd91694b4d4f0c06756668052a86c03097855c0 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Tue, 31 Mar 2026 17:56:24 +0700 Subject: [PATCH 57/63] Tune eval defaults and CI resources --- .circleci/config.yml | 2 +- scripts/eval/lib/agents/config.test.ts | 4 ++-- scripts/eval/lib/agents/config.ts | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 59dd245f7676..4a7ca4089c94 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -32,7 +32,7 @@ jobs: generate-and-run-config: executor: name: node/default - resource_class: small + resource_class: medium steps: - node/install: install-yarn: true diff --git a/scripts/eval/lib/agents/config.test.ts b/scripts/eval/lib/agents/config.test.ts index 24e967d9abed..29d205bd0b57 100644 --- a/scripts/eval/lib/agents/config.test.ts +++ b/scripts/eval/lib/agents/config.test.ts @@ -17,7 +17,7 @@ describe('AGENTS', () => { it('keeps Claude models fully remappable to SDK model ids', () => { expect(AGENTS.claude).toMatchObject({ defaultModel: 'sonnet-4.6', - defaultEffort: 'high', + defaultEffort: 'medium', sdkModelIds: Object.fromEntries( AGENTS.claude.models.map((model) => [model, expect.any(String)]) ), @@ -27,7 +27,7 @@ describe('AGENTS', () => { it('keeps Codex models fully priceable from token usage', () => { expect(AGENTS.codex).toMatchObject({ defaultModel: 'gpt-5.4', - defaultEffort: 'high', + defaultEffort: 'medium', pricing: { 'gpt-5.4': { input: 2.5, diff --git a/scripts/eval/lib/agents/config.ts b/scripts/eval/lib/agents/config.ts index 3322437b324d..f76a9ad03d12 100644 --- a/scripts/eval/lib/agents/config.ts +++ b/scripts/eval/lib/agents/config.ts @@ -80,7 +80,7 @@ export const AGENTS: Record = { }, pricing: {}, efforts: CLAUDE_EFFORTS, - defaultEffort: 'high', + defaultEffort: 'medium', }, codex: { models: CODEX_MODELS, @@ -90,7 +90,7 @@ export const AGENTS: Record = { 'gpt-5.4': { input: 2.5, cachedInput: 0.25, output: 15.0 }, }, efforts: CODEX_EFFORTS, - defaultEffort: 'high', + defaultEffort: 'medium', }, }; From 0c5f06a85d0da80aa254aa5ccaa58e1960af91b1 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Tue, 31 Mar 2026 18:09:29 +0700 Subject: [PATCH 58/63] Increase CircleCI resources for config generation --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4a7ca4089c94..d2798c319911 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -32,7 +32,7 @@ jobs: generate-and-run-config: executor: name: node/default - resource_class: medium + resource_class: large steps: - node/install: install-yarn: true From 7a8e2d5ccf639ba928501c6f56ae0f048f72a17e Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Tue, 31 Mar 2026 19:16:15 +0700 Subject: [PATCH 59/63] Refine eval harness execution and cache refresh --- .gitignore | 2 +- scripts/eval/eval.ts | 168 ++++++++++++++++--------- scripts/eval/lib/agents/claude-code.ts | 67 ++++++---- scripts/eval/lib/agents/codex.ts | 108 +++++++++------- scripts/eval/lib/agents/config.test.ts | 24 +++- scripts/eval/lib/agents/config.ts | 77 ++++++++++-- scripts/eval/lib/grade.test.ts | 54 ++++---- scripts/eval/lib/grade.ts | 26 ++-- scripts/eval/lib/prepare-trial.test.ts | 48 +++++++ scripts/eval/lib/prepare-trial.ts | 121 +++++++++++++++++- scripts/eval/lib/run-trial.test.ts | 8 +- scripts/package.json | 1 - yarn.lock | 1 - 13 files changed, 522 insertions(+), 183 deletions(-) create mode 100644 scripts/eval/lib/prepare-trial.test.ts diff --git a/.gitignore b/.gitignore index 26b22b0e8c0e..ecb034fa9189 100644 --- a/.gitignore +++ b/.gitignore @@ -85,5 +85,5 @@ CLAUDE.local.md scripts/eval/.cache scripts/eval/results -# PR review output +# review-pr skill output .pr-review diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 3ff4ba57d1bc..9f6055a8474d 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -21,11 +21,12 @@ import { z } from 'zod'; import pc from 'picocolors'; import { AGENT_IDS, + AGENTS, CLAUDE_MODELS, CLAUDE_EFFORTS, CODEX_MODELS, CODEX_EFFORTS, - AGENTS, + resolveClaudeSdkModel, type AgentId, type AgentVariant, } from './lib/agents/config.ts'; @@ -43,30 +44,50 @@ import { } from './lib/utils.ts'; const PROJECT_NAMES = PROJECTS.map((p) => p.name) as [string, ...string[]]; - -const base = { - project: z.enum(PROJECT_NAMES).optional(), +const LIST_MODE_FLAGS = [ + ['listProjects', 'list-projects'], + ['listModels', 'list-models'], + ['listPrompts', 'list-prompts'], +] as const; +type ListMode = (typeof LIST_MODE_FLAGS)[number][0]; +const LIST_MODE_NAMES = LIST_MODE_FLAGS.map(([name]) => name) as [ + ListMode, + ...ListMode[], +]; + +const runArgsBase = { + kind: z.literal('run'), + project: z.enum(PROJECT_NAMES), prompt: z.string().default('setup'), verbose: z.boolean().default(false), manual: z.boolean().default(false), - listProjects: z.boolean().default(false), - listModels: z.boolean().default(false), - listPrompts: z.boolean().default(false), }; -const argsSchema = z.discriminatedUnion('agent', [ - z.object({ - ...base, - agent: z.literal('claude'), - model: z.enum(CLAUDE_MODELS).default('sonnet-4.6'), - effort: z.enum(CLAUDE_EFFORTS).default('high'), - }), - z.object({ - ...base, - agent: z.literal('codex'), - model: z.enum(CODEX_MODELS).default('gpt-5.4'), - effort: z.enum(CODEX_EFFORTS).default('high'), - }), +const listArgsSchema = z.object({ + kind: z.literal('list'), + listMode: z.enum(LIST_MODE_NAMES), +}); + +const claudeRunArgsSchema = z.object({ + ...runArgsBase, + agent: z.literal('claude'), + model: z.enum(CLAUDE_MODELS).default(AGENTS.claude.defaultModel), + effort: z.enum(CLAUDE_EFFORTS).default(AGENTS.claude.defaultEffort), +}); + +const codexRunArgsSchema = z.object({ + ...runArgsBase, + agent: z.literal('codex'), + model: z.enum(CODEX_MODELS).default(AGENTS.codex.defaultModel), + effort: z.enum(CODEX_EFFORTS).default(AGENTS.codex.defaultEffort), +}); + +type RunArgs = z.infer | z.infer; + +const cliArgsSchema = z.discriminatedUnion('kind', [ + listArgsSchema, + claudeRunArgsSchema, + codexRunArgsSchema, ]); const { values } = parseArgs({ @@ -86,16 +107,13 @@ const { values } = parseArgs({ strict: true, }); -// Resolve the discriminator: explicit --agent, inferred from --model, or default to claude. -const agent = values.agent ?? (values.model ? inferAgent(values.model) : 'claude'); +const cliInput = resolveCliInput(values); +if ('error' in cliInput) { + console.error(pc.red(` ${cliInput.error}`)); + process.exit(1); +} -const parsed = argsSchema.safeParse({ - ...values, - agent, - listProjects: values['list-projects'], - listModels: values['list-models'], - listPrompts: values['list-prompts'], -}); +const parsed = cliArgsSchema.safeParse(cliInput); if (!parsed.success) { for (const issue of parsed.error.issues) { @@ -107,44 +125,26 @@ if (!parsed.success) { const args = parsed.data; const logger = createLogger(); -if (args.listProjects) { - for (const p of PROJECTS) logger.log(` ${pc.bold(p.name)} — ${p.description}`); - process.exit(0); -} -if (args.listModels) { - for (const [name, { models }] of Object.entries(AGENTS)) { - logger.log(`\n ${pc.bold(name)}`); - for (const m of models) logger.log(` ${m}`); - } - process.exit(0); -} -if (args.listPrompts) { - for (const name of listPrompts()) logger.log(` ${pc.bold(name)}`); +if (args.kind === 'list') { + runListMode(args.listMode, logger); process.exit(0); } -if (!args.project) { - logger.log(pc.red(`Specify a project with -p. Available: ${PROJECT_NAMES.join(', ')}`)); - process.exit(1); -} +const runArgs: RunArgs = args; const project = PROJECTS.find((p) => p.name === args.project)!; - -const variant: AgentVariant = - args.agent === 'claude' - ? { agent: args.agent, model: args.model, effort: args.effort } - : { agent: args.agent, model: args.model, effort: args.effort }; +const variant = toVariant(runArgs); logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); logger.log( - `Agent: ${variant.agent} | Model: ${variant.model} | Effort: ${variant.effort} | Prompt: ${args.prompt}\n` + `Agent: ${variant.agent} | Model: ${variant.model} | Effort: ${variant.effort} | Prompt: ${runArgs.prompt}\n` ); -if (args.manual) { - const trialId = generateTrialId(project.name, variant.agent, variant.model, args.prompt); +if (runArgs.manual) { + const trialId = generateTrialId(project.name, variant.agent, variant.model, runArgs.prompt); const workspace = await prepareTrial(project, trialId, logger); await captureEnvironment(workspace.resultsDir); - const prompt = loadPrompt(args.prompt); + const prompt = loadPrompt(runArgs.prompt); const promptPath = join(workspace.resultsDir, 'prompt.md'); await writeFile(promptPath, prompt); @@ -159,7 +159,7 @@ if (args.manual) { logger.log(` ${pc.green(cliCommand)}\n`); } else { const result = await runTrial( - { project, variant, prompt: args.prompt, verbose: args.verbose } satisfies TrialConfig, + { project, variant, prompt: runArgs.prompt, verbose: runArgs.verbose } satisfies TrialConfig, logger ); @@ -182,7 +182,7 @@ if (args.manual) { function inferAgent(model: string): AgentId { for (const id of AGENT_IDS) { - if (AGENTS[id].models.includes(model)) return id; + if (AGENTS[id].models.some((candidate) => candidate === model)) return id; } throw new Error(`No agent found for model: ${model}`); } @@ -190,8 +190,58 @@ function inferAgent(model: string): AgentId { function buildManualCommand(variant: AgentVariant, promptPath: string): string { const promptArg = `"$(cat ${promptPath})"`; if (variant.agent === 'claude') { - const sdkModel = AGENTS.claude.sdkModelIds[variant.model] ?? variant.model; - return `claude --model ${sdkModel} ${promptArg}`; + return `claude --model ${resolveClaudeSdkModel(variant.model)} ${promptArg}`; } return `codex --model ${variant.model} --reasoning-effort ${variant.effort} ${promptArg}`; } + +function resolveCliInput(values: Record) { + const listModes = LIST_MODE_FLAGS.filter(([, flag]) => values[flag]).map(([name]) => name); + if (listModes.length > 1) { + return { + error: `Choose only one list mode at a time: ${listModes.join(', ')}`, + } as const; + } + if (listModes.length === 1) { + return { + kind: 'list', + listMode: listModes[0], + } as const; + } + + const agent: AgentId = + values.agent === 'claude' || values.agent === 'codex' + ? values.agent + : values.model + ? inferAgent(values.model as string) + : 'claude'; + + return { + kind: 'run', + ...values, + agent, + } as const; +} + +function runListMode(listMode: ListMode, logger: ReturnType) { + switch (listMode) { + case 'listProjects': + for (const p of PROJECTS) logger.log(` ${pc.bold(p.name)} — ${p.description}`); + break; + case 'listModels': + for (const [name, { models }] of Object.entries(AGENTS)) { + logger.log(`\n ${pc.bold(name)}`); + for (const model of models) logger.log(` ${model}`); + } + break; + case 'listPrompts': + for (const name of listPrompts()) logger.log(` ${pc.bold(name)}`); + break; + } +} + +function toVariant(args: RunArgs): AgentVariant { + return args.agent === 'claude' + ? { agent: 'claude', model: args.model, effort: args.effort } + : { agent: 'codex', model: args.model, effort: args.effort }; +} diff --git a/scripts/eval/lib/agents/claude-code.ts b/scripts/eval/lib/agents/claude-code.ts index ab5d7c3f844d..1cd03f43b177 100644 --- a/scripts/eval/lib/agents/claude-code.ts +++ b/scripts/eval/lib/agents/claude-code.ts @@ -2,52 +2,59 @@ import type { SDKMessage } from '@anthropic-ai/claude-agent-sdk'; import { query } from '@anthropic-ai/claude-agent-sdk'; import { writeFile } from 'node:fs/promises'; import { join } from 'node:path'; -import { AGENTS, type AgentDriver, type Execution } from './config.ts'; +import { AGENTS, resolveClaudeSdkModel, type AgentDriver, type Execution } from './config.ts'; import type { Logger } from '../utils.ts'; -const MAX_TURNS = 50; - export const claudeAgent: AgentDriver = { name: 'claude', async execute({ prompt, projectPath, variant, resultsDir, logger }): Promise { + if (variant.agent !== 'claude') { + throw new Error(`Claude driver received unsupported variant: ${variant.agent}`); + } + const startTime = Date.now(); + const settings = AGENTS.claude.execution; const { model } = variant; const effort = variant.effort as 'low' | 'medium' | 'high' | 'max'; - const sdkModel = AGENTS.claude.sdkModelIds[model] ?? model; + const sdkModel = resolveClaudeSdkModel(model); let cost: number | undefined; let turns = 0; let durationApi: number | undefined; const messages: unknown[] = []; - for await (const message of query({ - prompt, - options: { - model: sdkModel, - cwd: projectPath, - allowedTools: ['Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep'], - maxTurns: MAX_TURNS, - effort, - debug: true, - systemPrompt: { type: 'preset', preset: 'claude_code' }, - }, - })) { - logMessage(message, logger); - messages.push(message); + try { + for await (const message of query({ + prompt, + options: { + model: sdkModel, + cwd: projectPath, + allowedTools: [...settings.allowedTools], + maxTurns: settings.maxTurns, + effort, + debug: settings.debug, + systemPrompt: settings.systemPrompt, + }, + })) { + logMessage(message, logger); + messages.push(message); - if (message.type === 'result' && message.subtype === 'success') { - cost = message.total_cost_usd as number | undefined; - turns = (message.num_turns as number) ?? 0; - durationApi = - typeof message.duration_api_ms === 'number' ? message.duration_api_ms / 1000 : undefined; + if (message.type === 'result' && message.subtype === 'success') { + cost = message.total_cost_usd as number | undefined; + turns = (message.num_turns as number) ?? 0; + durationApi = + typeof message.duration_api_ms === 'number' + ? message.duration_api_ms / 1000 + : undefined; + } } + } finally { + await writeTranscript(resultsDir, messages, logger); } const duration = (Date.now() - startTime) / 1000; - await writeFile(join(resultsDir, 'transcript.json'), JSON.stringify(messages, null, 2)); - return { cost, duration, @@ -123,3 +130,13 @@ function logMessage(message: SDKMessage, logger: Logger) { break; } } + +async function writeTranscript(resultsDir: string, messages: unknown[], logger: Logger) { + try { + await writeFile(join(resultsDir, 'transcript.json'), JSON.stringify(messages, null, 2)); + } catch (error) { + logger.logError( + `Failed to persist transcript: ${error instanceof Error ? error.message : String(error)}` + ); + } +} diff --git a/scripts/eval/lib/agents/codex.ts b/scripts/eval/lib/agents/codex.ts index beb292cb94ab..09cbdc00ee7b 100644 --- a/scripts/eval/lib/agents/codex.ts +++ b/scripts/eval/lib/agents/codex.ts @@ -1,13 +1,19 @@ import { Codex, type ModelReasoningEffort } from '@openai/codex-sdk'; import { writeFile } from 'node:fs/promises'; import { join } from 'node:path'; -import { estimateCost, type AgentDriver, type Execution } from './config.ts'; +import { AGENTS, estimateCost, type AgentDriver, type Execution } from './config.ts'; +import type { Logger } from '../utils.ts'; export const codexAgent: AgentDriver = { name: 'codex', async execute({ prompt, projectPath, variant, resultsDir, logger }): Promise { + if (variant.agent !== 'codex') { + throw new Error(`Codex driver received unsupported variant: ${variant.agent}`); + } + const startTime = Date.now(); + const settings = AGENTS.codex.execution; const { model, effort } = variant; const codex = new Codex(); @@ -15,9 +21,8 @@ export const codexAgent: AgentDriver = { model, modelReasoningEffort: effort as ModelReasoningEffort, workingDirectory: projectPath, - approvalPolicy: 'never', + approvalPolicy: settings.approvalPolicy, }); - const { events } = await thread.runStreamed(prompt); const items: unknown[] = []; let totalInput = 0; @@ -25,49 +30,54 @@ export const codexAgent: AgentDriver = { let totalOutput = 0; let turns = 0; - for await (const event of events) { - switch (event.type) { - case 'item.completed': { - const item = event.item; - items.push(item); - switch (item.type) { - case 'agent_message': - logger.log(`💬 ${item.text.slice(0, 300)}`); - break; - case 'command_execution': - logger.log(`🔧 $ ${item.command} → exit ${item.exit_code ?? '?'}`); - if (item.exit_code !== 0 && item.aggregated_output) { - logger.log(` ${item.aggregated_output.slice(-200)}`); - } - break; - case 'file_change': - for (const c of item.changes) logger.log(`📝 ${c.kind} ${c.path}`); - break; - case 'reasoning': - logger.log(`🧠 ${item.text.slice(0, 200)}`); - break; - case 'error': - logger.logError(item.message); - break; + try { + const { events } = await thread.runStreamed(prompt); + for await (const event of events) { + switch (event.type) { + case 'item.completed': { + const item = event.item; + items.push(item); + switch (item.type) { + case 'agent_message': + logger.log(`💬 ${item.text.slice(0, 300)}`); + break; + case 'command_execution': + logger.log(`🔧 $ ${item.command} → exit ${item.exit_code ?? '?'}`); + if (item.exit_code !== 0 && item.aggregated_output) { + logger.log(` ${item.aggregated_output.slice(-200)}`); + } + break; + case 'file_change': + for (const c of item.changes) logger.log(`📝 ${c.kind} ${c.path}`); + break; + case 'reasoning': + logger.log(`🧠 ${item.text.slice(0, 200)}`); + break; + case 'error': + logger.logError(item.message); + break; + } + break; } - break; + case 'turn.completed': + totalInput += event.usage.input_tokens; + totalCached += event.usage.cached_input_tokens; + totalOutput += event.usage.output_tokens; + turns++; + logger.log( + `📊 tokens: ${event.usage.input_tokens}in / ${event.usage.output_tokens}out (${event.usage.cached_input_tokens} cached)` + ); + break; + case 'turn.failed': + logger.logError(`Turn failed: ${event.error.message}`); + break; + case 'error': + logger.logError(`Error: ${event.message}`); + break; } - case 'turn.completed': - totalInput += event.usage.input_tokens; - totalCached += event.usage.cached_input_tokens; - totalOutput += event.usage.output_tokens; - turns++; - logger.log( - `📊 tokens: ${event.usage.input_tokens}in / ${event.usage.output_tokens}out (${event.usage.cached_input_tokens} cached)` - ); - break; - case 'turn.failed': - logger.logError(`Turn failed: ${event.error.message}`); - break; - case 'error': - logger.logError(`Error: ${event.message}`); - break; } + } finally { + await writeTranscript(resultsDir, items, logger); } const duration = (Date.now() - startTime) / 1000; @@ -80,8 +90,16 @@ export const codexAgent: AgentDriver = { `Done — ${turns} turns, ${Math.round(duration)}s, ${totalInput}in/${totalOutput}out tokens${cost != null ? `, $${cost.toFixed(4)}` : ''}` ); - await writeFile(join(resultsDir, 'transcript.json'), JSON.stringify(items, null, 2)); - return { cost, duration, turns }; }, }; + +async function writeTranscript(resultsDir: string, items: unknown[], logger: Logger) { + try { + await writeFile(join(resultsDir, 'transcript.json'), JSON.stringify(items, null, 2)); + } catch (error) { + logger.logError( + `Failed to persist transcript: ${error instanceof Error ? error.message : String(error)}` + ); + } +} diff --git a/scripts/eval/lib/agents/config.test.ts b/scripts/eval/lib/agents/config.test.ts index 29d205bd0b57..1236689d05cd 100644 --- a/scripts/eval/lib/agents/config.test.ts +++ b/scripts/eval/lib/agents/config.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest'; -import { AGENTS } from './config'; +import { AGENTS, getDefaultVariant } from './config'; describe('AGENTS', () => { it('keeps each agent default inside its supported model and effort lists', () => { @@ -18,6 +18,11 @@ describe('AGENTS', () => { expect(AGENTS.claude).toMatchObject({ defaultModel: 'sonnet-4.6', defaultEffort: 'medium', + execution: { + maxTurns: 50, + allowedTools: ['Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep'], + permissionModel: 'tool-allowlist', + }, sdkModelIds: Object.fromEntries( AGENTS.claude.models.map((model) => [model, expect.any(String)]) ), @@ -28,6 +33,10 @@ describe('AGENTS', () => { expect(AGENTS.codex).toMatchObject({ defaultModel: 'gpt-5.4', defaultEffort: 'medium', + execution: { + approvalPolicy: 'never', + permissionModel: 'approval-policy-never', + }, pricing: { 'gpt-5.4': { input: 2.5, @@ -37,4 +46,17 @@ describe('AGENTS', () => { }, }); }); + + it('derives default variants from the central agent definitions', () => { + expect(getDefaultVariant('claude')).toEqual({ + agent: 'claude', + model: 'sonnet-4.6', + effort: 'medium', + }); + expect(getDefaultVariant('codex')).toEqual({ + agent: 'codex', + model: 'gpt-5.4', + effort: 'medium', + }); + }); }); diff --git a/scripts/eval/lib/agents/config.ts b/scripts/eval/lib/agents/config.ts index f76a9ad03d12..eb13a52686a9 100644 --- a/scripts/eval/lib/agents/config.ts +++ b/scripts/eval/lib/agents/config.ts @@ -58,18 +58,48 @@ export interface TokenUsage { outputTokens: number; } -export interface AgentDefinition { - models: readonly string[]; - defaultModel: string; +export type ClaudeTool = 'Read' | 'Write' | 'Edit' | 'Bash' | 'Glob' | 'Grep'; + +export interface ClaudeExecutionConfig { + maxTurns: number; + /** + * Bash is toggled here at the harness level, but individual shell commands still execute through + * Claude's Bash tool rather than through a separate command allowlist. + */ + allowedTools: readonly ClaudeTool[]; + debug: boolean; + systemPrompt: { type: 'preset'; preset: 'claude_code' }; + /** Claude access is controlled through the explicit tool allowlist above. */ + permissionModel: 'tool-allowlist'; +} + +export interface CodexExecutionConfig { + /** Codex runs non-interactively so benchmark runs never block on approval prompts. */ + approvalPolicy: 'never'; + permissionModel: 'approval-policy-never'; +} + +export interface AgentDefinition { + models: readonly TModel[]; + defaultModel: TModel; /** Map friendly model names to SDK-specific model IDs (e.g. "sonnet-4.6" → "claude-sonnet-4-6"). */ - sdkModelIds: Record; + sdkModelIds: Partial>; /** Per-million-token pricing for manual cost estimation (agents that don't report cost natively). */ - pricing: Record; - efforts: readonly string[]; - defaultEffort: string; + pricing: Partial>; + efforts: readonly TEffort[]; + defaultEffort: TEffort; + execution: TExecution; +} + +export type ClaudeDefinition = AgentDefinition; +export type CodexDefinition = AgentDefinition; + +export interface AgentDefinitions { + claude: ClaudeDefinition; + codex: CodexDefinition; } -export const AGENTS: Record = { +export const AGENTS: AgentDefinitions = { claude: { models: CLAUDE_MODELS, defaultModel: 'sonnet-4.6', @@ -81,6 +111,13 @@ export const AGENTS: Record = { pricing: {}, efforts: CLAUDE_EFFORTS, defaultEffort: 'medium', + execution: { + maxTurns: 50, + allowedTools: ['Read', 'Write', 'Edit', 'Bash', 'Glob', 'Grep'], + debug: true, + systemPrompt: { type: 'preset', preset: 'claude_code' }, + permissionModel: 'tool-allowlist', + }, }, codex: { models: CODEX_MODELS, @@ -91,12 +128,34 @@ export const AGENTS: Record = { }, efforts: CODEX_EFFORTS, defaultEffort: 'medium', + execution: { + approvalPolicy: 'never', + permissionModel: 'approval-policy-never', + }, }, }; +export function getDefaultVariant( + agent: T +): Extract { + const definition = AGENTS[agent]; + return { + agent, + model: definition.defaultModel, + effort: definition.defaultEffort, + } as Extract; +} + +export function resolveClaudeSdkModel(model: ClaudeModel): string { + return AGENTS.claude.sdkModelIds[model] ?? model; +} + /** Estimate cost from token usage using the pricing table. */ export function estimateCost(agent: AgentId, model: string, usage: TokenUsage): number | undefined { - const pricing = AGENTS[agent].pricing[model]; + const pricing = + agent === 'claude' + ? AGENTS.claude.pricing[model as ClaudeModel] + : AGENTS.codex.pricing[model as CodexModel]; if (!pricing) return undefined; const freshInput = usage.inputTokens - usage.cachedInputTokens; return ( diff --git a/scripts/eval/lib/grade.test.ts b/scripts/eval/lib/grade.test.ts index 0ad1c87ae3cd..adcf2d85667d 100644 --- a/scripts/eval/lib/grade.test.ts +++ b/scripts/eval/lib/grade.test.ts @@ -1,4 +1,12 @@ -import { describe, expect, it } from 'vitest'; +import { describe, expect, it, vi } from 'vitest'; + +vi.mock('../../../code/core/src/core-server/utils/ghost-stories/get-candidates.ts', () => ({ + getComponentCandidates: vi.fn(), +})); + +vi.mock('../../../code/core/src/core-server/utils/ghost-stories/run-story-tests.ts', () => ({ + runGhostStories: vi.fn(), +})); import { filterStorybookFiles, @@ -11,32 +19,32 @@ import type { FileChange } from './grade'; describe('filterStorybookFiles', () => { it('matches files in .storybook/ directory', () => { const files: FileChange[] = [ - { path: '.storybook/main.ts', status: 'M' }, - { path: '.storybook/preview.tsx', status: 'A' }, - { path: 'src/App.tsx', status: 'M' }, + { path: '.storybook/main.ts', gitStatus: 'M' }, + { path: '.storybook/preview.tsx', gitStatus: 'A' }, + { path: 'src/App.tsx', gitStatus: 'M' }, ]; expect(filterStorybookFiles(files)).toMatchObject([ - { path: '.storybook/main.ts', status: 'M' }, - { path: '.storybook/preview.tsx', status: 'A' }, + { path: '.storybook/main.ts', gitStatus: 'M' }, + { path: '.storybook/preview.tsx', gitStatus: 'A' }, ]); }); it('matches story files with various extensions', () => { const files: FileChange[] = [ - { path: 'src/Button.stories.tsx', status: 'A' }, - { path: 'src/Header.stories.ts', status: 'A' }, - { path: 'src/Page.story.jsx', status: 'A' }, - { path: 'src/utils.stories.js', status: 'A' }, - { path: 'src/Button.tsx', status: 'M' }, - { path: 'src/Button.test.tsx', status: 'M' }, + { path: 'src/Button.stories.tsx', gitStatus: 'A' }, + { path: 'src/Header.stories.ts', gitStatus: 'A' }, + { path: 'src/Page.story.jsx', gitStatus: 'A' }, + { path: 'src/utils.stories.js', gitStatus: 'A' }, + { path: 'src/Button.tsx', gitStatus: 'M' }, + { path: 'src/Button.test.tsx', gitStatus: 'M' }, ]; expect(filterStorybookFiles(files)).toMatchObject(files.slice(0, 4)); }); it('returns empty for no storybook files', () => { const files: FileChange[] = [ - { path: 'src/App.tsx', status: 'M' }, - { path: 'package.json', status: 'M' }, + { path: 'src/App.tsx', gitStatus: 'M' }, + { path: 'package.json', gitStatus: 'M' }, ]; expect(filterStorybookFiles(files)).toHaveLength(0); }); @@ -47,9 +55,9 @@ describe('filterStorybookFiles', () => { it('matches renamed files using either side of the rename', () => { const files: FileChange[] = [ - { path: 'src/Button.tsx', previousPath: 'src/Button.stories.tsx', status: 'R' }, - { path: '.storybook/preview.tsx', previousPath: 'config/preview.tsx', status: 'R' }, - { path: 'src/App.tsx', previousPath: 'src/Main.tsx', status: 'R' }, + { path: 'src/Button.tsx', previousPath: 'src/Button.stories.tsx', gitStatus: 'R' }, + { path: '.storybook/preview.tsx', previousPath: 'config/preview.tsx', gitStatus: 'R' }, + { path: 'src/App.tsx', previousPath: 'src/Main.tsx', gitStatus: 'R' }, ]; expect(filterStorybookFiles(files)).toMatchObject(files.slice(0, 2)); @@ -232,10 +240,10 @@ describe('parseChangedFiles', () => { const output = 'A\tsrc/new-file.ts\nM\tsrc/existing.ts\nD\tsrc/removed.ts\nR100\told.ts\tnew.ts'; expect(parseChangedFiles(output)).toMatchObject([ - { path: 'src/new-file.ts', status: 'A' }, - { path: 'src/existing.ts', status: 'M' }, - { path: 'src/removed.ts', status: 'D' }, - { path: 'new.ts', previousPath: 'old.ts', status: 'R' }, + { path: 'src/new-file.ts', gitStatus: 'A' }, + { path: 'src/existing.ts', gitStatus: 'M' }, + { path: 'src/removed.ts', gitStatus: 'D' }, + { path: 'new.ts', previousPath: 'old.ts', gitStatus: 'R' }, ]); }); @@ -245,6 +253,8 @@ describe('parseChangedFiles', () => { }); it('handles single file', () => { - expect(parseChangedFiles('M\tpackage.json')).toEqual([{ path: 'package.json', status: 'M' }]); + expect(parseChangedFiles('M\tpackage.json')).toEqual([ + { path: 'package.json', gitStatus: 'M' }, + ]); }); }); diff --git a/scripts/eval/lib/grade.ts b/scripts/eval/lib/grade.ts index f480a4488a7a..0bf3259b56da 100644 --- a/scripts/eval/lib/grade.ts +++ b/scripts/eval/lib/grade.ts @@ -1,13 +1,17 @@ import { writeFile } from 'node:fs/promises'; import { join } from 'node:path'; import { x } from 'tinyexec'; +import { getComponentCandidates } from '../../../code/core/src/core-server/utils/ghost-stories/get-candidates.ts'; +import { runGhostStories } from '../../../code/core/src/core-server/utils/ghost-stories/run-story-tests.ts'; import type { Logger } from './utils.ts'; import type { TrialWorkspace } from './prepare-trial.ts'; -import { getComponentCandidates, runGhostStories } from 'storybook/internal/core-server'; + +/** Git `--name-status` codes: A=added, M=modified, D=deleted, R=renamed. */ +export type GitDiffStatus = 'A' | 'M' | 'D' | 'R'; export interface FileChange { path: string; - status: 'A' | 'M' | 'D' | 'R'; + gitStatus: GitDiffStatus; /** For renames, the original path before the move. */ previousPath?: string; } @@ -127,17 +131,14 @@ export function parseChangedFiles(gitOutput: string): FileChange[] { .filter(Boolean) .map((line) => { const [status, ...parts] = line.split('\t'); - const firstChar = status?.charAt(0) ?? ''; - const normalizedStatus = ( - ['A', 'M', 'D', 'R'].includes(firstChar) ? firstChar : 'M' - ) as FileChange['status']; + const gitStatus = parseGitDiffStatus(status); - if (normalizedStatus === 'R' && parts.length >= 2) { + if (gitStatus === 'R' && parts.length >= 2) { const [previousPath, path] = parts; - return { path, previousPath, status: normalizedStatus }; + return { path, previousPath, gitStatus }; } - return { path: parts.join('\t'), status: normalizedStatus }; + return { path: parts.join('\t'), gitStatus }; }); } @@ -279,3 +280,10 @@ function truncateEnd(text: string, maxChars: number): string { const firstNewline = truncated.indexOf('\n'); return firstNewline >= 0 ? truncated.slice(firstNewline + 1) : truncated; } + +function parseGitDiffStatus(rawStatus?: string): GitDiffStatus { + const firstChar = rawStatus?.charAt(0); + return firstChar === 'A' || firstChar === 'M' || firstChar === 'D' || firstChar === 'R' + ? firstChar + : 'M'; +} diff --git a/scripts/eval/lib/prepare-trial.test.ts b/scripts/eval/lib/prepare-trial.test.ts new file mode 100644 index 000000000000..af45783998b5 --- /dev/null +++ b/scripts/eval/lib/prepare-trial.test.ts @@ -0,0 +1,48 @@ +import { describe, expect, it } from 'vitest'; + +import { getCacheRefreshReason, type TrialCacheInfo } from './prepare-trial'; +import type { Project } from './projects'; + +const project: Project = { + name: 'mealdrop', + repo: 'https://github.com/example/mealdrop', + branch: 'eval-baseline', +}; + +const cacheInfo: TrialCacheInfo = { + repo: project.repo, + branch: project.branch, + baselineCommit: '0123456789abcdef', +}; + +describe('getCacheRefreshReason', () => { + it('keeps the cache when repo, branch, and baseline still match', () => { + expect(getCacheRefreshReason(project, cacheInfo, cacheInfo.baselineCommit)).toBeUndefined(); + }); + + it('refreshes when the repo URL changes', () => { + expect( + getCacheRefreshReason( + { ...project, repo: 'https://github.com/example/mealdrop-fork' }, + cacheInfo, + cacheInfo.baselineCommit + ) + ).toContain('repo changed'); + }); + + it('refreshes when the tracked branch changes', () => { + expect( + getCacheRefreshReason({ ...project, branch: 'next' }, cacheInfo, cacheInfo.baselineCommit) + ).toContain('branch changed'); + }); + + it('refreshes when the remote branch head advances', () => { + expect(getCacheRefreshReason(project, cacheInfo, 'fedcba9876543210')).toContain( + 'baseline branch advanced' + ); + }); + + it('keeps the cache if the remote branch cannot be verified', () => { + expect(getCacheRefreshReason(project, cacheInfo)).toBeUndefined(); + }); +}); diff --git a/scripts/eval/lib/prepare-trial.ts b/scripts/eval/lib/prepare-trial.ts index 0e59a7ca1843..a39eedd40f64 100644 --- a/scripts/eval/lib/prepare-trial.ts +++ b/scripts/eval/lib/prepare-trial.ts @@ -1,5 +1,5 @@ import { existsSync } from 'node:fs'; -import { cp, mkdir } from 'node:fs/promises'; +import { cp, mkdir, readFile, rm, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; import type { Logger } from './utils.ts'; import type { Project } from './projects.ts'; @@ -7,6 +7,8 @@ import { x } from 'tinyexec'; import { installDeps } from './package-manager.ts'; import { CACHE_DIR, TRIALS_DIR } from './utils.ts'; +const CACHE_INFO_SUFFIX = '.json'; + export interface TrialWorkspace { trialDir: string; repoRoot: string; @@ -15,6 +17,12 @@ export interface TrialWorkspace { baselineCommit: string; } +export interface TrialCacheInfo { + repo: string; + branch: string; + baselineCommit: string; +} + /** * First run: clone eval-baseline -> install deps -> cache it. * Subsequent runs: copy from cache. Agent starts immediately. @@ -25,11 +33,12 @@ export async function prepareTrial( logger: Logger ): Promise { const cacheDir = join(CACHE_DIR, project.name); + const cacheInfoPath = join(CACHE_DIR, `${project.name}${CACHE_INFO_SUFFIX}`); const trialDir = join(TRIALS_DIR, trialId); const repoRoot = join(trialDir, 'project'); await mkdir(trialDir, { recursive: true }); - if (existsSync(join(cacheDir, '.git'))) { + if (await canReuseCache(project, cacheDir, cacheInfoPath, logger)) { logger.logStep('Copying from cache...'); await cp(cacheDir, repoRoot, { recursive: true }); } else { @@ -42,12 +51,15 @@ export async function prepareTrial( await installDeps(projectPath, logger, undefined, { stopAt: repoRoot }); logger.logSuccess('Dependencies installed'); logger.logStep('Caching for future runs...'); - await cp(repoRoot, cacheDir, { recursive: true }); + const baselineCommit = await getGitHead(repoRoot); + await persistCache(cacheDir, cacheInfoPath, repoRoot, { + repo: project.repo, + branch: project.branch, + baselineCommit, + }); } - const baselineCommit = ( - await x('git', ['rev-parse', 'HEAD'], { nodeOptions: { cwd: repoRoot } }) - ).stdout.trim(); + const baselineCommit = await getGitHead(repoRoot); const projectPath = project.projectDir ? join(repoRoot, project.projectDir) : repoRoot; const resultsDir = join(trialDir, 'results'); await mkdir(resultsDir, { recursive: true }); @@ -55,3 +67,100 @@ export async function prepareTrial( logger.logSuccess('Trial ready'); return { trialDir, repoRoot, projectPath, resultsDir, baselineCommit }; } + +export function getCacheRefreshReason( + project: Project, + cacheInfo: TrialCacheInfo, + remoteHead?: string +): string | undefined { + if (cacheInfo.repo !== project.repo) { + return `repo changed (${cacheInfo.repo} → ${project.repo})`; + } + if (cacheInfo.branch !== project.branch) { + return `branch changed (${cacheInfo.branch} → ${project.branch})`; + } + if (remoteHead && cacheInfo.baselineCommit !== remoteHead) { + return `baseline branch advanced (${cacheInfo.baselineCommit.slice(0, 7)} → ${remoteHead.slice(0, 7)})`; + } + return undefined; +} + +async function canReuseCache( + project: Project, + cacheDir: string, + cacheInfoPath: string, + logger: Logger +): Promise { + if (!existsSync(join(cacheDir, '.git'))) { + return false; + } + + const cacheInfo = await readCacheInfo(cacheInfoPath); + if (!cacheInfo) { + logger.logStep('Refreshing cache (missing or invalid cache metadata)...'); + await clearCache(cacheDir, cacheInfoPath); + return false; + } + + const remoteHead = await getRemoteBranchHead(project.repo, project.branch, logger); + const refreshReason = getCacheRefreshReason(project, cacheInfo, remoteHead); + if (!refreshReason) { + return true; + } + + logger.logStep(`Refreshing cache (${refreshReason})...`); + await clearCache(cacheDir, cacheInfoPath); + return false; +} + +async function persistCache( + cacheDir: string, + cacheInfoPath: string, + repoRoot: string, + cacheInfo: TrialCacheInfo +) { + await clearCache(cacheDir, cacheInfoPath); + await cp(repoRoot, cacheDir, { recursive: true }); + await writeFile(cacheInfoPath, JSON.stringify(cacheInfo, null, 2)); +} + +async function readCacheInfo(cacheInfoPath: string): Promise { + if (!existsSync(cacheInfoPath)) { + return undefined; + } + + try { + return JSON.parse(await readFile(cacheInfoPath, 'utf-8')) as TrialCacheInfo; + } catch { + return undefined; + } +} + +async function getGitHead(cwd: string): Promise { + return (await x('git', ['rev-parse', 'HEAD'], { nodeOptions: { cwd } })).stdout.trim(); +} + +async function getRemoteBranchHead( + repo: string, + branch: string, + logger: Logger +): Promise { + const result = await x('git', ['ls-remote', repo, `refs/heads/${branch}`], { + throwOnError: false, + timeout: 120_000, + }); + if (result.exitCode !== 0) { + logger.logStep(`Could not verify remote HEAD for ${repo}#${branch}; reusing cache as-is.`); + return undefined; + } + + const line = result.stdout.trim().split('\n').find(Boolean); + return line?.split('\t')[0]?.trim() || undefined; +} + +async function clearCache(cacheDir: string, cacheInfoPath: string) { + await Promise.all([ + rm(cacheDir, { recursive: true, force: true }), + rm(cacheInfoPath, { force: true }), + ]); +} diff --git a/scripts/eval/lib/run-trial.test.ts b/scripts/eval/lib/run-trial.test.ts index 417e5b616be5..8b7f79bd07c0 100644 --- a/scripts/eval/lib/run-trial.test.ts +++ b/scripts/eval/lib/run-trial.test.ts @@ -217,12 +217,12 @@ function setupMocks(overrides?: { buildSuccess, typeCheckErrors, fileChanges: [ - { path: '.storybook/preview.tsx', status: 'A' }, - { path: 'src/Button.stories.tsx', status: 'A' }, + { path: '.storybook/preview.tsx', gitStatus: 'A' }, + { path: 'src/Button.stories.tsx', gitStatus: 'A' }, ], storybookChanges: [ - { path: '.storybook/preview.tsx', status: 'A' }, - { path: 'src/Button.stories.tsx', status: 'A' }, + { path: '.storybook/preview.tsx', gitStatus: 'A' }, + { path: 'src/Button.stories.tsx', gitStatus: 'A' }, ], }, score: { diff --git a/scripts/package.json b/scripts/package.json index 09987b86a4de..48fbc54c8704 100644 --- a/scripts/package.json +++ b/scripts/package.json @@ -147,7 +147,6 @@ "simple-git": "^3.30.0", "slash": "^3.0.0", "sort-package-json": "^3.5.0", - "storybook": "workspace:*", "tiny-invariant": "^1.3.3", "tinyexec": "^0.3.0", "trash": "^7.2.0", diff --git a/yarn.lock b/yarn.lock index 1187762a6a66..aec95012dc21 100644 --- a/yarn.lock +++ b/yarn.lock @@ -8942,7 +8942,6 @@ __metadata: simple-git: "npm:^3.30.0" slash: "npm:^3.0.0" sort-package-json: "npm:^3.5.0" - storybook: "workspace:*" tiny-invariant: "npm:^1.3.3" tinyexec: "npm:^0.3.0" trash: "npm:^7.2.0" From eccfb785ebbfeb1694c682bddeecea52d5bd7ae2 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Tue, 31 Mar 2026 20:04:06 +0700 Subject: [PATCH 60/63] Simplify eval CLI flow --- scripts/eval/eval.ts | 178 ++++++++++++++++--------------------------- 1 file changed, 66 insertions(+), 112 deletions(-) diff --git a/scripts/eval/eval.ts b/scripts/eval/eval.ts index 9f6055a8474d..048e5efb75ca 100644 --- a/scripts/eval/eval.ts +++ b/scripts/eval/eval.ts @@ -6,10 +6,10 @@ * * Usage: * node eval/eval.ts -p mealdrop # claude defaults - * node eval/eval.ts -p mealdrop -a codex # codex defaults - * node eval/eval.ts -p mealdrop -m gpt-5.4 # codex (inferred) - * node eval/eval.ts -p mealdrop -a claude -e max # claude with max effort - * node eval/eval.ts -p mealdrop --manual # prepare only, print instructions + * node eval/eval.ts -p mealdrop -a codex # codex defaults + * node eval/eval.ts -p mealdrop -m gpt-5.4 # codex (inferred) + * node eval/eval.ts -p mealdrop -a claude -e max # claude with max effort + * node eval/eval.ts -p mealdrop --manual # prepare only, print instructions * node eval/eval.ts --list-projects * node eval/eval.ts --list-models * node eval/eval.ts --list-prompts @@ -22,72 +22,51 @@ import pc from 'picocolors'; import { AGENT_IDS, AGENTS, - CLAUDE_MODELS, CLAUDE_EFFORTS, - CODEX_MODELS, + CLAUDE_MODELS, CODEX_EFFORTS, - resolveClaudeSdkModel, + CODEX_MODELS, type AgentId, type AgentVariant, } from './lib/agents/config.ts'; +import { prepareTrial } from './lib/prepare-trial.ts'; import { PROJECTS } from './lib/projects.ts'; import { runTrial, type TrialConfig } from './lib/run-trial.ts'; -import { prepareTrial } from './lib/prepare-trial.ts'; import { + captureEnvironment, createLogger, - formatDuration, formatCost, + formatDuration, + generateTrialId, listPrompts, loadPrompt, - generateTrialId, - captureEnvironment, } from './lib/utils.ts'; const PROJECT_NAMES = PROJECTS.map((p) => p.name) as [string, ...string[]]; -const LIST_MODE_FLAGS = [ - ['listProjects', 'list-projects'], - ['listModels', 'list-models'], - ['listPrompts', 'list-prompts'], -] as const; -type ListMode = (typeof LIST_MODE_FLAGS)[number][0]; -const LIST_MODE_NAMES = LIST_MODE_FLAGS.map(([name]) => name) as [ - ListMode, - ...ListMode[], -]; - -const runArgsBase = { - kind: z.literal('run'), - project: z.enum(PROJECT_NAMES), + +const base = { + project: z.enum(PROJECT_NAMES).optional(), prompt: z.string().default('setup'), verbose: z.boolean().default(false), manual: z.boolean().default(false), + listProjects: z.boolean().default(false), + listModels: z.boolean().default(false), + listPrompts: z.boolean().default(false), }; -const listArgsSchema = z.object({ - kind: z.literal('list'), - listMode: z.enum(LIST_MODE_NAMES), -}); - -const claudeRunArgsSchema = z.object({ - ...runArgsBase, - agent: z.literal('claude'), - model: z.enum(CLAUDE_MODELS).default(AGENTS.claude.defaultModel), - effort: z.enum(CLAUDE_EFFORTS).default(AGENTS.claude.defaultEffort), -}); - -const codexRunArgsSchema = z.object({ - ...runArgsBase, - agent: z.literal('codex'), - model: z.enum(CODEX_MODELS).default(AGENTS.codex.defaultModel), - effort: z.enum(CODEX_EFFORTS).default(AGENTS.codex.defaultEffort), -}); - -type RunArgs = z.infer | z.infer; - -const cliArgsSchema = z.discriminatedUnion('kind', [ - listArgsSchema, - claudeRunArgsSchema, - codexRunArgsSchema, +const argsSchema = z.discriminatedUnion('agent', [ + z.object({ + ...base, + agent: z.literal('claude'), + model: z.enum(CLAUDE_MODELS).default(AGENTS.claude.defaultModel), + effort: z.enum(CLAUDE_EFFORTS).default(AGENTS.claude.defaultEffort), + }), + z.object({ + ...base, + agent: z.literal('codex'), + model: z.enum(CODEX_MODELS).default(AGENTS.codex.defaultModel), + effort: z.enum(CODEX_EFFORTS).default(AGENTS.codex.defaultEffort), + }), ]); const { values } = parseArgs({ @@ -107,13 +86,16 @@ const { values } = parseArgs({ strict: true, }); -const cliInput = resolveCliInput(values); -if ('error' in cliInput) { - console.error(pc.red(` ${cliInput.error}`)); - process.exit(1); -} +// Resolve the discriminator: explicit --agent, inferred from --model, or default to claude. +const agent = values.agent ?? (values.model ? inferAgent(values.model) : 'claude'); -const parsed = cliArgsSchema.safeParse(cliInput); +const parsed = argsSchema.safeParse({ + ...values, + agent, + listProjects: values['list-projects'], + listModels: values['list-models'], + listPrompts: values['list-prompts'], +}); if (!parsed.success) { for (const issue of parsed.error.issues) { @@ -125,26 +107,42 @@ if (!parsed.success) { const args = parsed.data; const logger = createLogger(); -if (args.kind === 'list') { - runListMode(args.listMode, logger); +if (args.listProjects) { + for (const project of PROJECTS) { + logger.log(` ${pc.bold(project.name)} — ${project.description}`); + } + process.exit(0); +} +if (args.listModels) { + for (const [name, { models }] of Object.entries(AGENTS)) { + logger.log(`\n ${pc.bold(name)}`); + for (const model of models) logger.log(` ${model}`); + } + process.exit(0); +} +if (args.listPrompts) { + for (const name of listPrompts()) logger.log(` ${pc.bold(name)}`); process.exit(0); } -const runArgs: RunArgs = args; +if (!args.project) { + logger.log(pc.red(`Specify a project with -p. Available: ${PROJECT_NAMES.join(', ')}`)); + process.exit(1); +} const project = PROJECTS.find((p) => p.name === args.project)!; -const variant = toVariant(runArgs); +const variant = toVariant(args); logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); logger.log( - `Agent: ${variant.agent} | Model: ${variant.model} | Effort: ${variant.effort} | Prompt: ${runArgs.prompt}\n` + `Agent: ${variant.agent} | Model: ${variant.model} | Effort: ${variant.effort} | Prompt: ${args.prompt}\n` ); -if (runArgs.manual) { - const trialId = generateTrialId(project.name, variant.agent, variant.model, runArgs.prompt); +if (args.manual) { + const trialId = generateTrialId(project.name, variant.agent, variant.model, args.prompt); const workspace = await prepareTrial(project, trialId, logger); await captureEnvironment(workspace.resultsDir); - const prompt = loadPrompt(runArgs.prompt); + const prompt = loadPrompt(args.prompt); const promptPath = join(workspace.resultsDir, 'prompt.md'); await writeFile(promptPath, prompt); @@ -159,7 +157,7 @@ if (runArgs.manual) { logger.log(` ${pc.green(cliCommand)}\n`); } else { const result = await runTrial( - { project, variant, prompt: runArgs.prompt, verbose: runArgs.verbose } satisfies TrialConfig, + { project, variant, prompt: args.prompt, verbose: args.verbose } satisfies TrialConfig, logger ); @@ -190,57 +188,13 @@ function inferAgent(model: string): AgentId { function buildManualCommand(variant: AgentVariant, promptPath: string): string { const promptArg = `"$(cat ${promptPath})"`; if (variant.agent === 'claude') { - return `claude --model ${resolveClaudeSdkModel(variant.model)} ${promptArg}`; + const sdkModel = AGENTS.claude.sdkModelIds[variant.model] ?? variant.model; + return `claude --model ${sdkModel} ${promptArg}`; } return `codex --model ${variant.model} --reasoning-effort ${variant.effort} ${promptArg}`; } -function resolveCliInput(values: Record) { - const listModes = LIST_MODE_FLAGS.filter(([, flag]) => values[flag]).map(([name]) => name); - if (listModes.length > 1) { - return { - error: `Choose only one list mode at a time: ${listModes.join(', ')}`, - } as const; - } - if (listModes.length === 1) { - return { - kind: 'list', - listMode: listModes[0], - } as const; - } - - const agent: AgentId = - values.agent === 'claude' || values.agent === 'codex' - ? values.agent - : values.model - ? inferAgent(values.model as string) - : 'claude'; - - return { - kind: 'run', - ...values, - agent, - } as const; -} - -function runListMode(listMode: ListMode, logger: ReturnType) { - switch (listMode) { - case 'listProjects': - for (const p of PROJECTS) logger.log(` ${pc.bold(p.name)} — ${p.description}`); - break; - case 'listModels': - for (const [name, { models }] of Object.entries(AGENTS)) { - logger.log(`\n ${pc.bold(name)}`); - for (const model of models) logger.log(` ${model}`); - } - break; - case 'listPrompts': - for (const name of listPrompts()) logger.log(` ${pc.bold(name)}`); - break; - } -} - -function toVariant(args: RunArgs): AgentVariant { +function toVariant(args: z.infer): AgentVariant { return args.agent === 'claude' ? { agent: 'claude', model: args.model, effort: args.effort } : { agent: 'codex', model: args.model, effort: args.effort }; From 8de10d240b54c092cbc8718e4a6e9737cea0f4fb Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Tue, 31 Mar 2026 20:06:21 +0700 Subject: [PATCH 61/63] Remove eval script from tsconfig exclude list --- scripts/tsconfig.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/tsconfig.json b/scripts/tsconfig.json index 84dcf9469414..9c5b78519b8b 100644 --- a/scripts/tsconfig.json +++ b/scripts/tsconfig.json @@ -30,6 +30,6 @@ "noFallthroughCasesInSwitch": true, "resolveJsonModule": true }, - "exclude": ["dist", "**/dist", "node_modules", "**/node_modules", "eval/google-apps-script.js"], + "exclude": ["dist", "**/dist", "node_modules", "**/node_modules"], "include": ["./**/*", "./.eslintrc.cjs"] } From 68d0d5a1117f029ebd0416a996d1f656732c0180 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Tue, 31 Mar 2026 20:29:48 +0700 Subject: [PATCH 62/63] Increase CircleCI memory for format check --- scripts/ci/common-jobs.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/common-jobs.ts b/scripts/ci/common-jobs.ts index c47d03d26a4e..349dfde43048 100644 --- a/scripts/ci/common-jobs.ts +++ b/scripts/ci/common-jobs.ts @@ -67,7 +67,7 @@ export const build_linux = defineJob('Build (linux)', (workflowName) => ({ export const fmt = defineJob('Format check', () => ({ executor: { name: 'sb_node_22_classic', - class: 'medium+', + class: 'large', }, steps: [ git.checkout(), From 65893e93a77b07503963d52d5c7a78fa040149f9 Mon Sep 17 00:00:00 2001 From: Kasper Peulen Date: Tue, 31 Mar 2026 20:34:39 +0700 Subject: [PATCH 63/63] Increase CircleCI memory for format check again --- scripts/ci/common-jobs.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/common-jobs.ts b/scripts/ci/common-jobs.ts index 349dfde43048..2b8a1f0c0f23 100644 --- a/scripts/ci/common-jobs.ts +++ b/scripts/ci/common-jobs.ts @@ -67,7 +67,7 @@ export const build_linux = defineJob('Build (linux)', (workflowName) => ({ export const fmt = defineJob('Format check', () => ({ executor: { name: 'sb_node_22_classic', - class: 'large', + class: 'xlarge', }, steps: [ git.checkout(),