-
-
Notifications
You must be signed in to change notification settings - Fork 10.1k
Build: Add eval harness for Storybook agentic setup #34365
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1303e34
143577e
20cc6b9
dce536d
3e74467
7a8d08b
6c3e716
e11b9bd
2be54f4
5aabbda
986988a
1ee462d
06c5f9a
2336c46
ca03d7c
8629948
1606025
47e64e3
f6671a1
5701e8d
bdbae36
8819ae2
3caafda
4e04c66
5051a6d
f397085
9fb35ca
460fc5d
3dd2246
663b8e9
5452a10
45acc9b
bf5855b
da1f96e
9b6085b
cabe15a
73d7415
98a2f74
35d5699
842ac28
6e5fcf4
87abae4
c0720ee
920e6d3
b5f3a7b
b4bab02
35561d2
3011b46
37192c0
7887732
a55f40a
a11a783
b4e9cb8
9a850af
48a6e57
db7a142
2bd9169
0c5f06a
7a8e2d5
eccfb78
8de10d2
68d0d5a
65893e9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,201 @@ | ||
| /** | ||
| * Eval harness entry point. | ||
| * | ||
| * Runs with `node ./eval/eval.ts` (no jiti). Node 22+ supports .ts natively | ||
| * via type stripping. Import specifiers use explicit .ts extensions. | ||
| * | ||
| * Usage: | ||
| * node eval/eval.ts -p mealdrop # claude defaults | ||
| * node eval/eval.ts -p mealdrop -a codex # codex defaults | ||
| * node eval/eval.ts -p mealdrop -m gpt-5.4 # codex (inferred) | ||
| * node eval/eval.ts -p mealdrop -a claude -e max # claude with max effort | ||
| * node eval/eval.ts -p mealdrop --manual # prepare only, print instructions | ||
| * node eval/eval.ts --list-projects | ||
| * node eval/eval.ts --list-models | ||
| * node eval/eval.ts --list-prompts | ||
| */ | ||
| import { writeFile } from 'node:fs/promises'; | ||
| import { join } from 'node:path'; | ||
| import { parseArgs } from 'node:util'; | ||
| import { z } from 'zod'; | ||
| import pc from 'picocolors'; | ||
| import { | ||
| AGENT_IDS, | ||
| AGENTS, | ||
| CLAUDE_EFFORTS, | ||
| CLAUDE_MODELS, | ||
| CODEX_EFFORTS, | ||
| CODEX_MODELS, | ||
| type AgentId, | ||
| type AgentVariant, | ||
| } from './lib/agents/config.ts'; | ||
| import { prepareTrial } from './lib/prepare-trial.ts'; | ||
| import { PROJECTS } from './lib/projects.ts'; | ||
| import { runTrial, type TrialConfig } from './lib/run-trial.ts'; | ||
| import { | ||
| captureEnvironment, | ||
| createLogger, | ||
| formatCost, | ||
| formatDuration, | ||
| generateTrialId, | ||
| listPrompts, | ||
| loadPrompt, | ||
| } from './lib/utils.ts'; | ||
|
|
||
| const PROJECT_NAMES = PROJECTS.map((p) => p.name) as [string, ...string[]]; | ||
|
|
||
| const base = { | ||
| project: z.enum(PROJECT_NAMES).optional(), | ||
| prompt: z.string().default('setup'), | ||
| verbose: z.boolean().default(false), | ||
| manual: z.boolean().default(false), | ||
| listProjects: z.boolean().default(false), | ||
| listModels: z.boolean().default(false), | ||
| listPrompts: z.boolean().default(false), | ||
| }; | ||
|
|
||
| const argsSchema = z.discriminatedUnion('agent', [ | ||
| z.object({ | ||
| ...base, | ||
| agent: z.literal('claude'), | ||
| model: z.enum(CLAUDE_MODELS).default(AGENTS.claude.defaultModel), | ||
| effort: z.enum(CLAUDE_EFFORTS).default(AGENTS.claude.defaultEffort), | ||
| }), | ||
| z.object({ | ||
| ...base, | ||
| agent: z.literal('codex'), | ||
| model: z.enum(CODEX_MODELS).default(AGENTS.codex.defaultModel), | ||
| effort: z.enum(CODEX_EFFORTS).default(AGENTS.codex.defaultEffort), | ||
| }), | ||
| ]); | ||
|
|
||
| const { values } = parseArgs({ | ||
| options: { | ||
| project: { type: 'string', short: 'p' }, | ||
| agent: { type: 'string', short: 'a' }, | ||
| model: { type: 'string', short: 'm' }, | ||
| effort: { type: 'string', short: 'e' }, | ||
| prompt: { type: 'string' }, | ||
| verbose: { type: 'boolean', short: 'v' }, | ||
| manual: { type: 'boolean' }, | ||
| 'list-projects': { type: 'boolean' }, | ||
| 'list-models': { type: 'boolean' }, | ||
| 'list-prompts': { type: 'boolean' }, | ||
| }, | ||
| args: process.argv.slice(2), | ||
| strict: true, | ||
| }); | ||
|
|
||
| // Resolve the discriminator: explicit --agent, inferred from --model, or default to claude. | ||
| const agent = values.agent ?? (values.model ? inferAgent(values.model) : 'claude'); | ||
|
|
||
| const parsed = argsSchema.safeParse({ | ||
| ...values, | ||
| agent, | ||
| listProjects: values['list-projects'], | ||
| listModels: values['list-models'], | ||
| listPrompts: values['list-prompts'], | ||
| }); | ||
|
|
||
| if (!parsed.success) { | ||
| for (const issue of parsed.error.issues) { | ||
| console.error(pc.red(` ${issue.path.join('.')}: ${issue.message}`)); | ||
| } | ||
| process.exit(1); | ||
| } | ||
|
|
||
| const args = parsed.data; | ||
| const logger = createLogger(); | ||
|
|
||
| if (args.listProjects) { | ||
| for (const project of PROJECTS) { | ||
| logger.log(` ${pc.bold(project.name)} — ${project.description}`); | ||
| } | ||
| process.exit(0); | ||
| } | ||
| if (args.listModels) { | ||
| for (const [name, { models }] of Object.entries(AGENTS)) { | ||
| logger.log(`\n ${pc.bold(name)}`); | ||
| for (const model of models) logger.log(` ${model}`); | ||
| } | ||
| process.exit(0); | ||
| } | ||
| if (args.listPrompts) { | ||
| for (const name of listPrompts()) logger.log(` ${pc.bold(name)}`); | ||
| process.exit(0); | ||
| } | ||
|
|
||
| if (!args.project) { | ||
| logger.log(pc.red(`Specify a project with -p. Available: ${PROJECT_NAMES.join(', ')}`)); | ||
| process.exit(1); | ||
| } | ||
| const project = PROJECTS.find((p) => p.name === args.project)!; | ||
| const variant = toVariant(args); | ||
|
|
||
| logger.log(pc.bold(`\nStorybook Setup Eval — ${project.name}`)); | ||
| logger.log( | ||
| `Agent: ${variant.agent} | Model: ${variant.model} | Effort: ${variant.effort} | Prompt: ${args.prompt}\n` | ||
| ); | ||
|
|
||
| if (args.manual) { | ||
| const trialId = generateTrialId(project.name, variant.agent, variant.model, args.prompt); | ||
| const workspace = await prepareTrial(project, trialId, logger); | ||
| await captureEnvironment(workspace.resultsDir); | ||
|
|
||
| const prompt = loadPrompt(args.prompt); | ||
| const promptPath = join(workspace.resultsDir, 'prompt.md'); | ||
| await writeFile(promptPath, prompt); | ||
|
|
||
| const cliCommand = buildManualCommand(variant, promptPath); | ||
|
|
||
| logger.log(pc.bold('\n── Manual mode ──')); | ||
| logger.log(`\n Trial dir: ${pc.cyan(workspace.trialDir)}`); | ||
| logger.log(` Project dir: ${pc.cyan(workspace.projectPath)}`); | ||
| logger.log(` Prompt file: ${pc.cyan(promptPath)}`); | ||
| logger.log(pc.bold('\nRun the agent yourself:\n')); | ||
| logger.log(` ${pc.green('cd')} ${workspace.projectPath}`); | ||
| logger.log(` ${pc.green(cliCommand)}\n`); | ||
| } else { | ||
| const result = await runTrial( | ||
| { project, variant, prompt: args.prompt, verbose: args.verbose } satisfies TrialConfig, | ||
| logger | ||
| ); | ||
|
|
||
| const ghost = result.grade.ghostStories; | ||
| const ghostStr = ghost | ||
| ? `${ghost.passed}/${ghost.total} (${Math.round(ghost.successRate * 100)}%)` | ||
| : '-'; | ||
|
|
||
| logger.log(pc.bold('\nResult')); | ||
| logger.log(` Build: ${result.grade.buildSuccess ? pc.green('PASS') : pc.red('FAIL')}`); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are there ways to add info related to:
These can be done later:
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not yet in this pass. The main follow-up I still see here is expanding the saved report schema with model/time/context/before-after ghost metrics. |
||
| logger.log(` Ghost: ${ghostStr}`); | ||
| logger.log(` TS Err: ${result.grade.typeCheckErrors}`); | ||
| logger.log(` Score: ${result.score.score}`); | ||
| logger.log(` Cost: ${formatCost(result.execution.cost)}`); | ||
| logger.log(` Time: ${formatDuration(result.execution.duration)}`); | ||
| logger.log(` Turns: ${result.execution.turns}`); | ||
|
|
||
| logger.log('\nDone.'); | ||
| } | ||
|
|
||
| function inferAgent(model: string): AgentId { | ||
| for (const id of AGENT_IDS) { | ||
| if (AGENTS[id].models.some((candidate) => candidate === model)) return id; | ||
| } | ||
| throw new Error(`No agent found for model: ${model}`); | ||
| } | ||
|
|
||
| function buildManualCommand(variant: AgentVariant, promptPath: string): string { | ||
| const promptArg = `"$(cat ${promptPath})"`; | ||
| if (variant.agent === 'claude') { | ||
| const sdkModel = AGENTS.claude.sdkModelIds[variant.model] ?? variant.model; | ||
| return `claude --model ${sdkModel} ${promptArg}`; | ||
| } | ||
| return `codex --model ${variant.model} --reasoning-effort ${variant.effort} ${promptArg}`; | ||
| } | ||
|
|
||
| function toVariant(args: z.infer<typeof argsSchema>): AgentVariant { | ||
| return args.agent === 'claude' | ||
| ? { agent: 'claude', model: args.model, effort: args.effort } | ||
| : { agent: 'codex', model: args.model, effort: args.effort }; | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.