diff --git a/devvit.json b/devvit.json index 2628c98..14d24a8 100644 --- a/devvit.json +++ b/devvit.json @@ -20,10 +20,14 @@ "type": "select", "label": "OpenAI model (developer-set default)", "options": [ - { "label": "gpt-5.4-nano (recommended — latest, cheapest, json_object support)", "value": "gpt-5.4-nano" }, - { "label": "gpt-5.4-mini (latest, higher quality, ~7× more expensive)", "value": "gpt-5.4-mini" }, - { "label": "gpt-5-mini (previous-gen, fallback)", "value": "gpt-5-mini" }, - { "label": "gpt-5-nano (previous-gen, cheapest fallback)", "value": "gpt-5-nano" } + { + "label": "gpt-5.4-nano (recommended — fast ~1.4s, cheapest, 7/7 on the rule-compile smoke test)", + "value": "gpt-5.4-nano" + }, + { + "label": "gpt-5.4-mini (fast ~1.2s, equal quality, a bit pricier — pick this if cost is free for you)", + "value": "gpt-5.4-mini" + } ], "defaultValue": "gpt-5.4-nano" } diff --git a/scripts/openai-smoketest.ts b/scripts/openai-smoketest.ts index 08f3182..cb35687 100644 --- a/scripts/openai-smoketest.ts +++ b/scripts/openai-smoketest.ts @@ -4,24 +4,25 @@ // examples and a handful of representative moderator rules, then validates each // response against the Rule schema (or detects the clarification escape hatch). // Verifies the prompt ↔ schema ↔ model triad — the one thing the mocked route -// tests can't: does `gpt-5.4-nano` exist, honour response_format: json_object, -// and emit schema-valid rules from this prompt? Also reports token usage / $. +// tests can't: does the model exist, honour response_format: json_object, and +// emit schema-valid rules from this prompt? Reports per-call latency, token +// usage and a $ estimate, and (with OPENAI_MODELS) compares several models. // // OPENAI_API_KEY=sk-... npm run openai:smoketest -// OPENAI_MODEL=gpt-5.4-mini npm run openai:smoketest # override the model +// OPENAI_MODEL=gpt-5.4-mini npm run openai:smoketest # one model +// OPENAI_MODELS=gpt-5.4-nano,gpt-5.4-mini,gpt-4.1-mini npm run openai:smoketest # compare // -// The key is read from the OPENAI_API_KEY env var, or — if not set — from a -// `.env` file in the repo root (which is git-ignored). Never commit a key, and -// don't paste it into chat: put it in `.env` or `export` it in your shell. +// The key is read from $OPENAI_API_KEY, or — if not set — from a git-ignored +// `.env` at the repo root. Never commit a key; don't paste it into chat. // -// NOTE: this is intentionally NOT part of `npm run check` or CI — it costs a few -// fractions of a cent per run and needs a real key. +// NOT part of `npm run check` / CI (needs a real key). // // IMPORTANT: keep the request payload below in sync with `callOpenAI` in -// src/server/index.ts (same model default, response_format, max_completion_tokens). +// src/server/index.ts (same response_format, max_completion_tokens, no temperature). import { readFileSync, existsSync } from 'node:fs'; import { join } from 'node:path'; +import { performance } from 'node:perf_hooks'; import { VIBE_MOD_SYSTEM_PROMPT, FEW_SHOT_EXAMPLES } from '../src/shared/system-prompt'; import { Rule, checkTreeDepth, type RuleType } from '../src/shared/rule-schema'; @@ -38,7 +39,10 @@ function loadEnvFile(): void { loadEnvFile(); const API_KEY = process.env.OPENAI_API_KEY?.trim(); -const MODEL = process.env.OPENAI_MODEL?.trim() || 'gpt-5.4-nano'; +const MODELS = (process.env.OPENAI_MODELS?.trim() || process.env.OPENAI_MODEL?.trim() || 'gpt-5.4-nano') + .split(',') + .map((s) => s.trim()) + .filter(Boolean); if (!API_KEY) { console.error( @@ -56,7 +60,6 @@ const CASES: Case[] = [ }, { rule: 'Remove posts that contain discord.gg links from accounts with under 50 karma', expect: 'rule' }, { - // explicit numeric threshold → should compile against content.title.upperCaseRatio rule: 'If a post title is at least 12 characters and more than 70% capital letters, add the flair "Edit your title?"', expect: 'rule', }, @@ -71,12 +74,12 @@ const CASES: Case[] = [ ]; type ApiResult = - | { kind: 'rule'; rule: RuleType; tokensIn: number; tokensOut: number } - | { kind: 'clarification'; question: string; tokensIn: number; tokensOut: number } - | { kind: 'invalid'; raw: unknown; reason: string; tokensIn: number; tokensOut: number } - | { kind: 'http_error'; status: number; code?: string; message?: string }; + | { kind: 'rule'; rule: RuleType; tokensIn: number; tokensOut: number; ms: number } + | { kind: 'clarification'; question: string; tokensIn: number; tokensOut: number; ms: number } + | { kind: 'invalid'; raw: unknown; reason: string; tokensIn: number; tokensOut: number; ms: number } + | { kind: 'http_error'; status: number; code?: string; message?: string; ms: number }; -async function compile(userRule: string): Promise { +async function compile(model: string, userRule: string): Promise { // Mirrors callOpenAI() in src/server/index.ts. const messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }> = [ { role: 'system', content: VIBE_MOD_SYSTEM_PROMPT }, @@ -87,16 +90,11 @@ async function compile(userRule: string): Promise { } messages.push({ role: 'user', content: userRule }); + const t0 = performance.now(); const resp = await fetch('https://api.openai.com/v1/chat/completions', { method: 'POST', headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${API_KEY}` }, - body: JSON.stringify({ - model: MODEL, - response_format: { type: 'json_object' }, - messages, - // Newer OpenAI models (gpt-5.x family) require max_completion_tokens, not max_tokens. - max_completion_tokens: 700, - }), + body: JSON.stringify({ model, response_format: { type: 'json_object' }, messages, max_completion_tokens: 700 }), }); if (!resp.ok) { let code: string | undefined; @@ -108,13 +106,14 @@ async function compile(userRule: string): Promise { } catch { /* body wasn't JSON */ } - return { kind: 'http_error', status: resp.status, code, message }; + return { kind: 'http_error', status: resp.status, code, message, ms: performance.now() - t0 }; } const data = (await resp.json()) as { choices: Array<{ message: { content: string } }>; usage?: { prompt_tokens?: number; completion_tokens?: number }; }; + const ms = performance.now() - t0; const tokensIn = data.usage?.prompt_tokens ?? 0; const tokensOut = data.usage?.completion_tokens ?? 0; let parsed: unknown; @@ -127,9 +126,9 @@ async function compile(userRule: string): Promise { reason: `not JSON: ${e instanceof Error ? e.message : String(e)}`, tokensIn, tokensOut, + ms, }; } - if ( parsed && typeof parsed === 'object' && @@ -140,6 +139,7 @@ async function compile(userRule: string): Promise { question: String((parsed as { question?: string }).question ?? ''), tokensIn, tokensOut, + ms, }; } try { @@ -152,83 +152,119 @@ async function compile(userRule: string): Promise { }; const rule = Rule.parse(augmented); checkTreeDepth(rule.when as Parameters[0]); - return { kind: 'rule', rule, tokensIn, tokensOut }; + return { kind: 'rule', rule, tokensIn, tokensOut, ms }; } catch (e) { - return { kind: 'invalid', raw: parsed, reason: e instanceof Error ? e.message : String(e), tokensIn, tokensOut }; + return { + kind: 'invalid', + raw: parsed, + reason: e instanceof Error ? e.message : String(e), + tokensIn, + tokensOut, + ms, + }; } } // gpt-5.4-nano list price (developers.openai.com/api/docs/pricing) — adjust if you change MODEL. const PRICE_PER_M = { in: 0.05, out: 0.4 }; // USD per 1M tokens -(async () => { - console.log(`OpenAI smoke test — model: ${MODEL}\n`); - let pass = 0; - let fail = 0; - let totIn = 0; - let totOut = 0; +type ModelSummary = { + model: string; + pass: number; + total: number; + tokensIn: number; + tokensOut: number; + ms: number[]; + fatal: boolean; +}; +async function runModel(model: string): Promise { + console.log(`════════ ${model} ════════\n`); + const s: ModelSummary = { model, pass: 0, total: CASES.length, tokensIn: 0, tokensOut: 0, ms: [], fatal: false }; for (const c of CASES) { - process.stdout.write(`• "${c.rule.slice(0, 70)}${c.rule.length > 70 ? '…' : ''}"\n`); + process.stdout.write(`• "${c.rule.slice(0, 64)}${c.rule.length > 64 ? '…' : ''}"\n`); let r: ApiResult; try { - r = await compile(c.rule); + r = await compile(model, c.rule); } catch (e) { console.log(` ✗ request threw: ${e instanceof Error ? e.message : String(e)}\n`); - fail++; continue; } if (r.kind === 'http_error') { const billingCodes = ['insufficient_quota', 'billing_not_active', 'account_deactivated']; - const fatal = r.status === 401 || r.code === 'model_not_found' || billingCodes.includes(r.code ?? ''); + const fatal = + r.status === 401 || r.status === 403 || r.code === 'model_not_found' || billingCodes.includes(r.code ?? ''); const hint = - r.status === 404 || r.code === 'model_not_found' - ? ` → model "${MODEL}" not available to this key (try OPENAI_MODEL=gpt-5-mini, or check the exact model name)` + r.status === 404 || r.status === 403 || r.code === 'model_not_found' + ? ` → model "${model}" not available to this key` : r.status === 401 ? ' → invalid API key' : billingCodes.includes(r.code ?? '') - ? ' → OpenAI account billing is not active — add a payment method at platform.openai.com/account/billing (the key is fine; the account just is not enabled for API use yet)' + ? ' → OpenAI account billing is not active — add a payment method at platform.openai.com/account/billing' : r.status === 429 - ? ' → rate-limited — wait and re-run, or raise your usage limits' + ? ' → rate-limited — wait and re-run' : ''; console.log( ` ✗ HTTP ${r.status}${r.code ? ` (${r.code})` : ''}${r.message ? ` — ${r.message}` : ''}${hint}\n`, ); - fail++; - // No point hammering the API if it's a key/model/billing problem rather than a transient blip. if (fatal) { - console.log('Aborting remaining cases — fix the above and re-run.\n'); + s.fatal = true; + console.log('Skipping remaining cases for this model.\n'); break; } continue; } - totIn += r.tokensIn; - totOut += r.tokensOut; + s.tokensIn += r.tokensIn; + s.tokensOut += r.tokensOut; + s.ms.push(r.ms); const ok = (r.kind === 'rule' && c.expect === 'rule') || (r.kind === 'clarification' && c.expect === 'clarification'); - if (ok) pass++; - else fail++; + if (ok) s.pass++; const mark = ok ? '✓' : '✗'; + const t = `${Math.round(r.ms)}ms`; if (r.kind === 'rule') console.log( - ` ${mark} rule id=${r.rule.id} on=[${r.rule.on.join(',')}] then=[${r.rule.then.map((a) => a.action).join(',')}] (in ${r.tokensIn} / out ${r.tokensOut})${c.expect !== 'rule' ? ' ← expected clarification' : ''}`, + ` ${mark} ${t} rule id=${r.rule.id} on=[${r.rule.on.join(',')}] then=[${r.rule.then.map((a) => a.action).join(',')}] (in ${r.tokensIn}/out ${r.tokensOut})${c.expect !== 'rule' ? ' ← expected clarification' : ''}`, ); else if (r.kind === 'clarification') console.log( - ` ${mark} clarification: "${r.question.slice(0, 90)}" (in ${r.tokensIn} / out ${r.tokensOut})${c.expect !== 'clarification' ? ' ← expected a rule' : ''}`, + ` ${mark} ${t} clarification: "${r.question.slice(0, 80)}" (in ${r.tokensIn}/out ${r.tokensOut})${c.expect !== 'clarification' ? ' ← expected a rule' : ''}`, ); else console.log( - ` ✗ invalid output — ${r.reason}\n raw: ${JSON.stringify(r.raw).slice(0, 300)} (in ${r.tokensIn} / out ${r.tokensOut})`, + ` ✗ ${t} invalid — ${r.reason} (in ${r.tokensIn}/out ${r.tokensOut})\n raw: ${JSON.stringify(r.raw).slice(0, 240)}`, ); console.log(''); } - - const cost = (totIn / 1_000_000) * PRICE_PER_M.in + (totOut / 1_000_000) * PRICE_PER_M.out; - console.log('────────────────────────────────────────────────────────'); + const cost = (s.tokensIn / 1_000_000) * PRICE_PER_M.in + (s.tokensOut / 1_000_000) * PRICE_PER_M.out; + const med = s.ms.length ? [...s.ms].sort((a, b) => a - b)[Math.floor(s.ms.length / 2)] : 0; console.log( - `${pass}/${CASES.length} cases as expected. tokens: ${totIn} in / ${totOut} out ≈ $${cost.toFixed(5)} (at ${MODEL} list price)`, + ` → ${s.pass}/${s.total} as expected · latency median ${Math.round(med)}ms (min ${Math.round(Math.min(...s.ms, 0))} / max ${Math.round(Math.max(...s.ms, 0))}) · tokens ${s.tokensIn} in / ${s.tokensOut} out · ≈ $${cost.toFixed(5)} (at gpt-5.4-nano list price)\n`, ); - console.log('────────────────────────────────────────────────────────'); - process.exit(fail > 0 ? 1 : 0); + return s; +} + +(async () => { + const results: ModelSummary[] = []; + for (const m of MODELS) results.push(await runModel(m)); + + if (results.length > 1) { + console.log('════════ comparison ════════'); + console.log('model'.padEnd(22) + 'pass'.padEnd(8) + 'median'.padEnd(10) + 'max'.padEnd(10) + 'avg out tok'); + for (const r of results) { + const med = r.ms.length ? [...r.ms].sort((a, b) => a - b)[Math.floor(r.ms.length / 2)] : 0; + const avgOut = r.ms.length ? Math.round(r.tokensOut / r.ms.length) : 0; + console.log( + r.model.padEnd(22) + + `${r.pass}/${r.total}`.padEnd(8) + + `${Math.round(med)}ms`.padEnd(10) + + `${r.ms.length ? Math.round(Math.max(...r.ms)) : 0}ms`.padEnd(10) + + `${avgOut}${r.fatal ? ' (unavailable)' : ''}`, + ); + } + console.log(''); + } + + const anyFail = results.some((r) => r.pass < r.total || r.fatal); + process.exit(anyFail ? 1 : 0); })();