Two-Weeks-Team · ComBba · May 12, 2026 · May 12, 2026 · gemini-code-assist · May 12, 2026
diff --git a/devvit.json b/devvit.json
@@ -20,10 +20,14 @@
         "type": "select",
         "label": "OpenAI model (developer-set default)",
         "options": [
-          { "label": "gpt-5.4-nano (recommended — latest, cheapest, json_object support)", "value": "gpt-5.4-nano" },
-          { "label": "gpt-5.4-mini (latest, higher quality, ~7× more expensive)", "value": "gpt-5.4-mini" },
-          { "label": "gpt-5-mini (previous-gen, fallback)", "value": "gpt-5-mini" },
-          { "label": "gpt-5-nano (previous-gen, cheapest fallback)", "value": "gpt-5-nano" }
+          {
+            "label": "gpt-5.4-nano (recommended — fast ~1.4s, cheapest, 7/7 on the rule-compile smoke test)",
+            "value": "gpt-5.4-nano"
+          },
+          {
+            "label": "gpt-5.4-mini (fast ~1.2s, equal quality, a bit pricier — pick this if cost is free for you)",
+            "value": "gpt-5.4-mini"
+          }
         ],
         "defaultValue": "gpt-5.4-nano"
       }

diff --git a/scripts/openai-smoketest.ts b/scripts/openai-smoketest.ts
@@ -4,24 +4,25 @@
 // examples and a handful of representative moderator rules, then validates each
 // response against the Rule schema (or detects the clarification escape hatch).
 // Verifies the prompt ↔ schema ↔ model triad — the one thing the mocked route
-// tests can't: does `gpt-5.4-nano` exist, honour response_format: json_object,
-// and emit schema-valid rules from this prompt? Also reports token usage / $.
+// tests can't: does the model exist, honour response_format: json_object, and
+// emit schema-valid rules from this prompt? Reports per-call latency, token
+// usage and a $ estimate, and (with OPENAI_MODELS) compares several models.
 //
 //   OPENAI_API_KEY=sk-...  npm run openai:smoketest
-//   OPENAI_MODEL=gpt-5.4-mini  npm run openai:smoketest        # override the model
+//   OPENAI_MODEL=gpt-5.4-mini  npm run openai:smoketest                       # one model
+//   OPENAI_MODELS=gpt-5.4-nano,gpt-5.4-mini,gpt-4.1-mini  npm run openai:smoketest   # compare
 //
-// The key is read from the OPENAI_API_KEY env var, or — if not set — from a
-// `.env` file in the repo root (which is git-ignored). Never commit a key, and
-// don't paste it into chat: put it in `.env` or `export` it in your shell.
+// The key is read from $OPENAI_API_KEY, or — if not set — from a git-ignored
+// `.env` at the repo root. Never commit a key; don't paste it into chat.
 //
-// NOTE: this is intentionally NOT part of `npm run check` or CI — it costs a few
-// fractions of a cent per run and needs a real key.
+// NOT part of `npm run check` / CI (needs a real key).
 //
 // IMPORTANT: keep the request payload below in sync with `callOpenAI` in
-// src/server/index.ts (same model default, response_format, max_completion_tokens).
+// src/server/index.ts (same response_format, max_completion_tokens, no temperature).
 
 import { readFileSync, existsSync } from 'node:fs';
 import { join } from 'node:path';
+import { performance } from 'node:perf_hooks';
 import { VIBE_MOD_SYSTEM_PROMPT, FEW_SHOT_EXAMPLES } from '../src/shared/system-prompt';
 import { Rule, checkTreeDepth, type RuleType } from '../src/shared/rule-schema';
 
@@ -38,7 +39,10 @@ function loadEnvFile(): void {
 loadEnvFile();
 
 const API_KEY = process.env.OPENAI_API_KEY?.trim();
-const MODEL = process.env.OPENAI_MODEL?.trim() || 'gpt-5.4-nano';
+const MODELS = (process.env.OPENAI_MODELS?.trim() || process.env.OPENAI_MODEL?.trim() || 'gpt-5.4-nano')
+  .split(',')
+  .map((s) => s.trim())
+  .filter(Boolean);
 
 if (!API_KEY) {
   console.error(
@@ -56,7 +60,6 @@ const CASES: Case[] = [
   },
   { rule: 'Remove posts that contain discord.gg links from accounts with under 50 karma', expect: 'rule' },
   {
-    // explicit numeric threshold → should compile against content.title.upperCaseRatio
     rule: 'If a post title is at least 12 characters and more than 70% capital letters, add the flair "Edit your title?"',
     expect: 'rule',
   },
@@ -71,12 +74,12 @@ const CASES: Case[] = [
 ];
 
 type ApiResult =
-  | { kind: 'rule'; rule: RuleType; tokensIn: number; tokensOut: number }
-  | { kind: 'clarification'; question: string; tokensIn: number; tokensOut: number }
-  | { kind: 'invalid'; raw: unknown; reason: string; tokensIn: number; tokensOut: number }
-  | { kind: 'http_error'; status: number; code?: string; message?: string };
+  | { kind: 'rule'; rule: RuleType; tokensIn: number; tokensOut: number; ms: number }
+  | { kind: 'clarification'; question: string; tokensIn: number; tokensOut: number; ms: number }
+  | { kind: 'invalid'; raw: unknown; reason: string; tokensIn: number; tokensOut: number; ms: number }
+  | { kind: 'http_error'; status: number; code?: string; message?: string; ms: number };
 
-async function compile(userRule: string): Promise<ApiResult> {
+async function compile(model: string, userRule: string): Promise<ApiResult> {
   // Mirrors callOpenAI() in src/server/index.ts.
   const messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }> = [
     { role: 'system', content: VIBE_MOD_SYSTEM_PROMPT },
@@ -87,16 +90,11 @@ async function compile(userRule: string): Promise<ApiResult> {
   }
   messages.push({ role: 'user', content: userRule });
 
+  const t0 = performance.now();
   const resp = await fetch('https://api.openai.com/v1/chat/completions', {
     method: 'POST',
     headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${API_KEY}` },
-    body: JSON.stringify({
-      model: MODEL,
-      response_format: { type: 'json_object' },
-      messages,
-      // Newer OpenAI models (gpt-5.x family) require max_completion_tokens, not max_tokens.
-      max_completion_tokens: 700,
-    }),
+    body: JSON.stringify({ model, response_format: { type: 'json_object' }, messages, max_completion_tokens: 700 }),
   });
   if (!resp.ok) {
     let code: string | undefined;
@@ -108,13 +106,14 @@ async function compile(userRule: string): Promise<ApiResult> {
     } catch {
       /* body wasn't JSON */
     }
-    return { kind: 'http_error', status: resp.status, code, message };
+    return { kind: 'http_error', status: resp.status, code, message, ms: performance.now() - t0 };
   }
 
   const data = (await resp.json()) as {
     choices: Array<{ message: { content: string } }>;
     usage?: { prompt_tokens?: number; completion_tokens?: number };
   };
+  const ms = performance.now() - t0;
   const tokensIn = data.usage?.prompt_tokens ?? 0;
   const tokensOut = data.usage?.completion_tokens ?? 0;
   let parsed: unknown;
@@ -127,9 +126,9 @@ async function compile(userRule: string): Promise<ApiResult> {
       reason: `not JSON: ${e instanceof Error ? e.message : String(e)}`,
       tokensIn,
       tokensOut,
+      ms,
     };
   }
-
   if (
     parsed &&
     typeof parsed === 'object' &&
@@ -140,6 +139,7 @@ async function compile(userRule: string): Promise<ApiResult> {
       question: String((parsed as { question?: string }).question ?? ''),
       tokensIn,
       tokensOut,
+      ms,
     };
   }
   try {
@@ -152,83 +152,119 @@ async function compile(userRule: string): Promise<ApiResult> {
     };
     const rule = Rule.parse(augmented);
     checkTreeDepth(rule.when as Parameters<typeof checkTreeDepth>[0]);
-    return { kind: 'rule', rule, tokensIn, tokensOut };
+    return { kind: 'rule', rule, tokensIn, tokensOut, ms };
   } catch (e) {
-    return { kind: 'invalid', raw: parsed, reason: e instanceof Error ? e.message : String(e), tokensIn, tokensOut };
+    return {
+      kind: 'invalid',
+      raw: parsed,
+      reason: e instanceof Error ? e.message : String(e),
+      tokensIn,
+      tokensOut,
+      ms,
+    };
   }
 }
 
 // gpt-5.4-nano list price (developers.openai.com/api/docs/pricing) — adjust if you change MODEL.
 const PRICE_PER_M = { in: 0.05, out: 0.4 }; // USD per 1M tokens
 
-(async () => {
-  console.log(`OpenAI smoke test — model: ${MODEL}\n`);
-  let pass = 0;
-  let fail = 0;
-  let totIn = 0;
-  let totOut = 0;
+type ModelSummary = {
+  model: string;
+  pass: number;
+  total: number;
+  tokensIn: number;
+  tokensOut: number;
+  ms: number[];
+  fatal: boolean;
+};
 
+async function runModel(model: string): Promise<ModelSummary> {
+  console.log(`════════ ${model} ════════\n`);
+  const s: ModelSummary = { model, pass: 0, total: CASES.length, tokensIn: 0, tokensOut: 0, ms: [], fatal: false };
   for (const c of CASES) {
-    process.stdout.write(`• "${c.rule.slice(0, 70)}${c.rule.length > 70 ? '…' : ''}"\n`);
+    process.stdout.write(`• "${c.rule.slice(0, 64)}${c.rule.length > 64 ? '…' : ''}"\n`);
     let r: ApiResult;
     try {
-      r = await compile(c.rule);
+      r = await compile(model, c.rule);
     } catch (e) {
       console.log(`    ✗ request threw: ${e instanceof Error ? e.message : String(e)}\n`);
-      fail++;
       continue;
     }
     if (r.kind === 'http_error') {
       const billingCodes = ['insufficient_quota', 'billing_not_active', 'account_deactivated'];
-      const fatal = r.status === 401 || r.code === 'model_not_found' || billingCodes.includes(r.code ?? '');
+      const fatal =
+        r.status === 401 || r.status === 403 || r.code === 'model_not_found' || billingCodes.includes(r.code ?? '');
       const hint =
-        r.status === 404 || r.code === 'model_not_found'
-          ? `  → model "${MODEL}" not available to this key (try OPENAI_MODEL=gpt-5-mini, or check the exact model name)`
+        r.status === 404 || r.status === 403 || r.code === 'model_not_found'
+          ? `  → model "${model}" not available to this key`
           : r.status === 401
             ? '  → invalid API key'
             : billingCodes.includes(r.code ?? '')
-              ? '  → OpenAI account billing is not active — add a payment method at platform.openai.com/account/billing (the key is fine; the account just is not enabled for API use yet)'
+              ? '  → OpenAI account billing is not active — add a payment method at platform.openai.com/account/billing'
               : r.status === 429
-                ? '  → rate-limited — wait and re-run, or raise your usage limits'
+                ? '  → rate-limited — wait and re-run'
                 : '';
       console.log(
         `    ✗ HTTP ${r.status}${r.code ? ` (${r.code})` : ''}${r.message ? ` — ${r.message}` : ''}${hint}\n`,
       );
-      fail++;
-      // No point hammering the API if it's a key/model/billing problem rather than a transient blip.
       if (fatal) {
-        console.log('Aborting remaining cases — fix the above and re-run.\n');
+        s.fatal = true;
+        console.log('Skipping remaining cases for this model.\n');
         break;
       }
       continue;
     }
-    totIn += r.tokensIn;
-    totOut += r.tokensOut;
+    s.tokensIn += r.tokensIn;
+    s.tokensOut += r.tokensOut;
+    s.ms.push(r.ms);
     const ok =
       (r.kind === 'rule' && c.expect === 'rule') || (r.kind === 'clarification' && c.expect === 'clarification');
-    if (ok) pass++;
-    else fail++;
+    if (ok) s.pass++;
     const mark = ok ? '✓' : '✗';
+    const t = `${Math.round(r.ms)}ms`;
     if (r.kind === 'rule')
       console.log(
-        `    ${mark} rule  id=${r.rule.id}  on=[${r.rule.on.join(',')}]  then=[${r.rule.then.map((a) => a.action).join(',')}]  (in ${r.tokensIn} / out ${r.tokensOut})${c.expect !== 'rule' ? '  ← expected clarification' : ''}`,
+        `    ${mark} ${t}  rule  id=${r.rule.id}  on=[${r.rule.on.join(',')}]  then=[${r.rule.then.map((a) => a.action).join(',')}]  (in ${r.tokensIn}/out ${r.tokensOut})${c.expect !== 'rule' ? '  ← expected clarification' : ''}`,
       );
     else if (r.kind === 'clarification')
       console.log(
-        `    ${mark} clarification: "${r.question.slice(0, 90)}"  (in ${r.tokensIn} / out ${r.tokensOut})${c.expect !== 'clarification' ? '  ← expected a rule' : ''}`,
+        `    ${mark} ${t}  clarification: "${r.question.slice(0, 80)}"  (in ${r.tokensIn}/out ${r.tokensOut})${c.expect !== 'clarification' ? '  ← expected a rule' : ''}`,
       );
     else
       console.log(
-        `    ✗ invalid output — ${r.reason}\n        raw: ${JSON.stringify(r.raw).slice(0, 300)}  (in ${r.tokensIn} / out ${r.tokensOut})`,
+        `    ✗ ${t}  invalid — ${r.reason}  (in ${r.tokensIn}/out ${r.tokensOut})\n        raw: ${JSON.stringify(r.raw).slice(0, 240)}`,
       );
     console.log('');
   }
-
-  const cost = (totIn / 1_000_000) * PRICE_PER_M.in + (totOut / 1_000_000) * PRICE_PER_M.out;
-  console.log('────────────────────────────────────────────────────────');
+  const cost = (s.tokensIn / 1_000_000) * PRICE_PER_M.in + (s.tokensOut / 1_000_000) * PRICE_PER_M.out;
+  const med = s.ms.length ? [...s.ms].sort((a, b) => a - b)[Math.floor(s.ms.length / 2)] : 0;
   console.log(
-    `${pass}/${CASES.length} cases as expected.  tokens: ${totIn} in / ${totOut} out  ≈ $${cost.toFixed(5)} (at ${MODEL} list price)`,
+    `  → ${s.pass}/${s.total} as expected  ·  latency median ${Math.round(med)}ms (min ${Math.round(Math.min(...s.ms, 0))} / max ${Math.round(Math.max(...s.ms, 0))})  ·  tokens ${s.tokensIn} in / ${s.tokensOut} out  ·  ≈ $${cost.toFixed(5)} (at gpt-5.4-nano list price)\n`,
-    `  → ${s.pass}/${s.total} as expected  ·  latency median ${Math.round(med)}ms (min ${Math.round(Math.min(...s.ms, 0))} / max ${Math.round(Math.max(...s.ms, 0))})  ·  tokens ${s.tokensIn} in / ${s.tokensOut} out  ·  ≈ $${cost.toFixed(5)} (at gpt-5.4-nano list price)\n`,
+    `  → ${s.pass}/${s.total} as expected  ·  latency median ${Math.round(med)}ms (min ${s.ms.length ? Math.round(Math.min(...s.ms)) : 0} / max ${s.ms.length ? Math.round(Math.max(...s.ms)) : 0})  ·  tokens ${s.tokensIn} in / ${s.tokensOut} out  ·  ≈ $${cost.toFixed(5)} (at gpt-5.4-nano list price)\n`,
-    `  → ${s.pass}/${s.total} as expected  ·  latency median ${Math.round(med)}ms (min ${Math.round(Math.min(...s.ms, 0))} / max ${Math.round(Math.max(...s.ms, 0))})  ·  tokens ${s.tokensIn} in / ${s.tokensOut} out  ·  ≈ $${cost.toFixed(5)} (at gpt-5.4-nano list price)\n`,
+    `  → ${s.pass}/${s.total} as expected  ·  latency median ${Math.round(med)}ms (min ${s.ms.length ? Math.round(Math.min(...s.ms)) : 0} / max ${s.ms.length ? Math.round(Math.max(...s.ms)) : 0})  ·  tokens ${s.tokensIn} in / ${s.tokensOut} out  ·  ≈ $${cost.toFixed(5)} (at gpt-5.4-nano list price)\n`,
   );
-  console.log('────────────────────────────────────────────────────────');
-  process.exit(fail > 0 ? 1 : 0);
+  return s;
+}
+
+(async () => {
+  const results: ModelSummary[] = [];
+  for (const m of MODELS) results.push(await runModel(m));
+
+  if (results.length > 1) {
+    console.log('════════ comparison ════════');
+    console.log('model'.padEnd(22) + 'pass'.padEnd(8) + 'median'.padEnd(10) + 'max'.padEnd(10) + 'avg out tok');
+    for (const r of results) {
+      const med = r.ms.length ? [...r.ms].sort((a, b) => a - b)[Math.floor(r.ms.length / 2)] : 0;
+      const avgOut = r.ms.length ? Math.round(r.tokensOut / r.ms.length) : 0;
+      console.log(
+        r.model.padEnd(22) +
+          `${r.pass}/${r.total}`.padEnd(8) +
+          `${Math.round(med)}ms`.padEnd(10) +
+          `${r.ms.length ? Math.round(Math.max(...r.ms)) : 0}ms`.padEnd(10) +
+          `${avgOut}${r.fatal ? '  (unavailable)' : ''}`,
+      );
+    }
+    console.log('');
+  }
+
+  const anyFail = results.some((r) => r.pass < r.total || r.fatal);
+  process.exit(anyFail ? 1 : 0);
 })();