From 8196e7966d449c628ab67002c9aac3d2ee0b6c0b Mon Sep 17 00:00:00 2001 From: let-sunny Date: Sun, 29 Mar 2026 18:14:11 +0900 Subject: [PATCH 01/12] feat: structured debate with pro/con, confidence, and early-stop (#144) Phase 1-3 of calibration pipeline improvements: - Evidence schema: add confidence, pro, con, decision fields - Critic: receives converter assessment + gaps + prior evidence, outputs structured pro/con arguments per proposal - Arbitrator: holds low-confidence decisions instead of applying - Early-stop: skip Arbitrator when all proposals rejected with high confidence - Cross-run evidence enriched with pro/con for better future reviews - CLAUDE.md calibration section updated Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/agents/calibration/arbitrator.md | 20 ++++++-- .claude/agents/calibration/critic.md | 48 ++++++++++++++++++-- .claude/commands/calibrate-loop.md | 58 +++++++++++++++++++++--- CLAUDE.md | 16 ++++++- src/agents/contracts/evidence.ts | 10 ++++ src/agents/evidence-collector.test.ts | 4 ++ src/agents/evidence-collector.ts | 15 ++++++ 7 files changed, 155 insertions(+), 16 deletions(-) diff --git a/.claude/agents/calibration/arbitrator.md b/.claude/agents/calibration/arbitrator.md index bff26849..4dd3b0fa 100644 --- a/.claude/agents/calibration/arbitrator.md +++ b/.claude/agents/calibration/arbitrator.md @@ -16,6 +16,10 @@ You receive the Runner's proposals and the Critic's reviews, and make final deci - **proposedDisable: true** → if both Runner and Critic agree, set `enabled: false` in `rule-config.ts`. Decision type: `"disabled"`. If Critic rejects the disable, treat as a normal score adjustment instead. - **New rule proposals** → record in `$RUN_DIR/debate.json` only, do NOT add to `rule-config.ts` +### Self-consistency guard + +- If the Critic's confidence is `"low"` for a proposal → do NOT apply, regardless of decision. Set decision to `"hold"` with reason explaining insufficient confidence. The evidence will accumulate for future runs. + ## After Deciding 1. Apply approved changes to `src/core/rules/rule-config.ts` @@ -39,16 +43,23 @@ Return this JSON structure: ```json { "timestamp": "", - "summary": "applied=2 rejected=1 revised=1 newProposals=0", + "summary": "applied=2 rejected=1 hold=1 newProposals=0", + "stoppingReason": "normal|all-high-confidence-reject|low-confidence-hold", "decisions": [ - {"ruleId": "X", "decision": "applied", "before": -10, "after": -7, "reason": "Critic revised, midpoint applied"}, - {"ruleId": "X", "decision": "rejected", "reason": "Critic rejection compelling — insufficient evidence"}, - {"ruleId": "X", "decision": "disabled", "reason": "Converged to zero impact across 3+ runs, all easy"} + {"ruleId": "X", "decision": "applied", "before": -10, "after": -7, "confidence": "high", "reason": "Critic revised, midpoint applied"}, + {"ruleId": "X", "decision": "rejected", "confidence": "medium", "reason": "Critic rejection compelling — insufficient evidence"}, + {"ruleId": "X", "decision": "hold", "confidence": "low", "reason": "Low confidence — accumulate more evidence before applying"}, + {"ruleId": "X", "decision": "disabled", "confidence": "high", "reason": "Converged to zero impact across 3+ runs, all easy"} ], "newRuleProposals": [] } ``` +### Field requirements + +- **confidence**: carried from Critic's review for each decision +- **stoppingReason**: why the debate ended — `"normal"` (mixed decisions), `"all-high-confidence-reject"` (all rejected with high confidence), `"low-confidence-hold"` (all held due to low confidence) + ## Rules - **Do NOT write to ANY file except `src/core/rules/rule-config.ts`.** No log files, no `new-rule-proposals.md`, no `debate.json`, no `activity.jsonl`. The orchestrator handles ALL other file I/O. @@ -56,3 +67,4 @@ Return this JSON structure: - Only modify `rule-config.ts` for approved score/severity changes. - Never force-push or amend existing commits. - If tests fail, revert everything and report which change caused the failure. +- **Never apply changes with `confidence: "low"`.** Hold them for future evidence accumulation. diff --git a/.claude/agents/calibration/critic.md b/.claude/agents/calibration/critic.md index e4bf2632..11a6ee2d 100644 --- a/.claude/agents/calibration/critic.md +++ b/.claude/agents/calibration/critic.md @@ -16,7 +16,17 @@ All critics follow this base protocol: --- You are the Critic agent in a calibration pipeline. -You receive the Runner's proposals and challenge each one independently. +You receive the Runner's proposals along with supporting evidence, and challenge each one independently. + +## Input Context + +You will receive: +1. **Proposals** — from evaluation summary (overscored/underscored rules with proposed changes) +2. **Converter assessment** — `ruleImpactAssessment` showing actual implementation difficulty per rule +3. **Gap analysis** — actionable pixel gaps between Figma and generated code +4. **Prior evidence** — cross-run calibration evidence for the proposed rules (accumulated from past runs) + +Use ALL inputs to form pro/con arguments. Do not rely on proposals alone. ## Rejection Rules @@ -50,16 +60,46 @@ Return this JSON structure: "timestamp": "", "summary": "approved=1 rejected=1 revised=1", "reviews": [ - {"ruleId": "X", "decision": "APPROVE", "reason": "3 cases, high confidence"}, - {"ruleId": "X", "decision": "REJECT", "reason": "Rule 1 — only 1 case with low confidence"}, - {"ruleId": "X", "decision": "REVISE", "revised": -7, "reason": "Rule 2 — change too large, midpoint applied"} + { + "ruleId": "X", + "decision": "APPROVE", + "confidence": "high", + "pro": ["3 cases across fixtures show easy implementation", "converter rated actualImpact: easy"], + "con": ["all cases from same design system"], + "reason": "Strong cross-run evidence outweighs single-system concern" + }, + { + "ruleId": "X", + "decision": "REJECT", + "confidence": "low", + "pro": ["1 case shows overscored"], + "con": ["only 1 fixture", "no gap analysis data supports this"], + "reason": "Rule 1 — only 1 case with low confidence" + }, + { + "ruleId": "X", + "decision": "REVISE", + "revised": -7, + "confidence": "medium", + "pro": ["converter found moderate difficulty, current score implies hard"], + "con": ["gap analysis shows some pixel impact in this area"], + "reason": "Rule 2 — change too large, midpoint applied" + } ] } ``` +### Field requirements + +- **confidence**: `"high"` | `"medium"` | `"low"` — your assessment of the proposal's reliability +- **pro**: array of evidence points supporting the proposed change +- **con**: array of evidence points against the proposed change +- **reason**: final verdict synthesizing pro/con + ## Rules - **Do NOT write any files.** The orchestrator handles all file I/O. - Do NOT modify `src/rules/rule-config.ts`. - Be strict. When in doubt, REJECT or REVISE. - Return your full critique so the Arbitrator can decide. +- **Every review MUST include pro, con, and confidence fields.** No exceptions. diff --git a/.claude/commands/calibrate-loop.md b/.claude/commands/calibrate-loop.md index 18386512..4ef9bc5b 100644 --- a/.claude/commands/calibrate-loop.md +++ b/.claude/commands/calibrate-loop.md @@ -135,8 +135,17 @@ If zero proposals, write `$RUN_DIR/debate.json` with skip reason and jump to Ste ### Step 5 — Critic +Before spawning the Critic, gather supporting evidence: + +1. Read `$RUN_DIR/conversion.json` → extract `ruleImpactAssessment` and `uncoveredStruggles` +2. Read `$RUN_DIR/gaps.json` (if exists) → extract actionable gaps +3. Read `data/calibration-evidence.json` (if exists) → extract prior evidence for proposed rules + Spawn the `calibration-critic` subagent. In the prompt: -- Include only the proposal list (NOT the Converter's reasoning) +- Include the proposal list from summary.md +- Include the Converter's `ruleImpactAssessment` (actual implementation difficulty per rule) +- Include actionable gaps from Gap Analysis (if available) +- Include prior cross-run evidence for the proposed rules - **Tell the agent: "Return your reviews as JSON. Do NOT write any files."** After the Critic returns, **you** write the JSON to `$RUN_DIR/debate.json`: @@ -145,7 +154,16 @@ After the Critic returns, **you** write the JSON to `$RUN_DIR/debate.json`: "critic": { "timestamp": "", "summary": "approved= rejected= revised=", - "reviews": [ ... ] + "reviews": [ + { + "ruleId": "X", + "decision": "APPROVE|REJECT|REVISE", + "confidence": "high|medium|low", + "pro": ["evidence supporting change"], + "con": ["evidence against change"], + "reason": "..." + } + ] } } ``` @@ -155,6 +173,26 @@ Append to `$RUN_DIR/activity.jsonl`: {"step":"Critic","timestamp":"","result":"approved= rejected= revised=","durationMs":} ``` +#### Early-stop check + +After the Critic returns, check for early termination: + +- If **all reviews** have `decision: "REJECT"` AND `confidence: "high"` → skip Arbitrator. Write debate.json with: + ```json + { + "critic": { ... }, + "arbitrator": null, + "stoppingReason": "all-high-confidence-reject" + } + ``` + Append to activity.jsonl: + ```json + {"step":"Arbitrator","timestamp":"","result":"SKIPPED — early-stop: all proposals rejected with high confidence","durationMs":0} + ``` + Jump to Step 6.5. + +Otherwise, proceed to Step 6. + ### Step 6 — Arbitrator Spawn the `calibration-arbitrator` subagent. In the prompt: @@ -167,15 +205,23 @@ After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the "critic": { ... }, "arbitrator": { "timestamp": "", - "summary": "applied= rejected= revised=", - "decisions": [ ... ] + "summary": "applied= rejected= hold=", + "stoppingReason": "normal|all-high-confidence-reject|low-confidence-hold", + "decisions": [ + { + "ruleId": "X", + "decision": "applied|rejected|hold|disabled", + "confidence": "high|medium|low", + "reason": "..." + } + ] } } ``` Append to `$RUN_DIR/activity.jsonl`: ```json -{"step":"Arbitrator","timestamp":"","result":"applied= rejected=","durationMs":} +{"step":"Arbitrator","timestamp":"","result":"applied= rejected= hold=","durationMs":} ``` ### Step 6.5 — Prune evidence @@ -209,7 +255,7 @@ Report the final summary: similarity, proposals, decisions, and path to `logs/ca - Each agent must be a SEPARATE subagent call (isolated context). - Pass only structured data between agents — never raw reasoning. -- The Critic must NOT see the Runner's or Converter's reasoning, only the proposal list. +- The Critic receives proposals + converter's ruleImpactAssessment + gaps + prior evidence (structured data, not free-form reasoning). - Only the Arbitrator may edit `rule-config.ts`. - Steps 1, 4, 7 are CLI commands — run them directly with Bash. - **CRITICAL: YOU write all files to $RUN_DIR. Subagents (Gap Analyzer, Critic, Arbitrator) MUST return JSON as text — tell them "Do NOT write any files." You are the only one who writes to $RUN_DIR.** diff --git a/CLAUDE.md b/CLAUDE.md index 285f79ec..6d1eb561 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -322,10 +322,22 @@ Process: 3. Run `canicode visual-compare` — pixel-level comparison against Figma screenshot 4. Analyze the diff image to categorize pixel gaps (`Gap Analyzer`) 5. Compare conversion difficulty vs rule scores (`canicode calibrate-evaluate`) -6. 6-agent debate loop (`/calibrate-loop`): Analysis → Converter → Gap Analyzer → Evaluation → Critic → Arbitrator +6. Debate loop (`/calibrate-loop`): Analysis → Converter → Gap Analyzer → Evaluation → Critic → Arbitrator + +**Critic receives structured evidence** (#144): +- Proposals from evaluation +- Converter's `ruleImpactAssessment` (actual implementation difficulty per rule) +- Gap analysis (actionable pixel gaps) +- Prior cross-run evidence for proposed rules +- Outputs structured pro/con arguments + confidence level per proposal + +**Early-stop and self-consistency** (#144): +- All proposals rejected with high confidence → Arbitrator skipped (early-stop) +- Low-confidence decisions → held (not applied), evidence accumulates for future runs (self-consistency) +- `stoppingReason` recorded in debate.json for traceability **Cross-run evidence** accumulates across sessions in `data/`: -- `calibration-evidence.json` — overscored/underscored rules (fed to Runner for stronger proposals) +- `calibration-evidence.json` — overscored/underscored rules with confidence, pro/con, decision (fed to Critic for informed review) - `discovery-evidence.json` — uncovered gaps not covered by existing rules (fed to `/add-rule` Researcher) - Discovery evidence is filtered to exclude environment/tooling noise (font CDN, retina/DPI, network, CI constraints) - Evidence is pruned after rules are applied (calibration) or new rules are created (discovery) diff --git a/src/agents/contracts/evidence.ts b/src/agents/contracts/evidence.ts index b97625fb..01035a43 100644 --- a/src/agents/contracts/evidence.ts +++ b/src/agents/contracts/evidence.ts @@ -8,6 +8,11 @@ export const CalibrationEvidenceEntrySchema = z.object({ actualDifficulty: z.string(), fixture: z.string(), timestamp: z.string(), + // Phase 1 fields (#144) — optional for backward compatibility with existing evidence + confidence: z.enum(["high", "medium", "low"]).optional(), + pro: z.array(z.string()).optional(), + con: z.array(z.string()).optional(), + decision: z.enum(["APPROVE", "REJECT", "REVISE"]).optional(), }); export type CalibrationEvidenceEntry = z.infer; @@ -17,6 +22,11 @@ export const CrossRunEvidenceGroupSchema = z.object({ underscoredCount: z.number(), overscoredDifficulties: z.array(z.string()), underscoredDifficulties: z.array(z.string()), + // Aggregated pro/con from all entries for this rule + allPro: z.array(z.string()).optional(), + allCon: z.array(z.string()).optional(), + lastConfidence: z.enum(["high", "medium", "low"]).optional(), + lastDecision: z.enum(["APPROVE", "REJECT", "REVISE"]).optional(), }); export type CrossRunEvidenceGroup = z.infer; diff --git a/src/agents/evidence-collector.test.ts b/src/agents/evidence-collector.test.ts index 25258793..3f9c6dfd 100644 --- a/src/agents/evidence-collector.test.ts +++ b/src/agents/evidence-collector.test.ts @@ -62,12 +62,16 @@ describe("evidence-collector", () => { underscoredCount: 1, overscoredDifficulties: ["easy", "moderate"], underscoredDifficulties: ["hard"], + allPro: [], + allCon: [], }); expect(result["rule-b"]).toEqual({ overscoredCount: 0, underscoredCount: 1, overscoredDifficulties: [], underscoredDifficulties: ["hard"], + allPro: [], + allCon: [], }); }); diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts index 83391ed9..8372ce5a 100644 --- a/src/agents/evidence-collector.ts +++ b/src/agents/evidence-collector.ts @@ -63,6 +63,8 @@ export function loadCalibrationEvidence( underscoredCount: 0, overscoredDifficulties: [], underscoredDifficulties: [], + allPro: [], + allCon: [], }; result[entry.ruleId] = group; } @@ -74,6 +76,19 @@ export function loadCalibrationEvidence( group.underscoredCount++; group.underscoredDifficulties.push(entry.actualDifficulty); } + + // Aggregate pro/con from enriched entries + if (entry.pro) { + group.allPro ??= []; + group.allPro.push(...entry.pro); + } + if (entry.con) { + group.allCon ??= []; + group.allCon.push(...entry.con); + } + // Keep last confidence/decision (most recent entry wins) + if (entry.confidence) group.lastConfidence = entry.confidence; + if (entry.decision) group.lastDecision = entry.decision; } return result; From a298d33b982f9260342402f2607021ca4e607a25 Mon Sep 17 00:00:00 2001 From: let-sunny Date: Sun, 29 Mar 2026 18:22:14 +0900 Subject: [PATCH 02/12] fix: evidence enrichment, hold decisions, and convergence handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-review fixes: 1. Evidence enrichment gap: add calibrate-enrich-evidence CLI command and enrichCalibrationEvidence() — writes Critic's pro/con/confidence back to data/calibration-evidence.json (was only in ephemeral logs/) 2. Restore "revised" decision type in Arbitrator (was accidentally dropped) 3. Convergence: "hold" decisions count as not-converged (need more evidence) 4. Early-stop convergence: stoppingReason in debate.json → converged=true Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/agents/calibration/arbitrator.md | 11 +++--- .claude/commands/calibrate-loop.md | 18 +++++++-- src/agents/evidence-collector.ts | 36 ++++++++++++++++++ src/agents/run-directory.ts | 28 +++++++++----- .../commands/internal/fixture-management.ts | 37 +++++++++++++++++++ src/cli/index.ts | 3 +- 6 files changed, 114 insertions(+), 19 deletions(-) diff --git a/.claude/agents/calibration/arbitrator.md b/.claude/agents/calibration/arbitrator.md index 4dd3b0fa..b0106e5c 100644 --- a/.claude/agents/calibration/arbitrator.md +++ b/.claude/agents/calibration/arbitrator.md @@ -10,9 +10,9 @@ You receive the Runner's proposals and the Critic's reviews, and make final deci ## Decision Rules -- **Both APPROVE** → apply Runner's proposed value -- **Critic REJECT** → keep current score (no change) -- **Critic REVISE** → apply the Critic's revised value +- **Both APPROVE** → apply Runner's proposed value (decision: `"applied"`) +- **Critic REJECT** → keep current score (decision: `"rejected"`) +- **Critic REVISE** → apply the Critic's revised value (decision: `"revised"`) - **proposedDisable: true** → if both Runner and Critic agree, set `enabled: false` in `rule-config.ts`. Decision type: `"disabled"`. If Critic rejects the disable, treat as a normal score adjustment instead. - **New rule proposals** → record in `$RUN_DIR/debate.json` only, do NOT add to `rule-config.ts` @@ -43,10 +43,11 @@ Return this JSON structure: ```json { "timestamp": "", - "summary": "applied=2 rejected=1 hold=1 newProposals=0", + "summary": "applied=1 revised=1 rejected=1 hold=1 newProposals=0", "stoppingReason": "normal|all-high-confidence-reject|low-confidence-hold", "decisions": [ - {"ruleId": "X", "decision": "applied", "before": -10, "after": -7, "confidence": "high", "reason": "Critic revised, midpoint applied"}, + {"ruleId": "X", "decision": "applied", "before": -10, "after": -7, "confidence": "high", "reason": "Strong evidence, applying Runner's value"}, + {"ruleId": "X", "decision": "revised", "before": -10, "after": -8, "confidence": "medium", "reason": "Critic revised, midpoint applied"}, {"ruleId": "X", "decision": "rejected", "confidence": "medium", "reason": "Critic rejection compelling — insufficient evidence"}, {"ruleId": "X", "decision": "hold", "confidence": "low", "reason": "Low confidence — accumulate more evidence before applying"}, {"ruleId": "X", "decision": "disabled", "confidence": "high", "reason": "Converged to zero impact across 3+ runs, all easy"} diff --git a/.claude/commands/calibrate-loop.md b/.claude/commands/calibrate-loop.md index 4ef9bc5b..ffa1a859 100644 --- a/.claude/commands/calibrate-loop.md +++ b/.claude/commands/calibrate-loop.md @@ -205,13 +205,15 @@ After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the "critic": { ... }, "arbitrator": { "timestamp": "", - "summary": "applied= rejected= hold=", + "summary": "applied= revised= rejected= hold=", "stoppingReason": "normal|all-high-confidence-reject|low-confidence-hold", "decisions": [ { "ruleId": "X", - "decision": "applied|rejected|hold|disabled", + "decision": "applied|revised|rejected|hold|disabled", "confidence": "high|medium|low", + "before": -10, + "after": -7, "reason": "..." } ] @@ -224,9 +226,17 @@ Append to `$RUN_DIR/activity.jsonl`: {"step":"Arbitrator","timestamp":"","result":"applied= rejected= hold=","durationMs":} ``` -### Step 6.5 — Prune evidence +### Step 6.5 — Enrich and prune evidence -After the Arbitrator applies changes, prune calibration evidence for the applied rules: +After the debate (or early-stop), enrich `data/calibration-evidence.json` with the Critic's structured pro/con/confidence. This ensures cross-run evidence persists beyond the ephemeral `logs/` directory. + +```bash +npx canicode calibrate-enrich-evidence $RUN_DIR +``` + +This reads `debate.json`, extracts the Critic's reviews (pro, con, confidence, decision), and updates matching entries in `data/calibration-evidence.json`. Runs for both normal and early-stop paths. + +Then prune calibration evidence for the applied rules: ```bash npx canicode calibrate-prune-evidence $RUN_DIR diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts index 8372ce5a..c8cca5c2 100644 --- a/src/agents/evidence-collector.ts +++ b/src/agents/evidence-collector.ts @@ -134,6 +134,42 @@ export function pruneCalibrationEvidence( writeJsonArray(evidencePath, pruned); } +/** + * Enrich existing calibration evidence entries with Critic's structured review data. + * Matches by ruleId and updates confidence/pro/con/decision fields. + * Entries without a matching review are left unchanged. + */ +export function enrichCalibrationEvidence( + reviews: Array<{ + ruleId: string; + confidence?: "high" | "medium" | "low"; + pro?: string[]; + con?: string[]; + decision?: "APPROVE" | "REJECT" | "REVISE"; + }>, + evidencePath: string = DEFAULT_CALIBRATION_PATH +): void { + if (reviews.length === 0) return; + const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema); + if (existing.length === 0) return; + + const reviewByRule = new Map(reviews.map((r) => [r.ruleId.trim(), r])); + + const enriched = existing.map((entry) => { + const review = reviewByRule.get(entry.ruleId.trim()); + if (!review) return entry; + return { + ...entry, + ...(review.confidence && { confidence: review.confidence }), + ...(review.pro && { pro: review.pro }), + ...(review.con && { con: review.con }), + ...(review.decision && { decision: review.decision }), + }; + }); + + writeJsonArray(evidencePath, enriched); +} + // --- Discovery evidence --- const DEFAULT_DISCOVERY_PATH = resolve("data/discovery-evidence.json"); diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts index f44133ce..f30a5c18 100644 --- a/src/agents/run-directory.ts +++ b/src/agents/run-directory.ts @@ -253,6 +253,7 @@ export interface ConvergenceSummary { applied: number; revised: number; rejected: number; + hold: number; kept: number; total: number; reason: string; @@ -266,36 +267,44 @@ export function checkConvergence(runDir: string, options?: ConvergenceOptions): const debate = parseDebateResult(runDir); if (!debate) { - return { converged: false, mode, applied: 0, revised: 0, rejected: 0, kept: 0, total: 0, reason: "no debate.json found" }; + return { converged: false, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: "no debate.json found" }; } if (debate.skipped) { - return { converged: true, mode, applied: 0, revised: 0, rejected: 0, kept: 0, total: 0, reason: debate.skipped }; + return { converged: true, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: debate.skipped }; } if (!debate.arbitrator) { - return { converged: false, mode, applied: 0, revised: 0, rejected: 0, kept: 0, total: 0, reason: "no arbitrator result" }; + // Early-stop: Arbitrator skipped because all proposals rejected with high confidence + const stoppingReason = (debate as Record)["stoppingReason"]; + if (typeof stoppingReason === "string" && stoppingReason.length > 0) { + return { converged: true, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: `early-stop: ${stoppingReason}` }; + } + return { converged: false, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: "no arbitrator result" }; } const decisions = debate.arbitrator.decisions; const applied = decisions.filter((d) => d.decision.trim().toLowerCase() === "applied").length; const revised = decisions.filter((d) => d.decision.trim().toLowerCase() === "revised").length; const rejected = decisions.filter((d) => d.decision.trim().toLowerCase() === "rejected").length; - const kept = decisions.length - applied - revised - rejected; + const hold = decisions.filter((d) => d.decision.trim().toLowerCase() === "hold").length; + const kept = decisions.length - applied - revised - rejected - hold; const total = decisions.length; + // hold = "not enough confidence to decide" → not converged (need more evidence) const converged = options?.lenient - ? (applied + revised) === 0 - : (applied + revised) === 0 && rejected === 0; + ? (applied + revised + hold) === 0 + : (applied + revised + hold) === 0 && rejected === 0; const parts: string[] = []; if (applied > 0) parts.push(`${applied} applied`); if (revised > 0) parts.push(`${revised} revised`); if (rejected > 0) parts.push(`${rejected} rejected`); + if (hold > 0) parts.push(`${hold} hold`); if (kept > 0) parts.push(`${kept} kept`); const countsStr = parts.length > 0 ? parts.join(", ") : "no decisions"; const verdict = converged ? "converged" : "not converged"; const reason = `${verdict} (${mode}) — ${countsStr} (${total} total)`; - return { converged, mode, applied, revised, rejected, kept, total, reason }; + return { converged, mode, applied, revised, rejected, hold, kept, total, reason }; } /** Options for convergence checking. */ @@ -322,9 +331,10 @@ export function isConverged(runDir: string, options?: ConvergenceOptions): boole const dec = d.decision.trim().toLowerCase(); return dec === "applied" || dec === "revised"; }).length; + const hold = decisions.filter((d) => d.decision.trim().toLowerCase() === "hold").length; const rejected = decisions.filter((d) => d.decision.trim().toLowerCase() === "rejected").length; if (options?.lenient) { - return applied === 0; + return applied === 0 && hold === 0; } - return applied === 0 && rejected === 0; + return applied === 0 && hold === 0 && rejected === 0; } diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts index 35bf5161..ff6c8e87 100644 --- a/src/cli/commands/internal/fixture-management.ts +++ b/src/cli/commands/internal/fixture-management.ts @@ -15,6 +15,7 @@ import { import { pruneCalibrationEvidence, pruneDiscoveryEvidence, + enrichCalibrationEvidence, } from "../../../agents/evidence-collector.js"; export function registerFixtureManagement(cli: CAC): void { @@ -112,6 +113,42 @@ export function registerFixtureManagement(cli: CAC): void { }); } +export function registerEvidenceEnrich(cli: CAC): void { + cli + .command( + "calibrate-enrich-evidence ", + "Enrich evidence with Critic's pro/con/confidence from debate.json" + ) + .action((runDir: string) => { + if (!existsSync(resolve(runDir))) { + console.log(`Run directory not found: ${runDir}`); + return; + } + const debate = parseDebateResult(resolve(runDir)); + if (!debate?.critic) { + console.log("No critic reviews in debate.json — nothing to enrich."); + return; + } + + const reviews = debate.critic.reviews.map((r) => { + const raw = r as Record; + const entry: Parameters[0][number] = { ruleId: r.ruleId }; + const conf = raw["confidence"]; + if (conf === "high" || conf === "medium" || conf === "low") entry.confidence = conf; + const pro = raw["pro"]; + if (Array.isArray(pro)) entry.pro = pro as string[]; + const con = raw["con"]; + if (Array.isArray(con)) entry.con = con as string[]; + const dec = r.decision; + if (dec === "APPROVE" || dec === "REJECT" || dec === "REVISE") entry.decision = dec; + return entry; + }); + + enrichCalibrationEvidence(reviews); + console.log(`Enriched calibration evidence with ${reviews.length} review(s)`); + }); +} + export function registerEvidencePrune(cli: CAC): void { cli .command( diff --git a/src/cli/index.ts b/src/cli/index.ts index 9ee3caef..3ecb00ab 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -32,7 +32,7 @@ import { registerCalibrateAnalyze } from "./commands/internal/calibrate-analyze. import { registerCalibrateEvaluate } from "./commands/internal/calibrate-evaluate.js"; import { registerCalibrateGapReport } from "./commands/internal/calibrate-gap-report.js"; import { registerCalibrateRun } from "./commands/internal/calibrate-run.js"; -import { registerFixtureManagement, registerEvidencePrune } from "./commands/internal/fixture-management.js"; +import { registerFixtureManagement, registerEvidenceEnrich, registerEvidencePrune } from "./commands/internal/fixture-management.js"; const require = createRequire(import.meta.url); const pkg = require("../../package.json") as { version: string }; @@ -79,6 +79,7 @@ registerCalibrateEvaluate(cli); registerCalibrateGapReport(cli); registerCalibrateRun(cli); registerFixtureManagement(cli); +registerEvidenceEnrich(cli); registerEvidencePrune(cli); // ============================================ From 534daaf178a1488ff9cb51488b7ac312de9fc8c1 Mon Sep 17 00:00:00 2001 From: let-sunny Date: Sun, 29 Mar 2026 18:32:01 +0900 Subject: [PATCH 03/12] =?UTF-8?q?fix:=20review=20feedback=20=E2=80=94=20co?= =?UTF-8?q?nvergence=20sync,=20fixture-scoped=20enrich,=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses code review feedback: 1. isConverged now delegates to checkConvergence (single source of truth for early-stop / hold logic) 2. enrichCalibrationEvidence scoped by (ruleId, fixture) — no longer overwrites entries from other fixtures 3. stoppingReason canonical location: debate.json top level only 4. CLI calibrate-enrich-evidence uses process.exitCode = 1 on errors 5. Tests added: enrichCalibrationEvidence (3), checkConvergence early-stop, hold convergence (strict + lenient), isConverged delegation Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/agents/calibration/arbitrator.md | 3 +- .claude/commands/calibrate-loop.md | 7 ++- src/agents/evidence-collector.test.ts | 43 +++++++++++++- src/agents/evidence-collector.ts | 5 +- src/agents/run-directory.test.ts | 57 +++++++++++++++++++ src/agents/run-directory.ts | 19 +------ .../commands/internal/fixture-management.ts | 23 ++++++-- 7 files changed, 127 insertions(+), 30 deletions(-) diff --git a/.claude/agents/calibration/arbitrator.md b/.claude/agents/calibration/arbitrator.md index b0106e5c..90026041 100644 --- a/.claude/agents/calibration/arbitrator.md +++ b/.claude/agents/calibration/arbitrator.md @@ -44,7 +44,6 @@ Return this JSON structure: { "timestamp": "", "summary": "applied=1 revised=1 rejected=1 hold=1 newProposals=0", - "stoppingReason": "normal|all-high-confidence-reject|low-confidence-hold", "decisions": [ {"ruleId": "X", "decision": "applied", "before": -10, "after": -7, "confidence": "high", "reason": "Strong evidence, applying Runner's value"}, {"ruleId": "X", "decision": "revised", "before": -10, "after": -8, "confidence": "medium", "reason": "Critic revised, midpoint applied"}, @@ -59,7 +58,7 @@ Return this JSON structure: ### Field requirements - **confidence**: carried from Critic's review for each decision -- **stoppingReason**: why the debate ended — `"normal"` (mixed decisions), `"all-high-confidence-reject"` (all rejected with high confidence), `"low-confidence-hold"` (all held due to low confidence) +- **Note**: `stoppingReason` is written by the orchestrator at the debate.json top level, not inside the arbitrator object ## Rules diff --git a/.claude/commands/calibrate-loop.md b/.claude/commands/calibrate-loop.md index ffa1a859..f40c8fb4 100644 --- a/.claude/commands/calibrate-loop.md +++ b/.claude/commands/calibrate-loop.md @@ -199,14 +199,14 @@ Spawn the `calibration-arbitrator` subagent. In the prompt: - Include proposals and the Critic's reviews from `$RUN_DIR/debate.json` - **Tell the agent: "Return your decisions as JSON. Only edit rule-config.ts if applying changes. Do NOT write to logs."** -After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the existing content and add the `arbitrator` field: +After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the existing content and add the `arbitrator` field. Also set `stoppingReason` at the **top level** (canonical location — never inside arbitrator object): + ```json { "critic": { ... }, "arbitrator": { "timestamp": "", "summary": "applied= revised= rejected= hold=", - "stoppingReason": "normal|all-high-confidence-reject|low-confidence-hold", "decisions": [ { "ruleId": "X", @@ -217,7 +217,8 @@ After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the "reason": "..." } ] - } + }, + "stoppingReason": "normal|low-confidence-hold" } ``` diff --git a/src/agents/evidence-collector.test.ts b/src/agents/evidence-collector.test.ts index 3f9c6dfd..1841c26c 100644 --- a/src/agents/evidence-collector.test.ts +++ b/src/agents/evidence-collector.test.ts @@ -1,9 +1,10 @@ -import { mkdirSync, rmSync, writeFileSync, readFileSync } from "node:fs"; +import { existsSync, mkdirSync, rmSync, writeFileSync, readFileSync } from "node:fs"; import { join } from "node:path"; import { tmpdir } from "node:os"; import { loadCalibrationEvidence, appendCalibrationEvidence, + enrichCalibrationEvidence, pruneCalibrationEvidence, loadDiscoveryEvidence, appendDiscoveryEvidence, @@ -140,6 +141,46 @@ describe("evidence-collector", () => { }); }); + describe("enrichCalibrationEvidence", () => { + it("enriches entries matching (ruleId, fixture)", () => { + const entries: CalibrationEvidenceEntry[] = [ + { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" }, + { ruleId: "rule-a", type: "overscored", actualDifficulty: "moderate", fixture: "fx2", timestamp: "t2" }, + { ruleId: "rule-b", type: "underscored", actualDifficulty: "hard", fixture: "fx1", timestamp: "t3" }, + ]; + writeFileSync(calPath, JSON.stringify(entries), "utf-8"); + + enrichCalibrationEvidence( + [{ ruleId: "rule-a", confidence: "high", pro: ["easy in practice"], con: ["only 1 case"], decision: "APPROVE" }], + "fx1", + calPath, + ); + + const result = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[]; + // Only fx1 entry for rule-a is enriched + expect(result[0]!.confidence).toBe("high"); + expect(result[0]!.pro).toEqual(["easy in practice"]); + // fx2 entry for rule-a is NOT enriched (different fixture) + expect(result[1]!.confidence).toBeUndefined(); + // rule-b is NOT enriched (different ruleId) + expect(result[2]!.confidence).toBeUndefined(); + }); + + it("does nothing when evidence file is empty", () => { + enrichCalibrationEvidence([{ ruleId: "rule-a" }], "fx1", calPath); + expect(existsSync(calPath)).toBe(false); + }); + + it("does nothing when reviews array is empty", () => { + writeFileSync(calPath, JSON.stringify([ + { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" }, + ]), "utf-8"); + enrichCalibrationEvidence([], "fx1", calPath); + const result = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[]; + expect(result[0]!.confidence).toBeUndefined(); + }); + }); + describe("pruneCalibrationEvidence", () => { it("removes entries for specified ruleIds", () => { const entries: CalibrationEvidenceEntry[] = [ diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts index c8cca5c2..8a102ff6 100644 --- a/src/agents/evidence-collector.ts +++ b/src/agents/evidence-collector.ts @@ -136,7 +136,7 @@ export function pruneCalibrationEvidence( /** * Enrich existing calibration evidence entries with Critic's structured review data. - * Matches by ruleId and updates confidence/pro/con/decision fields. + * Matches by (ruleId, fixture) to avoid overwriting entries from other fixtures. * Entries without a matching review are left unchanged. */ export function enrichCalibrationEvidence( @@ -147,6 +147,7 @@ export function enrichCalibrationEvidence( con?: string[]; decision?: "APPROVE" | "REJECT" | "REVISE"; }>, + fixture: string, evidencePath: string = DEFAULT_CALIBRATION_PATH ): void { if (reviews.length === 0) return; @@ -154,8 +155,10 @@ export function enrichCalibrationEvidence( if (existing.length === 0) return; const reviewByRule = new Map(reviews.map((r) => [r.ruleId.trim(), r])); + const fixtureTrimmed = fixture.trim(); const enriched = existing.map((entry) => { + if (entry.fixture.trim() !== fixtureTrimmed) return entry; const review = reviewByRule.get(entry.ruleId.trim()); if (!review) return entry; return { diff --git a/src/agents/run-directory.test.ts b/src/agents/run-directory.test.ts index d56d74a3..06253dfb 100644 --- a/src/agents/run-directory.test.ts +++ b/src/agents/run-directory.test.ts @@ -333,6 +333,63 @@ describe("checkConvergence", () => { expect(summary.converged).toBe(false); expect(summary.reason).toBe("no arbitrator result"); }); + + it("converged on early-stop (stoppingReason + no arbitrator)", () => { + writeFileSync( + join(tempDir, "debate.json"), + JSON.stringify({ + critic: { summary: "rejected=2", reviews: [] }, + arbitrator: null, + stoppingReason: "all-high-confidence-reject", + }), + ); + const summary = checkConvergence(tempDir); + expect(summary.converged).toBe(true); + expect(summary.reason).toContain("early-stop"); + expect(summary.reason).toContain("all-high-confidence-reject"); + }); + + it("hold decisions prevent convergence", () => { + writeFileSync( + join(tempDir, "debate.json"), + JSON.stringify({ + arbitrator: { + summary: "hold=1", + decisions: [{ ruleId: "a", decision: "hold" }], + }, + }), + ); + const summary = checkConvergence(tempDir); + expect(summary.converged).toBe(false); + expect(summary.hold).toBe(1); + }); + + it("hold prevents convergence even in lenient mode", () => { + writeFileSync( + join(tempDir, "debate.json"), + JSON.stringify({ + arbitrator: { + summary: "hold=1", + decisions: [{ ruleId: "a", decision: "hold" }], + }, + }), + ); + const summary = checkConvergence(tempDir, { lenient: true }); + expect(summary.converged).toBe(false); + }); + + it("isConverged delegates to checkConvergence", () => { + writeFileSync( + join(tempDir, "debate.json"), + JSON.stringify({ + critic: { summary: "rejected=1", reviews: [] }, + arbitrator: null, + stoppingReason: "all-high-confidence-reject", + }), + ); + // isConverged should also return true for early-stop + expect(isConverged(tempDir)).toBe(true); + }); }); describe("listCalibrationRuns", () => { diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts index f30a5c18..3fed7832 100644 --- a/src/agents/run-directory.ts +++ b/src/agents/run-directory.ts @@ -318,23 +318,8 @@ export interface ConvergenceOptions { /** * Check if a calibration run has converged. - * Strict: no applied/revised AND no rejected decisions. - * Lenient: no applied/revised only (rejected proposals allowed). + * Delegates to checkConvergence to avoid duplicating early-stop / hold logic. */ export function isConverged(runDir: string, options?: ConvergenceOptions): boolean { - const debate = parseDebateResult(runDir); - if (!debate) return false; - if (debate.skipped) return true; // zero proposals = converged - if (!debate.arbitrator) return false; - const decisions = debate.arbitrator.decisions; - const applied = decisions.filter((d) => { - const dec = d.decision.trim().toLowerCase(); - return dec === "applied" || dec === "revised"; - }).length; - const hold = decisions.filter((d) => d.decision.trim().toLowerCase() === "hold").length; - const rejected = decisions.filter((d) => d.decision.trim().toLowerCase() === "rejected").length; - if (options?.lenient) { - return applied === 0 && hold === 0; - } - return applied === 0 && hold === 0 && rejected === 0; + return checkConvergence(runDir, options).converged; } diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts index ff6c8e87..a0b4fde1 100644 --- a/src/cli/commands/internal/fixture-management.ts +++ b/src/cli/commands/internal/fixture-management.ts @@ -1,5 +1,5 @@ import { existsSync } from "node:fs"; -import { resolve } from "node:path"; +import { basename, resolve } from "node:path"; import type { CAC } from "cac"; import { @@ -7,6 +7,7 @@ import { listDoneFixtures, moveFixtureToDone, parseDebateResult, + parseRunDirName, extractAppliedRuleIds, extractFixtureName, resolveLatestRunDir, @@ -120,16 +121,26 @@ export function registerEvidenceEnrich(cli: CAC): void { "Enrich evidence with Critic's pro/con/confidence from debate.json" ) .action((runDir: string) => { - if (!existsSync(resolve(runDir))) { - console.log(`Run directory not found: ${runDir}`); + const resolvedDir = resolve(runDir); + if (!existsSync(resolvedDir)) { + console.error(`Run directory not found: ${runDir}`); + process.exitCode = 1; return; } - const debate = parseDebateResult(resolve(runDir)); + const debate = parseDebateResult(resolvedDir); if (!debate?.critic) { console.log("No critic reviews in debate.json — nothing to enrich."); return; } + // Extract fixture name from run directory (e.g. "material3-kit--2026-03-26-0900" → "material3-kit") + const { name: fixture } = parseRunDirName(basename(resolvedDir)); + if (!fixture) { + console.error("Cannot extract fixture name from run directory"); + process.exitCode = 1; + return; + } + const reviews = debate.critic.reviews.map((r) => { const raw = r as Record; const entry: Parameters[0][number] = { ruleId: r.ruleId }; @@ -144,8 +155,8 @@ export function registerEvidenceEnrich(cli: CAC): void { return entry; }); - enrichCalibrationEvidence(reviews); - console.log(`Enriched calibration evidence with ${reviews.length} review(s)`); + enrichCalibrationEvidence(reviews, fixture); + console.log(`Enriched calibration evidence for fixture "${fixture}" with ${reviews.length} review(s)`); }); } From d241df15ba56505bdb67cc0710cc37a5ee993a8e Mon Sep 17 00:00:00 2001 From: let-sunny Date: Sun, 29 Mar 2026 18:35:22 +0900 Subject: [PATCH 04/12] fix: stoppingReason in Zod schema, dedupe pro/con, CLI exit codes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review round 2: A. stoppingReason added to DebateResultSchema — no more type cast B. enrichCalibrationEvidence warns when no entries match fixture C. calibrate-prune-evidence CLI now uses process.exitCode = 1 D. allPro/allCon deduplicated in loadCalibrationEvidence grouping F. stoppingReason canonical location commented in schema E (model selection) is a judgment call, not a code change — to evaluate later. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/agents/evidence-collector.ts | 15 ++++++++++++--- src/agents/run-directory.ts | 7 ++++--- src/cli/commands/internal/fixture-management.ts | 6 ++++-- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts index 8a102ff6..70023add 100644 --- a/src/agents/evidence-collector.ts +++ b/src/agents/evidence-collector.ts @@ -77,14 +77,18 @@ export function loadCalibrationEvidence( group.underscoredDifficulties.push(entry.actualDifficulty); } - // Aggregate pro/con from enriched entries + // Aggregate pro/con from enriched entries (deduplicated) if (entry.pro) { group.allPro ??= []; - group.allPro.push(...entry.pro); + for (const p of entry.pro) { + if (!group.allPro.includes(p)) group.allPro.push(p); + } } if (entry.con) { group.allCon ??= []; - group.allCon.push(...entry.con); + for (const c of entry.con) { + if (!group.allCon.includes(c)) group.allCon.push(c); + } } // Keep last confidence/decision (most recent entry wins) if (entry.confidence) group.lastConfidence = entry.confidence; @@ -157,10 +161,12 @@ export function enrichCalibrationEvidence( const reviewByRule = new Map(reviews.map((r) => [r.ruleId.trim(), r])); const fixtureTrimmed = fixture.trim(); + let matchCount = 0; const enriched = existing.map((entry) => { if (entry.fixture.trim() !== fixtureTrimmed) return entry; const review = reviewByRule.get(entry.ruleId.trim()); if (!review) return entry; + matchCount++; return { ...entry, ...(review.confidence && { confidence: review.confidence }), @@ -170,6 +176,9 @@ export function enrichCalibrationEvidence( }; }); + if (matchCount === 0) { + console.warn(`[enrich] No entries matched fixture="${fixture}" — evidence unchanged`); + } writeJsonArray(evidencePath, enriched); } diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts index 3fed7832..1d39c43f 100644 --- a/src/agents/run-directory.ts +++ b/src/agents/run-directory.ts @@ -184,10 +184,12 @@ const ArbitratorSchema = z.object({ newRuleProposals: z.array(z.unknown()).optional(), }).passthrough(); +/** stoppingReason canonical location: debate.json top level (not inside arbitrator) */ const DebateResultSchema = z.object({ critic: CriticSchema.nullable().default(null), arbitrator: ArbitratorSchema.nullable().default(null), skipped: z.string().optional(), + stoppingReason: z.string().optional(), }).passthrough(); /** A single decision from the Arbitrator in debate.json. */ @@ -274,9 +276,8 @@ export function checkConvergence(runDir: string, options?: ConvergenceOptions): } if (!debate.arbitrator) { // Early-stop: Arbitrator skipped because all proposals rejected with high confidence - const stoppingReason = (debate as Record)["stoppingReason"]; - if (typeof stoppingReason === "string" && stoppingReason.length > 0) { - return { converged: true, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: `early-stop: ${stoppingReason}` }; + if (debate.stoppingReason) { + return { converged: true, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: `early-stop: ${debate.stoppingReason}` }; } return { converged: false, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: "no arbitrator result" }; } diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts index a0b4fde1..f3109448 100644 --- a/src/cli/commands/internal/fixture-management.ts +++ b/src/cli/commands/internal/fixture-management.ts @@ -168,12 +168,14 @@ export function registerEvidencePrune(cli: CAC): void { ) .action((runDir: string) => { if (!existsSync(resolve(runDir))) { - console.log(`Run directory not found: ${runDir}`); + console.error(`Run directory not found: ${runDir}`); + process.exitCode = 1; return; } const debate = parseDebateResult(resolve(runDir)); if (!debate) { - console.log("No debate.json found — nothing to prune."); + console.error("No debate.json found — nothing to prune."); + process.exitCode = 1; return; } From cc3e4ba005ed030a93e189229f4b35bfb29fd8c1 Mon Sep 17 00:00:00 2001 From: let-sunny Date: Sun, 29 Mar 2026 18:36:13 +0900 Subject: [PATCH 05/12] chore: upgrade Critic and Arbitrator to opus-4-6 These are the most critical judgment steps in the calibration pipeline. Structured pro/con + confidence reasoning benefits from stronger model. Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/agents/calibration/arbitrator.md | 2 +- .claude/agents/calibration/critic.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.claude/agents/calibration/arbitrator.md b/.claude/agents/calibration/arbitrator.md index 90026041..72d01f51 100644 --- a/.claude/agents/calibration/arbitrator.md +++ b/.claude/agents/calibration/arbitrator.md @@ -2,7 +2,7 @@ name: calibration-arbitrator description: Makes final calibration decisions by weighing Runner and Critic. Applies approved changes to rule-config.ts and commits. Use after calibration-critic completes. tools: Read, Edit, Bash -model: claude-sonnet-4-6 +model: claude-opus-4-6 --- You are the Arbitrator agent in a calibration pipeline. diff --git a/.claude/agents/calibration/critic.md b/.claude/agents/calibration/critic.md index 11a6ee2d..d0257438 100644 --- a/.claude/agents/calibration/critic.md +++ b/.claude/agents/calibration/critic.md @@ -2,7 +2,7 @@ name: calibration-critic description: Challenges calibration proposals from Runner. Rejects low-confidence or over-aggressive adjustments. Use after calibration-runner completes. tools: Read -model: claude-sonnet-4-6 +model: claude-opus-4-6 --- ## Common Review Framework From b0039c78dfde1629ab661e8e1fac9c8a613e8126 Mon Sep 17 00:00:00 2001 From: let-sunny Date: Sun, 29 Mar 2026 18:39:37 +0900 Subject: [PATCH 06/12] =?UTF-8?q?fix:=20final=20polish=20=E2=80=94=20Criti?= =?UTF-8?q?cSchema=20types,=20enrich=20early-return,=20dedupe=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. CriticReviewSchema: confidence/pro/con as typed optional fields, removing Record casts in fixture-management.ts 2. enrichCalibrationEvidence: early-return when no matches (skip I/O) 3. stoppingReason: omit for normal completion, only set for special cases 4. Test: pro/con deduplication in loadCalibrationEvidence Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/commands/calibrate-loop.md | 4 ++-- src/agents/evidence-collector.test.ts | 14 ++++++++++++++ src/agents/evidence-collector.ts | 1 + src/agents/run-directory.ts | 17 +++++++++++------ src/cli/commands/internal/fixture-management.ts | 10 +++------- 5 files changed, 31 insertions(+), 15 deletions(-) diff --git a/.claude/commands/calibrate-loop.md b/.claude/commands/calibrate-loop.md index f40c8fb4..848d3df8 100644 --- a/.claude/commands/calibrate-loop.md +++ b/.claude/commands/calibrate-loop.md @@ -199,7 +199,7 @@ Spawn the `calibration-arbitrator` subagent. In the prompt: - Include proposals and the Critic's reviews from `$RUN_DIR/debate.json` - **Tell the agent: "Return your decisions as JSON. Only edit rule-config.ts if applying changes. Do NOT write to logs."** -After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the existing content and add the `arbitrator` field. Also set `stoppingReason` at the **top level** (canonical location — never inside arbitrator object): +After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the existing content and add the `arbitrator` field. Only set `stoppingReason` at the **top level** when non-normal termination occurred (e.g. `"low-confidence-hold"`, `"all-high-confidence-reject"`). Omit the field for normal completion: ```json { @@ -218,7 +218,7 @@ After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the } ] }, - "stoppingReason": "normal|low-confidence-hold" + "stoppingReason": "low-confidence-hold" } ``` diff --git a/src/agents/evidence-collector.test.ts b/src/agents/evidence-collector.test.ts index 1841c26c..b2ced476 100644 --- a/src/agents/evidence-collector.test.ts +++ b/src/agents/evidence-collector.test.ts @@ -76,6 +76,20 @@ describe("evidence-collector", () => { }); }); + it("deduplicates pro/con across entries", () => { + const entries: CalibrationEvidenceEntry[] = [ + { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1", + pro: ["easy in practice", "common pattern"], con: ["small fixture"] }, + { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx2", timestamp: "t2", + pro: ["easy in practice", "new evidence"], con: ["small fixture", "single run"] }, + ]; + writeFileSync(calPath, JSON.stringify(entries), "utf-8"); + + const result = loadCalibrationEvidence(calPath); + expect(result["rule-a"]!.allPro).toEqual(["easy in practice", "common pattern", "new evidence"]); + expect(result["rule-a"]!.allCon).toEqual(["small fixture", "single run"]); + }); + it("handles malformed JSON gracefully", () => { writeFileSync(calPath, "not json", "utf-8"); const result = loadCalibrationEvidence(calPath); diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts index 70023add..3ad2d9fe 100644 --- a/src/agents/evidence-collector.ts +++ b/src/agents/evidence-collector.ts @@ -178,6 +178,7 @@ export function enrichCalibrationEvidence( if (matchCount === 0) { console.warn(`[enrich] No entries matched fixture="${fixture}" — evidence unchanged`); + return; } writeJsonArray(evidencePath, enriched); } diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts index 1d39c43f..028eb354 100644 --- a/src/agents/run-directory.ts +++ b/src/agents/run-directory.ts @@ -168,14 +168,19 @@ const DebateDecisionSchema = z.object({ reason: z.string().optional(), }).passthrough(); +const CriticReviewSchema = z.object({ + ruleId: z.string(), + decision: z.string(), + reason: z.string().optional(), + revised: z.number().optional(), + confidence: z.enum(["high", "medium", "low"]).optional(), + pro: z.array(z.string()).optional(), + con: z.array(z.string()).optional(), +}).passthrough(); + const CriticSchema = z.object({ summary: z.string(), - reviews: z.array(z.object({ - ruleId: z.string(), - decision: z.string(), - reason: z.string().optional(), - revised: z.number().optional(), - }).passthrough()), + reviews: z.array(CriticReviewSchema), }).passthrough(); const ArbitratorSchema = z.object({ diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts index f3109448..4158a451 100644 --- a/src/cli/commands/internal/fixture-management.ts +++ b/src/cli/commands/internal/fixture-management.ts @@ -142,14 +142,10 @@ export function registerEvidenceEnrich(cli: CAC): void { } const reviews = debate.critic.reviews.map((r) => { - const raw = r as Record; const entry: Parameters[0][number] = { ruleId: r.ruleId }; - const conf = raw["confidence"]; - if (conf === "high" || conf === "medium" || conf === "low") entry.confidence = conf; - const pro = raw["pro"]; - if (Array.isArray(pro)) entry.pro = pro as string[]; - const con = raw["con"]; - if (Array.isArray(con)) entry.con = con as string[]; + if (r.confidence) entry.confidence = r.confidence; + if (r.pro) entry.pro = r.pro; + if (r.con) entry.con = r.con; const dec = r.decision; if (dec === "APPROVE" || dec === "REJECT" || dec === "REVISE") entry.decision = dec; return entry; From 4018b29b916f9eb0e3147ae3b16fc9609678f279 Mon Sep 17 00:00:00 2001 From: let-sunny Date: Sun, 29 Mar 2026 18:43:16 +0900 Subject: [PATCH 07/12] refactor: extract deterministic logic from orchestrator into CLI commands Three deterministic tasks the orchestrator (LLM) was doing are now CLI commands: 1. calibrate-gather-evidence Reads conversion.json, gaps.json, summary.md, calibration-evidence.json and writes a single critic-evidence.json for the Critic prompt. Was: orchestrator reads 3-4 files and extracts relevant parts (error-prone) 2. calibrate-finalize-debate After Critic: checks early-stop (all REJECT + high confidence) After Arbitrator: determines stoppingReason (low-confidence-hold) Was: orchestrator evaluates conditions inline (can misjudge) calibrate-loop.md updated to use CLI commands instead of inline logic. Orchestrator now only: runs CLI commands + spawns LLM agents. Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/commands/calibrate-loop.md | 49 +++-- src/cli/commands/internal/calibrate-debate.ts | 193 ++++++++++++++++++ src/cli/index.ts | 3 + 3 files changed, 223 insertions(+), 22 deletions(-) create mode 100644 src/cli/commands/internal/calibrate-debate.ts diff --git a/.claude/commands/calibrate-loop.md b/.claude/commands/calibrate-loop.md index 848d3df8..9fadb295 100644 --- a/.claude/commands/calibrate-loop.md +++ b/.claude/commands/calibrate-loop.md @@ -135,17 +135,19 @@ If zero proposals, write `$RUN_DIR/debate.json` with skip reason and jump to Ste ### Step 5 — Critic -Before spawning the Critic, gather supporting evidence: +Gather supporting evidence (deterministic CLI — no LLM): -1. Read `$RUN_DIR/conversion.json` → extract `ruleImpactAssessment` and `uncoveredStruggles` -2. Read `$RUN_DIR/gaps.json` (if exists) → extract actionable gaps -3. Read `data/calibration-evidence.json` (if exists) → extract prior evidence for proposed rules +```bash +npx canicode calibrate-gather-evidence $RUN_DIR +``` + +This reads `conversion.json`, `gaps.json`, `summary.md`, and `data/calibration-evidence.json`, and writes a single `$RUN_DIR/critic-evidence.json` with structured data for the Critic. + +Read `$RUN_DIR/critic-evidence.json` and include it in the Critic prompt. Spawn the `calibration-critic` subagent. In the prompt: - Include the proposal list from summary.md -- Include the Converter's `ruleImpactAssessment` (actual implementation difficulty per rule) -- Include actionable gaps from Gap Analysis (if available) -- Include prior cross-run evidence for the proposed rules +- Include the gathered evidence from `critic-evidence.json` - **Tell the agent: "Return your reviews as JSON. Do NOT write any files."** After the Critic returns, **you** write the JSON to `$RUN_DIR/debate.json`: @@ -173,25 +175,21 @@ Append to `$RUN_DIR/activity.jsonl`: {"step":"Critic","timestamp":"","result":"approved= rejected= revised=","durationMs":} ``` -#### Early-stop check +#### Early-stop check (deterministic CLI — no LLM) -After the Critic returns, check for early termination: +```bash +npx canicode calibrate-finalize-debate $RUN_DIR +``` -- If **all reviews** have `decision: "REJECT"` AND `confidence: "high"` → skip Arbitrator. Write debate.json with: - ```json - { - "critic": { ... }, - "arbitrator": null, - "stoppingReason": "all-high-confidence-reject" - } - ``` - Append to activity.jsonl: +This outputs JSON: `{"action": "early-stop"|"continue", ...}`. + +- If `action` is `"early-stop"`: the CLI has already written `stoppingReason` to debate.json. Append to activity.jsonl: ```json {"step":"Arbitrator","timestamp":"","result":"SKIPPED — early-stop: all proposals rejected with high confidence","durationMs":0} ``` Jump to Step 6.5. -Otherwise, proceed to Step 6. +- If `action` is `"continue"`: proceed to Step 6. ### Step 6 — Arbitrator @@ -199,7 +197,7 @@ Spawn the `calibration-arbitrator` subagent. In the prompt: - Include proposals and the Critic's reviews from `$RUN_DIR/debate.json` - **Tell the agent: "Return your decisions as JSON. Only edit rule-config.ts if applying changes. Do NOT write to logs."** -After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the existing content and add the `arbitrator` field. Only set `stoppingReason` at the **top level** when non-normal termination occurred (e.g. `"low-confidence-hold"`, `"all-high-confidence-reject"`). Omit the field for normal completion: +After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the existing content and add the `arbitrator` field: ```json { @@ -217,11 +215,18 @@ After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the "reason": "..." } ] - }, - "stoppingReason": "low-confidence-hold" + } } ``` +Then finalize the debate (deterministic CLI — no LLM): + +```bash +npx canicode calibrate-finalize-debate $RUN_DIR +``` + +This determines `stoppingReason` (if any) and writes it to debate.json. Outputs JSON with `action: "finalized"`. + Append to `$RUN_DIR/activity.jsonl`: ```json {"step":"Arbitrator","timestamp":"","result":"applied= rejected= hold=","durationMs":} diff --git a/src/cli/commands/internal/calibrate-debate.ts b/src/cli/commands/internal/calibrate-debate.ts new file mode 100644 index 00000000..8567ca88 --- /dev/null +++ b/src/cli/commands/internal/calibrate-debate.ts @@ -0,0 +1,193 @@ +import { existsSync, readFileSync, writeFileSync } from "node:fs"; +import { join, resolve } from "node:path"; +import type { CAC } from "cac"; + +import { parseDebateResult } from "../../../agents/run-directory.js"; +import { loadCalibrationEvidence } from "../../../agents/evidence-collector.js"; + +// ─── calibrate-gather-evidence ────────────────────────────────────────────── + +interface GatheredEvidence { + ruleImpactAssessment: unknown[]; + uncoveredStruggles: unknown[]; + actionableGaps: unknown[]; + priorEvidence: Record; +} + +/** + * Gather structured evidence for the Critic from run artifacts + cross-run data. + * Pure data extraction — no LLM needed. + */ +function gatherEvidence(runDir: string, proposedRuleIds: string[]): GatheredEvidence { + const result: GatheredEvidence = { + ruleImpactAssessment: [], + uncoveredStruggles: [], + actionableGaps: [], + priorEvidence: {}, + }; + + // 1. conversion.json → ruleImpactAssessment, uncoveredStruggles + const convPath = join(runDir, "conversion.json"); + if (existsSync(convPath)) { + try { + const conv = JSON.parse(readFileSync(convPath, "utf-8")) as Record; + if (Array.isArray(conv["ruleImpactAssessment"])) { + result.ruleImpactAssessment = conv["ruleImpactAssessment"]; + } + if (Array.isArray(conv["uncoveredStruggles"])) { + result.uncoveredStruggles = conv["uncoveredStruggles"]; + } + } catch { /* ignore malformed */ } + } + + // 2. gaps.json → actionable gaps + const gapsPath = join(runDir, "gaps.json"); + if (existsSync(gapsPath)) { + try { + const gaps = JSON.parse(readFileSync(gapsPath, "utf-8")) as Record; + const gapList = Array.isArray(gaps["gaps"]) ? gaps["gaps"] : []; + result.actionableGaps = gapList.filter( + (g): g is Record => + typeof g === "object" && g !== null && (g as Record)["actionable"] === true + ); + } catch { /* ignore malformed */ } + } + + // 3. Prior evidence filtered to proposed rules only + if (proposedRuleIds.length > 0) { + const allEvidence = loadCalibrationEvidence(); + const ruleSet = new Set(proposedRuleIds.map((id) => id.trim())); + for (const [ruleId, group] of Object.entries(allEvidence)) { + if (ruleSet.has(ruleId)) { + result.priorEvidence[ruleId] = group; + } + } + } + + return result; +} + +/** + * Extract proposed ruleIds from summary.md. + * Looks for rule IDs in markdown table rows or bullet points. + */ +function extractProposedRuleIds(runDir: string): string[] { + const summaryPath = join(runDir, "summary.md"); + if (!existsSync(summaryPath)) return []; + try { + const content = readFileSync(summaryPath, "utf-8"); + const ids = new Set(); + // Match rule IDs in backticks (common in markdown tables) + for (const match of content.matchAll(/`([a-z][\w-]*)`/g)) { + if (match[1]) ids.add(match[1]); + } + return [...ids]; + } catch { + return []; + } +} + +export function registerGatherEvidence(cli: CAC): void { + cli + .command( + "calibrate-gather-evidence ", + "Gather structured evidence for Critic from run artifacts + cross-run data" + ) + .action((runDir: string) => { + const dir = resolve(runDir); + if (!existsSync(dir)) { + console.error(`Run directory not found: ${runDir}`); + process.exitCode = 1; + return; + } + + const proposedRuleIds = extractProposedRuleIds(dir); + const evidence = gatherEvidence(dir, proposedRuleIds); + + // Write to file for orchestrator to include in Critic prompt + const outPath = join(dir, "critic-evidence.json"); + writeFileSync(outPath, JSON.stringify(evidence, null, 2) + "\n", "utf-8"); + console.log(`Gathered evidence: ${evidence.ruleImpactAssessment.length} impact assessments, ${evidence.actionableGaps.length} gaps, ${Object.keys(evidence.priorEvidence).length} prior rules`); + console.log(`Written to ${outPath}`); + }); +} + +// ─── calibrate-finalize-debate ────────────────────────────────────────────── + +interface FinalizeResult { + action: "early-stop" | "continue" | "finalized"; + stoppingReason?: string; +} + +export function registerFinalizeDebate(cli: CAC): void { + cli + .command( + "calibrate-finalize-debate ", + "Check early-stop or determine stoppingReason after debate" + ) + .action((runDir: string) => { + const dir = resolve(runDir); + if (!existsSync(dir)) { + console.error(`Run directory not found: ${runDir}`); + process.exitCode = 1; + return; + } + + const debate = parseDebateResult(dir); + if (!debate) { + console.error("No debate.json found"); + process.exitCode = 1; + return; + } + + const debatePath = join(dir, "debate.json"); + const raw = JSON.parse(readFileSync(debatePath, "utf-8")) as Record; + + // Case 1: Critic done, no Arbitrator yet → check early-stop + if (debate.critic && !debate.arbitrator) { + const reviews = debate.critic.reviews; + const allHighConfidenceReject = reviews.length > 0 && reviews.every((r) => { + return r.decision.trim().toUpperCase() === "REJECT" && r.confidence === "high"; + }); + + if (allHighConfidenceReject) { + raw["stoppingReason"] = "all-high-confidence-reject"; + writeFileSync(debatePath, JSON.stringify(raw, null, 2) + "\n", "utf-8"); + const result: FinalizeResult = { action: "early-stop", stoppingReason: "all-high-confidence-reject" }; + console.log(JSON.stringify(result)); + // exit 0 = early-stop, orchestrator should skip Arbitrator + return; + } + + const result: FinalizeResult = { action: "continue" }; + console.log(JSON.stringify(result)); + // exit 0 but action=continue → orchestrator proceeds to Arbitrator + return; + } + + // Case 2: Both Critic and Arbitrator done → determine stoppingReason + if (debate.arbitrator) { + const decisions = debate.arbitrator.decisions; + const allHold = decisions.length > 0 && decisions.every((d) => + d.decision.trim().toLowerCase() === "hold" + ); + + if (allHold) { + raw["stoppingReason"] = "low-confidence-hold"; + writeFileSync(debatePath, JSON.stringify(raw, null, 2) + "\n", "utf-8"); + const result: FinalizeResult = { action: "finalized", stoppingReason: "low-confidence-hold" }; + console.log(JSON.stringify(result)); + return; + } + + // Normal completion — no stoppingReason needed + const result: FinalizeResult = { action: "finalized" }; + console.log(JSON.stringify(result)); + return; + } + + // Fallback + const result: FinalizeResult = { action: "continue" }; + console.log(JSON.stringify(result)); + }); +} diff --git a/src/cli/index.ts b/src/cli/index.ts index 3ecb00ab..3074df24 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -32,6 +32,7 @@ import { registerCalibrateAnalyze } from "./commands/internal/calibrate-analyze. import { registerCalibrateEvaluate } from "./commands/internal/calibrate-evaluate.js"; import { registerCalibrateGapReport } from "./commands/internal/calibrate-gap-report.js"; import { registerCalibrateRun } from "./commands/internal/calibrate-run.js"; +import { registerGatherEvidence, registerFinalizeDebate } from "./commands/internal/calibrate-debate.js"; import { registerFixtureManagement, registerEvidenceEnrich, registerEvidencePrune } from "./commands/internal/fixture-management.js"; const require = createRequire(import.meta.url); @@ -78,6 +79,8 @@ registerCalibrateAnalyze(cli); registerCalibrateEvaluate(cli); registerCalibrateGapReport(cli); registerCalibrateRun(cli); +registerGatherEvidence(cli); +registerFinalizeDebate(cli); registerFixtureManagement(cli); registerEvidenceEnrich(cli); registerEvidencePrune(cli); From 06321d9ce3b1241e975febccb54896f30d62321a Mon Sep 17 00:00:00 2001 From: let-sunny Date: Sun, 29 Mar 2026 18:45:14 +0900 Subject: [PATCH 08/12] test: add unit tests for calibrate-debate CLI logic 8 tests covering: - gather-evidence: conversion.json parsing, gap filtering, ruleId extraction - finalize-debate: early-stop detection, mixed reviews, hold detection, normal completion, missing debate.json Co-Authored-By: Claude Opus 4.6 (1M context) --- .../internal/calibrate-debate.test.ts | 169 ++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 src/cli/commands/internal/calibrate-debate.test.ts diff --git a/src/cli/commands/internal/calibrate-debate.test.ts b/src/cli/commands/internal/calibrate-debate.test.ts new file mode 100644 index 00000000..5cec4845 --- /dev/null +++ b/src/cli/commands/internal/calibrate-debate.test.ts @@ -0,0 +1,169 @@ +import { mkdtempSync, writeFileSync, readFileSync, existsSync } from "node:fs"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; +import { rm } from "node:fs/promises"; + +/** + * Import the functions directly to test as units. + * These are the same functions the CLI commands call. + */ + +// We can't import the CLI registration functions directly (they register on CAC), +// so we test the underlying logic by importing from the modules they depend on. +import { parseDebateResult } from "../../../agents/run-directory.js"; +import { loadCalibrationEvidence } from "../../../agents/evidence-collector.js"; + +describe("calibrate-gather-evidence logic", () => { + let runDir: string; + + beforeEach(() => { + runDir = mkdtempSync(join(tmpdir(), "gather-test-")); + }); + + afterEach(async () => { + await rm(runDir, { recursive: true, force: true }); + }); + + it("conversion.json ruleImpactAssessment is parseable", () => { + writeFileSync(join(runDir, "conversion.json"), JSON.stringify({ + ruleImpactAssessment: [ + { ruleId: "no-auto-layout", issueCount: 3, actualImpact: "easy" }, + ], + uncoveredStruggles: [ + { description: "border radius mismatch" }, + ], + })); + + const conv = JSON.parse(readFileSync(join(runDir, "conversion.json"), "utf-8")) as Record; + expect(Array.isArray(conv["ruleImpactAssessment"])).toBe(true); + expect(conv["ruleImpactAssessment"]).toHaveLength(1); + expect(Array.isArray(conv["uncoveredStruggles"])).toBe(true); + }); + + it("gaps.json actionable filtering works", () => { + writeFileSync(join(runDir, "gaps.json"), JSON.stringify({ + gaps: [ + { category: "spacing", actionable: true, description: "padding off" }, + { category: "rendering", actionable: false, description: "font fallback" }, + ], + })); + + const gaps = JSON.parse(readFileSync(join(runDir, "gaps.json"), "utf-8")) as Record; + const gapList = Array.isArray(gaps["gaps"]) ? gaps["gaps"] : []; + const actionable = gapList.filter( + (g): g is Record => + typeof g === "object" && g !== null && (g as Record)["actionable"] === true + ); + expect(actionable).toHaveLength(1); + expect((actionable[0] as Record)["description"]).toBe("padding off"); + }); + + it("proposed ruleIds are extracted from summary.md", () => { + writeFileSync(join(runDir, "summary.md"), "## Overscored\n| `no-auto-layout` | -10 | easy |\n| `raw-value` | -3 | moderate |"); + + const content = readFileSync(join(runDir, "summary.md"), "utf-8"); + const ids = new Set(); + for (const match of content.matchAll(/`([a-z][\w-]*)`/g)) { + if (match[1]) ids.add(match[1]); + } + expect([...ids]).toContain("no-auto-layout"); + expect([...ids]).toContain("raw-value"); + }); +}); + +describe("calibrate-finalize-debate logic", () => { + let runDir: string; + + beforeEach(() => { + runDir = mkdtempSync(join(tmpdir(), "finalize-test-")); + }); + + afterEach(async () => { + await rm(runDir, { recursive: true, force: true }); + }); + + it("detects early-stop when all critic reviews are high-confidence REJECT", () => { + writeFileSync(join(runDir, "debate.json"), JSON.stringify({ + critic: { + summary: "rejected=2", + reviews: [ + { ruleId: "a", decision: "REJECT", confidence: "high", pro: [], con: ["weak"], reason: "x" }, + { ruleId: "b", decision: "REJECT", confidence: "high", pro: [], con: ["weak"], reason: "y" }, + ], + }, + })); + + const debate = parseDebateResult(runDir)!; + expect(debate.critic).not.toBeNull(); + expect(debate.arbitrator).toBeNull(); + + const reviews = debate.critic!.reviews; + const allHighConfidenceReject = reviews.length > 0 && reviews.every((r) => + r.decision.trim().toUpperCase() === "REJECT" && r.confidence === "high" + ); + expect(allHighConfidenceReject).toBe(true); + }); + + it("does NOT early-stop when reviews are mixed", () => { + writeFileSync(join(runDir, "debate.json"), JSON.stringify({ + critic: { + summary: "approved=1 rejected=1", + reviews: [ + { ruleId: "a", decision: "APPROVE", confidence: "high", reason: "x" }, + { ruleId: "b", decision: "REJECT", confidence: "medium", reason: "y" }, + ], + }, + })); + + const debate = parseDebateResult(runDir)!; + const reviews = debate.critic!.reviews; + const allHighConfidenceReject = reviews.length > 0 && reviews.every((r) => + r.decision.trim().toUpperCase() === "REJECT" && r.confidence === "high" + ); + expect(allHighConfidenceReject).toBe(false); + }); + + it("detects low-confidence-hold when all arbitrator decisions are hold", () => { + writeFileSync(join(runDir, "debate.json"), JSON.stringify({ + critic: { summary: "revised=2", reviews: [] }, + arbitrator: { + summary: "hold=2", + decisions: [ + { ruleId: "a", decision: "hold" }, + { ruleId: "b", decision: "hold" }, + ], + }, + })); + + const debate = parseDebateResult(runDir)!; + const decisions = debate.arbitrator!.decisions; + const allHold = decisions.length > 0 && decisions.every((d) => + d.decision.trim().toLowerCase() === "hold" + ); + expect(allHold).toBe(true); + }); + + it("no stoppingReason for normal completion", () => { + writeFileSync(join(runDir, "debate.json"), JSON.stringify({ + critic: { summary: "approved=1", reviews: [] }, + arbitrator: { + summary: "applied=1", + decisions: [ + { ruleId: "a", decision: "applied", before: -10, after: -7 }, + ], + }, + })); + + const debate = parseDebateResult(runDir)!; + const decisions = debate.arbitrator!.decisions; + const allHold = decisions.length > 0 && decisions.every((d) => + d.decision.trim().toLowerCase() === "hold" + ); + expect(allHold).toBe(false); + }); + + it("returns null for missing debate.json", () => { + const debate = parseDebateResult(runDir); + expect(debate).toBeNull(); + }); +}); From 4d15939f794caff33d32933431e463a963175d6a Mon Sep 17 00:00:00 2001 From: let-sunny Date: Sun, 29 Mar 2026 18:50:26 +0900 Subject: [PATCH 09/12] fix: deterministic ruleId extraction + export gatherEvidence for testing - calibrate-evaluate now writes proposed-rules.json (deterministic ruleId list) - calibrate-gather-evidence reads proposed-rules.json first, falls back to summary.md regex (eliminates false positive risk) - Export gatherEvidence, loadProposedRuleIds, GatheredEvidence for direct unit testing - Tests rewritten to import functions directly (13 tests) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../internal/calibrate-debate.test.ts | 98 ++++++++++++------- src/cli/commands/internal/calibrate-debate.ts | 23 +++-- .../commands/internal/calibrate-evaluate.ts | 9 ++ 3 files changed, 85 insertions(+), 45 deletions(-) diff --git a/src/cli/commands/internal/calibrate-debate.test.ts b/src/cli/commands/internal/calibrate-debate.test.ts index 5cec4845..261f4e84 100644 --- a/src/cli/commands/internal/calibrate-debate.test.ts +++ b/src/cli/commands/internal/calibrate-debate.test.ts @@ -1,19 +1,12 @@ -import { mkdtempSync, writeFileSync, readFileSync, existsSync } from "node:fs"; +import { mkdtempSync, writeFileSync, existsSync } from "node:fs"; import { join } from "node:path"; import { tmpdir } from "node:os"; import { rm } from "node:fs/promises"; -/** - * Import the functions directly to test as units. - * These are the same functions the CLI commands call. - */ - -// We can't import the CLI registration functions directly (they register on CAC), -// so we test the underlying logic by importing from the modules they depend on. +import { gatherEvidence, loadProposedRuleIds } from "./calibrate-debate.js"; import { parseDebateResult } from "../../../agents/run-directory.js"; -import { loadCalibrationEvidence } from "../../../agents/evidence-collector.js"; -describe("calibrate-gather-evidence logic", () => { +describe("gatherEvidence", () => { let runDir: string; beforeEach(() => { @@ -24,7 +17,7 @@ describe("calibrate-gather-evidence logic", () => { await rm(runDir, { recursive: true, force: true }); }); - it("conversion.json ruleImpactAssessment is parseable", () => { + it("extracts ruleImpactAssessment and uncoveredStruggles from conversion.json", () => { writeFileSync(join(runDir, "conversion.json"), JSON.stringify({ ruleImpactAssessment: [ { ruleId: "no-auto-layout", issueCount: 3, actualImpact: "easy" }, @@ -34,40 +27,73 @@ describe("calibrate-gather-evidence logic", () => { ], })); - const conv = JSON.parse(readFileSync(join(runDir, "conversion.json"), "utf-8")) as Record; - expect(Array.isArray(conv["ruleImpactAssessment"])).toBe(true); - expect(conv["ruleImpactAssessment"]).toHaveLength(1); - expect(Array.isArray(conv["uncoveredStruggles"])).toBe(true); + const evidence = gatherEvidence(runDir, []); + expect(evidence.ruleImpactAssessment).toHaveLength(1); + expect(evidence.uncoveredStruggles).toHaveLength(1); }); - it("gaps.json actionable filtering works", () => { + it("filters gaps to actionable only", () => { writeFileSync(join(runDir, "gaps.json"), JSON.stringify({ gaps: [ { category: "spacing", actionable: true, description: "padding off" }, { category: "rendering", actionable: false, description: "font fallback" }, + { category: "color", actionable: true, description: "wrong shade" }, ], })); - const gaps = JSON.parse(readFileSync(join(runDir, "gaps.json"), "utf-8")) as Record; - const gapList = Array.isArray(gaps["gaps"]) ? gaps["gaps"] : []; - const actionable = gapList.filter( - (g): g is Record => - typeof g === "object" && g !== null && (g as Record)["actionable"] === true - ); - expect(actionable).toHaveLength(1); - expect((actionable[0] as Record)["description"]).toBe("padding off"); + const evidence = gatherEvidence(runDir, []); + expect(evidence.actionableGaps).toHaveLength(2); + }); + + it("handles missing files gracefully", () => { + const evidence = gatherEvidence(runDir, []); + expect(evidence.ruleImpactAssessment).toHaveLength(0); + expect(evidence.uncoveredStruggles).toHaveLength(0); + expect(evidence.actionableGaps).toHaveLength(0); + expect(evidence.priorEvidence).toEqual({}); + }); + + it("returns empty priorEvidence when no ruleIds proposed", () => { + const evidence = gatherEvidence(runDir, []); + expect(evidence.priorEvidence).toEqual({}); + }); +}); + +describe("loadProposedRuleIds", () => { + let runDir: string; + + beforeEach(() => { + runDir = mkdtempSync(join(tmpdir(), "proposed-test-")); + }); + + afterEach(async () => { + await rm(runDir, { recursive: true, force: true }); }); - it("proposed ruleIds are extracted from summary.md", () => { - writeFileSync(join(runDir, "summary.md"), "## Overscored\n| `no-auto-layout` | -10 | easy |\n| `raw-value` | -3 | moderate |"); + it("loads from proposed-rules.json when available", () => { + writeFileSync(join(runDir, "proposed-rules.json"), JSON.stringify(["no-auto-layout", "raw-value"])); + const ids = loadProposedRuleIds(runDir); + expect(ids).toEqual(["no-auto-layout", "raw-value"]); + }); - const content = readFileSync(join(runDir, "summary.md"), "utf-8"); - const ids = new Set(); - for (const match of content.matchAll(/`([a-z][\w-]*)`/g)) { - if (match[1]) ids.add(match[1]); - } - expect([...ids]).toContain("no-auto-layout"); - expect([...ids]).toContain("raw-value"); + it("falls back to summary.md regex when no proposed-rules.json", () => { + writeFileSync(join(runDir, "summary.md"), "## Overscored\n| `no-auto-layout` | -10 | easy |\n| `raw-value` | -3 |"); + const ids = loadProposedRuleIds(runDir); + expect(ids).toContain("no-auto-layout"); + expect(ids).toContain("raw-value"); + }); + + it("returns empty for missing files", () => { + const ids = loadProposedRuleIds(runDir); + expect(ids).toEqual([]); + }); + + it("prefers proposed-rules.json over summary.md", () => { + writeFileSync(join(runDir, "proposed-rules.json"), JSON.stringify(["rule-a"])); + writeFileSync(join(runDir, "summary.md"), "| `rule-a` | | |\n| `rule-b` | | |"); + const ids = loadProposedRuleIds(runDir); + // Should only have rule-a from proposed-rules.json, not rule-b from summary.md + expect(ids).toEqual(["rule-a"]); }); }); @@ -94,9 +120,6 @@ describe("calibrate-finalize-debate logic", () => { })); const debate = parseDebateResult(runDir)!; - expect(debate.critic).not.toBeNull(); - expect(debate.arbitrator).toBeNull(); - const reviews = debate.critic!.reviews; const allHighConfidenceReject = reviews.length > 0 && reviews.every((r) => r.decision.trim().toUpperCase() === "REJECT" && r.confidence === "high" @@ -163,7 +186,6 @@ describe("calibrate-finalize-debate logic", () => { }); it("returns null for missing debate.json", () => { - const debate = parseDebateResult(runDir); - expect(debate).toBeNull(); + expect(parseDebateResult(runDir)).toBeNull(); }); }); diff --git a/src/cli/commands/internal/calibrate-debate.ts b/src/cli/commands/internal/calibrate-debate.ts index 8567ca88..9031eb77 100644 --- a/src/cli/commands/internal/calibrate-debate.ts +++ b/src/cli/commands/internal/calibrate-debate.ts @@ -7,7 +7,7 @@ import { loadCalibrationEvidence } from "../../../agents/evidence-collector.js"; // ─── calibrate-gather-evidence ────────────────────────────────────────────── -interface GatheredEvidence { +export interface GatheredEvidence { ruleImpactAssessment: unknown[]; uncoveredStruggles: unknown[]; actionableGaps: unknown[]; @@ -18,7 +18,7 @@ interface GatheredEvidence { * Gather structured evidence for the Critic from run artifacts + cross-run data. * Pure data extraction — no LLM needed. */ -function gatherEvidence(runDir: string, proposedRuleIds: string[]): GatheredEvidence { +export function gatherEvidence(runDir: string, proposedRuleIds: string[]): GatheredEvidence { const result: GatheredEvidence = { ruleImpactAssessment: [], uncoveredStruggles: [], @@ -68,16 +68,25 @@ function gatherEvidence(runDir: string, proposedRuleIds: string[]): GatheredEvid } /** - * Extract proposed ruleIds from summary.md. - * Looks for rule IDs in markdown table rows or bullet points. + * Load proposed ruleIds from proposed-rules.json (written by calibrate-evaluate). + * Falls back to regex extraction from summary.md if file doesn't exist. */ -function extractProposedRuleIds(runDir: string): string[] { +export function loadProposedRuleIds(runDir: string): string[] { + // Preferred: deterministic list from calibrate-evaluate + const proposedPath = join(runDir, "proposed-rules.json"); + if (existsSync(proposedPath)) { + try { + const raw: unknown = JSON.parse(readFileSync(proposedPath, "utf-8")); + if (Array.isArray(raw)) return raw.filter((id): id is string => typeof id === "string"); + } catch { /* fall through to regex */ } + } + + // Fallback: extract from summary.md (may have false positives) const summaryPath = join(runDir, "summary.md"); if (!existsSync(summaryPath)) return []; try { const content = readFileSync(summaryPath, "utf-8"); const ids = new Set(); - // Match rule IDs in backticks (common in markdown tables) for (const match of content.matchAll(/`([a-z][\w-]*)`/g)) { if (match[1]) ids.add(match[1]); } @@ -101,7 +110,7 @@ export function registerGatherEvidence(cli: CAC): void { return; } - const proposedRuleIds = extractProposedRuleIds(dir); + const proposedRuleIds = loadProposedRuleIds(dir); const evidence = gatherEvidence(dir, proposedRuleIds); // Write to file for orchestrator to include in Critic prompt diff --git a/src/cli/commands/internal/calibrate-evaluate.ts b/src/cli/commands/internal/calibrate-evaluate.ts index 02e9d637..45665042 100644 --- a/src/cli/commands/internal/calibrate-evaluate.ts +++ b/src/cli/commands/internal/calibrate-evaluate.ts @@ -82,6 +82,15 @@ export function registerCalibrateEvaluate(cli: CAC): void { mismatchCounts[key]++; } + // Write proposed ruleIds for deterministic evidence gathering + if (options.runDir && tuningOutput.adjustments.length > 0) { + const proposedIds = tuningOutput.adjustments.map( + (a: { ruleId: string }) => a.ruleId + ); + const proposedPath = resolve(options.runDir, "proposed-rules.json"); + await writeFile(proposedPath, JSON.stringify(proposedIds) + "\n", "utf-8"); + } + console.log(`\nEvaluation complete.`); console.log(` Validated: ${mismatchCounts.validated}`); console.log(` Overscored: ${mismatchCounts.overscored}`); From 0e4c16be7e2a9ec1b400a4a9ec3c84f755805d38 Mon Sep 17 00:00:00 2001 From: let-sunny Date: Sun, 29 Mar 2026 18:58:38 +0900 Subject: [PATCH 10/12] fix: revert internal CLI error handling to stdout + exit 0 Internal calibration commands are consumed by subagents that parse stdout. console.error + process.exitCode = 1 breaks this pattern. Reverted all internal commands to console.log + exit 0 for subagent compatibility. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/cli/commands/internal/calibrate-debate.ts | 9 +++------ src/cli/commands/internal/fixture-management.ts | 12 ++++-------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/src/cli/commands/internal/calibrate-debate.ts b/src/cli/commands/internal/calibrate-debate.ts index 9031eb77..bc8f2d36 100644 --- a/src/cli/commands/internal/calibrate-debate.ts +++ b/src/cli/commands/internal/calibrate-debate.ts @@ -105,8 +105,7 @@ export function registerGatherEvidence(cli: CAC): void { .action((runDir: string) => { const dir = resolve(runDir); if (!existsSync(dir)) { - console.error(`Run directory not found: ${runDir}`); - process.exitCode = 1; + console.log(`Run directory not found: ${runDir}`); return; } @@ -137,15 +136,13 @@ export function registerFinalizeDebate(cli: CAC): void { .action((runDir: string) => { const dir = resolve(runDir); if (!existsSync(dir)) { - console.error(`Run directory not found: ${runDir}`); - process.exitCode = 1; + console.log(`Run directory not found: ${runDir}`); return; } const debate = parseDebateResult(dir); if (!debate) { - console.error("No debate.json found"); - process.exitCode = 1; + console.log("No debate.json found"); return; } diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts index 4158a451..7664ea45 100644 --- a/src/cli/commands/internal/fixture-management.ts +++ b/src/cli/commands/internal/fixture-management.ts @@ -123,8 +123,7 @@ export function registerEvidenceEnrich(cli: CAC): void { .action((runDir: string) => { const resolvedDir = resolve(runDir); if (!existsSync(resolvedDir)) { - console.error(`Run directory not found: ${runDir}`); - process.exitCode = 1; + console.log(`Run directory not found: ${runDir}`); return; } const debate = parseDebateResult(resolvedDir); @@ -136,8 +135,7 @@ export function registerEvidenceEnrich(cli: CAC): void { // Extract fixture name from run directory (e.g. "material3-kit--2026-03-26-0900" → "material3-kit") const { name: fixture } = parseRunDirName(basename(resolvedDir)); if (!fixture) { - console.error("Cannot extract fixture name from run directory"); - process.exitCode = 1; + console.log("Cannot extract fixture name from run directory"); return; } @@ -164,14 +162,12 @@ export function registerEvidencePrune(cli: CAC): void { ) .action((runDir: string) => { if (!existsSync(resolve(runDir))) { - console.error(`Run directory not found: ${runDir}`); - process.exitCode = 1; + console.log(`Run directory not found: ${runDir}`); return; } const debate = parseDebateResult(resolve(runDir)); if (!debate) { - console.error("No debate.json found — nothing to prune."); - process.exitCode = 1; + console.log("No debate.json found — nothing to prune."); return; } From 3cdf5825efd78b3d7fe852da80df1fb2d411ef3f Mon Sep 17 00:00:00 2001 From: let-sunny Date: Sun, 29 Mar 2026 19:06:22 +0900 Subject: [PATCH 11/12] fix: guard debate.json re-read + include HOLD in decision enum - finalize-debate: try/catch on second debate.json read, fallback to {action: "continue"} on parse failure (subagent safety) - HOLD added to decision enum in evidence schema, enrichCalibrationEvidence signature, and CLI enrich command (was silently dropped) Co-Authored-By: Claude Opus 4.6 (1M context) --- src/agents/contracts/evidence.ts | 4 ++-- src/agents/evidence-collector.ts | 2 +- src/cli/commands/internal/calibrate-debate.ts | 8 +++++++- src/cli/commands/internal/fixture-management.ts | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/agents/contracts/evidence.ts b/src/agents/contracts/evidence.ts index 01035a43..ec65078c 100644 --- a/src/agents/contracts/evidence.ts +++ b/src/agents/contracts/evidence.ts @@ -12,7 +12,7 @@ export const CalibrationEvidenceEntrySchema = z.object({ confidence: z.enum(["high", "medium", "low"]).optional(), pro: z.array(z.string()).optional(), con: z.array(z.string()).optional(), - decision: z.enum(["APPROVE", "REJECT", "REVISE"]).optional(), + decision: z.enum(["APPROVE", "REJECT", "REVISE", "HOLD"]).optional(), }); export type CalibrationEvidenceEntry = z.infer; @@ -26,7 +26,7 @@ export const CrossRunEvidenceGroupSchema = z.object({ allPro: z.array(z.string()).optional(), allCon: z.array(z.string()).optional(), lastConfidence: z.enum(["high", "medium", "low"]).optional(), - lastDecision: z.enum(["APPROVE", "REJECT", "REVISE"]).optional(), + lastDecision: z.enum(["APPROVE", "REJECT", "REVISE", "HOLD"]).optional(), }); export type CrossRunEvidenceGroup = z.infer; diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts index 3ad2d9fe..5f9e2d2f 100644 --- a/src/agents/evidence-collector.ts +++ b/src/agents/evidence-collector.ts @@ -149,7 +149,7 @@ export function enrichCalibrationEvidence( confidence?: "high" | "medium" | "low"; pro?: string[]; con?: string[]; - decision?: "APPROVE" | "REJECT" | "REVISE"; + decision?: "APPROVE" | "REJECT" | "REVISE" | "HOLD"; }>, fixture: string, evidencePath: string = DEFAULT_CALIBRATION_PATH diff --git a/src/cli/commands/internal/calibrate-debate.ts b/src/cli/commands/internal/calibrate-debate.ts index bc8f2d36..7a228e29 100644 --- a/src/cli/commands/internal/calibrate-debate.ts +++ b/src/cli/commands/internal/calibrate-debate.ts @@ -147,7 +147,13 @@ export function registerFinalizeDebate(cli: CAC): void { } const debatePath = join(dir, "debate.json"); - const raw = JSON.parse(readFileSync(debatePath, "utf-8")) as Record; + let raw: Record; + try { + raw = JSON.parse(readFileSync(debatePath, "utf-8")) as Record; + } catch { + console.log(JSON.stringify({ action: "continue" })); + return; + } // Case 1: Critic done, no Arbitrator yet → check early-stop if (debate.critic && !debate.arbitrator) { diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts index 7664ea45..8ed8433e 100644 --- a/src/cli/commands/internal/fixture-management.ts +++ b/src/cli/commands/internal/fixture-management.ts @@ -145,7 +145,7 @@ export function registerEvidenceEnrich(cli: CAC): void { if (r.pro) entry.pro = r.pro; if (r.con) entry.con = r.con; const dec = r.decision; - if (dec === "APPROVE" || dec === "REJECT" || dec === "REVISE") entry.decision = dec; + if (dec === "APPROVE" || dec === "REJECT" || dec === "REVISE" || dec === "HOLD") entry.decision = dec; return entry; }); From 16112aaa7f77d742715254d77785fe4fd7a5597f Mon Sep 17 00:00:00 2001 From: let-sunny Date: Sun, 29 Mar 2026 19:20:52 +0900 Subject: [PATCH 12/12] fix: validate run dir format + normalize decision casing - enrich-evidence: check timestamp suffix instead of dead !fixture check (parseRunDirName never returns falsy name) - enrich-evidence: normalize decision to uppercase before persisting (handles mixed-case "reject"/"Reject" from Critic) Co-Authored-By: Claude Opus 4.6 (1M context) --- src/cli/commands/internal/fixture-management.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts index 8ed8433e..85dfea03 100644 --- a/src/cli/commands/internal/fixture-management.ts +++ b/src/cli/commands/internal/fixture-management.ts @@ -133,9 +133,9 @@ export function registerEvidenceEnrich(cli: CAC): void { } // Extract fixture name from run directory (e.g. "material3-kit--2026-03-26-0900" → "material3-kit") - const { name: fixture } = parseRunDirName(basename(resolvedDir)); - if (!fixture) { - console.log("Cannot extract fixture name from run directory"); + const { name: fixture, timestamp } = parseRunDirName(basename(resolvedDir)); + if (!timestamp) { + console.log(`Run directory "${basename(resolvedDir)}" does not match expected -- format`); return; } @@ -144,7 +144,7 @@ export function registerEvidenceEnrich(cli: CAC): void { if (r.confidence) entry.confidence = r.confidence; if (r.pro) entry.pro = r.pro; if (r.con) entry.con = r.con; - const dec = r.decision; + const dec = r.decision.trim().toUpperCase(); if (dec === "APPROVE" || dec === "REJECT" || dec === "REVISE" || dec === "HOLD") entry.decision = dec; return entry; });