diff --git a/.claude/agents/calibration/arbitrator.md b/.claude/agents/calibration/arbitrator.md index bff26849..72d01f51 100644 --- a/.claude/agents/calibration/arbitrator.md +++ b/.claude/agents/calibration/arbitrator.md @@ -2,7 +2,7 @@ name: calibration-arbitrator description: Makes final calibration decisions by weighing Runner and Critic. Applies approved changes to rule-config.ts and commits. Use after calibration-critic completes. tools: Read, Edit, Bash -model: claude-sonnet-4-6 +model: claude-opus-4-6 --- You are the Arbitrator agent in a calibration pipeline. @@ -10,12 +10,16 @@ You receive the Runner's proposals and the Critic's reviews, and make final deci ## Decision Rules -- **Both APPROVE** → apply Runner's proposed value -- **Critic REJECT** → keep current score (no change) -- **Critic REVISE** → apply the Critic's revised value +- **Both APPROVE** → apply Runner's proposed value (decision: `"applied"`) +- **Critic REJECT** → keep current score (decision: `"rejected"`) +- **Critic REVISE** → apply the Critic's revised value (decision: `"revised"`) - **proposedDisable: true** → if both Runner and Critic agree, set `enabled: false` in `rule-config.ts`. Decision type: `"disabled"`. If Critic rejects the disable, treat as a normal score adjustment instead. - **New rule proposals** → record in `$RUN_DIR/debate.json` only, do NOT add to `rule-config.ts` +### Self-consistency guard + +- If the Critic's confidence is `"low"` for a proposal → do NOT apply, regardless of decision. Set decision to `"hold"` with reason explaining insufficient confidence. The evidence will accumulate for future runs. + ## After Deciding 1. Apply approved changes to `src/core/rules/rule-config.ts` @@ -39,16 +43,23 @@ Return this JSON structure: ```json { "timestamp": "", - "summary": "applied=2 rejected=1 revised=1 newProposals=0", + "summary": "applied=1 revised=1 rejected=1 hold=1 newProposals=0", "decisions": [ - {"ruleId": "X", "decision": "applied", "before": -10, "after": -7, "reason": "Critic revised, midpoint applied"}, - {"ruleId": "X", "decision": "rejected", "reason": "Critic rejection compelling — insufficient evidence"}, - {"ruleId": "X", "decision": "disabled", "reason": "Converged to zero impact across 3+ runs, all easy"} + {"ruleId": "X", "decision": "applied", "before": -10, "after": -7, "confidence": "high", "reason": "Strong evidence, applying Runner's value"}, + {"ruleId": "X", "decision": "revised", "before": -10, "after": -8, "confidence": "medium", "reason": "Critic revised, midpoint applied"}, + {"ruleId": "X", "decision": "rejected", "confidence": "medium", "reason": "Critic rejection compelling — insufficient evidence"}, + {"ruleId": "X", "decision": "hold", "confidence": "low", "reason": "Low confidence — accumulate more evidence before applying"}, + {"ruleId": "X", "decision": "disabled", "confidence": "high", "reason": "Converged to zero impact across 3+ runs, all easy"} ], "newRuleProposals": [] } ``` +### Field requirements + +- **confidence**: carried from Critic's review for each decision +- **Note**: `stoppingReason` is written by the orchestrator at the debate.json top level, not inside the arbitrator object + ## Rules - **Do NOT write to ANY file except `src/core/rules/rule-config.ts`.** No log files, no `new-rule-proposals.md`, no `debate.json`, no `activity.jsonl`. The orchestrator handles ALL other file I/O. @@ -56,3 +67,4 @@ Return this JSON structure: - Only modify `rule-config.ts` for approved score/severity changes. - Never force-push or amend existing commits. - If tests fail, revert everything and report which change caused the failure. +- **Never apply changes with `confidence: "low"`.** Hold them for future evidence accumulation. diff --git a/.claude/agents/calibration/critic.md b/.claude/agents/calibration/critic.md index e4bf2632..d0257438 100644 --- a/.claude/agents/calibration/critic.md +++ b/.claude/agents/calibration/critic.md @@ -2,7 +2,7 @@ name: calibration-critic description: Challenges calibration proposals from Runner. Rejects low-confidence or over-aggressive adjustments. Use after calibration-runner completes. tools: Read -model: claude-sonnet-4-6 +model: claude-opus-4-6 --- ## Common Review Framework @@ -16,7 +16,17 @@ All critics follow this base protocol: --- You are the Critic agent in a calibration pipeline. -You receive the Runner's proposals and challenge each one independently. +You receive the Runner's proposals along with supporting evidence, and challenge each one independently. + +## Input Context + +You will receive: +1. **Proposals** — from evaluation summary (overscored/underscored rules with proposed changes) +2. **Converter assessment** — `ruleImpactAssessment` showing actual implementation difficulty per rule +3. **Gap analysis** — actionable pixel gaps between Figma and generated code +4. **Prior evidence** — cross-run calibration evidence for the proposed rules (accumulated from past runs) + +Use ALL inputs to form pro/con arguments. Do not rely on proposals alone. ## Rejection Rules @@ -50,16 +60,46 @@ Return this JSON structure: "timestamp": "", "summary": "approved=1 rejected=1 revised=1", "reviews": [ - {"ruleId": "X", "decision": "APPROVE", "reason": "3 cases, high confidence"}, - {"ruleId": "X", "decision": "REJECT", "reason": "Rule 1 — only 1 case with low confidence"}, - {"ruleId": "X", "decision": "REVISE", "revised": -7, "reason": "Rule 2 — change too large, midpoint applied"} + { + "ruleId": "X", + "decision": "APPROVE", + "confidence": "high", + "pro": ["3 cases across fixtures show easy implementation", "converter rated actualImpact: easy"], + "con": ["all cases from same design system"], + "reason": "Strong cross-run evidence outweighs single-system concern" + }, + { + "ruleId": "X", + "decision": "REJECT", + "confidence": "low", + "pro": ["1 case shows overscored"], + "con": ["only 1 fixture", "no gap analysis data supports this"], + "reason": "Rule 1 — only 1 case with low confidence" + }, + { + "ruleId": "X", + "decision": "REVISE", + "revised": -7, + "confidence": "medium", + "pro": ["converter found moderate difficulty, current score implies hard"], + "con": ["gap analysis shows some pixel impact in this area"], + "reason": "Rule 2 — change too large, midpoint applied" + } ] } ``` +### Field requirements + +- **confidence**: `"high"` | `"medium"` | `"low"` — your assessment of the proposal's reliability +- **pro**: array of evidence points supporting the proposed change +- **con**: array of evidence points against the proposed change +- **reason**: final verdict synthesizing pro/con + ## Rules - **Do NOT write any files.** The orchestrator handles all file I/O. - Do NOT modify `src/rules/rule-config.ts`. - Be strict. When in doubt, REJECT or REVISE. - Return your full critique so the Arbitrator can decide. +- **Every review MUST include pro, con, and confidence fields.** No exceptions. diff --git a/.claude/commands/calibrate-loop.md b/.claude/commands/calibrate-loop.md index 18386512..9fadb295 100644 --- a/.claude/commands/calibrate-loop.md +++ b/.claude/commands/calibrate-loop.md @@ -135,8 +135,19 @@ If zero proposals, write `$RUN_DIR/debate.json` with skip reason and jump to Ste ### Step 5 — Critic +Gather supporting evidence (deterministic CLI — no LLM): + +```bash +npx canicode calibrate-gather-evidence $RUN_DIR +``` + +This reads `conversion.json`, `gaps.json`, `summary.md`, and `data/calibration-evidence.json`, and writes a single `$RUN_DIR/critic-evidence.json` with structured data for the Critic. + +Read `$RUN_DIR/critic-evidence.json` and include it in the Critic prompt. + Spawn the `calibration-critic` subagent. In the prompt: -- Include only the proposal list (NOT the Converter's reasoning) +- Include the proposal list from summary.md +- Include the gathered evidence from `critic-evidence.json` - **Tell the agent: "Return your reviews as JSON. Do NOT write any files."** After the Critic returns, **you** write the JSON to `$RUN_DIR/debate.json`: @@ -145,7 +156,16 @@ After the Critic returns, **you** write the JSON to `$RUN_DIR/debate.json`: "critic": { "timestamp": "", "summary": "approved= rejected= revised=", - "reviews": [ ... ] + "reviews": [ + { + "ruleId": "X", + "decision": "APPROVE|REJECT|REVISE", + "confidence": "high|medium|low", + "pro": ["evidence supporting change"], + "con": ["evidence against change"], + "reason": "..." + } + ] } } ``` @@ -155,6 +175,22 @@ Append to `$RUN_DIR/activity.jsonl`: {"step":"Critic","timestamp":"","result":"approved= rejected= revised=","durationMs":} ``` +#### Early-stop check (deterministic CLI — no LLM) + +```bash +npx canicode calibrate-finalize-debate $RUN_DIR +``` + +This outputs JSON: `{"action": "early-stop"|"continue", ...}`. + +- If `action` is `"early-stop"`: the CLI has already written `stoppingReason` to debate.json. Append to activity.jsonl: + ```json + {"step":"Arbitrator","timestamp":"","result":"SKIPPED — early-stop: all proposals rejected with high confidence","durationMs":0} + ``` + Jump to Step 6.5. + +- If `action` is `"continue"`: proceed to Step 6. + ### Step 6 — Arbitrator Spawn the `calibration-arbitrator` subagent. In the prompt: @@ -162,25 +198,51 @@ Spawn the `calibration-arbitrator` subagent. In the prompt: - **Tell the agent: "Return your decisions as JSON. Only edit rule-config.ts if applying changes. Do NOT write to logs."** After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the existing content and add the `arbitrator` field: + ```json { "critic": { ... }, "arbitrator": { "timestamp": "", - "summary": "applied= rejected= revised=", - "decisions": [ ... ] + "summary": "applied= revised= rejected= hold=", + "decisions": [ + { + "ruleId": "X", + "decision": "applied|revised|rejected|hold|disabled", + "confidence": "high|medium|low", + "before": -10, + "after": -7, + "reason": "..." + } + ] } } ``` +Then finalize the debate (deterministic CLI — no LLM): + +```bash +npx canicode calibrate-finalize-debate $RUN_DIR +``` + +This determines `stoppingReason` (if any) and writes it to debate.json. Outputs JSON with `action: "finalized"`. + Append to `$RUN_DIR/activity.jsonl`: ```json -{"step":"Arbitrator","timestamp":"","result":"applied= rejected=","durationMs":} +{"step":"Arbitrator","timestamp":"","result":"applied= rejected= hold=","durationMs":} +``` + +### Step 6.5 — Enrich and prune evidence + +After the debate (or early-stop), enrich `data/calibration-evidence.json` with the Critic's structured pro/con/confidence. This ensures cross-run evidence persists beyond the ephemeral `logs/` directory. + +```bash +npx canicode calibrate-enrich-evidence $RUN_DIR ``` -### Step 6.5 — Prune evidence +This reads `debate.json`, extracts the Critic's reviews (pro, con, confidence, decision), and updates matching entries in `data/calibration-evidence.json`. Runs for both normal and early-stop paths. -After the Arbitrator applies changes, prune calibration evidence for the applied rules: +Then prune calibration evidence for the applied rules: ```bash npx canicode calibrate-prune-evidence $RUN_DIR @@ -209,7 +271,7 @@ Report the final summary: similarity, proposals, decisions, and path to `logs/ca - Each agent must be a SEPARATE subagent call (isolated context). - Pass only structured data between agents — never raw reasoning. -- The Critic must NOT see the Runner's or Converter's reasoning, only the proposal list. +- The Critic receives proposals + converter's ruleImpactAssessment + gaps + prior evidence (structured data, not free-form reasoning). - Only the Arbitrator may edit `rule-config.ts`. - Steps 1, 4, 7 are CLI commands — run them directly with Bash. - **CRITICAL: YOU write all files to $RUN_DIR. Subagents (Gap Analyzer, Critic, Arbitrator) MUST return JSON as text — tell them "Do NOT write any files." You are the only one who writes to $RUN_DIR.** diff --git a/CLAUDE.md b/CLAUDE.md index 285f79ec..6d1eb561 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -322,10 +322,22 @@ Process: 3. Run `canicode visual-compare` — pixel-level comparison against Figma screenshot 4. Analyze the diff image to categorize pixel gaps (`Gap Analyzer`) 5. Compare conversion difficulty vs rule scores (`canicode calibrate-evaluate`) -6. 6-agent debate loop (`/calibrate-loop`): Analysis → Converter → Gap Analyzer → Evaluation → Critic → Arbitrator +6. Debate loop (`/calibrate-loop`): Analysis → Converter → Gap Analyzer → Evaluation → Critic → Arbitrator + +**Critic receives structured evidence** (#144): +- Proposals from evaluation +- Converter's `ruleImpactAssessment` (actual implementation difficulty per rule) +- Gap analysis (actionable pixel gaps) +- Prior cross-run evidence for proposed rules +- Outputs structured pro/con arguments + confidence level per proposal + +**Early-stop and self-consistency** (#144): +- All proposals rejected with high confidence → Arbitrator skipped (early-stop) +- Low-confidence decisions → held (not applied), evidence accumulates for future runs (self-consistency) +- `stoppingReason` recorded in debate.json for traceability **Cross-run evidence** accumulates across sessions in `data/`: -- `calibration-evidence.json` — overscored/underscored rules (fed to Runner for stronger proposals) +- `calibration-evidence.json` — overscored/underscored rules with confidence, pro/con, decision (fed to Critic for informed review) - `discovery-evidence.json` — uncovered gaps not covered by existing rules (fed to `/add-rule` Researcher) - Discovery evidence is filtered to exclude environment/tooling noise (font CDN, retina/DPI, network, CI constraints) - Evidence is pruned after rules are applied (calibration) or new rules are created (discovery) diff --git a/src/agents/contracts/evidence.ts b/src/agents/contracts/evidence.ts index b97625fb..ec65078c 100644 --- a/src/agents/contracts/evidence.ts +++ b/src/agents/contracts/evidence.ts @@ -8,6 +8,11 @@ export const CalibrationEvidenceEntrySchema = z.object({ actualDifficulty: z.string(), fixture: z.string(), timestamp: z.string(), + // Phase 1 fields (#144) — optional for backward compatibility with existing evidence + confidence: z.enum(["high", "medium", "low"]).optional(), + pro: z.array(z.string()).optional(), + con: z.array(z.string()).optional(), + decision: z.enum(["APPROVE", "REJECT", "REVISE", "HOLD"]).optional(), }); export type CalibrationEvidenceEntry = z.infer; @@ -17,6 +22,11 @@ export const CrossRunEvidenceGroupSchema = z.object({ underscoredCount: z.number(), overscoredDifficulties: z.array(z.string()), underscoredDifficulties: z.array(z.string()), + // Aggregated pro/con from all entries for this rule + allPro: z.array(z.string()).optional(), + allCon: z.array(z.string()).optional(), + lastConfidence: z.enum(["high", "medium", "low"]).optional(), + lastDecision: z.enum(["APPROVE", "REJECT", "REVISE", "HOLD"]).optional(), }); export type CrossRunEvidenceGroup = z.infer; diff --git a/src/agents/evidence-collector.test.ts b/src/agents/evidence-collector.test.ts index 25258793..b2ced476 100644 --- a/src/agents/evidence-collector.test.ts +++ b/src/agents/evidence-collector.test.ts @@ -1,9 +1,10 @@ -import { mkdirSync, rmSync, writeFileSync, readFileSync } from "node:fs"; +import { existsSync, mkdirSync, rmSync, writeFileSync, readFileSync } from "node:fs"; import { join } from "node:path"; import { tmpdir } from "node:os"; import { loadCalibrationEvidence, appendCalibrationEvidence, + enrichCalibrationEvidence, pruneCalibrationEvidence, loadDiscoveryEvidence, appendDiscoveryEvidence, @@ -62,15 +63,33 @@ describe("evidence-collector", () => { underscoredCount: 1, overscoredDifficulties: ["easy", "moderate"], underscoredDifficulties: ["hard"], + allPro: [], + allCon: [], }); expect(result["rule-b"]).toEqual({ overscoredCount: 0, underscoredCount: 1, overscoredDifficulties: [], underscoredDifficulties: ["hard"], + allPro: [], + allCon: [], }); }); + it("deduplicates pro/con across entries", () => { + const entries: CalibrationEvidenceEntry[] = [ + { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1", + pro: ["easy in practice", "common pattern"], con: ["small fixture"] }, + { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx2", timestamp: "t2", + pro: ["easy in practice", "new evidence"], con: ["small fixture", "single run"] }, + ]; + writeFileSync(calPath, JSON.stringify(entries), "utf-8"); + + const result = loadCalibrationEvidence(calPath); + expect(result["rule-a"]!.allPro).toEqual(["easy in practice", "common pattern", "new evidence"]); + expect(result["rule-a"]!.allCon).toEqual(["small fixture", "single run"]); + }); + it("handles malformed JSON gracefully", () => { writeFileSync(calPath, "not json", "utf-8"); const result = loadCalibrationEvidence(calPath); @@ -136,6 +155,46 @@ describe("evidence-collector", () => { }); }); + describe("enrichCalibrationEvidence", () => { + it("enriches entries matching (ruleId, fixture)", () => { + const entries: CalibrationEvidenceEntry[] = [ + { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" }, + { ruleId: "rule-a", type: "overscored", actualDifficulty: "moderate", fixture: "fx2", timestamp: "t2" }, + { ruleId: "rule-b", type: "underscored", actualDifficulty: "hard", fixture: "fx1", timestamp: "t3" }, + ]; + writeFileSync(calPath, JSON.stringify(entries), "utf-8"); + + enrichCalibrationEvidence( + [{ ruleId: "rule-a", confidence: "high", pro: ["easy in practice"], con: ["only 1 case"], decision: "APPROVE" }], + "fx1", + calPath, + ); + + const result = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[]; + // Only fx1 entry for rule-a is enriched + expect(result[0]!.confidence).toBe("high"); + expect(result[0]!.pro).toEqual(["easy in practice"]); + // fx2 entry for rule-a is NOT enriched (different fixture) + expect(result[1]!.confidence).toBeUndefined(); + // rule-b is NOT enriched (different ruleId) + expect(result[2]!.confidence).toBeUndefined(); + }); + + it("does nothing when evidence file is empty", () => { + enrichCalibrationEvidence([{ ruleId: "rule-a" }], "fx1", calPath); + expect(existsSync(calPath)).toBe(false); + }); + + it("does nothing when reviews array is empty", () => { + writeFileSync(calPath, JSON.stringify([ + { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" }, + ]), "utf-8"); + enrichCalibrationEvidence([], "fx1", calPath); + const result = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[]; + expect(result[0]!.confidence).toBeUndefined(); + }); + }); + describe("pruneCalibrationEvidence", () => { it("removes entries for specified ruleIds", () => { const entries: CalibrationEvidenceEntry[] = [ diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts index 83391ed9..5f9e2d2f 100644 --- a/src/agents/evidence-collector.ts +++ b/src/agents/evidence-collector.ts @@ -63,6 +63,8 @@ export function loadCalibrationEvidence( underscoredCount: 0, overscoredDifficulties: [], underscoredDifficulties: [], + allPro: [], + allCon: [], }; result[entry.ruleId] = group; } @@ -74,6 +76,23 @@ export function loadCalibrationEvidence( group.underscoredCount++; group.underscoredDifficulties.push(entry.actualDifficulty); } + + // Aggregate pro/con from enriched entries (deduplicated) + if (entry.pro) { + group.allPro ??= []; + for (const p of entry.pro) { + if (!group.allPro.includes(p)) group.allPro.push(p); + } + } + if (entry.con) { + group.allCon ??= []; + for (const c of entry.con) { + if (!group.allCon.includes(c)) group.allCon.push(c); + } + } + // Keep last confidence/decision (most recent entry wins) + if (entry.confidence) group.lastConfidence = entry.confidence; + if (entry.decision) group.lastDecision = entry.decision; } return result; @@ -119,6 +138,51 @@ export function pruneCalibrationEvidence( writeJsonArray(evidencePath, pruned); } +/** + * Enrich existing calibration evidence entries with Critic's structured review data. + * Matches by (ruleId, fixture) to avoid overwriting entries from other fixtures. + * Entries without a matching review are left unchanged. + */ +export function enrichCalibrationEvidence( + reviews: Array<{ + ruleId: string; + confidence?: "high" | "medium" | "low"; + pro?: string[]; + con?: string[]; + decision?: "APPROVE" | "REJECT" | "REVISE" | "HOLD"; + }>, + fixture: string, + evidencePath: string = DEFAULT_CALIBRATION_PATH +): void { + if (reviews.length === 0) return; + const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema); + if (existing.length === 0) return; + + const reviewByRule = new Map(reviews.map((r) => [r.ruleId.trim(), r])); + const fixtureTrimmed = fixture.trim(); + + let matchCount = 0; + const enriched = existing.map((entry) => { + if (entry.fixture.trim() !== fixtureTrimmed) return entry; + const review = reviewByRule.get(entry.ruleId.trim()); + if (!review) return entry; + matchCount++; + return { + ...entry, + ...(review.confidence && { confidence: review.confidence }), + ...(review.pro && { pro: review.pro }), + ...(review.con && { con: review.con }), + ...(review.decision && { decision: review.decision }), + }; + }); + + if (matchCount === 0) { + console.warn(`[enrich] No entries matched fixture="${fixture}" — evidence unchanged`); + return; + } + writeJsonArray(evidencePath, enriched); +} + // --- Discovery evidence --- const DEFAULT_DISCOVERY_PATH = resolve("data/discovery-evidence.json"); diff --git a/src/agents/run-directory.test.ts b/src/agents/run-directory.test.ts index d56d74a3..06253dfb 100644 --- a/src/agents/run-directory.test.ts +++ b/src/agents/run-directory.test.ts @@ -333,6 +333,63 @@ describe("checkConvergence", () => { expect(summary.converged).toBe(false); expect(summary.reason).toBe("no arbitrator result"); }); + + it("converged on early-stop (stoppingReason + no arbitrator)", () => { + writeFileSync( + join(tempDir, "debate.json"), + JSON.stringify({ + critic: { summary: "rejected=2", reviews: [] }, + arbitrator: null, + stoppingReason: "all-high-confidence-reject", + }), + ); + const summary = checkConvergence(tempDir); + expect(summary.converged).toBe(true); + expect(summary.reason).toContain("early-stop"); + expect(summary.reason).toContain("all-high-confidence-reject"); + }); + + it("hold decisions prevent convergence", () => { + writeFileSync( + join(tempDir, "debate.json"), + JSON.stringify({ + arbitrator: { + summary: "hold=1", + decisions: [{ ruleId: "a", decision: "hold" }], + }, + }), + ); + const summary = checkConvergence(tempDir); + expect(summary.converged).toBe(false); + expect(summary.hold).toBe(1); + }); + + it("hold prevents convergence even in lenient mode", () => { + writeFileSync( + join(tempDir, "debate.json"), + JSON.stringify({ + arbitrator: { + summary: "hold=1", + decisions: [{ ruleId: "a", decision: "hold" }], + }, + }), + ); + const summary = checkConvergence(tempDir, { lenient: true }); + expect(summary.converged).toBe(false); + }); + + it("isConverged delegates to checkConvergence", () => { + writeFileSync( + join(tempDir, "debate.json"), + JSON.stringify({ + critic: { summary: "rejected=1", reviews: [] }, + arbitrator: null, + stoppingReason: "all-high-confidence-reject", + }), + ); + // isConverged should also return true for early-stop + expect(isConverged(tempDir)).toBe(true); + }); }); describe("listCalibrationRuns", () => { diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts index f44133ce..028eb354 100644 --- a/src/agents/run-directory.ts +++ b/src/agents/run-directory.ts @@ -168,14 +168,19 @@ const DebateDecisionSchema = z.object({ reason: z.string().optional(), }).passthrough(); +const CriticReviewSchema = z.object({ + ruleId: z.string(), + decision: z.string(), + reason: z.string().optional(), + revised: z.number().optional(), + confidence: z.enum(["high", "medium", "low"]).optional(), + pro: z.array(z.string()).optional(), + con: z.array(z.string()).optional(), +}).passthrough(); + const CriticSchema = z.object({ summary: z.string(), - reviews: z.array(z.object({ - ruleId: z.string(), - decision: z.string(), - reason: z.string().optional(), - revised: z.number().optional(), - }).passthrough()), + reviews: z.array(CriticReviewSchema), }).passthrough(); const ArbitratorSchema = z.object({ @@ -184,10 +189,12 @@ const ArbitratorSchema = z.object({ newRuleProposals: z.array(z.unknown()).optional(), }).passthrough(); +/** stoppingReason canonical location: debate.json top level (not inside arbitrator) */ const DebateResultSchema = z.object({ critic: CriticSchema.nullable().default(null), arbitrator: ArbitratorSchema.nullable().default(null), skipped: z.string().optional(), + stoppingReason: z.string().optional(), }).passthrough(); /** A single decision from the Arbitrator in debate.json. */ @@ -253,6 +260,7 @@ export interface ConvergenceSummary { applied: number; revised: number; rejected: number; + hold: number; kept: number; total: number; reason: string; @@ -266,36 +274,43 @@ export function checkConvergence(runDir: string, options?: ConvergenceOptions): const debate = parseDebateResult(runDir); if (!debate) { - return { converged: false, mode, applied: 0, revised: 0, rejected: 0, kept: 0, total: 0, reason: "no debate.json found" }; + return { converged: false, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: "no debate.json found" }; } if (debate.skipped) { - return { converged: true, mode, applied: 0, revised: 0, rejected: 0, kept: 0, total: 0, reason: debate.skipped }; + return { converged: true, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: debate.skipped }; } if (!debate.arbitrator) { - return { converged: false, mode, applied: 0, revised: 0, rejected: 0, kept: 0, total: 0, reason: "no arbitrator result" }; + // Early-stop: Arbitrator skipped because all proposals rejected with high confidence + if (debate.stoppingReason) { + return { converged: true, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: `early-stop: ${debate.stoppingReason}` }; + } + return { converged: false, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: "no arbitrator result" }; } const decisions = debate.arbitrator.decisions; const applied = decisions.filter((d) => d.decision.trim().toLowerCase() === "applied").length; const revised = decisions.filter((d) => d.decision.trim().toLowerCase() === "revised").length; const rejected = decisions.filter((d) => d.decision.trim().toLowerCase() === "rejected").length; - const kept = decisions.length - applied - revised - rejected; + const hold = decisions.filter((d) => d.decision.trim().toLowerCase() === "hold").length; + const kept = decisions.length - applied - revised - rejected - hold; const total = decisions.length; + // hold = "not enough confidence to decide" → not converged (need more evidence) const converged = options?.lenient - ? (applied + revised) === 0 - : (applied + revised) === 0 && rejected === 0; + ? (applied + revised + hold) === 0 + : (applied + revised + hold) === 0 && rejected === 0; const parts: string[] = []; if (applied > 0) parts.push(`${applied} applied`); if (revised > 0) parts.push(`${revised} revised`); if (rejected > 0) parts.push(`${rejected} rejected`); + if (hold > 0) parts.push(`${hold} hold`); if (kept > 0) parts.push(`${kept} kept`); const countsStr = parts.length > 0 ? parts.join(", ") : "no decisions"; const verdict = converged ? "converged" : "not converged"; const reason = `${verdict} (${mode}) — ${countsStr} (${total} total)`; - return { converged, mode, applied, revised, rejected, kept, total, reason }; + return { converged, mode, applied, revised, rejected, hold, kept, total, reason }; } /** Options for convergence checking. */ @@ -309,22 +324,8 @@ export interface ConvergenceOptions { /** * Check if a calibration run has converged. - * Strict: no applied/revised AND no rejected decisions. - * Lenient: no applied/revised only (rejected proposals allowed). + * Delegates to checkConvergence to avoid duplicating early-stop / hold logic. */ export function isConverged(runDir: string, options?: ConvergenceOptions): boolean { - const debate = parseDebateResult(runDir); - if (!debate) return false; - if (debate.skipped) return true; // zero proposals = converged - if (!debate.arbitrator) return false; - const decisions = debate.arbitrator.decisions; - const applied = decisions.filter((d) => { - const dec = d.decision.trim().toLowerCase(); - return dec === "applied" || dec === "revised"; - }).length; - const rejected = decisions.filter((d) => d.decision.trim().toLowerCase() === "rejected").length; - if (options?.lenient) { - return applied === 0; - } - return applied === 0 && rejected === 0; + return checkConvergence(runDir, options).converged; } diff --git a/src/cli/commands/internal/calibrate-debate.test.ts b/src/cli/commands/internal/calibrate-debate.test.ts new file mode 100644 index 00000000..261f4e84 --- /dev/null +++ b/src/cli/commands/internal/calibrate-debate.test.ts @@ -0,0 +1,191 @@ +import { mkdtempSync, writeFileSync, existsSync } from "node:fs"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; +import { rm } from "node:fs/promises"; + +import { gatherEvidence, loadProposedRuleIds } from "./calibrate-debate.js"; +import { parseDebateResult } from "../../../agents/run-directory.js"; + +describe("gatherEvidence", () => { + let runDir: string; + + beforeEach(() => { + runDir = mkdtempSync(join(tmpdir(), "gather-test-")); + }); + + afterEach(async () => { + await rm(runDir, { recursive: true, force: true }); + }); + + it("extracts ruleImpactAssessment and uncoveredStruggles from conversion.json", () => { + writeFileSync(join(runDir, "conversion.json"), JSON.stringify({ + ruleImpactAssessment: [ + { ruleId: "no-auto-layout", issueCount: 3, actualImpact: "easy" }, + ], + uncoveredStruggles: [ + { description: "border radius mismatch" }, + ], + })); + + const evidence = gatherEvidence(runDir, []); + expect(evidence.ruleImpactAssessment).toHaveLength(1); + expect(evidence.uncoveredStruggles).toHaveLength(1); + }); + + it("filters gaps to actionable only", () => { + writeFileSync(join(runDir, "gaps.json"), JSON.stringify({ + gaps: [ + { category: "spacing", actionable: true, description: "padding off" }, + { category: "rendering", actionable: false, description: "font fallback" }, + { category: "color", actionable: true, description: "wrong shade" }, + ], + })); + + const evidence = gatherEvidence(runDir, []); + expect(evidence.actionableGaps).toHaveLength(2); + }); + + it("handles missing files gracefully", () => { + const evidence = gatherEvidence(runDir, []); + expect(evidence.ruleImpactAssessment).toHaveLength(0); + expect(evidence.uncoveredStruggles).toHaveLength(0); + expect(evidence.actionableGaps).toHaveLength(0); + expect(evidence.priorEvidence).toEqual({}); + }); + + it("returns empty priorEvidence when no ruleIds proposed", () => { + const evidence = gatherEvidence(runDir, []); + expect(evidence.priorEvidence).toEqual({}); + }); +}); + +describe("loadProposedRuleIds", () => { + let runDir: string; + + beforeEach(() => { + runDir = mkdtempSync(join(tmpdir(), "proposed-test-")); + }); + + afterEach(async () => { + await rm(runDir, { recursive: true, force: true }); + }); + + it("loads from proposed-rules.json when available", () => { + writeFileSync(join(runDir, "proposed-rules.json"), JSON.stringify(["no-auto-layout", "raw-value"])); + const ids = loadProposedRuleIds(runDir); + expect(ids).toEqual(["no-auto-layout", "raw-value"]); + }); + + it("falls back to summary.md regex when no proposed-rules.json", () => { + writeFileSync(join(runDir, "summary.md"), "## Overscored\n| `no-auto-layout` | -10 | easy |\n| `raw-value` | -3 |"); + const ids = loadProposedRuleIds(runDir); + expect(ids).toContain("no-auto-layout"); + expect(ids).toContain("raw-value"); + }); + + it("returns empty for missing files", () => { + const ids = loadProposedRuleIds(runDir); + expect(ids).toEqual([]); + }); + + it("prefers proposed-rules.json over summary.md", () => { + writeFileSync(join(runDir, "proposed-rules.json"), JSON.stringify(["rule-a"])); + writeFileSync(join(runDir, "summary.md"), "| `rule-a` | | |\n| `rule-b` | | |"); + const ids = loadProposedRuleIds(runDir); + // Should only have rule-a from proposed-rules.json, not rule-b from summary.md + expect(ids).toEqual(["rule-a"]); + }); +}); + +describe("calibrate-finalize-debate logic", () => { + let runDir: string; + + beforeEach(() => { + runDir = mkdtempSync(join(tmpdir(), "finalize-test-")); + }); + + afterEach(async () => { + await rm(runDir, { recursive: true, force: true }); + }); + + it("detects early-stop when all critic reviews are high-confidence REJECT", () => { + writeFileSync(join(runDir, "debate.json"), JSON.stringify({ + critic: { + summary: "rejected=2", + reviews: [ + { ruleId: "a", decision: "REJECT", confidence: "high", pro: [], con: ["weak"], reason: "x" }, + { ruleId: "b", decision: "REJECT", confidence: "high", pro: [], con: ["weak"], reason: "y" }, + ], + }, + })); + + const debate = parseDebateResult(runDir)!; + const reviews = debate.critic!.reviews; + const allHighConfidenceReject = reviews.length > 0 && reviews.every((r) => + r.decision.trim().toUpperCase() === "REJECT" && r.confidence === "high" + ); + expect(allHighConfidenceReject).toBe(true); + }); + + it("does NOT early-stop when reviews are mixed", () => { + writeFileSync(join(runDir, "debate.json"), JSON.stringify({ + critic: { + summary: "approved=1 rejected=1", + reviews: [ + { ruleId: "a", decision: "APPROVE", confidence: "high", reason: "x" }, + { ruleId: "b", decision: "REJECT", confidence: "medium", reason: "y" }, + ], + }, + })); + + const debate = parseDebateResult(runDir)!; + const reviews = debate.critic!.reviews; + const allHighConfidenceReject = reviews.length > 0 && reviews.every((r) => + r.decision.trim().toUpperCase() === "REJECT" && r.confidence === "high" + ); + expect(allHighConfidenceReject).toBe(false); + }); + + it("detects low-confidence-hold when all arbitrator decisions are hold", () => { + writeFileSync(join(runDir, "debate.json"), JSON.stringify({ + critic: { summary: "revised=2", reviews: [] }, + arbitrator: { + summary: "hold=2", + decisions: [ + { ruleId: "a", decision: "hold" }, + { ruleId: "b", decision: "hold" }, + ], + }, + })); + + const debate = parseDebateResult(runDir)!; + const decisions = debate.arbitrator!.decisions; + const allHold = decisions.length > 0 && decisions.every((d) => + d.decision.trim().toLowerCase() === "hold" + ); + expect(allHold).toBe(true); + }); + + it("no stoppingReason for normal completion", () => { + writeFileSync(join(runDir, "debate.json"), JSON.stringify({ + critic: { summary: "approved=1", reviews: [] }, + arbitrator: { + summary: "applied=1", + decisions: [ + { ruleId: "a", decision: "applied", before: -10, after: -7 }, + ], + }, + })); + + const debate = parseDebateResult(runDir)!; + const decisions = debate.arbitrator!.decisions; + const allHold = decisions.length > 0 && decisions.every((d) => + d.decision.trim().toLowerCase() === "hold" + ); + expect(allHold).toBe(false); + }); + + it("returns null for missing debate.json", () => { + expect(parseDebateResult(runDir)).toBeNull(); + }); +}); diff --git a/src/cli/commands/internal/calibrate-debate.ts b/src/cli/commands/internal/calibrate-debate.ts new file mode 100644 index 00000000..7a228e29 --- /dev/null +++ b/src/cli/commands/internal/calibrate-debate.ts @@ -0,0 +1,205 @@ +import { existsSync, readFileSync, writeFileSync } from "node:fs"; +import { join, resolve } from "node:path"; +import type { CAC } from "cac"; + +import { parseDebateResult } from "../../../agents/run-directory.js"; +import { loadCalibrationEvidence } from "../../../agents/evidence-collector.js"; + +// ─── calibrate-gather-evidence ────────────────────────────────────────────── + +export interface GatheredEvidence { + ruleImpactAssessment: unknown[]; + uncoveredStruggles: unknown[]; + actionableGaps: unknown[]; + priorEvidence: Record; +} + +/** + * Gather structured evidence for the Critic from run artifacts + cross-run data. + * Pure data extraction — no LLM needed. + */ +export function gatherEvidence(runDir: string, proposedRuleIds: string[]): GatheredEvidence { + const result: GatheredEvidence = { + ruleImpactAssessment: [], + uncoveredStruggles: [], + actionableGaps: [], + priorEvidence: {}, + }; + + // 1. conversion.json → ruleImpactAssessment, uncoveredStruggles + const convPath = join(runDir, "conversion.json"); + if (existsSync(convPath)) { + try { + const conv = JSON.parse(readFileSync(convPath, "utf-8")) as Record; + if (Array.isArray(conv["ruleImpactAssessment"])) { + result.ruleImpactAssessment = conv["ruleImpactAssessment"]; + } + if (Array.isArray(conv["uncoveredStruggles"])) { + result.uncoveredStruggles = conv["uncoveredStruggles"]; + } + } catch { /* ignore malformed */ } + } + + // 2. gaps.json → actionable gaps + const gapsPath = join(runDir, "gaps.json"); + if (existsSync(gapsPath)) { + try { + const gaps = JSON.parse(readFileSync(gapsPath, "utf-8")) as Record; + const gapList = Array.isArray(gaps["gaps"]) ? gaps["gaps"] : []; + result.actionableGaps = gapList.filter( + (g): g is Record => + typeof g === "object" && g !== null && (g as Record)["actionable"] === true + ); + } catch { /* ignore malformed */ } + } + + // 3. Prior evidence filtered to proposed rules only + if (proposedRuleIds.length > 0) { + const allEvidence = loadCalibrationEvidence(); + const ruleSet = new Set(proposedRuleIds.map((id) => id.trim())); + for (const [ruleId, group] of Object.entries(allEvidence)) { + if (ruleSet.has(ruleId)) { + result.priorEvidence[ruleId] = group; + } + } + } + + return result; +} + +/** + * Load proposed ruleIds from proposed-rules.json (written by calibrate-evaluate). + * Falls back to regex extraction from summary.md if file doesn't exist. + */ +export function loadProposedRuleIds(runDir: string): string[] { + // Preferred: deterministic list from calibrate-evaluate + const proposedPath = join(runDir, "proposed-rules.json"); + if (existsSync(proposedPath)) { + try { + const raw: unknown = JSON.parse(readFileSync(proposedPath, "utf-8")); + if (Array.isArray(raw)) return raw.filter((id): id is string => typeof id === "string"); + } catch { /* fall through to regex */ } + } + + // Fallback: extract from summary.md (may have false positives) + const summaryPath = join(runDir, "summary.md"); + if (!existsSync(summaryPath)) return []; + try { + const content = readFileSync(summaryPath, "utf-8"); + const ids = new Set(); + for (const match of content.matchAll(/`([a-z][\w-]*)`/g)) { + if (match[1]) ids.add(match[1]); + } + return [...ids]; + } catch { + return []; + } +} + +export function registerGatherEvidence(cli: CAC): void { + cli + .command( + "calibrate-gather-evidence ", + "Gather structured evidence for Critic from run artifacts + cross-run data" + ) + .action((runDir: string) => { + const dir = resolve(runDir); + if (!existsSync(dir)) { + console.log(`Run directory not found: ${runDir}`); + return; + } + + const proposedRuleIds = loadProposedRuleIds(dir); + const evidence = gatherEvidence(dir, proposedRuleIds); + + // Write to file for orchestrator to include in Critic prompt + const outPath = join(dir, "critic-evidence.json"); + writeFileSync(outPath, JSON.stringify(evidence, null, 2) + "\n", "utf-8"); + console.log(`Gathered evidence: ${evidence.ruleImpactAssessment.length} impact assessments, ${evidence.actionableGaps.length} gaps, ${Object.keys(evidence.priorEvidence).length} prior rules`); + console.log(`Written to ${outPath}`); + }); +} + +// ─── calibrate-finalize-debate ────────────────────────────────────────────── + +interface FinalizeResult { + action: "early-stop" | "continue" | "finalized"; + stoppingReason?: string; +} + +export function registerFinalizeDebate(cli: CAC): void { + cli + .command( + "calibrate-finalize-debate ", + "Check early-stop or determine stoppingReason after debate" + ) + .action((runDir: string) => { + const dir = resolve(runDir); + if (!existsSync(dir)) { + console.log(`Run directory not found: ${runDir}`); + return; + } + + const debate = parseDebateResult(dir); + if (!debate) { + console.log("No debate.json found"); + return; + } + + const debatePath = join(dir, "debate.json"); + let raw: Record; + try { + raw = JSON.parse(readFileSync(debatePath, "utf-8")) as Record; + } catch { + console.log(JSON.stringify({ action: "continue" })); + return; + } + + // Case 1: Critic done, no Arbitrator yet → check early-stop + if (debate.critic && !debate.arbitrator) { + const reviews = debate.critic.reviews; + const allHighConfidenceReject = reviews.length > 0 && reviews.every((r) => { + return r.decision.trim().toUpperCase() === "REJECT" && r.confidence === "high"; + }); + + if (allHighConfidenceReject) { + raw["stoppingReason"] = "all-high-confidence-reject"; + writeFileSync(debatePath, JSON.stringify(raw, null, 2) + "\n", "utf-8"); + const result: FinalizeResult = { action: "early-stop", stoppingReason: "all-high-confidence-reject" }; + console.log(JSON.stringify(result)); + // exit 0 = early-stop, orchestrator should skip Arbitrator + return; + } + + const result: FinalizeResult = { action: "continue" }; + console.log(JSON.stringify(result)); + // exit 0 but action=continue → orchestrator proceeds to Arbitrator + return; + } + + // Case 2: Both Critic and Arbitrator done → determine stoppingReason + if (debate.arbitrator) { + const decisions = debate.arbitrator.decisions; + const allHold = decisions.length > 0 && decisions.every((d) => + d.decision.trim().toLowerCase() === "hold" + ); + + if (allHold) { + raw["stoppingReason"] = "low-confidence-hold"; + writeFileSync(debatePath, JSON.stringify(raw, null, 2) + "\n", "utf-8"); + const result: FinalizeResult = { action: "finalized", stoppingReason: "low-confidence-hold" }; + console.log(JSON.stringify(result)); + return; + } + + // Normal completion — no stoppingReason needed + const result: FinalizeResult = { action: "finalized" }; + console.log(JSON.stringify(result)); + return; + } + + // Fallback + const result: FinalizeResult = { action: "continue" }; + console.log(JSON.stringify(result)); + }); +} diff --git a/src/cli/commands/internal/calibrate-evaluate.ts b/src/cli/commands/internal/calibrate-evaluate.ts index 02e9d637..45665042 100644 --- a/src/cli/commands/internal/calibrate-evaluate.ts +++ b/src/cli/commands/internal/calibrate-evaluate.ts @@ -82,6 +82,15 @@ export function registerCalibrateEvaluate(cli: CAC): void { mismatchCounts[key]++; } + // Write proposed ruleIds for deterministic evidence gathering + if (options.runDir && tuningOutput.adjustments.length > 0) { + const proposedIds = tuningOutput.adjustments.map( + (a: { ruleId: string }) => a.ruleId + ); + const proposedPath = resolve(options.runDir, "proposed-rules.json"); + await writeFile(proposedPath, JSON.stringify(proposedIds) + "\n", "utf-8"); + } + console.log(`\nEvaluation complete.`); console.log(` Validated: ${mismatchCounts.validated}`); console.log(` Overscored: ${mismatchCounts.overscored}`); diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts index 35bf5161..85dfea03 100644 --- a/src/cli/commands/internal/fixture-management.ts +++ b/src/cli/commands/internal/fixture-management.ts @@ -1,5 +1,5 @@ import { existsSync } from "node:fs"; -import { resolve } from "node:path"; +import { basename, resolve } from "node:path"; import type { CAC } from "cac"; import { @@ -7,6 +7,7 @@ import { listDoneFixtures, moveFixtureToDone, parseDebateResult, + parseRunDirName, extractAppliedRuleIds, extractFixtureName, resolveLatestRunDir, @@ -15,6 +16,7 @@ import { import { pruneCalibrationEvidence, pruneDiscoveryEvidence, + enrichCalibrationEvidence, } from "../../../agents/evidence-collector.js"; export function registerFixtureManagement(cli: CAC): void { @@ -112,6 +114,46 @@ export function registerFixtureManagement(cli: CAC): void { }); } +export function registerEvidenceEnrich(cli: CAC): void { + cli + .command( + "calibrate-enrich-evidence ", + "Enrich evidence with Critic's pro/con/confidence from debate.json" + ) + .action((runDir: string) => { + const resolvedDir = resolve(runDir); + if (!existsSync(resolvedDir)) { + console.log(`Run directory not found: ${runDir}`); + return; + } + const debate = parseDebateResult(resolvedDir); + if (!debate?.critic) { + console.log("No critic reviews in debate.json — nothing to enrich."); + return; + } + + // Extract fixture name from run directory (e.g. "material3-kit--2026-03-26-0900" → "material3-kit") + const { name: fixture, timestamp } = parseRunDirName(basename(resolvedDir)); + if (!timestamp) { + console.log(`Run directory "${basename(resolvedDir)}" does not match expected -- format`); + return; + } + + const reviews = debate.critic.reviews.map((r) => { + const entry: Parameters[0][number] = { ruleId: r.ruleId }; + if (r.confidence) entry.confidence = r.confidence; + if (r.pro) entry.pro = r.pro; + if (r.con) entry.con = r.con; + const dec = r.decision.trim().toUpperCase(); + if (dec === "APPROVE" || dec === "REJECT" || dec === "REVISE" || dec === "HOLD") entry.decision = dec; + return entry; + }); + + enrichCalibrationEvidence(reviews, fixture); + console.log(`Enriched calibration evidence for fixture "${fixture}" with ${reviews.length} review(s)`); + }); +} + export function registerEvidencePrune(cli: CAC): void { cli .command( diff --git a/src/cli/index.ts b/src/cli/index.ts index 9ee3caef..3074df24 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -32,7 +32,8 @@ import { registerCalibrateAnalyze } from "./commands/internal/calibrate-analyze. import { registerCalibrateEvaluate } from "./commands/internal/calibrate-evaluate.js"; import { registerCalibrateGapReport } from "./commands/internal/calibrate-gap-report.js"; import { registerCalibrateRun } from "./commands/internal/calibrate-run.js"; -import { registerFixtureManagement, registerEvidencePrune } from "./commands/internal/fixture-management.js"; +import { registerGatherEvidence, registerFinalizeDebate } from "./commands/internal/calibrate-debate.js"; +import { registerFixtureManagement, registerEvidenceEnrich, registerEvidencePrune } from "./commands/internal/fixture-management.js"; const require = createRequire(import.meta.url); const pkg = require("../../package.json") as { version: string }; @@ -78,7 +79,10 @@ registerCalibrateAnalyze(cli); registerCalibrateEvaluate(cli); registerCalibrateGapReport(cli); registerCalibrateRun(cli); +registerGatherEvidence(cli); +registerFinalizeDebate(cli); registerFixtureManagement(cli); +registerEvidenceEnrich(cli); registerEvidencePrune(cli); // ============================================