From 8196e7966d449c628ab67002c9aac3d2ee0b6c0b Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Sun, 29 Mar 2026 18:14:11 +0900
Subject: [PATCH 01/12] feat: structured debate with pro/con, confidence, and
 early-stop (#144)

Phase 1-3 of calibration pipeline improvements:

- Evidence schema: add confidence, pro, con, decision fields
- Critic: receives converter assessment + gaps + prior evidence,
  outputs structured pro/con arguments per proposal
- Arbitrator: holds low-confidence decisions instead of applying
- Early-stop: skip Arbitrator when all proposals rejected with
  high confidence
- Cross-run evidence enriched with pro/con for better future reviews
- CLAUDE.md calibration section updated

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .claude/agents/calibration/arbitrator.md | 20 ++++++--
 .claude/agents/calibration/critic.md     | 48 ++++++++++++++++++--
 .claude/commands/calibrate-loop.md       | 58 +++++++++++++++++++++---
 CLAUDE.md                                | 16 ++++++-
 src/agents/contracts/evidence.ts         | 10 ++++
 src/agents/evidence-collector.test.ts    |  4 ++
 src/agents/evidence-collector.ts         | 15 ++++++
 7 files changed, 155 insertions(+), 16 deletions(-)
diff --git a/.claude/agents/calibration/arbitrator.md b/.claude/agents/calibration/arbitrator.md
index bff26849..4dd3b0fa 100644
--- a/.claude/agents/calibration/arbitrator.md
+++ b/.claude/agents/calibration/arbitrator.md
@@ -16,6 +16,10 @@ You receive the Runner's proposals and the Critic's reviews, and make final deci
 - **proposedDisable: true** → if both Runner and Critic agree, set `enabled: false` in `rule-config.ts`. Decision type: `"disabled"`. If Critic rejects the disable, treat as a normal score adjustment instead.
 - **New rule proposals** → record in `$RUN_DIR/debate.json` only, do NOT add to `rule-config.ts`
 
+### Self-consistency guard
+
+- If the Critic's confidence is `"low"` for a proposal → do NOT apply, regardless of decision. Set decision to `"hold"` with reason explaining insufficient confidence. The evidence will accumulate for future runs.
+
 ## After Deciding
 
 1. Apply approved changes to `src/core/rules/rule-config.ts`
@@ -39,16 +43,23 @@ Return this JSON structure:
 ```json
 {
   "timestamp": "<ISO8601>",
-  "summary": "applied=2 rejected=1 revised=1 newProposals=0",
+  "summary": "applied=2 rejected=1 hold=1 newProposals=0",
+  "stoppingReason": "normal|all-high-confidence-reject|low-confidence-hold",
   "decisions": [
-    {"ruleId": "X", "decision": "applied", "before": -10, "after": -7, "reason": "Critic revised, midpoint applied"},
-    {"ruleId": "X", "decision": "rejected", "reason": "Critic rejection compelling — insufficient evidence"},
-    {"ruleId": "X", "decision": "disabled", "reason": "Converged to zero impact across 3+ runs, all easy"}
+    {"ruleId": "X", "decision": "applied", "before": -10, "after": -7, "confidence": "high", "reason": "Critic revised, midpoint applied"},
+    {"ruleId": "X", "decision": "rejected", "confidence": "medium", "reason": "Critic rejection compelling — insufficient evidence"},
+    {"ruleId": "X", "decision": "hold", "confidence": "low", "reason": "Low confidence — accumulate more evidence before applying"},
+    {"ruleId": "X", "decision": "disabled", "confidence": "high", "reason": "Converged to zero impact across 3+ runs, all easy"}
   ],
   "newRuleProposals": []
 }
 ```
 
+### Field requirements
+
+- **confidence**: carried from Critic's review for each decision
+- **stoppingReason**: why the debate ended — `"normal"` (mixed decisions), `"all-high-confidence-reject"` (all rejected with high confidence), `"low-confidence-hold"` (all held due to low confidence)
+
 ## Rules
 
 - **Do NOT write to ANY file except `src/core/rules/rule-config.ts`.** No log files, no `new-rule-proposals.md`, no `debate.json`, no `activity.jsonl`. The orchestrator handles ALL other file I/O.
@@ -56,3 +67,4 @@ Return this JSON structure:
 - Only modify `rule-config.ts` for approved score/severity changes.
 - Never force-push or amend existing commits.
 - If tests fail, revert everything and report which change caused the failure.
+- **Never apply changes with `confidence: "low"`.** Hold them for future evidence accumulation.
diff --git a/.claude/agents/calibration/critic.md b/.claude/agents/calibration/critic.md
index e4bf2632..11a6ee2d 100644
--- a/.claude/agents/calibration/critic.md
+++ b/.claude/agents/calibration/critic.md
@@ -16,7 +16,17 @@ All critics follow this base protocol:
 ---
 
 You are the Critic agent in a calibration pipeline.
-You receive the Runner's proposals and challenge each one independently.
+You receive the Runner's proposals along with supporting evidence, and challenge each one independently.
+
+## Input Context
+
+You will receive:
+1. **Proposals** — from evaluation summary (overscored/underscored rules with proposed changes)
+2. **Converter assessment** — `ruleImpactAssessment` showing actual implementation difficulty per rule
+3. **Gap analysis** — actionable pixel gaps between Figma and generated code
+4. **Prior evidence** — cross-run calibration evidence for the proposed rules (accumulated from past runs)
+
+Use ALL inputs to form pro/con arguments. Do not rely on proposals alone.
 
 ## Rejection Rules
 
@@ -50,16 +60,46 @@ Return this JSON structure:
   "timestamp": "<ISO8601>",
   "summary": "approved=1 rejected=1 revised=1",
   "reviews": [
-    {"ruleId": "X", "decision": "APPROVE", "reason": "3 cases, high confidence"},
-    {"ruleId": "X", "decision": "REJECT", "reason": "Rule 1 — only 1 case with low confidence"},
-    {"ruleId": "X", "decision": "REVISE", "revised": -7, "reason": "Rule 2 — change too large, midpoint applied"}
+    {
+      "ruleId": "X",
+      "decision": "APPROVE",
+      "confidence": "high",
+      "pro": ["3 cases across fixtures show easy implementation", "converter rated actualImpact: easy"],
+      "con": ["all cases from same design system"],
+      "reason": "Strong cross-run evidence outweighs single-system concern"
+    },
+    {
+      "ruleId": "X",
+      "decision": "REJECT",
+      "confidence": "low",
+      "pro": ["1 case shows overscored"],
+      "con": ["only 1 fixture", "no gap analysis data supports this"],
+      "reason": "Rule 1 — only 1 case with low confidence"
+    },
+    {
+      "ruleId": "X",
+      "decision": "REVISE",
+      "revised": -7,
+      "confidence": "medium",
+      "pro": ["converter found moderate difficulty, current score implies hard"],
+      "con": ["gap analysis shows some pixel impact in this area"],
+      "reason": "Rule 2 — change too large, midpoint applied"
+    }
   ]
 }
 ```
 
+### Field requirements
+
+- **confidence**: `"high"` | `"medium"` | `"low"` — your assessment of the proposal's reliability
+- **pro**: array of evidence points supporting the proposed change
+- **con**: array of evidence points against the proposed change
+- **reason**: final verdict synthesizing pro/con
+
 ## Rules
 
 - **Do NOT write any files.** The orchestrator handles all file I/O.
 - Do NOT modify `src/rules/rule-config.ts`.
 - Be strict. When in doubt, REJECT or REVISE.
 - Return your full critique so the Arbitrator can decide.
+- **Every review MUST include pro, con, and confidence fields.** No exceptions.
diff --git a/.claude/commands/calibrate-loop.md b/.claude/commands/calibrate-loop.md
index 18386512..4ef9bc5b 100644
--- a/.claude/commands/calibrate-loop.md
+++ b/.claude/commands/calibrate-loop.md
@@ -135,8 +135,17 @@ If zero proposals, write `$RUN_DIR/debate.json` with skip reason and jump to Ste
 
 ### Step 5 — Critic
 
+Before spawning the Critic, gather supporting evidence:
+
+1. Read `$RUN_DIR/conversion.json` → extract `ruleImpactAssessment` and `uncoveredStruggles`
+2. Read `$RUN_DIR/gaps.json` (if exists) → extract actionable gaps
+3. Read `data/calibration-evidence.json` (if exists) → extract prior evidence for proposed rules
+
 Spawn the `calibration-critic` subagent. In the prompt:
-- Include only the proposal list (NOT the Converter's reasoning)
+- Include the proposal list from summary.md
+- Include the Converter's `ruleImpactAssessment` (actual implementation difficulty per rule)
+- Include actionable gaps from Gap Analysis (if available)
+- Include prior cross-run evidence for the proposed rules
 - **Tell the agent: "Return your reviews as JSON. Do NOT write any files."**
 
 After the Critic returns, **you** write the JSON to `$RUN_DIR/debate.json`:
@@ -145,7 +154,16 @@ After the Critic returns, **you** write the JSON to `$RUN_DIR/debate.json`:
   "critic": {
     "timestamp": "<ISO8601>",
     "summary": "approved=<N> rejected=<N> revised=<N>",
-    "reviews": [ ... ]
+    "reviews": [
+      {
+        "ruleId": "X",
+        "decision": "APPROVE|REJECT|REVISE",
+        "confidence": "high|medium|low",
+        "pro": ["evidence supporting change"],
+        "con": ["evidence against change"],
+        "reason": "..."
+      }
+    ]
   }
 }
 ```
@@ -155,6 +173,26 @@ Append to `$RUN_DIR/activity.jsonl`:
 {"step":"Critic","timestamp":"<ISO8601>","result":"approved=<N> rejected=<N> revised=<N>","durationMs":<ms>}
 ```
 
+#### Early-stop check
+
+After the Critic returns, check for early termination:
+
+- If **all reviews** have `decision: "REJECT"` AND `confidence: "high"` → skip Arbitrator. Write debate.json with:
+  ```json
+  {
+    "critic": { ... },
+    "arbitrator": null,
+    "stoppingReason": "all-high-confidence-reject"
+  }
+  ```
+  Append to activity.jsonl:
+  ```json
+  {"step":"Arbitrator","timestamp":"<ISO8601>","result":"SKIPPED — early-stop: all proposals rejected with high confidence","durationMs":0}
+  ```
+  Jump to Step 6.5.
+
+Otherwise, proceed to Step 6.
+
 ### Step 6 — Arbitrator
 
 Spawn the `calibration-arbitrator` subagent. In the prompt:
@@ -167,15 +205,23 @@ After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the
   "critic": { ... },
   "arbitrator": {
     "timestamp": "<ISO8601>",
-    "summary": "applied=<N> rejected=<N> revised=<N>",
-    "decisions": [ ... ]
+    "summary": "applied=<N> rejected=<N> hold=<N>",
+    "stoppingReason": "normal|all-high-confidence-reject|low-confidence-hold",
+    "decisions": [
+      {
+        "ruleId": "X",
+        "decision": "applied|rejected|hold|disabled",
+        "confidence": "high|medium|low",
+        "reason": "..."
+      }
+    ]
   }
 }
 ```
 
 Append to `$RUN_DIR/activity.jsonl`:
 ```json
-{"step":"Arbitrator","timestamp":"<ISO8601>","result":"applied=<N> rejected=<N>","durationMs":<ms>}
+{"step":"Arbitrator","timestamp":"<ISO8601>","result":"applied=<N> rejected=<N> hold=<N>","durationMs":<ms>}
 ```
 
 ### Step 6.5 — Prune evidence
@@ -209,7 +255,7 @@ Report the final summary: similarity, proposals, decisions, and path to `logs/ca
 
 - Each agent must be a SEPARATE subagent call (isolated context).
 - Pass only structured data between agents — never raw reasoning.
-- The Critic must NOT see the Runner's or Converter's reasoning, only the proposal list.
+- The Critic receives proposals + converter's ruleImpactAssessment + gaps + prior evidence (structured data, not free-form reasoning).
 - Only the Arbitrator may edit `rule-config.ts`.
 - Steps 1, 4, 7 are CLI commands — run them directly with Bash.
 - **CRITICAL: YOU write all files to $RUN_DIR. Subagents (Gap Analyzer, Critic, Arbitrator) MUST return JSON as text — tell them "Do NOT write any files." You are the only one who writes to $RUN_DIR.**
diff --git a/CLAUDE.md b/CLAUDE.md
index 285f79ec..6d1eb561 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -322,10 +322,22 @@ Process:
 3. Run `canicode visual-compare` — pixel-level comparison against Figma screenshot
 4. Analyze the diff image to categorize pixel gaps (`Gap Analyzer`)
 5. Compare conversion difficulty vs rule scores (`canicode calibrate-evaluate`)
-6. 6-agent debate loop (`/calibrate-loop`): Analysis → Converter → Gap Analyzer → Evaluation → Critic → Arbitrator
+6. Debate loop (`/calibrate-loop`): Analysis → Converter → Gap Analyzer → Evaluation → Critic → Arbitrator
+
+**Critic receives structured evidence** (#144):
+- Proposals from evaluation
+- Converter's `ruleImpactAssessment` (actual implementation difficulty per rule)
+- Gap analysis (actionable pixel gaps)
+- Prior cross-run evidence for proposed rules
+- Outputs structured pro/con arguments + confidence level per proposal
+
+**Early-stop and self-consistency** (#144):
+- All proposals rejected with high confidence → Arbitrator skipped (early-stop)
+- Low-confidence decisions → held (not applied), evidence accumulates for future runs (self-consistency)
+- `stoppingReason` recorded in debate.json for traceability
 
 **Cross-run evidence** accumulates across sessions in `data/`:
-- `calibration-evidence.json` — overscored/underscored rules (fed to Runner for stronger proposals)
+- `calibration-evidence.json` — overscored/underscored rules with confidence, pro/con, decision (fed to Critic for informed review)
 - `discovery-evidence.json` — uncovered gaps not covered by existing rules (fed to `/add-rule` Researcher)
 - Discovery evidence is filtered to exclude environment/tooling noise (font CDN, retina/DPI, network, CI constraints)
 - Evidence is pruned after rules are applied (calibration) or new rules are created (discovery)
diff --git a/src/agents/contracts/evidence.ts b/src/agents/contracts/evidence.ts
index b97625fb..01035a43 100644
--- a/src/agents/contracts/evidence.ts
+++ b/src/agents/contracts/evidence.ts
@@ -8,6 +8,11 @@ export const CalibrationEvidenceEntrySchema = z.object({
   actualDifficulty: z.string(),
   fixture: z.string(),
   timestamp: z.string(),
+  // Phase 1 fields (#144) — optional for backward compatibility with existing evidence
+  confidence: z.enum(["high", "medium", "low"]).optional(),
+  pro: z.array(z.string()).optional(),
+  con: z.array(z.string()).optional(),
+  decision: z.enum(["APPROVE", "REJECT", "REVISE"]).optional(),
 });
 
 export type CalibrationEvidenceEntry = z.infer<typeof CalibrationEvidenceEntrySchema>;
@@ -17,6 +22,11 @@ export const CrossRunEvidenceGroupSchema = z.object({
   underscoredCount: z.number(),
   overscoredDifficulties: z.array(z.string()),
   underscoredDifficulties: z.array(z.string()),
+  // Aggregated pro/con from all entries for this rule
+  allPro: z.array(z.string()).optional(),
+  allCon: z.array(z.string()).optional(),
+  lastConfidence: z.enum(["high", "medium", "low"]).optional(),
+  lastDecision: z.enum(["APPROVE", "REJECT", "REVISE"]).optional(),
 });
 
 export type CrossRunEvidenceGroup = z.infer<typeof CrossRunEvidenceGroupSchema>;
diff --git a/src/agents/evidence-collector.test.ts b/src/agents/evidence-collector.test.ts
index 25258793..3f9c6dfd 100644
--- a/src/agents/evidence-collector.test.ts
+++ b/src/agents/evidence-collector.test.ts
@@ -62,12 +62,16 @@ describe("evidence-collector", () => {
         underscoredCount: 1,
         overscoredDifficulties: ["easy", "moderate"],
         underscoredDifficulties: ["hard"],
+        allPro: [],
+        allCon: [],
       });
       expect(result["rule-b"]).toEqual({
         overscoredCount: 0,
         underscoredCount: 1,
         overscoredDifficulties: [],
         underscoredDifficulties: ["hard"],
+        allPro: [],
+        allCon: [],
       });
     });
 
diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts
index 83391ed9..8372ce5a 100644
--- a/src/agents/evidence-collector.ts
+++ b/src/agents/evidence-collector.ts
@@ -63,6 +63,8 @@ export function loadCalibrationEvidence(
         underscoredCount: 0,
         overscoredDifficulties: [],
         underscoredDifficulties: [],
+        allPro: [],
+        allCon: [],
       };
       result[entry.ruleId] = group;
     }
@@ -74,6 +76,19 @@ export function loadCalibrationEvidence(
       group.underscoredCount++;
       group.underscoredDifficulties.push(entry.actualDifficulty);
     }
+
+    // Aggregate pro/con from enriched entries
+    if (entry.pro) {
+      group.allPro ??= [];
+      group.allPro.push(...entry.pro);
+    }
+    if (entry.con) {
+      group.allCon ??= [];
+      group.allCon.push(...entry.con);
+    }
+    // Keep last confidence/decision (most recent entry wins)
+    if (entry.confidence) group.lastConfidence = entry.confidence;
+    if (entry.decision) group.lastDecision = entry.decision;
   }
 
   return result;

From a298d33b982f9260342402f2607021ca4e607a25 Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Sun, 29 Mar 2026 18:22:14 +0900
Subject: [PATCH 02/12] fix: evidence enrichment, hold decisions, and
 convergence handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Self-review fixes:

1. Evidence enrichment gap: add calibrate-enrich-evidence CLI command
   and enrichCalibrationEvidence() — writes Critic's pro/con/confidence
   back to data/calibration-evidence.json (was only in ephemeral logs/)

2. Restore "revised" decision type in Arbitrator (was accidentally dropped)

3. Convergence: "hold" decisions count as not-converged (need more evidence)

4. Early-stop convergence: stoppingReason in debate.json → converged=true

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .claude/agents/calibration/arbitrator.md      | 11 +++---
 .claude/commands/calibrate-loop.md            | 18 +++++++--
 src/agents/evidence-collector.ts              | 36 ++++++++++++++++++
 src/agents/run-directory.ts                   | 28 +++++++++-----
 .../commands/internal/fixture-management.ts   | 37 +++++++++++++++++++
 src/cli/index.ts                              |  3 +-
 6 files changed, 114 insertions(+), 19 deletions(-)

diff --git a/.claude/agents/calibration/arbitrator.md b/.claude/agents/calibration/arbitrator.md
index 4dd3b0fa..b0106e5c 100644
--- a/.claude/agents/calibration/arbitrator.md
+++ b/.claude/agents/calibration/arbitrator.md
@@ -10,9 +10,9 @@ You receive the Runner's proposals and the Critic's reviews, and make final deci
 
 ## Decision Rules
 
-- **Both APPROVE** → apply Runner's proposed value
-- **Critic REJECT** → keep current score (no change)
-- **Critic REVISE** → apply the Critic's revised value
+- **Both APPROVE** → apply Runner's proposed value (decision: `"applied"`)
+- **Critic REJECT** → keep current score (decision: `"rejected"`)
+- **Critic REVISE** → apply the Critic's revised value (decision: `"revised"`)
 - **proposedDisable: true** → if both Runner and Critic agree, set `enabled: false` in `rule-config.ts`. Decision type: `"disabled"`. If Critic rejects the disable, treat as a normal score adjustment instead.
 - **New rule proposals** → record in `$RUN_DIR/debate.json` only, do NOT add to `rule-config.ts`
 
@@ -43,10 +43,11 @@ Return this JSON structure:
 ```json
 {
   "timestamp": "<ISO8601>",
-  "summary": "applied=2 rejected=1 hold=1 newProposals=0",
+  "summary": "applied=1 revised=1 rejected=1 hold=1 newProposals=0",
   "stoppingReason": "normal|all-high-confidence-reject|low-confidence-hold",
   "decisions": [
-    {"ruleId": "X", "decision": "applied", "before": -10, "after": -7, "confidence": "high", "reason": "Critic revised, midpoint applied"},
+    {"ruleId": "X", "decision": "applied", "before": -10, "after": -7, "confidence": "high", "reason": "Strong evidence, applying Runner's value"},
+    {"ruleId": "X", "decision": "revised", "before": -10, "after": -8, "confidence": "medium", "reason": "Critic revised, midpoint applied"},
     {"ruleId": "X", "decision": "rejected", "confidence": "medium", "reason": "Critic rejection compelling — insufficient evidence"},
     {"ruleId": "X", "decision": "hold", "confidence": "low", "reason": "Low confidence — accumulate more evidence before applying"},
     {"ruleId": "X", "decision": "disabled", "confidence": "high", "reason": "Converged to zero impact across 3+ runs, all easy"}
diff --git a/.claude/commands/calibrate-loop.md b/.claude/commands/calibrate-loop.md
index 4ef9bc5b..ffa1a859 100644
--- a/.claude/commands/calibrate-loop.md
+++ b/.claude/commands/calibrate-loop.md
@@ -205,13 +205,15 @@ After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the
   "critic": { ... },
   "arbitrator": {
     "timestamp": "<ISO8601>",
-    "summary": "applied=<N> rejected=<N> hold=<N>",
+    "summary": "applied=<N> revised=<N> rejected=<N> hold=<N>",
     "stoppingReason": "normal|all-high-confidence-reject|low-confidence-hold",
     "decisions": [
       {
         "ruleId": "X",
-        "decision": "applied|rejected|hold|disabled",
+        "decision": "applied|revised|rejected|hold|disabled",
         "confidence": "high|medium|low",
+        "before": -10,
+        "after": -7,
         "reason": "..."
       }
     ]
@@ -224,9 +226,17 @@ Append to `$RUN_DIR/activity.jsonl`:
 {"step":"Arbitrator","timestamp":"<ISO8601>","result":"applied=<N> rejected=<N> hold=<N>","durationMs":<ms>}
 ```
 
-### Step 6.5 — Prune evidence
+### Step 6.5 — Enrich and prune evidence
 
-After the Arbitrator applies changes, prune calibration evidence for the applied rules:
+After the debate (or early-stop), enrich `data/calibration-evidence.json` with the Critic's structured pro/con/confidence. This ensures cross-run evidence persists beyond the ephemeral `logs/` directory.
+
+```bash
+npx canicode calibrate-enrich-evidence $RUN_DIR
+```
+
+This reads `debate.json`, extracts the Critic's reviews (pro, con, confidence, decision), and updates matching entries in `data/calibration-evidence.json`. Runs for both normal and early-stop paths.
+
+Then prune calibration evidence for the applied rules:
 
 ```bash
 npx canicode calibrate-prune-evidence $RUN_DIR
diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts
index 8372ce5a..c8cca5c2 100644
--- a/src/agents/evidence-collector.ts
+++ b/src/agents/evidence-collector.ts
@@ -134,6 +134,42 @@ export function pruneCalibrationEvidence(
   writeJsonArray(evidencePath, pruned);
 }
 
+/**
+ * Enrich existing calibration evidence entries with Critic's structured review data.
+ * Matches by ruleId and updates confidence/pro/con/decision fields.
+ * Entries without a matching review are left unchanged.
+ */
+export function enrichCalibrationEvidence(
+  reviews: Array<{
+    ruleId: string;
+    confidence?: "high" | "medium" | "low";
+    pro?: string[];
+    con?: string[];
+    decision?: "APPROVE" | "REJECT" | "REVISE";
+  }>,
+  evidencePath: string = DEFAULT_CALIBRATION_PATH
+): void {
+  if (reviews.length === 0) return;
+  const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema);
+  if (existing.length === 0) return;
+
+  const reviewByRule = new Map(reviews.map((r) => [r.ruleId.trim(), r]));
+
+  const enriched = existing.map((entry) => {
+    const review = reviewByRule.get(entry.ruleId.trim());
+    if (!review) return entry;
+    return {
+      ...entry,
+      ...(review.confidence && { confidence: review.confidence }),
+      ...(review.pro && { pro: review.pro }),
+      ...(review.con && { con: review.con }),
+      ...(review.decision && { decision: review.decision }),
+    };
+  });
+
+  writeJsonArray(evidencePath, enriched);
+}
+
 // --- Discovery evidence ---
 
 const DEFAULT_DISCOVERY_PATH = resolve("data/discovery-evidence.json");
diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts
index f44133ce..f30a5c18 100644
--- a/src/agents/run-directory.ts
+++ b/src/agents/run-directory.ts
@@ -253,6 +253,7 @@ export interface ConvergenceSummary {
   applied: number;
   revised: number;
   rejected: number;
+  hold: number;
   kept: number;
   total: number;
   reason: string;
@@ -266,36 +267,44 @@ export function checkConvergence(runDir: string, options?: ConvergenceOptions):
   const debate = parseDebateResult(runDir);
 
   if (!debate) {
-    return { converged: false, mode, applied: 0, revised: 0, rejected: 0, kept: 0, total: 0, reason: "no debate.json found" };
+    return { converged: false, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: "no debate.json found" };
   }
   if (debate.skipped) {
-    return { converged: true, mode, applied: 0, revised: 0, rejected: 0, kept: 0, total: 0, reason: debate.skipped };
+    return { converged: true, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: debate.skipped };
   }
   if (!debate.arbitrator) {
-    return { converged: false, mode, applied: 0, revised: 0, rejected: 0, kept: 0, total: 0, reason: "no arbitrator result" };
+    // Early-stop: Arbitrator skipped because all proposals rejected with high confidence
+    const stoppingReason = (debate as Record<string, unknown>)["stoppingReason"];
+    if (typeof stoppingReason === "string" && stoppingReason.length > 0) {
+      return { converged: true, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: `early-stop: ${stoppingReason}` };
+    }
+    return { converged: false, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: "no arbitrator result" };
   }
 
   const decisions = debate.arbitrator.decisions;
   const applied = decisions.filter((d) => d.decision.trim().toLowerCase() === "applied").length;
   const revised = decisions.filter((d) => d.decision.trim().toLowerCase() === "revised").length;
   const rejected = decisions.filter((d) => d.decision.trim().toLowerCase() === "rejected").length;
-  const kept = decisions.length - applied - revised - rejected;
+  const hold = decisions.filter((d) => d.decision.trim().toLowerCase() === "hold").length;
+  const kept = decisions.length - applied - revised - rejected - hold;
   const total = decisions.length;
 
+  // hold = "not enough confidence to decide" → not converged (need more evidence)
   const converged = options?.lenient
-    ? (applied + revised) === 0
-    : (applied + revised) === 0 && rejected === 0;
+    ? (applied + revised + hold) === 0
+    : (applied + revised + hold) === 0 && rejected === 0;
 
   const parts: string[] = [];
   if (applied > 0) parts.push(`${applied} applied`);
   if (revised > 0) parts.push(`${revised} revised`);
   if (rejected > 0) parts.push(`${rejected} rejected`);
+  if (hold > 0) parts.push(`${hold} hold`);
   if (kept > 0) parts.push(`${kept} kept`);
   const countsStr = parts.length > 0 ? parts.join(", ") : "no decisions";
   const verdict = converged ? "converged" : "not converged";
   const reason = `${verdict} (${mode}) — ${countsStr} (${total} total)`;
 
-  return { converged, mode, applied, revised, rejected, kept, total, reason };
+  return { converged, mode, applied, revised, rejected, hold, kept, total, reason };
 }
 
 /** Options for convergence checking. */
@@ -322,9 +331,10 @@ export function isConverged(runDir: string, options?: ConvergenceOptions): boole
     const dec = d.decision.trim().toLowerCase();
     return dec === "applied" || dec === "revised";
   }).length;
+  const hold = decisions.filter((d) => d.decision.trim().toLowerCase() === "hold").length;
   const rejected = decisions.filter((d) => d.decision.trim().toLowerCase() === "rejected").length;
   if (options?.lenient) {
-    return applied === 0;
+    return applied === 0 && hold === 0;
   }
-  return applied === 0 && rejected === 0;
+  return applied === 0 && hold === 0 && rejected === 0;
 }
diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts
index 35bf5161..ff6c8e87 100644
--- a/src/cli/commands/internal/fixture-management.ts
+++ b/src/cli/commands/internal/fixture-management.ts
@@ -15,6 +15,7 @@ import {
 import {
   pruneCalibrationEvidence,
   pruneDiscoveryEvidence,
+  enrichCalibrationEvidence,
 } from "../../../agents/evidence-collector.js";
 
 export function registerFixtureManagement(cli: CAC): void {
@@ -112,6 +113,42 @@ export function registerFixtureManagement(cli: CAC): void {
     });
 }
 
+export function registerEvidenceEnrich(cli: CAC): void {
+  cli
+    .command(
+      "calibrate-enrich-evidence <runDir>",
+      "Enrich evidence with Critic's pro/con/confidence from debate.json"
+    )
+    .action((runDir: string) => {
+      if (!existsSync(resolve(runDir))) {
+        console.log(`Run directory not found: ${runDir}`);
+        return;
+      }
+      const debate = parseDebateResult(resolve(runDir));
+      if (!debate?.critic) {
+        console.log("No critic reviews in debate.json — nothing to enrich.");
+        return;
+      }
+
+      const reviews = debate.critic.reviews.map((r) => {
+        const raw = r as Record<string, unknown>;
+        const entry: Parameters<typeof enrichCalibrationEvidence>[0][number] = { ruleId: r.ruleId };
+        const conf = raw["confidence"];
+        if (conf === "high" || conf === "medium" || conf === "low") entry.confidence = conf;
+        const pro = raw["pro"];
+        if (Array.isArray(pro)) entry.pro = pro as string[];
+        const con = raw["con"];
+        if (Array.isArray(con)) entry.con = con as string[];
+        const dec = r.decision;
+        if (dec === "APPROVE" || dec === "REJECT" || dec === "REVISE") entry.decision = dec;
+        return entry;
+      });
+
+      enrichCalibrationEvidence(reviews);
+      console.log(`Enriched calibration evidence with ${reviews.length} review(s)`);
+    });
+}
+
 export function registerEvidencePrune(cli: CAC): void {
   cli
     .command(
diff --git a/src/cli/index.ts b/src/cli/index.ts
index 9ee3caef..3ecb00ab 100644
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@@ -32,7 +32,7 @@ import { registerCalibrateAnalyze } from "./commands/internal/calibrate-analyze.
 import { registerCalibrateEvaluate } from "./commands/internal/calibrate-evaluate.js";
 import { registerCalibrateGapReport } from "./commands/internal/calibrate-gap-report.js";
 import { registerCalibrateRun } from "./commands/internal/calibrate-run.js";
-import { registerFixtureManagement, registerEvidencePrune } from "./commands/internal/fixture-management.js";
+import { registerFixtureManagement, registerEvidenceEnrich, registerEvidencePrune } from "./commands/internal/fixture-management.js";
 
 const require = createRequire(import.meta.url);
 const pkg = require("../../package.json") as { version: string };
@@ -79,6 +79,7 @@ registerCalibrateEvaluate(cli);
 registerCalibrateGapReport(cli);
 registerCalibrateRun(cli);
 registerFixtureManagement(cli);
+registerEvidenceEnrich(cli);
 registerEvidencePrune(cli);
 
 // ============================================

From 534daaf178a1488ff9cb51488b7ac312de9fc8c1 Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Sun, 29 Mar 2026 18:32:01 +0900
Subject: [PATCH 03/12] =?UTF-8?q?fix:=20review=20feedback=20=E2=80=94=20co?=
 =?UTF-8?q?nvergence=20sync,=20fixture-scoped=20enrich,=20tests?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses code review feedback:

1. isConverged now delegates to checkConvergence (single source of truth
   for early-stop / hold logic)
2. enrichCalibrationEvidence scoped by (ruleId, fixture) — no longer
   overwrites entries from other fixtures
3. stoppingReason canonical location: debate.json top level only
4. CLI calibrate-enrich-evidence uses process.exitCode = 1 on errors
5. Tests added: enrichCalibrationEvidence (3), checkConvergence early-stop,
   hold convergence (strict + lenient), isConverged delegation

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .claude/agents/calibration/arbitrator.md      |  3 +-
 .claude/commands/calibrate-loop.md            |  7 ++-
 src/agents/evidence-collector.test.ts         | 43 +++++++++++++-
 src/agents/evidence-collector.ts              |  5 +-
 src/agents/run-directory.test.ts              | 57 +++++++++++++++++++
 src/agents/run-directory.ts                   | 19 +------
 .../commands/internal/fixture-management.ts   | 23 ++++++--
 7 files changed, 127 insertions(+), 30 deletions(-)

diff --git a/.claude/agents/calibration/arbitrator.md b/.claude/agents/calibration/arbitrator.md
index b0106e5c..90026041 100644
--- a/.claude/agents/calibration/arbitrator.md
+++ b/.claude/agents/calibration/arbitrator.md
@@ -44,7 +44,6 @@ Return this JSON structure:
 {
   "timestamp": "<ISO8601>",
   "summary": "applied=1 revised=1 rejected=1 hold=1 newProposals=0",
-  "stoppingReason": "normal|all-high-confidence-reject|low-confidence-hold",
   "decisions": [
     {"ruleId": "X", "decision": "applied", "before": -10, "after": -7, "confidence": "high", "reason": "Strong evidence, applying Runner's value"},
     {"ruleId": "X", "decision": "revised", "before": -10, "after": -8, "confidence": "medium", "reason": "Critic revised, midpoint applied"},
@@ -59,7 +58,7 @@ Return this JSON structure:
 ### Field requirements
 
 - **confidence**: carried from Critic's review for each decision
-- **stoppingReason**: why the debate ended — `"normal"` (mixed decisions), `"all-high-confidence-reject"` (all rejected with high confidence), `"low-confidence-hold"` (all held due to low confidence)
+- **Note**: `stoppingReason` is written by the orchestrator at the debate.json top level, not inside the arbitrator object
 
 ## Rules
 
diff --git a/.claude/commands/calibrate-loop.md b/.claude/commands/calibrate-loop.md
index ffa1a859..f40c8fb4 100644
--- a/.claude/commands/calibrate-loop.md
+++ b/.claude/commands/calibrate-loop.md
@@ -199,14 +199,14 @@ Spawn the `calibration-arbitrator` subagent. In the prompt:
 - Include proposals and the Critic's reviews from `$RUN_DIR/debate.json`
 - **Tell the agent: "Return your decisions as JSON. Only edit rule-config.ts if applying changes. Do NOT write to logs."**
 
-After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the existing content and add the `arbitrator` field:
+After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the existing content and add the `arbitrator` field. Also set `stoppingReason` at the **top level** (canonical location — never inside arbitrator object):
+
 ```json
 {
   "critic": { ... },
   "arbitrator": {
     "timestamp": "<ISO8601>",
     "summary": "applied=<N> revised=<N> rejected=<N> hold=<N>",
-    "stoppingReason": "normal|all-high-confidence-reject|low-confidence-hold",
     "decisions": [
       {
         "ruleId": "X",
@@ -217,7 +217,8 @@ After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the
         "reason": "..."
       }
     ]
-  }
+  },
+  "stoppingReason": "normal|low-confidence-hold"
 }
 ```
 
diff --git a/src/agents/evidence-collector.test.ts b/src/agents/evidence-collector.test.ts
index 3f9c6dfd..1841c26c 100644
--- a/src/agents/evidence-collector.test.ts
+++ b/src/agents/evidence-collector.test.ts
@@ -1,9 +1,10 @@
-import { mkdirSync, rmSync, writeFileSync, readFileSync } from "node:fs";
+import { existsSync, mkdirSync, rmSync, writeFileSync, readFileSync } from "node:fs";
 import { join } from "node:path";
 import { tmpdir } from "node:os";
 import {
   loadCalibrationEvidence,
   appendCalibrationEvidence,
+  enrichCalibrationEvidence,
   pruneCalibrationEvidence,
   loadDiscoveryEvidence,
   appendDiscoveryEvidence,
@@ -140,6 +141,46 @@ describe("evidence-collector", () => {
     });
   });
 
+  describe("enrichCalibrationEvidence", () => {
+    it("enriches entries matching (ruleId, fixture)", () => {
+      const entries: CalibrationEvidenceEntry[] = [
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" },
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "moderate", fixture: "fx2", timestamp: "t2" },
+        { ruleId: "rule-b", type: "underscored", actualDifficulty: "hard", fixture: "fx1", timestamp: "t3" },
+      ];
+      writeFileSync(calPath, JSON.stringify(entries), "utf-8");
+
+      enrichCalibrationEvidence(
+        [{ ruleId: "rule-a", confidence: "high", pro: ["easy in practice"], con: ["only 1 case"], decision: "APPROVE" }],
+        "fx1",
+        calPath,
+      );
+
+      const result = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[];
+      // Only fx1 entry for rule-a is enriched
+      expect(result[0]!.confidence).toBe("high");
+      expect(result[0]!.pro).toEqual(["easy in practice"]);
+      // fx2 entry for rule-a is NOT enriched (different fixture)
+      expect(result[1]!.confidence).toBeUndefined();
+      // rule-b is NOT enriched (different ruleId)
+      expect(result[2]!.confidence).toBeUndefined();
+    });
+
+    it("does nothing when evidence file is empty", () => {
+      enrichCalibrationEvidence([{ ruleId: "rule-a" }], "fx1", calPath);
+      expect(existsSync(calPath)).toBe(false);
+    });
+
+    it("does nothing when reviews array is empty", () => {
+      writeFileSync(calPath, JSON.stringify([
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" },
+      ]), "utf-8");
+      enrichCalibrationEvidence([], "fx1", calPath);
+      const result = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[];
+      expect(result[0]!.confidence).toBeUndefined();
+    });
+  });
+
   describe("pruneCalibrationEvidence", () => {
     it("removes entries for specified ruleIds", () => {
       const entries: CalibrationEvidenceEntry[] = [
diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts
index c8cca5c2..8a102ff6 100644
--- a/src/agents/evidence-collector.ts
+++ b/src/agents/evidence-collector.ts
@@ -136,7 +136,7 @@ export function pruneCalibrationEvidence(
 
 /**
  * Enrich existing calibration evidence entries with Critic's structured review data.
- * Matches by ruleId and updates confidence/pro/con/decision fields.
+ * Matches by (ruleId, fixture) to avoid overwriting entries from other fixtures.
  * Entries without a matching review are left unchanged.
  */
 export function enrichCalibrationEvidence(
@@ -147,6 +147,7 @@ export function enrichCalibrationEvidence(
     con?: string[];
     decision?: "APPROVE" | "REJECT" | "REVISE";
   }>,
+  fixture: string,
   evidencePath: string = DEFAULT_CALIBRATION_PATH
 ): void {
   if (reviews.length === 0) return;
@@ -154,8 +155,10 @@ export function enrichCalibrationEvidence(
   if (existing.length === 0) return;
 
   const reviewByRule = new Map(reviews.map((r) => [r.ruleId.trim(), r]));
+  const fixtureTrimmed = fixture.trim();
 
   const enriched = existing.map((entry) => {
+    if (entry.fixture.trim() !== fixtureTrimmed) return entry;
     const review = reviewByRule.get(entry.ruleId.trim());
     if (!review) return entry;
     return {
diff --git a/src/agents/run-directory.test.ts b/src/agents/run-directory.test.ts
index d56d74a3..06253dfb 100644
--- a/src/agents/run-directory.test.ts
+++ b/src/agents/run-directory.test.ts
@@ -333,6 +333,63 @@ describe("checkConvergence", () => {
     expect(summary.converged).toBe(false);
     expect(summary.reason).toBe("no arbitrator result");
   });
+
+  it("converged on early-stop (stoppingReason + no arbitrator)", () => {
+    writeFileSync(
+      join(tempDir, "debate.json"),
+      JSON.stringify({
+        critic: { summary: "rejected=2", reviews: [] },
+        arbitrator: null,
+        stoppingReason: "all-high-confidence-reject",
+      }),
+    );
+    const summary = checkConvergence(tempDir);
+    expect(summary.converged).toBe(true);
+    expect(summary.reason).toContain("early-stop");
+    expect(summary.reason).toContain("all-high-confidence-reject");
+  });
+
+  it("hold decisions prevent convergence", () => {
+    writeFileSync(
+      join(tempDir, "debate.json"),
+      JSON.stringify({
+        arbitrator: {
+          summary: "hold=1",
+          decisions: [{ ruleId: "a", decision: "hold" }],
+        },
+      }),
+    );
+    const summary = checkConvergence(tempDir);
+    expect(summary.converged).toBe(false);
+    expect(summary.hold).toBe(1);
+  });
+
+  it("hold prevents convergence even in lenient mode", () => {
+    writeFileSync(
+      join(tempDir, "debate.json"),
+      JSON.stringify({
+        arbitrator: {
+          summary: "hold=1",
+          decisions: [{ ruleId: "a", decision: "hold" }],
+        },
+      }),
+    );
+    const summary = checkConvergence(tempDir, { lenient: true });
+    expect(summary.converged).toBe(false);
+  });
+
+  it("isConverged delegates to checkConvergence", () => {
+    writeFileSync(
+      join(tempDir, "debate.json"),
+      JSON.stringify({
+        critic: { summary: "rejected=1", reviews: [] },
+        arbitrator: null,
+        stoppingReason: "all-high-confidence-reject",
+      }),
+    );
+    // isConverged should also return true for early-stop
+    expect(isConverged(tempDir)).toBe(true);
+  });
 });
 
 describe("listCalibrationRuns", () => {
diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts
index f30a5c18..3fed7832 100644
--- a/src/agents/run-directory.ts
+++ b/src/agents/run-directory.ts
@@ -318,23 +318,8 @@ export interface ConvergenceOptions {
 
 /**
  * Check if a calibration run has converged.
- * Strict: no applied/revised AND no rejected decisions.
- * Lenient: no applied/revised only (rejected proposals allowed).
+ * Delegates to checkConvergence to avoid duplicating early-stop / hold logic.
  */
 export function isConverged(runDir: string, options?: ConvergenceOptions): boolean {
-  const debate = parseDebateResult(runDir);
-  if (!debate) return false;
-  if (debate.skipped) return true; // zero proposals = converged
-  if (!debate.arbitrator) return false;
-  const decisions = debate.arbitrator.decisions;
-  const applied = decisions.filter((d) => {
-    const dec = d.decision.trim().toLowerCase();
-    return dec === "applied" || dec === "revised";
-  }).length;
-  const hold = decisions.filter((d) => d.decision.trim().toLowerCase() === "hold").length;
-  const rejected = decisions.filter((d) => d.decision.trim().toLowerCase() === "rejected").length;
-  if (options?.lenient) {
-    return applied === 0 && hold === 0;
-  }
-  return applied === 0 && hold === 0 && rejected === 0;
+  return checkConvergence(runDir, options).converged;
 }
diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts
index ff6c8e87..a0b4fde1 100644
--- a/src/cli/commands/internal/fixture-management.ts
+++ b/src/cli/commands/internal/fixture-management.ts
@@ -1,5 +1,5 @@
 import { existsSync } from "node:fs";
-import { resolve } from "node:path";
+import { basename, resolve } from "node:path";
 import type { CAC } from "cac";
 
 import {
@@ -7,6 +7,7 @@ import {
   listDoneFixtures,
   moveFixtureToDone,
   parseDebateResult,
+  parseRunDirName,
   extractAppliedRuleIds,
   extractFixtureName,
   resolveLatestRunDir,
@@ -120,16 +121,26 @@ export function registerEvidenceEnrich(cli: CAC): void {
       "Enrich evidence with Critic's pro/con/confidence from debate.json"
     )
     .action((runDir: string) => {
-      if (!existsSync(resolve(runDir))) {
-        console.log(`Run directory not found: ${runDir}`);
+      const resolvedDir = resolve(runDir);
+      if (!existsSync(resolvedDir)) {
+        console.error(`Run directory not found: ${runDir}`);
+        process.exitCode = 1;
         return;
       }
-      const debate = parseDebateResult(resolve(runDir));
+      const debate = parseDebateResult(resolvedDir);
       if (!debate?.critic) {
         console.log("No critic reviews in debate.json — nothing to enrich.");
         return;
       }
 
+      // Extract fixture name from run directory (e.g. "material3-kit--2026-03-26-0900" → "material3-kit")
+      const { name: fixture } = parseRunDirName(basename(resolvedDir));
+      if (!fixture) {
+        console.error("Cannot extract fixture name from run directory");
+        process.exitCode = 1;
+        return;
+      }
+
       const reviews = debate.critic.reviews.map((r) => {
         const raw = r as Record<string, unknown>;
         const entry: Parameters<typeof enrichCalibrationEvidence>[0][number] = { ruleId: r.ruleId };
@@ -144,8 +155,8 @@ export function registerEvidenceEnrich(cli: CAC): void {
         return entry;
       });
 
-      enrichCalibrationEvidence(reviews);
-      console.log(`Enriched calibration evidence with ${reviews.length} review(s)`);
+      enrichCalibrationEvidence(reviews, fixture);
+      console.log(`Enriched calibration evidence for fixture "${fixture}" with ${reviews.length} review(s)`);
     });
 }
 

From d241df15ba56505bdb67cc0710cc37a5ee993a8e Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Sun, 29 Mar 2026 18:35:22 +0900
Subject: [PATCH 04/12] fix: stoppingReason in Zod schema, dedupe pro/con, CLI
 exit codes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Review round 2:

A. stoppingReason added to DebateResultSchema — no more type cast
B. enrichCalibrationEvidence warns when no entries match fixture
C. calibrate-prune-evidence CLI now uses process.exitCode = 1
D. allPro/allCon deduplicated in loadCalibrationEvidence grouping
F. stoppingReason canonical location commented in schema

E (model selection) is a judgment call, not a code change — to evaluate later.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/agents/evidence-collector.ts                | 15 ++++++++++++---
 src/agents/run-directory.ts                     |  7 ++++---
 src/cli/commands/internal/fixture-management.ts |  6 ++++--
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts
index 8a102ff6..70023add 100644
--- a/src/agents/evidence-collector.ts
+++ b/src/agents/evidence-collector.ts
@@ -77,14 +77,18 @@ export function loadCalibrationEvidence(
       group.underscoredDifficulties.push(entry.actualDifficulty);
     }
 
-    // Aggregate pro/con from enriched entries
+    // Aggregate pro/con from enriched entries (deduplicated)
     if (entry.pro) {
       group.allPro ??= [];
-      group.allPro.push(...entry.pro);
+      for (const p of entry.pro) {
+        if (!group.allPro.includes(p)) group.allPro.push(p);
+      }
     }
     if (entry.con) {
       group.allCon ??= [];
-      group.allCon.push(...entry.con);
+      for (const c of entry.con) {
+        if (!group.allCon.includes(c)) group.allCon.push(c);
+      }
     }
     // Keep last confidence/decision (most recent entry wins)
     if (entry.confidence) group.lastConfidence = entry.confidence;
@@ -157,10 +161,12 @@ export function enrichCalibrationEvidence(
   const reviewByRule = new Map(reviews.map((r) => [r.ruleId.trim(), r]));
   const fixtureTrimmed = fixture.trim();
 
+  let matchCount = 0;
   const enriched = existing.map((entry) => {
     if (entry.fixture.trim() !== fixtureTrimmed) return entry;
     const review = reviewByRule.get(entry.ruleId.trim());
     if (!review) return entry;
+    matchCount++;
     return {
       ...entry,
       ...(review.confidence && { confidence: review.confidence }),
@@ -170,6 +176,9 @@ export function enrichCalibrationEvidence(
     };
   });
 
+  if (matchCount === 0) {
+    console.warn(`[enrich] No entries matched fixture="${fixture}" — evidence unchanged`);
+  }
   writeJsonArray(evidencePath, enriched);
 }
 
diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts
index 3fed7832..1d39c43f 100644
--- a/src/agents/run-directory.ts
+++ b/src/agents/run-directory.ts
@@ -184,10 +184,12 @@ const ArbitratorSchema = z.object({
   newRuleProposals: z.array(z.unknown()).optional(),
 }).passthrough();
 
+/** stoppingReason canonical location: debate.json top level (not inside arbitrator) */
 const DebateResultSchema = z.object({
   critic: CriticSchema.nullable().default(null),
   arbitrator: ArbitratorSchema.nullable().default(null),
   skipped: z.string().optional(),
+  stoppingReason: z.string().optional(),
 }).passthrough();
 
 /** A single decision from the Arbitrator in debate.json. */
@@ -274,9 +276,8 @@ export function checkConvergence(runDir: string, options?: ConvergenceOptions):
   }
   if (!debate.arbitrator) {
     // Early-stop: Arbitrator skipped because all proposals rejected with high confidence
-    const stoppingReason = (debate as Record<string, unknown>)["stoppingReason"];
-    if (typeof stoppingReason === "string" && stoppingReason.length > 0) {
-      return { converged: true, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: `early-stop: ${stoppingReason}` };
+    if (debate.stoppingReason) {
+      return { converged: true, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: `early-stop: ${debate.stoppingReason}` };
     }
     return { converged: false, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: "no arbitrator result" };
   }
diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts
index a0b4fde1..f3109448 100644
--- a/src/cli/commands/internal/fixture-management.ts
+++ b/src/cli/commands/internal/fixture-management.ts
@@ -168,12 +168,14 @@ export function registerEvidencePrune(cli: CAC): void {
     )
     .action((runDir: string) => {
       if (!existsSync(resolve(runDir))) {
-        console.log(`Run directory not found: ${runDir}`);
+        console.error(`Run directory not found: ${runDir}`);
+        process.exitCode = 1;
         return;
       }
       const debate = parseDebateResult(resolve(runDir));
       if (!debate) {
-        console.log("No debate.json found — nothing to prune.");
+        console.error("No debate.json found — nothing to prune.");
+        process.exitCode = 1;
         return;
       }
 

From cc3e4ba005ed030a93e189229f4b35bfb29fd8c1 Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Sun, 29 Mar 2026 18:36:13 +0900
Subject: [PATCH 05/12] chore: upgrade Critic and Arbitrator to opus-4-6

These are the most critical judgment steps in the calibration pipeline.
Structured pro/con + confidence reasoning benefits from stronger model.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .claude/agents/calibration/arbitrator.md | 2 +-
 .claude/agents/calibration/critic.md     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.claude/agents/calibration/arbitrator.md b/.claude/agents/calibration/arbitrator.md
index 90026041..72d01f51 100644
--- a/.claude/agents/calibration/arbitrator.md
+++ b/.claude/agents/calibration/arbitrator.md
@@ -2,7 +2,7 @@
 name: calibration-arbitrator
 description: Makes final calibration decisions by weighing Runner and Critic. Applies approved changes to rule-config.ts and commits. Use after calibration-critic completes.
 tools: Read, Edit, Bash
-model: claude-sonnet-4-6
+model: claude-opus-4-6
 ---
 
 You are the Arbitrator agent in a calibration pipeline.
diff --git a/.claude/agents/calibration/critic.md b/.claude/agents/calibration/critic.md
index 11a6ee2d..d0257438 100644
--- a/.claude/agents/calibration/critic.md
+++ b/.claude/agents/calibration/critic.md
@@ -2,7 +2,7 @@
 name: calibration-critic
 description: Challenges calibration proposals from Runner. Rejects low-confidence or over-aggressive adjustments. Use after calibration-runner completes.
 tools: Read
-model: claude-sonnet-4-6
+model: claude-opus-4-6
 ---
 
 ## Common Review Framework

From b0039c78dfde1629ab661e8e1fac9c8a613e8126 Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Sun, 29 Mar 2026 18:39:37 +0900
Subject: [PATCH 06/12] =?UTF-8?q?fix:=20final=20polish=20=E2=80=94=20Criti?=
 =?UTF-8?q?cSchema=20types,=20enrich=20early-return,=20dedupe=20test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. CriticReviewSchema: confidence/pro/con as typed optional fields,
   removing Record<string, unknown> casts in fixture-management.ts
2. enrichCalibrationEvidence: early-return when no matches (skip I/O)
3. stoppingReason: omit for normal completion, only set for special cases
4. Test: pro/con deduplication in loadCalibrationEvidence

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .claude/commands/calibrate-loop.md              |  4 ++--
 src/agents/evidence-collector.test.ts           | 14 ++++++++++++++
 src/agents/evidence-collector.ts                |  1 +
 src/agents/run-directory.ts                     | 17 +++++++++++------
 src/cli/commands/internal/fixture-management.ts | 10 +++-------
 5 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/.claude/commands/calibrate-loop.md b/.claude/commands/calibrate-loop.md
index f40c8fb4..848d3df8 100644
--- a/.claude/commands/calibrate-loop.md
+++ b/.claude/commands/calibrate-loop.md
@@ -199,7 +199,7 @@ Spawn the `calibration-arbitrator` subagent. In the prompt:
 - Include proposals and the Critic's reviews from `$RUN_DIR/debate.json`
 - **Tell the agent: "Return your decisions as JSON. Only edit rule-config.ts if applying changes. Do NOT write to logs."**
 
-After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the existing content and add the `arbitrator` field. Also set `stoppingReason` at the **top level** (canonical location — never inside arbitrator object):
+After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the existing content and add the `arbitrator` field. Only set `stoppingReason` at the **top level** when non-normal termination occurred (e.g. `"low-confidence-hold"`, `"all-high-confidence-reject"`). Omit the field for normal completion:
 
 ```json
 {
@@ -218,7 +218,7 @@ After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the
       }
     ]
   },
-  "stoppingReason": "normal|low-confidence-hold"
+  "stoppingReason": "low-confidence-hold"
 }
 ```
 
diff --git a/src/agents/evidence-collector.test.ts b/src/agents/evidence-collector.test.ts
index 1841c26c..b2ced476 100644
--- a/src/agents/evidence-collector.test.ts
+++ b/src/agents/evidence-collector.test.ts
@@ -76,6 +76,20 @@ describe("evidence-collector", () => {
       });
     });
 
+    it("deduplicates pro/con across entries", () => {
+      const entries: CalibrationEvidenceEntry[] = [
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1",
+          pro: ["easy in practice", "common pattern"], con: ["small fixture"] },
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx2", timestamp: "t2",
+          pro: ["easy in practice", "new evidence"], con: ["small fixture", "single run"] },
+      ];
+      writeFileSync(calPath, JSON.stringify(entries), "utf-8");
+
+      const result = loadCalibrationEvidence(calPath);
+      expect(result["rule-a"]!.allPro).toEqual(["easy in practice", "common pattern", "new evidence"]);
+      expect(result["rule-a"]!.allCon).toEqual(["small fixture", "single run"]);
+    });
+
     it("handles malformed JSON gracefully", () => {
       writeFileSync(calPath, "not json", "utf-8");
       const result = loadCalibrationEvidence(calPath);
diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts
index 70023add..3ad2d9fe 100644
--- a/src/agents/evidence-collector.ts
+++ b/src/agents/evidence-collector.ts
@@ -178,6 +178,7 @@ export function enrichCalibrationEvidence(
 
   if (matchCount === 0) {
     console.warn(`[enrich] No entries matched fixture="${fixture}" — evidence unchanged`);
+    return;
   }
   writeJsonArray(evidencePath, enriched);
 }
diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts
index 1d39c43f..028eb354 100644
--- a/src/agents/run-directory.ts
+++ b/src/agents/run-directory.ts
@@ -168,14 +168,19 @@ const DebateDecisionSchema = z.object({
   reason: z.string().optional(),
 }).passthrough();
 
+const CriticReviewSchema = z.object({
+  ruleId: z.string(),
+  decision: z.string(),
+  reason: z.string().optional(),
+  revised: z.number().optional(),
+  confidence: z.enum(["high", "medium", "low"]).optional(),
+  pro: z.array(z.string()).optional(),
+  con: z.array(z.string()).optional(),
+}).passthrough();
+
 const CriticSchema = z.object({
   summary: z.string(),
-  reviews: z.array(z.object({
-    ruleId: z.string(),
-    decision: z.string(),
-    reason: z.string().optional(),
-    revised: z.number().optional(),
-  }).passthrough()),
+  reviews: z.array(CriticReviewSchema),
 }).passthrough();
 
 const ArbitratorSchema = z.object({
diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts
index f3109448..4158a451 100644
--- a/src/cli/commands/internal/fixture-management.ts
+++ b/src/cli/commands/internal/fixture-management.ts
@@ -142,14 +142,10 @@ export function registerEvidenceEnrich(cli: CAC): void {
       }
 
       const reviews = debate.critic.reviews.map((r) => {
-        const raw = r as Record<string, unknown>;
         const entry: Parameters<typeof enrichCalibrationEvidence>[0][number] = { ruleId: r.ruleId };
-        const conf = raw["confidence"];
-        if (conf === "high" || conf === "medium" || conf === "low") entry.confidence = conf;
-        const pro = raw["pro"];
-        if (Array.isArray(pro)) entry.pro = pro as string[];
-        const con = raw["con"];
-        if (Array.isArray(con)) entry.con = con as string[];
+        if (r.confidence) entry.confidence = r.confidence;
+        if (r.pro) entry.pro = r.pro;
+        if (r.con) entry.con = r.con;
         const dec = r.decision;
         if (dec === "APPROVE" || dec === "REJECT" || dec === "REVISE") entry.decision = dec;
         return entry;

From 4018b29b916f9eb0e3147ae3b16fc9609678f279 Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Sun, 29 Mar 2026 18:43:16 +0900
Subject: [PATCH 07/12] refactor: extract deterministic logic from orchestrator
 into CLI commands

Three deterministic tasks the orchestrator (LLM) was doing are now CLI commands:

1. calibrate-gather-evidence <runDir>
   Reads conversion.json, gaps.json, summary.md, calibration-evidence.json
   and writes a single critic-evidence.json for the Critic prompt.
   Was: orchestrator reads 3-4 files and extracts relevant parts (error-prone)

2. calibrate-finalize-debate <runDir>
   After Critic: checks early-stop (all REJECT + high confidence)
   After Arbitrator: determines stoppingReason (low-confidence-hold)
   Was: orchestrator evaluates conditions inline (can misjudge)

calibrate-loop.md updated to use CLI commands instead of inline logic.
Orchestrator now only: runs CLI commands + spawns LLM agents.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .claude/commands/calibrate-loop.md            |  49 +++--
 src/cli/commands/internal/calibrate-debate.ts | 193 ++++++++++++++++++
 src/cli/index.ts                              |   3 +
 3 files changed, 223 insertions(+), 22 deletions(-)
 create mode 100644 src/cli/commands/internal/calibrate-debate.ts

diff --git a/.claude/commands/calibrate-loop.md b/.claude/commands/calibrate-loop.md
index 848d3df8..9fadb295 100644
--- a/.claude/commands/calibrate-loop.md
+++ b/.claude/commands/calibrate-loop.md
@@ -135,17 +135,19 @@ If zero proposals, write `$RUN_DIR/debate.json` with skip reason and jump to Ste
 
 ### Step 5 — Critic
 
-Before spawning the Critic, gather supporting evidence:
+Gather supporting evidence (deterministic CLI — no LLM):
 
-1. Read `$RUN_DIR/conversion.json` → extract `ruleImpactAssessment` and `uncoveredStruggles`
-2. Read `$RUN_DIR/gaps.json` (if exists) → extract actionable gaps
-3. Read `data/calibration-evidence.json` (if exists) → extract prior evidence for proposed rules
+```bash
+npx canicode calibrate-gather-evidence $RUN_DIR
+```
+
+This reads `conversion.json`, `gaps.json`, `summary.md`, and `data/calibration-evidence.json`, and writes a single `$RUN_DIR/critic-evidence.json` with structured data for the Critic.
+
+Read `$RUN_DIR/critic-evidence.json` and include it in the Critic prompt.
 
 Spawn the `calibration-critic` subagent. In the prompt:
 - Include the proposal list from summary.md
-- Include the Converter's `ruleImpactAssessment` (actual implementation difficulty per rule)
-- Include actionable gaps from Gap Analysis (if available)
-- Include prior cross-run evidence for the proposed rules
+- Include the gathered evidence from `critic-evidence.json`
 - **Tell the agent: "Return your reviews as JSON. Do NOT write any files."**
 
 After the Critic returns, **you** write the JSON to `$RUN_DIR/debate.json`:
@@ -173,25 +175,21 @@ Append to `$RUN_DIR/activity.jsonl`:
 {"step":"Critic","timestamp":"<ISO8601>","result":"approved=<N> rejected=<N> revised=<N>","durationMs":<ms>}
 ```
 
-#### Early-stop check
+#### Early-stop check (deterministic CLI — no LLM)
 
-After the Critic returns, check for early termination:
+```bash
+npx canicode calibrate-finalize-debate $RUN_DIR
+```
 
-- If **all reviews** have `decision: "REJECT"` AND `confidence: "high"` → skip Arbitrator. Write debate.json with:
-  ```json
-  {
-    "critic": { ... },
-    "arbitrator": null,
-    "stoppingReason": "all-high-confidence-reject"
-  }
-  ```
-  Append to activity.jsonl:
+This outputs JSON: `{"action": "early-stop"|"continue", ...}`.
+
+- If `action` is `"early-stop"`: the CLI has already written `stoppingReason` to debate.json. Append to activity.jsonl:
   ```json
   {"step":"Arbitrator","timestamp":"<ISO8601>","result":"SKIPPED — early-stop: all proposals rejected with high confidence","durationMs":0}
   ```
   Jump to Step 6.5.
 
-Otherwise, proceed to Step 6.
+- If `action` is `"continue"`: proceed to Step 6.
 
 ### Step 6 — Arbitrator
 
@@ -199,7 +197,7 @@ Spawn the `calibration-arbitrator` subagent. In the prompt:
 - Include proposals and the Critic's reviews from `$RUN_DIR/debate.json`
 - **Tell the agent: "Return your decisions as JSON. Only edit rule-config.ts if applying changes. Do NOT write to logs."**
 
-After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the existing content and add the `arbitrator` field. Only set `stoppingReason` at the **top level** when non-normal termination occurred (e.g. `"low-confidence-hold"`, `"all-high-confidence-reject"`). Omit the field for normal completion:
+After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the existing content and add the `arbitrator` field:
 
 ```json
 {
@@ -217,11 +215,18 @@ After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the
         "reason": "..."
       }
     ]
-  },
-  "stoppingReason": "low-confidence-hold"
+  }
 }
 ```
 
+Then finalize the debate (deterministic CLI — no LLM):
+
+```bash
+npx canicode calibrate-finalize-debate $RUN_DIR
+```
+
+This determines `stoppingReason` (if any) and writes it to debate.json. Outputs JSON with `action: "finalized"`.
+
 Append to `$RUN_DIR/activity.jsonl`:
 ```json
 {"step":"Arbitrator","timestamp":"<ISO8601>","result":"applied=<N> rejected=<N> hold=<N>","durationMs":<ms>}
diff --git a/src/cli/commands/internal/calibrate-debate.ts b/src/cli/commands/internal/calibrate-debate.ts
new file mode 100644
index 00000000..8567ca88
--- /dev/null
+++ b/src/cli/commands/internal/calibrate-debate.ts
@@ -0,0 +1,193 @@
+import { existsSync, readFileSync, writeFileSync } from "node:fs";
+import { join, resolve } from "node:path";
+import type { CAC } from "cac";
+
+import { parseDebateResult } from "../../../agents/run-directory.js";
+import { loadCalibrationEvidence } from "../../../agents/evidence-collector.js";
+
+// ─── calibrate-gather-evidence ──────────────────────────────────────────────
+
+interface GatheredEvidence {
+  ruleImpactAssessment: unknown[];
+  uncoveredStruggles: unknown[];
+  actionableGaps: unknown[];
+  priorEvidence: Record<string, unknown>;
+}
+
+/**
+ * Gather structured evidence for the Critic from run artifacts + cross-run data.
+ * Pure data extraction — no LLM needed.
+ */
+function gatherEvidence(runDir: string, proposedRuleIds: string[]): GatheredEvidence {
+  const result: GatheredEvidence = {
+    ruleImpactAssessment: [],
+    uncoveredStruggles: [],
+    actionableGaps: [],
+    priorEvidence: {},
+  };
+
+  // 1. conversion.json → ruleImpactAssessment, uncoveredStruggles
+  const convPath = join(runDir, "conversion.json");
+  if (existsSync(convPath)) {
+    try {
+      const conv = JSON.parse(readFileSync(convPath, "utf-8")) as Record<string, unknown>;
+      if (Array.isArray(conv["ruleImpactAssessment"])) {
+        result.ruleImpactAssessment = conv["ruleImpactAssessment"];
+      }
+      if (Array.isArray(conv["uncoveredStruggles"])) {
+        result.uncoveredStruggles = conv["uncoveredStruggles"];
+      }
+    } catch { /* ignore malformed */ }
+  }
+
+  // 2. gaps.json → actionable gaps
+  const gapsPath = join(runDir, "gaps.json");
+  if (existsSync(gapsPath)) {
+    try {
+      const gaps = JSON.parse(readFileSync(gapsPath, "utf-8")) as Record<string, unknown>;
+      const gapList = Array.isArray(gaps["gaps"]) ? gaps["gaps"] : [];
+      result.actionableGaps = gapList.filter(
+        (g): g is Record<string, unknown> =>
+          typeof g === "object" && g !== null && (g as Record<string, unknown>)["actionable"] === true
+      );
+    } catch { /* ignore malformed */ }
+  }
+
+  // 3. Prior evidence filtered to proposed rules only
+  if (proposedRuleIds.length > 0) {
+    const allEvidence = loadCalibrationEvidence();
+    const ruleSet = new Set(proposedRuleIds.map((id) => id.trim()));
+    for (const [ruleId, group] of Object.entries(allEvidence)) {
+      if (ruleSet.has(ruleId)) {
+        result.priorEvidence[ruleId] = group;
+      }
+    }
+  }
+
+  return result;
+}
+
+/**
+ * Extract proposed ruleIds from summary.md.
+ * Looks for rule IDs in markdown table rows or bullet points.
+ */
+function extractProposedRuleIds(runDir: string): string[] {
+  const summaryPath = join(runDir, "summary.md");
+  if (!existsSync(summaryPath)) return [];
+  try {
+    const content = readFileSync(summaryPath, "utf-8");
+    const ids = new Set<string>();
+    // Match rule IDs in backticks (common in markdown tables)
+    for (const match of content.matchAll(/`([a-z][\w-]*)`/g)) {
+      if (match[1]) ids.add(match[1]);
+    }
+    return [...ids];
+  } catch {
+    return [];
+  }
+}
+
+export function registerGatherEvidence(cli: CAC): void {
+  cli
+    .command(
+      "calibrate-gather-evidence <runDir>",
+      "Gather structured evidence for Critic from run artifacts + cross-run data"
+    )
+    .action((runDir: string) => {
+      const dir = resolve(runDir);
+      if (!existsSync(dir)) {
+        console.error(`Run directory not found: ${runDir}`);
+        process.exitCode = 1;
+        return;
+      }
+
+      const proposedRuleIds = extractProposedRuleIds(dir);
+      const evidence = gatherEvidence(dir, proposedRuleIds);
+
+      // Write to file for orchestrator to include in Critic prompt
+      const outPath = join(dir, "critic-evidence.json");
+      writeFileSync(outPath, JSON.stringify(evidence, null, 2) + "\n", "utf-8");
+      console.log(`Gathered evidence: ${evidence.ruleImpactAssessment.length} impact assessments, ${evidence.actionableGaps.length} gaps, ${Object.keys(evidence.priorEvidence).length} prior rules`);
+      console.log(`Written to ${outPath}`);
+    });
+}
+
+// ─── calibrate-finalize-debate ──────────────────────────────────────────────
+
+interface FinalizeResult {
+  action: "early-stop" | "continue" | "finalized";
+  stoppingReason?: string;
+}
+
+export function registerFinalizeDebate(cli: CAC): void {
+  cli
+    .command(
+      "calibrate-finalize-debate <runDir>",
+      "Check early-stop or determine stoppingReason after debate"
+    )
+    .action((runDir: string) => {
+      const dir = resolve(runDir);
+      if (!existsSync(dir)) {
+        console.error(`Run directory not found: ${runDir}`);
+        process.exitCode = 1;
+        return;
+      }
+
+      const debate = parseDebateResult(dir);
+      if (!debate) {
+        console.error("No debate.json found");
+        process.exitCode = 1;
+        return;
+      }
+
+      const debatePath = join(dir, "debate.json");
+      const raw = JSON.parse(readFileSync(debatePath, "utf-8")) as Record<string, unknown>;
+
+      // Case 1: Critic done, no Arbitrator yet → check early-stop
+      if (debate.critic && !debate.arbitrator) {
+        const reviews = debate.critic.reviews;
+        const allHighConfidenceReject = reviews.length > 0 && reviews.every((r) => {
+          return r.decision.trim().toUpperCase() === "REJECT" && r.confidence === "high";
+        });
+
+        if (allHighConfidenceReject) {
+          raw["stoppingReason"] = "all-high-confidence-reject";
+          writeFileSync(debatePath, JSON.stringify(raw, null, 2) + "\n", "utf-8");
+          const result: FinalizeResult = { action: "early-stop", stoppingReason: "all-high-confidence-reject" };
+          console.log(JSON.stringify(result));
+          // exit 0 = early-stop, orchestrator should skip Arbitrator
+          return;
+        }
+
+        const result: FinalizeResult = { action: "continue" };
+        console.log(JSON.stringify(result));
+        // exit 0 but action=continue → orchestrator proceeds to Arbitrator
+        return;
+      }
+
+      // Case 2: Both Critic and Arbitrator done → determine stoppingReason
+      if (debate.arbitrator) {
+        const decisions = debate.arbitrator.decisions;
+        const allHold = decisions.length > 0 && decisions.every((d) =>
+          d.decision.trim().toLowerCase() === "hold"
+        );
+
+        if (allHold) {
+          raw["stoppingReason"] = "low-confidence-hold";
+          writeFileSync(debatePath, JSON.stringify(raw, null, 2) + "\n", "utf-8");
+          const result: FinalizeResult = { action: "finalized", stoppingReason: "low-confidence-hold" };
+          console.log(JSON.stringify(result));
+          return;
+        }
+
+        // Normal completion — no stoppingReason needed
+        const result: FinalizeResult = { action: "finalized" };
+        console.log(JSON.stringify(result));
+        return;
+      }
+
+      // Fallback
+      const result: FinalizeResult = { action: "continue" };
+      console.log(JSON.stringify(result));
+    });
+}
diff --git a/src/cli/index.ts b/src/cli/index.ts
index 3ecb00ab..3074df24 100644
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@@ -32,6 +32,7 @@ import { registerCalibrateAnalyze } from "./commands/internal/calibrate-analyze.
 import { registerCalibrateEvaluate } from "./commands/internal/calibrate-evaluate.js";
 import { registerCalibrateGapReport } from "./commands/internal/calibrate-gap-report.js";
 import { registerCalibrateRun } from "./commands/internal/calibrate-run.js";
+import { registerGatherEvidence, registerFinalizeDebate } from "./commands/internal/calibrate-debate.js";
 import { registerFixtureManagement, registerEvidenceEnrich, registerEvidencePrune } from "./commands/internal/fixture-management.js";
 
 const require = createRequire(import.meta.url);
@@ -78,6 +79,8 @@ registerCalibrateAnalyze(cli);
 registerCalibrateEvaluate(cli);
 registerCalibrateGapReport(cli);
 registerCalibrateRun(cli);
+registerGatherEvidence(cli);
+registerFinalizeDebate(cli);
 registerFixtureManagement(cli);
 registerEvidenceEnrich(cli);
 registerEvidencePrune(cli);

From 06321d9ce3b1241e975febccb54896f30d62321a Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Sun, 29 Mar 2026 18:45:14 +0900
Subject: [PATCH 08/12] test: add unit tests for calibrate-debate CLI logic

8 tests covering:
- gather-evidence: conversion.json parsing, gap filtering, ruleId extraction
- finalize-debate: early-stop detection, mixed reviews, hold detection,
  normal completion, missing debate.json

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../internal/calibrate-debate.test.ts         | 169 ++++++++++++++++++
 1 file changed, 169 insertions(+)
 create mode 100644 src/cli/commands/internal/calibrate-debate.test.ts

diff --git a/src/cli/commands/internal/calibrate-debate.test.ts b/src/cli/commands/internal/calibrate-debate.test.ts
new file mode 100644
index 00000000..5cec4845
--- /dev/null
+++ b/src/cli/commands/internal/calibrate-debate.test.ts
@@ -0,0 +1,169 @@
+import { mkdtempSync, writeFileSync, readFileSync, existsSync } from "node:fs";
+import { join } from "node:path";
+import { tmpdir } from "node:os";
+import { rm } from "node:fs/promises";
+
+/**
+ * Import the functions directly to test as units.
+ * These are the same functions the CLI commands call.
+ */
+
+// We can't import the CLI registration functions directly (they register on CAC),
+// so we test the underlying logic by importing from the modules they depend on.
+import { parseDebateResult } from "../../../agents/run-directory.js";
+import { loadCalibrationEvidence } from "../../../agents/evidence-collector.js";
+
+describe("calibrate-gather-evidence logic", () => {
+  let runDir: string;
+
+  beforeEach(() => {
+    runDir = mkdtempSync(join(tmpdir(), "gather-test-"));
+  });
+
+  afterEach(async () => {
+    await rm(runDir, { recursive: true, force: true });
+  });
+
+  it("conversion.json ruleImpactAssessment is parseable", () => {
+    writeFileSync(join(runDir, "conversion.json"), JSON.stringify({
+      ruleImpactAssessment: [
+        { ruleId: "no-auto-layout", issueCount: 3, actualImpact: "easy" },
+      ],
+      uncoveredStruggles: [
+        { description: "border radius mismatch" },
+      ],
+    }));
+
+    const conv = JSON.parse(readFileSync(join(runDir, "conversion.json"), "utf-8")) as Record<string, unknown>;
+    expect(Array.isArray(conv["ruleImpactAssessment"])).toBe(true);
+    expect(conv["ruleImpactAssessment"]).toHaveLength(1);
+    expect(Array.isArray(conv["uncoveredStruggles"])).toBe(true);
+  });
+
+  it("gaps.json actionable filtering works", () => {
+    writeFileSync(join(runDir, "gaps.json"), JSON.stringify({
+      gaps: [
+        { category: "spacing", actionable: true, description: "padding off" },
+        { category: "rendering", actionable: false, description: "font fallback" },
+      ],
+    }));
+
+    const gaps = JSON.parse(readFileSync(join(runDir, "gaps.json"), "utf-8")) as Record<string, unknown>;
+    const gapList = Array.isArray(gaps["gaps"]) ? gaps["gaps"] : [];
+    const actionable = gapList.filter(
+      (g): g is Record<string, unknown> =>
+        typeof g === "object" && g !== null && (g as Record<string, unknown>)["actionable"] === true
+    );
+    expect(actionable).toHaveLength(1);
+    expect((actionable[0] as Record<string, unknown>)["description"]).toBe("padding off");
+  });
+
+  it("proposed ruleIds are extracted from summary.md", () => {
+    writeFileSync(join(runDir, "summary.md"), "## Overscored\n| `no-auto-layout` | -10 | easy |\n| `raw-value` | -3 | moderate |");
+
+    const content = readFileSync(join(runDir, "summary.md"), "utf-8");
+    const ids = new Set<string>();
+    for (const match of content.matchAll(/`([a-z][\w-]*)`/g)) {
+      if (match[1]) ids.add(match[1]);
+    }
+    expect([...ids]).toContain("no-auto-layout");
+    expect([...ids]).toContain("raw-value");
+  });
+});
+
+describe("calibrate-finalize-debate logic", () => {
+  let runDir: string;
+
+  beforeEach(() => {
+    runDir = mkdtempSync(join(tmpdir(), "finalize-test-"));
+  });
+
+  afterEach(async () => {
+    await rm(runDir, { recursive: true, force: true });
+  });
+
+  it("detects early-stop when all critic reviews are high-confidence REJECT", () => {
+    writeFileSync(join(runDir, "debate.json"), JSON.stringify({
+      critic: {
+        summary: "rejected=2",
+        reviews: [
+          { ruleId: "a", decision: "REJECT", confidence: "high", pro: [], con: ["weak"], reason: "x" },
+          { ruleId: "b", decision: "REJECT", confidence: "high", pro: [], con: ["weak"], reason: "y" },
+        ],
+      },
+    }));
+
+    const debate = parseDebateResult(runDir)!;
+    expect(debate.critic).not.toBeNull();
+    expect(debate.arbitrator).toBeNull();
+
+    const reviews = debate.critic!.reviews;
+    const allHighConfidenceReject = reviews.length > 0 && reviews.every((r) =>
+      r.decision.trim().toUpperCase() === "REJECT" && r.confidence === "high"
+    );
+    expect(allHighConfidenceReject).toBe(true);
+  });
+
+  it("does NOT early-stop when reviews are mixed", () => {
+    writeFileSync(join(runDir, "debate.json"), JSON.stringify({
+      critic: {
+        summary: "approved=1 rejected=1",
+        reviews: [
+          { ruleId: "a", decision: "APPROVE", confidence: "high", reason: "x" },
+          { ruleId: "b", decision: "REJECT", confidence: "medium", reason: "y" },
+        ],
+      },
+    }));
+
+    const debate = parseDebateResult(runDir)!;
+    const reviews = debate.critic!.reviews;
+    const allHighConfidenceReject = reviews.length > 0 && reviews.every((r) =>
+      r.decision.trim().toUpperCase() === "REJECT" && r.confidence === "high"
+    );
+    expect(allHighConfidenceReject).toBe(false);
+  });
+
+  it("detects low-confidence-hold when all arbitrator decisions are hold", () => {
+    writeFileSync(join(runDir, "debate.json"), JSON.stringify({
+      critic: { summary: "revised=2", reviews: [] },
+      arbitrator: {
+        summary: "hold=2",
+        decisions: [
+          { ruleId: "a", decision: "hold" },
+          { ruleId: "b", decision: "hold" },
+        ],
+      },
+    }));
+
+    const debate = parseDebateResult(runDir)!;
+    const decisions = debate.arbitrator!.decisions;
+    const allHold = decisions.length > 0 && decisions.every((d) =>
+      d.decision.trim().toLowerCase() === "hold"
+    );
+    expect(allHold).toBe(true);
+  });
+
+  it("no stoppingReason for normal completion", () => {
+    writeFileSync(join(runDir, "debate.json"), JSON.stringify({
+      critic: { summary: "approved=1", reviews: [] },
+      arbitrator: {
+        summary: "applied=1",
+        decisions: [
+          { ruleId: "a", decision: "applied", before: -10, after: -7 },
+        ],
+      },
+    }));
+
+    const debate = parseDebateResult(runDir)!;
+    const decisions = debate.arbitrator!.decisions;
+    const allHold = decisions.length > 0 && decisions.every((d) =>
+      d.decision.trim().toLowerCase() === "hold"
+    );
+    expect(allHold).toBe(false);
+  });
+
+  it("returns null for missing debate.json", () => {
+    const debate = parseDebateResult(runDir);
+    expect(debate).toBeNull();
+  });
+});

From 4d15939f794caff33d32933431e463a963175d6a Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Sun, 29 Mar 2026 18:50:26 +0900
Subject: [PATCH 09/12] fix: deterministic ruleId extraction + export
 gatherEvidence for testing

- calibrate-evaluate now writes proposed-rules.json (deterministic ruleId list)
- calibrate-gather-evidence reads proposed-rules.json first, falls back to
  summary.md regex (eliminates false positive risk)
- Export gatherEvidence, loadProposedRuleIds, GatheredEvidence for direct
  unit testing
- Tests rewritten to import functions directly (13 tests)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../internal/calibrate-debate.test.ts         | 98 ++++++++++++-------
 src/cli/commands/internal/calibrate-debate.ts | 23 +++--
 .../commands/internal/calibrate-evaluate.ts   |  9 ++
 3 files changed, 85 insertions(+), 45 deletions(-)

diff --git a/src/cli/commands/internal/calibrate-debate.test.ts b/src/cli/commands/internal/calibrate-debate.test.ts
index 5cec4845..261f4e84 100644
--- a/src/cli/commands/internal/calibrate-debate.test.ts
+++ b/src/cli/commands/internal/calibrate-debate.test.ts
@@ -1,19 +1,12 @@
-import { mkdtempSync, writeFileSync, readFileSync, existsSync } from "node:fs";
+import { mkdtempSync, writeFileSync, existsSync } from "node:fs";
 import { join } from "node:path";
 import { tmpdir } from "node:os";
 import { rm } from "node:fs/promises";
 
-/**
- * Import the functions directly to test as units.
- * These are the same functions the CLI commands call.
- */
-
-// We can't import the CLI registration functions directly (they register on CAC),
-// so we test the underlying logic by importing from the modules they depend on.
+import { gatherEvidence, loadProposedRuleIds } from "./calibrate-debate.js";
 import { parseDebateResult } from "../../../agents/run-directory.js";
-import { loadCalibrationEvidence } from "../../../agents/evidence-collector.js";
 
-describe("calibrate-gather-evidence logic", () => {
+describe("gatherEvidence", () => {
   let runDir: string;
 
   beforeEach(() => {
@@ -24,7 +17,7 @@ describe("calibrate-gather-evidence logic", () => {
     await rm(runDir, { recursive: true, force: true });
   });
 
-  it("conversion.json ruleImpactAssessment is parseable", () => {
+  it("extracts ruleImpactAssessment and uncoveredStruggles from conversion.json", () => {
     writeFileSync(join(runDir, "conversion.json"), JSON.stringify({
       ruleImpactAssessment: [
         { ruleId: "no-auto-layout", issueCount: 3, actualImpact: "easy" },
@@ -34,40 +27,73 @@ describe("calibrate-gather-evidence logic", () => {
       ],
     }));
 
-    const conv = JSON.parse(readFileSync(join(runDir, "conversion.json"), "utf-8")) as Record<string, unknown>;
-    expect(Array.isArray(conv["ruleImpactAssessment"])).toBe(true);
-    expect(conv["ruleImpactAssessment"]).toHaveLength(1);
-    expect(Array.isArray(conv["uncoveredStruggles"])).toBe(true);
+    const evidence = gatherEvidence(runDir, []);
+    expect(evidence.ruleImpactAssessment).toHaveLength(1);
+    expect(evidence.uncoveredStruggles).toHaveLength(1);
   });
 
-  it("gaps.json actionable filtering works", () => {
+  it("filters gaps to actionable only", () => {
     writeFileSync(join(runDir, "gaps.json"), JSON.stringify({
       gaps: [
         { category: "spacing", actionable: true, description: "padding off" },
         { category: "rendering", actionable: false, description: "font fallback" },
+        { category: "color", actionable: true, description: "wrong shade" },
       ],
     }));
 
-    const gaps = JSON.parse(readFileSync(join(runDir, "gaps.json"), "utf-8")) as Record<string, unknown>;
-    const gapList = Array.isArray(gaps["gaps"]) ? gaps["gaps"] : [];
-    const actionable = gapList.filter(
-      (g): g is Record<string, unknown> =>
-        typeof g === "object" && g !== null && (g as Record<string, unknown>)["actionable"] === true
-    );
-    expect(actionable).toHaveLength(1);
-    expect((actionable[0] as Record<string, unknown>)["description"]).toBe("padding off");
+    const evidence = gatherEvidence(runDir, []);
+    expect(evidence.actionableGaps).toHaveLength(2);
+  });
+
+  it("handles missing files gracefully", () => {
+    const evidence = gatherEvidence(runDir, []);
+    expect(evidence.ruleImpactAssessment).toHaveLength(0);
+    expect(evidence.uncoveredStruggles).toHaveLength(0);
+    expect(evidence.actionableGaps).toHaveLength(0);
+    expect(evidence.priorEvidence).toEqual({});
+  });
+
+  it("returns empty priorEvidence when no ruleIds proposed", () => {
+    const evidence = gatherEvidence(runDir, []);
+    expect(evidence.priorEvidence).toEqual({});
+  });
+});
+
+describe("loadProposedRuleIds", () => {
+  let runDir: string;
+
+  beforeEach(() => {
+    runDir = mkdtempSync(join(tmpdir(), "proposed-test-"));
+  });
+
+  afterEach(async () => {
+    await rm(runDir, { recursive: true, force: true });
   });
 
-  it("proposed ruleIds are extracted from summary.md", () => {
-    writeFileSync(join(runDir, "summary.md"), "## Overscored\n| `no-auto-layout` | -10 | easy |\n| `raw-value` | -3 | moderate |");
+  it("loads from proposed-rules.json when available", () => {
+    writeFileSync(join(runDir, "proposed-rules.json"), JSON.stringify(["no-auto-layout", "raw-value"]));
+    const ids = loadProposedRuleIds(runDir);
+    expect(ids).toEqual(["no-auto-layout", "raw-value"]);
+  });
 
-    const content = readFileSync(join(runDir, "summary.md"), "utf-8");
-    const ids = new Set<string>();
-    for (const match of content.matchAll(/`([a-z][\w-]*)`/g)) {
-      if (match[1]) ids.add(match[1]);
-    }
-    expect([...ids]).toContain("no-auto-layout");
-    expect([...ids]).toContain("raw-value");
+  it("falls back to summary.md regex when no proposed-rules.json", () => {
+    writeFileSync(join(runDir, "summary.md"), "## Overscored\n| `no-auto-layout` | -10 | easy |\n| `raw-value` | -3 |");
+    const ids = loadProposedRuleIds(runDir);
+    expect(ids).toContain("no-auto-layout");
+    expect(ids).toContain("raw-value");
+  });
+
+  it("returns empty for missing files", () => {
+    const ids = loadProposedRuleIds(runDir);
+    expect(ids).toEqual([]);
+  });
+
+  it("prefers proposed-rules.json over summary.md", () => {
+    writeFileSync(join(runDir, "proposed-rules.json"), JSON.stringify(["rule-a"]));
+    writeFileSync(join(runDir, "summary.md"), "| `rule-a` | | |\n| `rule-b` | | |");
+    const ids = loadProposedRuleIds(runDir);
+    // Should only have rule-a from proposed-rules.json, not rule-b from summary.md
+    expect(ids).toEqual(["rule-a"]);
   });
 });
 
@@ -94,9 +120,6 @@ describe("calibrate-finalize-debate logic", () => {
     }));
 
     const debate = parseDebateResult(runDir)!;
-    expect(debate.critic).not.toBeNull();
-    expect(debate.arbitrator).toBeNull();
-
     const reviews = debate.critic!.reviews;
     const allHighConfidenceReject = reviews.length > 0 && reviews.every((r) =>
       r.decision.trim().toUpperCase() === "REJECT" && r.confidence === "high"
@@ -163,7 +186,6 @@ describe("calibrate-finalize-debate logic", () => {
   });
 
   it("returns null for missing debate.json", () => {
-    const debate = parseDebateResult(runDir);
-    expect(debate).toBeNull();
+    expect(parseDebateResult(runDir)).toBeNull();
   });
 });
diff --git a/src/cli/commands/internal/calibrate-debate.ts b/src/cli/commands/internal/calibrate-debate.ts
index 8567ca88..9031eb77 100644
--- a/src/cli/commands/internal/calibrate-debate.ts
+++ b/src/cli/commands/internal/calibrate-debate.ts
@@ -7,7 +7,7 @@ import { loadCalibrationEvidence } from "../../../agents/evidence-collector.js";
 
 // ─── calibrate-gather-evidence ──────────────────────────────────────────────
 
-interface GatheredEvidence {
+export interface GatheredEvidence {
   ruleImpactAssessment: unknown[];
   uncoveredStruggles: unknown[];
   actionableGaps: unknown[];
@@ -18,7 +18,7 @@ interface GatheredEvidence {
  * Gather structured evidence for the Critic from run artifacts + cross-run data.
  * Pure data extraction — no LLM needed.
  */
-function gatherEvidence(runDir: string, proposedRuleIds: string[]): GatheredEvidence {
+export function gatherEvidence(runDir: string, proposedRuleIds: string[]): GatheredEvidence {
   const result: GatheredEvidence = {
     ruleImpactAssessment: [],
     uncoveredStruggles: [],
@@ -68,16 +68,25 @@ function gatherEvidence(runDir: string, proposedRuleIds: string[]): GatheredEvid
 }
 
 /**
- * Extract proposed ruleIds from summary.md.
- * Looks for rule IDs in markdown table rows or bullet points.
+ * Load proposed ruleIds from proposed-rules.json (written by calibrate-evaluate).
+ * Falls back to regex extraction from summary.md if file doesn't exist.
  */
-function extractProposedRuleIds(runDir: string): string[] {
+export function loadProposedRuleIds(runDir: string): string[] {
+  // Preferred: deterministic list from calibrate-evaluate
+  const proposedPath = join(runDir, "proposed-rules.json");
+  if (existsSync(proposedPath)) {
+    try {
+      const raw: unknown = JSON.parse(readFileSync(proposedPath, "utf-8"));
+      if (Array.isArray(raw)) return raw.filter((id): id is string => typeof id === "string");
+    } catch { /* fall through to regex */ }
+  }
+
+  // Fallback: extract from summary.md (may have false positives)
   const summaryPath = join(runDir, "summary.md");
   if (!existsSync(summaryPath)) return [];
   try {
     const content = readFileSync(summaryPath, "utf-8");
     const ids = new Set<string>();
-    // Match rule IDs in backticks (common in markdown tables)
     for (const match of content.matchAll(/`([a-z][\w-]*)`/g)) {
       if (match[1]) ids.add(match[1]);
     }
@@ -101,7 +110,7 @@ export function registerGatherEvidence(cli: CAC): void {
         return;
       }
 
-      const proposedRuleIds = extractProposedRuleIds(dir);
+      const proposedRuleIds = loadProposedRuleIds(dir);
       const evidence = gatherEvidence(dir, proposedRuleIds);
 
       // Write to file for orchestrator to include in Critic prompt
diff --git a/src/cli/commands/internal/calibrate-evaluate.ts b/src/cli/commands/internal/calibrate-evaluate.ts
index 02e9d637..45665042 100644
--- a/src/cli/commands/internal/calibrate-evaluate.ts
+++ b/src/cli/commands/internal/calibrate-evaluate.ts
@@ -82,6 +82,15 @@ export function registerCalibrateEvaluate(cli: CAC): void {
           mismatchCounts[key]++;
         }
 
+        // Write proposed ruleIds for deterministic evidence gathering
+        if (options.runDir && tuningOutput.adjustments.length > 0) {
+          const proposedIds = tuningOutput.adjustments.map(
+            (a: { ruleId: string }) => a.ruleId
+          );
+          const proposedPath = resolve(options.runDir, "proposed-rules.json");
+          await writeFile(proposedPath, JSON.stringify(proposedIds) + "\n", "utf-8");
+        }
+
         console.log(`\nEvaluation complete.`);
         console.log(`  Validated: ${mismatchCounts.validated}`);
         console.log(`  Overscored: ${mismatchCounts.overscored}`);

From 0e4c16be7e2a9ec1b400a4a9ec3c84f755805d38 Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Sun, 29 Mar 2026 18:58:38 +0900
Subject: [PATCH 10/12] fix: revert internal CLI error handling to stdout +
 exit 0

Internal calibration commands are consumed by subagents that parse stdout.
console.error + process.exitCode = 1 breaks this pattern.
Reverted all internal commands to console.log + exit 0 for subagent compatibility.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/cli/commands/internal/calibrate-debate.ts   |  9 +++------
 src/cli/commands/internal/fixture-management.ts | 12 ++++--------
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/src/cli/commands/internal/calibrate-debate.ts b/src/cli/commands/internal/calibrate-debate.ts
index 9031eb77..bc8f2d36 100644
--- a/src/cli/commands/internal/calibrate-debate.ts
+++ b/src/cli/commands/internal/calibrate-debate.ts
@@ -105,8 +105,7 @@ export function registerGatherEvidence(cli: CAC): void {
     .action((runDir: string) => {
       const dir = resolve(runDir);
       if (!existsSync(dir)) {
-        console.error(`Run directory not found: ${runDir}`);
-        process.exitCode = 1;
+        console.log(`Run directory not found: ${runDir}`);
         return;
       }
 
@@ -137,15 +136,13 @@ export function registerFinalizeDebate(cli: CAC): void {
     .action((runDir: string) => {
       const dir = resolve(runDir);
       if (!existsSync(dir)) {
-        console.error(`Run directory not found: ${runDir}`);
-        process.exitCode = 1;
+        console.log(`Run directory not found: ${runDir}`);
         return;
       }
 
       const debate = parseDebateResult(dir);
       if (!debate) {
-        console.error("No debate.json found");
-        process.exitCode = 1;
+        console.log("No debate.json found");
         return;
       }
 
diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts
index 4158a451..7664ea45 100644
--- a/src/cli/commands/internal/fixture-management.ts
+++ b/src/cli/commands/internal/fixture-management.ts
@@ -123,8 +123,7 @@ export function registerEvidenceEnrich(cli: CAC): void {
     .action((runDir: string) => {
       const resolvedDir = resolve(runDir);
       if (!existsSync(resolvedDir)) {
-        console.error(`Run directory not found: ${runDir}`);
-        process.exitCode = 1;
+        console.log(`Run directory not found: ${runDir}`);
         return;
       }
       const debate = parseDebateResult(resolvedDir);
@@ -136,8 +135,7 @@ export function registerEvidenceEnrich(cli: CAC): void {
       // Extract fixture name from run directory (e.g. "material3-kit--2026-03-26-0900" → "material3-kit")
       const { name: fixture } = parseRunDirName(basename(resolvedDir));
       if (!fixture) {
-        console.error("Cannot extract fixture name from run directory");
-        process.exitCode = 1;
+        console.log("Cannot extract fixture name from run directory");
         return;
       }
 
@@ -164,14 +162,12 @@ export function registerEvidencePrune(cli: CAC): void {
     )
     .action((runDir: string) => {
       if (!existsSync(resolve(runDir))) {
-        console.error(`Run directory not found: ${runDir}`);
-        process.exitCode = 1;
+        console.log(`Run directory not found: ${runDir}`);
         return;
       }
       const debate = parseDebateResult(resolve(runDir));
       if (!debate) {
-        console.error("No debate.json found — nothing to prune.");
-        process.exitCode = 1;
+        console.log("No debate.json found — nothing to prune.");
         return;
       }
 

From 3cdf5825efd78b3d7fe852da80df1fb2d411ef3f Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Sun, 29 Mar 2026 19:06:22 +0900
Subject: [PATCH 11/12] fix: guard debate.json re-read + include HOLD in
 decision enum

- finalize-debate: try/catch on second debate.json read, fallback to
  {action: "continue"} on parse failure (subagent safety)
- HOLD added to decision enum in evidence schema, enrichCalibrationEvidence
  signature, and CLI enrich command (was silently dropped)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/agents/contracts/evidence.ts                | 4 ++--
 src/agents/evidence-collector.ts                | 2 +-
 src/cli/commands/internal/calibrate-debate.ts   | 8 +++++++-
 src/cli/commands/internal/fixture-management.ts | 2 +-
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/agents/contracts/evidence.ts b/src/agents/contracts/evidence.ts
index 01035a43..ec65078c 100644
--- a/src/agents/contracts/evidence.ts
+++ b/src/agents/contracts/evidence.ts
@@ -12,7 +12,7 @@ export const CalibrationEvidenceEntrySchema = z.object({
   confidence: z.enum(["high", "medium", "low"]).optional(),
   pro: z.array(z.string()).optional(),
   con: z.array(z.string()).optional(),
-  decision: z.enum(["APPROVE", "REJECT", "REVISE"]).optional(),
+  decision: z.enum(["APPROVE", "REJECT", "REVISE", "HOLD"]).optional(),
 });
 
 export type CalibrationEvidenceEntry = z.infer<typeof CalibrationEvidenceEntrySchema>;
@@ -26,7 +26,7 @@ export const CrossRunEvidenceGroupSchema = z.object({
   allPro: z.array(z.string()).optional(),
   allCon: z.array(z.string()).optional(),
   lastConfidence: z.enum(["high", "medium", "low"]).optional(),
-  lastDecision: z.enum(["APPROVE", "REJECT", "REVISE"]).optional(),
+  lastDecision: z.enum(["APPROVE", "REJECT", "REVISE", "HOLD"]).optional(),
 });
 
 export type CrossRunEvidenceGroup = z.infer<typeof CrossRunEvidenceGroupSchema>;
diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts
index 3ad2d9fe..5f9e2d2f 100644
--- a/src/agents/evidence-collector.ts
+++ b/src/agents/evidence-collector.ts
@@ -149,7 +149,7 @@ export function enrichCalibrationEvidence(
     confidence?: "high" | "medium" | "low";
     pro?: string[];
     con?: string[];
-    decision?: "APPROVE" | "REJECT" | "REVISE";
+    decision?: "APPROVE" | "REJECT" | "REVISE" | "HOLD";
   }>,
   fixture: string,
   evidencePath: string = DEFAULT_CALIBRATION_PATH
diff --git a/src/cli/commands/internal/calibrate-debate.ts b/src/cli/commands/internal/calibrate-debate.ts
index bc8f2d36..7a228e29 100644
--- a/src/cli/commands/internal/calibrate-debate.ts
+++ b/src/cli/commands/internal/calibrate-debate.ts
@@ -147,7 +147,13 @@ export function registerFinalizeDebate(cli: CAC): void {
       }
 
       const debatePath = join(dir, "debate.json");
-      const raw = JSON.parse(readFileSync(debatePath, "utf-8")) as Record<string, unknown>;
+      let raw: Record<string, unknown>;
+      try {
+        raw = JSON.parse(readFileSync(debatePath, "utf-8")) as Record<string, unknown>;
+      } catch {
+        console.log(JSON.stringify({ action: "continue" }));
+        return;
+      }
 
       // Case 1: Critic done, no Arbitrator yet → check early-stop
       if (debate.critic && !debate.arbitrator) {
diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts
index 7664ea45..8ed8433e 100644
--- a/src/cli/commands/internal/fixture-management.ts
+++ b/src/cli/commands/internal/fixture-management.ts
@@ -145,7 +145,7 @@ export function registerEvidenceEnrich(cli: CAC): void {
         if (r.pro) entry.pro = r.pro;
         if (r.con) entry.con = r.con;
         const dec = r.decision;
-        if (dec === "APPROVE" || dec === "REJECT" || dec === "REVISE") entry.decision = dec;
+        if (dec === "APPROVE" || dec === "REJECT" || dec === "REVISE" || dec === "HOLD") entry.decision = dec;
         return entry;
       });
 

From 16112aaa7f77d742715254d77785fe4fd7a5597f Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Sun, 29 Mar 2026 19:20:52 +0900
Subject: [PATCH 12/12] fix: validate run dir format + normalize decision
 casing

- enrich-evidence: check timestamp suffix instead of dead !fixture check
  (parseRunDirName never returns falsy name)
- enrich-evidence: normalize decision to uppercase before persisting
  (handles mixed-case "reject"/"Reject" from Critic)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/cli/commands/internal/fixture-management.ts | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts
index 8ed8433e..85dfea03 100644
--- a/src/cli/commands/internal/fixture-management.ts
+++ b/src/cli/commands/internal/fixture-management.ts
@@ -133,9 +133,9 @@ export function registerEvidenceEnrich(cli: CAC): void {
       }
 
       // Extract fixture name from run directory (e.g. "material3-kit--2026-03-26-0900" → "material3-kit")
-      const { name: fixture } = parseRunDirName(basename(resolvedDir));
-      if (!fixture) {
-        console.log("Cannot extract fixture name from run directory");
+      const { name: fixture, timestamp } = parseRunDirName(basename(resolvedDir));
+      if (!timestamp) {
+        console.log(`Run directory "${basename(resolvedDir)}" does not match expected <name>--<timestamp> format`);
         return;
       }
 
@@ -144,7 +144,7 @@ export function registerEvidenceEnrich(cli: CAC): void {
         if (r.confidence) entry.confidence = r.confidence;
         if (r.pro) entry.pro = r.pro;
         if (r.con) entry.con = r.con;
-        const dec = r.decision;
+        const dec = r.decision.trim().toUpperCase();
         if (dec === "APPROVE" || dec === "REJECT" || dec === "REVISE" || dec === "HOLD") entry.decision = dec;
         return entry;
       });