diff --git a/.claude/agents/calibration/arbitrator.md b/.claude/agents/calibration/arbitrator.md
index bff26849..72d01f51 100644
--- a/.claude/agents/calibration/arbitrator.md
+++ b/.claude/agents/calibration/arbitrator.md
@@ -2,7 +2,7 @@
 name: calibration-arbitrator
 description: Makes final calibration decisions by weighing Runner and Critic. Applies approved changes to rule-config.ts and commits. Use after calibration-critic completes.
 tools: Read, Edit, Bash
-model: claude-sonnet-4-6
+model: claude-opus-4-6
 ---
 
 You are the Arbitrator agent in a calibration pipeline.
@@ -10,12 +10,16 @@ You receive the Runner's proposals and the Critic's reviews, and make final deci
 
 ## Decision Rules
 
-- **Both APPROVE** → apply Runner's proposed value
-- **Critic REJECT** → keep current score (no change)
-- **Critic REVISE** → apply the Critic's revised value
+- **Both APPROVE** → apply Runner's proposed value (decision: `"applied"`)
+- **Critic REJECT** → keep current score (decision: `"rejected"`)
+- **Critic REVISE** → apply the Critic's revised value (decision: `"revised"`)
 - **proposedDisable: true** → if both Runner and Critic agree, set `enabled: false` in `rule-config.ts`. Decision type: `"disabled"`. If Critic rejects the disable, treat as a normal score adjustment instead.
 - **New rule proposals** → record in `$RUN_DIR/debate.json` only, do NOT add to `rule-config.ts`
 
+### Self-consistency guard
+
+- If the Critic's confidence is `"low"` for a proposal → do NOT apply, regardless of decision. Set decision to `"hold"` with reason explaining insufficient confidence. The evidence will accumulate for future runs.
+
 ## After Deciding
 
 1. Apply approved changes to `src/core/rules/rule-config.ts`
@@ -39,16 +43,23 @@ Return this JSON structure:
 ```json
 {
   "timestamp": "<ISO8601>",
-  "summary": "applied=2 rejected=1 revised=1 newProposals=0",
+  "summary": "applied=1 revised=1 rejected=1 hold=1 newProposals=0",
   "decisions": [
-    {"ruleId": "X", "decision": "applied", "before": -10, "after": -7, "reason": "Critic revised, midpoint applied"},
-    {"ruleId": "X", "decision": "rejected", "reason": "Critic rejection compelling — insufficient evidence"},
-    {"ruleId": "X", "decision": "disabled", "reason": "Converged to zero impact across 3+ runs, all easy"}
+    {"ruleId": "X", "decision": "applied", "before": -10, "after": -7, "confidence": "high", "reason": "Strong evidence, applying Runner's value"},
+    {"ruleId": "X", "decision": "revised", "before": -10, "after": -8, "confidence": "medium", "reason": "Critic revised, midpoint applied"},
+    {"ruleId": "X", "decision": "rejected", "confidence": "medium", "reason": "Critic rejection compelling — insufficient evidence"},
+    {"ruleId": "X", "decision": "hold", "confidence": "low", "reason": "Low confidence — accumulate more evidence before applying"},
+    {"ruleId": "X", "decision": "disabled", "confidence": "high", "reason": "Converged to zero impact across 3+ runs, all easy"}
   ],
   "newRuleProposals": []
 }
 ```
 
+### Field requirements
+
+- **confidence**: carried from Critic's review for each decision
+- **Note**: `stoppingReason` is written by the orchestrator at the debate.json top level, not inside the arbitrator object
+
 ## Rules
 
 - **Do NOT write to ANY file except `src/core/rules/rule-config.ts`.** No log files, no `new-rule-proposals.md`, no `debate.json`, no `activity.jsonl`. The orchestrator handles ALL other file I/O.
@@ -56,3 +67,4 @@ Return this JSON structure:
 - Only modify `rule-config.ts` for approved score/severity changes.
 - Never force-push or amend existing commits.
 - If tests fail, revert everything and report which change caused the failure.
+- **Never apply changes with `confidence: "low"`.** Hold them for future evidence accumulation.
diff --git a/.claude/agents/calibration/critic.md b/.claude/agents/calibration/critic.md
index e4bf2632..d0257438 100644
--- a/.claude/agents/calibration/critic.md
+++ b/.claude/agents/calibration/critic.md
@@ -2,7 +2,7 @@
 name: calibration-critic
 description: Challenges calibration proposals from Runner. Rejects low-confidence or over-aggressive adjustments. Use after calibration-runner completes.
 tools: Read
-model: claude-sonnet-4-6
+model: claude-opus-4-6
 ---
 
 ## Common Review Framework
@@ -16,7 +16,17 @@ All critics follow this base protocol:
 ---
 
 You are the Critic agent in a calibration pipeline.
-You receive the Runner's proposals and challenge each one independently.
+You receive the Runner's proposals along with supporting evidence, and challenge each one independently.
+
+## Input Context
+
+You will receive:
+1. **Proposals** — from evaluation summary (overscored/underscored rules with proposed changes)
+2. **Converter assessment** — `ruleImpactAssessment` showing actual implementation difficulty per rule
+3. **Gap analysis** — actionable pixel gaps between Figma and generated code
+4. **Prior evidence** — cross-run calibration evidence for the proposed rules (accumulated from past runs)
+
+Use ALL inputs to form pro/con arguments. Do not rely on proposals alone.
 
 ## Rejection Rules
 
@@ -50,16 +60,46 @@ Return this JSON structure:
   "timestamp": "<ISO8601>",
   "summary": "approved=1 rejected=1 revised=1",
   "reviews": [
-    {"ruleId": "X", "decision": "APPROVE", "reason": "3 cases, high confidence"},
-    {"ruleId": "X", "decision": "REJECT", "reason": "Rule 1 — only 1 case with low confidence"},
-    {"ruleId": "X", "decision": "REVISE", "revised": -7, "reason": "Rule 2 — change too large, midpoint applied"}
+    {
+      "ruleId": "X",
+      "decision": "APPROVE",
+      "confidence": "high",
+      "pro": ["3 cases across fixtures show easy implementation", "converter rated actualImpact: easy"],
+      "con": ["all cases from same design system"],
+      "reason": "Strong cross-run evidence outweighs single-system concern"
+    },
+    {
+      "ruleId": "X",
+      "decision": "REJECT",
+      "confidence": "low",
+      "pro": ["1 case shows overscored"],
+      "con": ["only 1 fixture", "no gap analysis data supports this"],
+      "reason": "Rule 1 — only 1 case with low confidence"
+    },
+    {
+      "ruleId": "X",
+      "decision": "REVISE",
+      "revised": -7,
+      "confidence": "medium",
+      "pro": ["converter found moderate difficulty, current score implies hard"],
+      "con": ["gap analysis shows some pixel impact in this area"],
+      "reason": "Rule 2 — change too large, midpoint applied"
+    }
   ]
 }
 ```
 
+### Field requirements
+
+- **confidence**: `"high"` | `"medium"` | `"low"` — your assessment of the proposal's reliability
+- **pro**: array of evidence points supporting the proposed change
+- **con**: array of evidence points against the proposed change
+- **reason**: final verdict synthesizing pro/con
+
 ## Rules
 
 - **Do NOT write any files.** The orchestrator handles all file I/O.
 - Do NOT modify `src/rules/rule-config.ts`.
 - Be strict. When in doubt, REJECT or REVISE.
 - Return your full critique so the Arbitrator can decide.
+- **Every review MUST include pro, con, and confidence fields.** No exceptions.
diff --git a/.claude/commands/calibrate-loop.md b/.claude/commands/calibrate-loop.md
index 18386512..9fadb295 100644
--- a/.claude/commands/calibrate-loop.md
+++ b/.claude/commands/calibrate-loop.md
@@ -135,8 +135,19 @@ If zero proposals, write `$RUN_DIR/debate.json` with skip reason and jump to Ste
 
 ### Step 5 — Critic
 
+Gather supporting evidence (deterministic CLI — no LLM):
+
+```bash
+npx canicode calibrate-gather-evidence $RUN_DIR
+```
+
+This reads `conversion.json`, `gaps.json`, `summary.md`, and `data/calibration-evidence.json`, and writes a single `$RUN_DIR/critic-evidence.json` with structured data for the Critic.
+
+Read `$RUN_DIR/critic-evidence.json` and include it in the Critic prompt.
+
 Spawn the `calibration-critic` subagent. In the prompt:
-- Include only the proposal list (NOT the Converter's reasoning)
+- Include the proposal list from summary.md
+- Include the gathered evidence from `critic-evidence.json`
 - **Tell the agent: "Return your reviews as JSON. Do NOT write any files."**
 
 After the Critic returns, **you** write the JSON to `$RUN_DIR/debate.json`:
@@ -145,7 +156,16 @@ After the Critic returns, **you** write the JSON to `$RUN_DIR/debate.json`:
   "critic": {
     "timestamp": "<ISO8601>",
     "summary": "approved=<N> rejected=<N> revised=<N>",
-    "reviews": [ ... ]
+    "reviews": [
+      {
+        "ruleId": "X",
+        "decision": "APPROVE|REJECT|REVISE",
+        "confidence": "high|medium|low",
+        "pro": ["evidence supporting change"],
+        "con": ["evidence against change"],
+        "reason": "..."
+      }
+    ]
   }
 }
 ```
@@ -155,6 +175,22 @@ Append to `$RUN_DIR/activity.jsonl`:
 {"step":"Critic","timestamp":"<ISO8601>","result":"approved=<N> rejected=<N> revised=<N>","durationMs":<ms>}
 ```
 
+#### Early-stop check (deterministic CLI — no LLM)
+
+```bash
+npx canicode calibrate-finalize-debate $RUN_DIR
+```
+
+This outputs JSON: `{"action": "early-stop"|"continue", ...}`.
+
+- If `action` is `"early-stop"`: the CLI has already written `stoppingReason` to debate.json. Append to activity.jsonl:
+  ```json
+  {"step":"Arbitrator","timestamp":"<ISO8601>","result":"SKIPPED — early-stop: all proposals rejected with high confidence","durationMs":0}
+  ```
+  Jump to Step 6.5.
+
+- If `action` is `"continue"`: proceed to Step 6.
+
 ### Step 6 — Arbitrator
 
 Spawn the `calibration-arbitrator` subagent. In the prompt:
@@ -162,25 +198,51 @@ Spawn the `calibration-arbitrator` subagent. In the prompt:
 - **Tell the agent: "Return your decisions as JSON. Only edit rule-config.ts if applying changes. Do NOT write to logs."**
 
 After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the existing content and add the `arbitrator` field:
+
 ```json
 {
   "critic": { ... },
   "arbitrator": {
     "timestamp": "<ISO8601>",
-    "summary": "applied=<N> rejected=<N> revised=<N>",
-    "decisions": [ ... ]
+    "summary": "applied=<N> revised=<N> rejected=<N> hold=<N>",
+    "decisions": [
+      {
+        "ruleId": "X",
+        "decision": "applied|revised|rejected|hold|disabled",
+        "confidence": "high|medium|low",
+        "before": -10,
+        "after": -7,
+        "reason": "..."
+      }
+    ]
   }
 }
 ```
 
+Then finalize the debate (deterministic CLI — no LLM):
+
+```bash
+npx canicode calibrate-finalize-debate $RUN_DIR
+```
+
+This determines `stoppingReason` (if any) and writes it to debate.json. Outputs JSON with `action: "finalized"`.
+
 Append to `$RUN_DIR/activity.jsonl`:
 ```json
-{"step":"Arbitrator","timestamp":"<ISO8601>","result":"applied=<N> rejected=<N>","durationMs":<ms>}
+{"step":"Arbitrator","timestamp":"<ISO8601>","result":"applied=<N> rejected=<N> hold=<N>","durationMs":<ms>}
+```
+
+### Step 6.5 — Enrich and prune evidence
+
+After the debate (or early-stop), enrich `data/calibration-evidence.json` with the Critic's structured pro/con/confidence. This ensures cross-run evidence persists beyond the ephemeral `logs/` directory.
+
+```bash
+npx canicode calibrate-enrich-evidence $RUN_DIR
 ```
 
-### Step 6.5 — Prune evidence
+This reads `debate.json`, extracts the Critic's reviews (pro, con, confidence, decision), and updates matching entries in `data/calibration-evidence.json`. Runs for both normal and early-stop paths.
 
-After the Arbitrator applies changes, prune calibration evidence for the applied rules:
+Then prune calibration evidence for the applied rules:
 
 ```bash
 npx canicode calibrate-prune-evidence $RUN_DIR
@@ -209,7 +271,7 @@ Report the final summary: similarity, proposals, decisions, and path to `logs/ca
 
 - Each agent must be a SEPARATE subagent call (isolated context).
 - Pass only structured data between agents — never raw reasoning.
-- The Critic must NOT see the Runner's or Converter's reasoning, only the proposal list.
+- The Critic receives proposals + converter's ruleImpactAssessment + gaps + prior evidence (structured data, not free-form reasoning).
 - Only the Arbitrator may edit `rule-config.ts`.
 - Steps 1, 4, 7 are CLI commands — run them directly with Bash.
 - **CRITICAL: YOU write all files to $RUN_DIR. Subagents (Gap Analyzer, Critic, Arbitrator) MUST return JSON as text — tell them "Do NOT write any files." You are the only one who writes to $RUN_DIR.**
diff --git a/CLAUDE.md b/CLAUDE.md
index 285f79ec..6d1eb561 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -322,10 +322,22 @@ Process:
 3. Run `canicode visual-compare` — pixel-level comparison against Figma screenshot
 4. Analyze the diff image to categorize pixel gaps (`Gap Analyzer`)
 5. Compare conversion difficulty vs rule scores (`canicode calibrate-evaluate`)
-6. 6-agent debate loop (`/calibrate-loop`): Analysis → Converter → Gap Analyzer → Evaluation → Critic → Arbitrator
+6. Debate loop (`/calibrate-loop`): Analysis → Converter → Gap Analyzer → Evaluation → Critic → Arbitrator
+
+**Critic receives structured evidence** (#144):
+- Proposals from evaluation
+- Converter's `ruleImpactAssessment` (actual implementation difficulty per rule)
+- Gap analysis (actionable pixel gaps)
+- Prior cross-run evidence for proposed rules
+- Outputs structured pro/con arguments + confidence level per proposal
+
+**Early-stop and self-consistency** (#144):
+- All proposals rejected with high confidence → Arbitrator skipped (early-stop)
+- Low-confidence decisions → held (not applied), evidence accumulates for future runs (self-consistency)
+- `stoppingReason` recorded in debate.json for traceability
 
 **Cross-run evidence** accumulates across sessions in `data/`:
-- `calibration-evidence.json` — overscored/underscored rules (fed to Runner for stronger proposals)
+- `calibration-evidence.json` — overscored/underscored rules with confidence, pro/con, decision (fed to Critic for informed review)
 - `discovery-evidence.json` — uncovered gaps not covered by existing rules (fed to `/add-rule` Researcher)
 - Discovery evidence is filtered to exclude environment/tooling noise (font CDN, retina/DPI, network, CI constraints)
 - Evidence is pruned after rules are applied (calibration) or new rules are created (discovery)
diff --git a/src/agents/contracts/evidence.ts b/src/agents/contracts/evidence.ts
index b97625fb..ec65078c 100644
--- a/src/agents/contracts/evidence.ts
+++ b/src/agents/contracts/evidence.ts
@@ -8,6 +8,11 @@ export const CalibrationEvidenceEntrySchema = z.object({
   actualDifficulty: z.string(),
   fixture: z.string(),
   timestamp: z.string(),
+  // Phase 1 fields (#144) — optional for backward compatibility with existing evidence
+  confidence: z.enum(["high", "medium", "low"]).optional(),
+  pro: z.array(z.string()).optional(),
+  con: z.array(z.string()).optional(),
+  decision: z.enum(["APPROVE", "REJECT", "REVISE", "HOLD"]).optional(),
 });
 
 export type CalibrationEvidenceEntry = z.infer<typeof CalibrationEvidenceEntrySchema>;
@@ -17,6 +22,11 @@ export const CrossRunEvidenceGroupSchema = z.object({
   underscoredCount: z.number(),
   overscoredDifficulties: z.array(z.string()),
   underscoredDifficulties: z.array(z.string()),
+  // Aggregated pro/con from all entries for this rule
+  allPro: z.array(z.string()).optional(),
+  allCon: z.array(z.string()).optional(),
+  lastConfidence: z.enum(["high", "medium", "low"]).optional(),
+  lastDecision: z.enum(["APPROVE", "REJECT", "REVISE", "HOLD"]).optional(),
 });
 
 export type CrossRunEvidenceGroup = z.infer<typeof CrossRunEvidenceGroupSchema>;
diff --git a/src/agents/evidence-collector.test.ts b/src/agents/evidence-collector.test.ts
index 25258793..b2ced476 100644
--- a/src/agents/evidence-collector.test.ts
+++ b/src/agents/evidence-collector.test.ts
@@ -1,9 +1,10 @@
-import { mkdirSync, rmSync, writeFileSync, readFileSync } from "node:fs";
+import { existsSync, mkdirSync, rmSync, writeFileSync, readFileSync } from "node:fs";
 import { join } from "node:path";
 import { tmpdir } from "node:os";
 import {
   loadCalibrationEvidence,
   appendCalibrationEvidence,
+  enrichCalibrationEvidence,
   pruneCalibrationEvidence,
   loadDiscoveryEvidence,
   appendDiscoveryEvidence,
@@ -62,15 +63,33 @@ describe("evidence-collector", () => {
         underscoredCount: 1,
         overscoredDifficulties: ["easy", "moderate"],
         underscoredDifficulties: ["hard"],
+        allPro: [],
+        allCon: [],
       });
       expect(result["rule-b"]).toEqual({
         overscoredCount: 0,
         underscoredCount: 1,
         overscoredDifficulties: [],
         underscoredDifficulties: ["hard"],
+        allPro: [],
+        allCon: [],
       });
     });
 
+    it("deduplicates pro/con across entries", () => {
+      const entries: CalibrationEvidenceEntry[] = [
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1",
+          pro: ["easy in practice", "common pattern"], con: ["small fixture"] },
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx2", timestamp: "t2",
+          pro: ["easy in practice", "new evidence"], con: ["small fixture", "single run"] },
+      ];
+      writeFileSync(calPath, JSON.stringify(entries), "utf-8");
+
+      const result = loadCalibrationEvidence(calPath);
+      expect(result["rule-a"]!.allPro).toEqual(["easy in practice", "common pattern", "new evidence"]);
+      expect(result["rule-a"]!.allCon).toEqual(["small fixture", "single run"]);
+    });
+
     it("handles malformed JSON gracefully", () => {
       writeFileSync(calPath, "not json", "utf-8");
       const result = loadCalibrationEvidence(calPath);
@@ -136,6 +155,46 @@ describe("evidence-collector", () => {
     });
   });
 
+  describe("enrichCalibrationEvidence", () => {
+    it("enriches entries matching (ruleId, fixture)", () => {
+      const entries: CalibrationEvidenceEntry[] = [
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" },
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "moderate", fixture: "fx2", timestamp: "t2" },
+        { ruleId: "rule-b", type: "underscored", actualDifficulty: "hard", fixture: "fx1", timestamp: "t3" },
+      ];
+      writeFileSync(calPath, JSON.stringify(entries), "utf-8");
+
+      enrichCalibrationEvidence(
+        [{ ruleId: "rule-a", confidence: "high", pro: ["easy in practice"], con: ["only 1 case"], decision: "APPROVE" }],
+        "fx1",
+        calPath,
+      );
+
+      const result = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[];
+      // Only fx1 entry for rule-a is enriched
+      expect(result[0]!.confidence).toBe("high");
+      expect(result[0]!.pro).toEqual(["easy in practice"]);
+      // fx2 entry for rule-a is NOT enriched (different fixture)
+      expect(result[1]!.confidence).toBeUndefined();
+      // rule-b is NOT enriched (different ruleId)
+      expect(result[2]!.confidence).toBeUndefined();
+    });
+
+    it("does nothing when evidence file is empty", () => {
+      enrichCalibrationEvidence([{ ruleId: "rule-a" }], "fx1", calPath);
+      expect(existsSync(calPath)).toBe(false);
+    });
+
+    it("does nothing when reviews array is empty", () => {
+      writeFileSync(calPath, JSON.stringify([
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" },
+      ]), "utf-8");
+      enrichCalibrationEvidence([], "fx1", calPath);
+      const result = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[];
+      expect(result[0]!.confidence).toBeUndefined();
+    });
+  });
+
   describe("pruneCalibrationEvidence", () => {
     it("removes entries for specified ruleIds", () => {
       const entries: CalibrationEvidenceEntry[] = [
diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts
index 83391ed9..5f9e2d2f 100644
--- a/src/agents/evidence-collector.ts
+++ b/src/agents/evidence-collector.ts
@@ -63,6 +63,8 @@ export function loadCalibrationEvidence(
         underscoredCount: 0,
         overscoredDifficulties: [],
         underscoredDifficulties: [],
+        allPro: [],
+        allCon: [],
       };
       result[entry.ruleId] = group;
     }
@@ -74,6 +76,23 @@ export function loadCalibrationEvidence(
       group.underscoredCount++;
       group.underscoredDifficulties.push(entry.actualDifficulty);
     }
+
+    // Aggregate pro/con from enriched entries (deduplicated)
+    if (entry.pro) {
+      group.allPro ??= [];
+      for (const p of entry.pro) {
+        if (!group.allPro.includes(p)) group.allPro.push(p);
+      }
+    }
+    if (entry.con) {
+      group.allCon ??= [];
+      for (const c of entry.con) {
+        if (!group.allCon.includes(c)) group.allCon.push(c);
+      }
+    }
+    // Keep last confidence/decision (most recent entry wins)
+    if (entry.confidence) group.lastConfidence = entry.confidence;
+    if (entry.decision) group.lastDecision = entry.decision;
   }
 
   return result;
@@ -119,6 +138,51 @@ export function pruneCalibrationEvidence(
   writeJsonArray(evidencePath, pruned);
 }
 
+/**
+ * Enrich existing calibration evidence entries with Critic's structured review data.
+ * Matches by (ruleId, fixture) to avoid overwriting entries from other fixtures.
+ * Entries without a matching review are left unchanged.
+ */
+export function enrichCalibrationEvidence(
+  reviews: Array<{
+    ruleId: string;
+    confidence?: "high" | "medium" | "low";
+    pro?: string[];
+    con?: string[];
+    decision?: "APPROVE" | "REJECT" | "REVISE" | "HOLD";
+  }>,
+  fixture: string,
+  evidencePath: string = DEFAULT_CALIBRATION_PATH
+): void {
+  if (reviews.length === 0) return;
+  const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema);
+  if (existing.length === 0) return;
+
+  const reviewByRule = new Map(reviews.map((r) => [r.ruleId.trim(), r]));
+  const fixtureTrimmed = fixture.trim();
+
+  let matchCount = 0;
+  const enriched = existing.map((entry) => {
+    if (entry.fixture.trim() !== fixtureTrimmed) return entry;
+    const review = reviewByRule.get(entry.ruleId.trim());
+    if (!review) return entry;
+    matchCount++;
+    return {
+      ...entry,
+      ...(review.confidence && { confidence: review.confidence }),
+      ...(review.pro && { pro: review.pro }),
+      ...(review.con && { con: review.con }),
+      ...(review.decision && { decision: review.decision }),
+    };
+  });
+
+  if (matchCount === 0) {
+    console.warn(`[enrich] No entries matched fixture="${fixture}" — evidence unchanged`);
+    return;
+  }
+  writeJsonArray(evidencePath, enriched);
+}
+
 // --- Discovery evidence ---
 
 const DEFAULT_DISCOVERY_PATH = resolve("data/discovery-evidence.json");
diff --git a/src/agents/run-directory.test.ts b/src/agents/run-directory.test.ts
index d56d74a3..06253dfb 100644
--- a/src/agents/run-directory.test.ts
+++ b/src/agents/run-directory.test.ts
@@ -333,6 +333,63 @@ describe("checkConvergence", () => {
     expect(summary.converged).toBe(false);
     expect(summary.reason).toBe("no arbitrator result");
   });
+
+  it("converged on early-stop (stoppingReason + no arbitrator)", () => {
+    writeFileSync(
+      join(tempDir, "debate.json"),
+      JSON.stringify({
+        critic: { summary: "rejected=2", reviews: [] },
+        arbitrator: null,
+        stoppingReason: "all-high-confidence-reject",
+      }),
+    );
+    const summary = checkConvergence(tempDir);
+    expect(summary.converged).toBe(true);
+    expect(summary.reason).toContain("early-stop");
+    expect(summary.reason).toContain("all-high-confidence-reject");
+  });
+
+  it("hold decisions prevent convergence", () => {
+    writeFileSync(
+      join(tempDir, "debate.json"),
+      JSON.stringify({
+        arbitrator: {
+          summary: "hold=1",
+          decisions: [{ ruleId: "a", decision: "hold" }],
+        },
+      }),
+    );
+    const summary = checkConvergence(tempDir);
+    expect(summary.converged).toBe(false);
+    expect(summary.hold).toBe(1);
+  });
+
+  it("hold prevents convergence even in lenient mode", () => {
+    writeFileSync(
+      join(tempDir, "debate.json"),
+      JSON.stringify({
+        arbitrator: {
+          summary: "hold=1",
+          decisions: [{ ruleId: "a", decision: "hold" }],
+        },
+      }),
+    );
+    const summary = checkConvergence(tempDir, { lenient: true });
+    expect(summary.converged).toBe(false);
+  });
+
+  it("isConverged delegates to checkConvergence", () => {
+    writeFileSync(
+      join(tempDir, "debate.json"),
+      JSON.stringify({
+        critic: { summary: "rejected=1", reviews: [] },
+        arbitrator: null,
+        stoppingReason: "all-high-confidence-reject",
+      }),
+    );
+    // isConverged should also return true for early-stop
+    expect(isConverged(tempDir)).toBe(true);
+  });
 });
 
 describe("listCalibrationRuns", () => {
diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts
index f44133ce..028eb354 100644
--- a/src/agents/run-directory.ts
+++ b/src/agents/run-directory.ts
@@ -168,14 +168,19 @@ const DebateDecisionSchema = z.object({
   reason: z.string().optional(),
 }).passthrough();
 
+const CriticReviewSchema = z.object({
+  ruleId: z.string(),
+  decision: z.string(),
+  reason: z.string().optional(),
+  revised: z.number().optional(),
+  confidence: z.enum(["high", "medium", "low"]).optional(),
+  pro: z.array(z.string()).optional(),
+  con: z.array(z.string()).optional(),
+}).passthrough();
+
 const CriticSchema = z.object({
   summary: z.string(),
-  reviews: z.array(z.object({
-    ruleId: z.string(),
-    decision: z.string(),
-    reason: z.string().optional(),
-    revised: z.number().optional(),
-  }).passthrough()),
+  reviews: z.array(CriticReviewSchema),
 }).passthrough();
 
 const ArbitratorSchema = z.object({
@@ -184,10 +189,12 @@ const ArbitratorSchema = z.object({
   newRuleProposals: z.array(z.unknown()).optional(),
 }).passthrough();
 
+/** stoppingReason canonical location: debate.json top level (not inside arbitrator) */
 const DebateResultSchema = z.object({
   critic: CriticSchema.nullable().default(null),
   arbitrator: ArbitratorSchema.nullable().default(null),
   skipped: z.string().optional(),
+  stoppingReason: z.string().optional(),
 }).passthrough();
 
 /** A single decision from the Arbitrator in debate.json. */
@@ -253,6 +260,7 @@ export interface ConvergenceSummary {
   applied: number;
   revised: number;
   rejected: number;
+  hold: number;
   kept: number;
   total: number;
   reason: string;
@@ -266,36 +274,43 @@ export function checkConvergence(runDir: string, options?: ConvergenceOptions):
   const debate = parseDebateResult(runDir);
 
   if (!debate) {
-    return { converged: false, mode, applied: 0, revised: 0, rejected: 0, kept: 0, total: 0, reason: "no debate.json found" };
+    return { converged: false, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: "no debate.json found" };
   }
   if (debate.skipped) {
-    return { converged: true, mode, applied: 0, revised: 0, rejected: 0, kept: 0, total: 0, reason: debate.skipped };
+    return { converged: true, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: debate.skipped };
   }
   if (!debate.arbitrator) {
-    return { converged: false, mode, applied: 0, revised: 0, rejected: 0, kept: 0, total: 0, reason: "no arbitrator result" };
+    // Early-stop: Arbitrator skipped because all proposals rejected with high confidence
+    if (debate.stoppingReason) {
+      return { converged: true, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: `early-stop: ${debate.stoppingReason}` };
+    }
+    return { converged: false, mode, applied: 0, revised: 0, rejected: 0, hold: 0, kept: 0, total: 0, reason: "no arbitrator result" };
   }
 
   const decisions = debate.arbitrator.decisions;
   const applied = decisions.filter((d) => d.decision.trim().toLowerCase() === "applied").length;
   const revised = decisions.filter((d) => d.decision.trim().toLowerCase() === "revised").length;
   const rejected = decisions.filter((d) => d.decision.trim().toLowerCase() === "rejected").length;
-  const kept = decisions.length - applied - revised - rejected;
+  const hold = decisions.filter((d) => d.decision.trim().toLowerCase() === "hold").length;
+  const kept = decisions.length - applied - revised - rejected - hold;
   const total = decisions.length;
 
+  // hold = "not enough confidence to decide" → not converged (need more evidence)
   const converged = options?.lenient
-    ? (applied + revised) === 0
-    : (applied + revised) === 0 && rejected === 0;
+    ? (applied + revised + hold) === 0
+    : (applied + revised + hold) === 0 && rejected === 0;
 
   const parts: string[] = [];
   if (applied > 0) parts.push(`${applied} applied`);
   if (revised > 0) parts.push(`${revised} revised`);
   if (rejected > 0) parts.push(`${rejected} rejected`);
+  if (hold > 0) parts.push(`${hold} hold`);
   if (kept > 0) parts.push(`${kept} kept`);
   const countsStr = parts.length > 0 ? parts.join(", ") : "no decisions";
   const verdict = converged ? "converged" : "not converged";
   const reason = `${verdict} (${mode}) — ${countsStr} (${total} total)`;
 
-  return { converged, mode, applied, revised, rejected, kept, total, reason };
+  return { converged, mode, applied, revised, rejected, hold, kept, total, reason };
 }
 
 /** Options for convergence checking. */
@@ -309,22 +324,8 @@ export interface ConvergenceOptions {
 
 /**
  * Check if a calibration run has converged.
- * Strict: no applied/revised AND no rejected decisions.
- * Lenient: no applied/revised only (rejected proposals allowed).
+ * Delegates to checkConvergence to avoid duplicating early-stop / hold logic.
  */
 export function isConverged(runDir: string, options?: ConvergenceOptions): boolean {
-  const debate = parseDebateResult(runDir);
-  if (!debate) return false;
-  if (debate.skipped) return true; // zero proposals = converged
-  if (!debate.arbitrator) return false;
-  const decisions = debate.arbitrator.decisions;
-  const applied = decisions.filter((d) => {
-    const dec = d.decision.trim().toLowerCase();
-    return dec === "applied" || dec === "revised";
-  }).length;
-  const rejected = decisions.filter((d) => d.decision.trim().toLowerCase() === "rejected").length;
-  if (options?.lenient) {
-    return applied === 0;
-  }
-  return applied === 0 && rejected === 0;
+  return checkConvergence(runDir, options).converged;
 }
diff --git a/src/cli/commands/internal/calibrate-debate.test.ts b/src/cli/commands/internal/calibrate-debate.test.ts
new file mode 100644
index 00000000..261f4e84
--- /dev/null
+++ b/src/cli/commands/internal/calibrate-debate.test.ts
@@ -0,0 +1,191 @@
+import { mkdtempSync, writeFileSync, existsSync } from "node:fs";
+import { join } from "node:path";
+import { tmpdir } from "node:os";
+import { rm } from "node:fs/promises";
+
+import { gatherEvidence, loadProposedRuleIds } from "./calibrate-debate.js";
+import { parseDebateResult } from "../../../agents/run-directory.js";
+
+describe("gatherEvidence", () => {
+  let runDir: string;
+
+  beforeEach(() => {
+    runDir = mkdtempSync(join(tmpdir(), "gather-test-"));
+  });
+
+  afterEach(async () => {
+    await rm(runDir, { recursive: true, force: true });
+  });
+
+  it("extracts ruleImpactAssessment and uncoveredStruggles from conversion.json", () => {
+    writeFileSync(join(runDir, "conversion.json"), JSON.stringify({
+      ruleImpactAssessment: [
+        { ruleId: "no-auto-layout", issueCount: 3, actualImpact: "easy" },
+      ],
+      uncoveredStruggles: [
+        { description: "border radius mismatch" },
+      ],
+    }));
+
+    const evidence = gatherEvidence(runDir, []);
+    expect(evidence.ruleImpactAssessment).toHaveLength(1);
+    expect(evidence.uncoveredStruggles).toHaveLength(1);
+  });
+
+  it("filters gaps to actionable only", () => {
+    writeFileSync(join(runDir, "gaps.json"), JSON.stringify({
+      gaps: [
+        { category: "spacing", actionable: true, description: "padding off" },
+        { category: "rendering", actionable: false, description: "font fallback" },
+        { category: "color", actionable: true, description: "wrong shade" },
+      ],
+    }));
+
+    const evidence = gatherEvidence(runDir, []);
+    expect(evidence.actionableGaps).toHaveLength(2);
+  });
+
+  it("handles missing files gracefully", () => {
+    const evidence = gatherEvidence(runDir, []);
+    expect(evidence.ruleImpactAssessment).toHaveLength(0);
+    expect(evidence.uncoveredStruggles).toHaveLength(0);
+    expect(evidence.actionableGaps).toHaveLength(0);
+    expect(evidence.priorEvidence).toEqual({});
+  });
+
+  it("returns empty priorEvidence when no ruleIds proposed", () => {
+    const evidence = gatherEvidence(runDir, []);
+    expect(evidence.priorEvidence).toEqual({});
+  });
+});
+
+describe("loadProposedRuleIds", () => {
+  let runDir: string;
+
+  beforeEach(() => {
+    runDir = mkdtempSync(join(tmpdir(), "proposed-test-"));
+  });
+
+  afterEach(async () => {
+    await rm(runDir, { recursive: true, force: true });
+  });
+
+  it("loads from proposed-rules.json when available", () => {
+    writeFileSync(join(runDir, "proposed-rules.json"), JSON.stringify(["no-auto-layout", "raw-value"]));
+    const ids = loadProposedRuleIds(runDir);
+    expect(ids).toEqual(["no-auto-layout", "raw-value"]);
+  });
+
+  it("falls back to summary.md regex when no proposed-rules.json", () => {
+    writeFileSync(join(runDir, "summary.md"), "## Overscored\n| `no-auto-layout` | -10 | easy |\n| `raw-value` | -3 |");
+    const ids = loadProposedRuleIds(runDir);
+    expect(ids).toContain("no-auto-layout");
+    expect(ids).toContain("raw-value");
+  });
+
+  it("returns empty for missing files", () => {
+    const ids = loadProposedRuleIds(runDir);
+    expect(ids).toEqual([]);
+  });
+
+  it("prefers proposed-rules.json over summary.md", () => {
+    writeFileSync(join(runDir, "proposed-rules.json"), JSON.stringify(["rule-a"]));
+    writeFileSync(join(runDir, "summary.md"), "| `rule-a` | | |\n| `rule-b` | | |");
+    const ids = loadProposedRuleIds(runDir);
+    // Should only have rule-a from proposed-rules.json, not rule-b from summary.md
+    expect(ids).toEqual(["rule-a"]);
+  });
+});
+
+describe("calibrate-finalize-debate logic", () => {
+  let runDir: string;
+
+  beforeEach(() => {
+    runDir = mkdtempSync(join(tmpdir(), "finalize-test-"));
+  });
+
+  afterEach(async () => {
+    await rm(runDir, { recursive: true, force: true });
+  });
+
+  it("detects early-stop when all critic reviews are high-confidence REJECT", () => {
+    writeFileSync(join(runDir, "debate.json"), JSON.stringify({
+      critic: {
+        summary: "rejected=2",
+        reviews: [
+          { ruleId: "a", decision: "REJECT", confidence: "high", pro: [], con: ["weak"], reason: "x" },
+          { ruleId: "b", decision: "REJECT", confidence: "high", pro: [], con: ["weak"], reason: "y" },
+        ],
+      },
+    }));
+
+    const debate = parseDebateResult(runDir)!;
+    const reviews = debate.critic!.reviews;
+    const allHighConfidenceReject = reviews.length > 0 && reviews.every((r) =>
+      r.decision.trim().toUpperCase() === "REJECT" && r.confidence === "high"
+    );
+    expect(allHighConfidenceReject).toBe(true);
+  });
+
+  it("does NOT early-stop when reviews are mixed", () => {
+    writeFileSync(join(runDir, "debate.json"), JSON.stringify({
+      critic: {
+        summary: "approved=1 rejected=1",
+        reviews: [
+          { ruleId: "a", decision: "APPROVE", confidence: "high", reason: "x" },
+          { ruleId: "b", decision: "REJECT", confidence: "medium", reason: "y" },
+        ],
+      },
+    }));
+
+    const debate = parseDebateResult(runDir)!;
+    const reviews = debate.critic!.reviews;
+    const allHighConfidenceReject = reviews.length > 0 && reviews.every((r) =>
+      r.decision.trim().toUpperCase() === "REJECT" && r.confidence === "high"
+    );
+    expect(allHighConfidenceReject).toBe(false);
+  });
+
+  it("detects low-confidence-hold when all arbitrator decisions are hold", () => {
+    writeFileSync(join(runDir, "debate.json"), JSON.stringify({
+      critic: { summary: "revised=2", reviews: [] },
+      arbitrator: {
+        summary: "hold=2",
+        decisions: [
+          { ruleId: "a", decision: "hold" },
+          { ruleId: "b", decision: "hold" },
+        ],
+      },
+    }));
+
+    const debate = parseDebateResult(runDir)!;
+    const decisions = debate.arbitrator!.decisions;
+    const allHold = decisions.length > 0 && decisions.every((d) =>
+      d.decision.trim().toLowerCase() === "hold"
+    );
+    expect(allHold).toBe(true);
+  });
+
+  it("no stoppingReason for normal completion", () => {
+    writeFileSync(join(runDir, "debate.json"), JSON.stringify({
+      critic: { summary: "approved=1", reviews: [] },
+      arbitrator: {
+        summary: "applied=1",
+        decisions: [
+          { ruleId: "a", decision: "applied", before: -10, after: -7 },
+        ],
+      },
+    }));
+
+    const debate = parseDebateResult(runDir)!;
+    const decisions = debate.arbitrator!.decisions;
+    const allHold = decisions.length > 0 && decisions.every((d) =>
+      d.decision.trim().toLowerCase() === "hold"
+    );
+    expect(allHold).toBe(false);
+  });
+
+  it("returns null for missing debate.json", () => {
+    expect(parseDebateResult(runDir)).toBeNull();
+  });
+});
diff --git a/src/cli/commands/internal/calibrate-debate.ts b/src/cli/commands/internal/calibrate-debate.ts
new file mode 100644
index 00000000..7a228e29
--- /dev/null
+++ b/src/cli/commands/internal/calibrate-debate.ts
@@ -0,0 +1,205 @@
+import { existsSync, readFileSync, writeFileSync } from "node:fs";
+import { join, resolve } from "node:path";
+import type { CAC } from "cac";
+
+import { parseDebateResult } from "../../../agents/run-directory.js";
+import { loadCalibrationEvidence } from "../../../agents/evidence-collector.js";
+
+// ─── calibrate-gather-evidence ──────────────────────────────────────────────
+
+export interface GatheredEvidence {
+  ruleImpactAssessment: unknown[];
+  uncoveredStruggles: unknown[];
+  actionableGaps: unknown[];
+  priorEvidence: Record<string, unknown>;
+}
+
+/**
+ * Gather structured evidence for the Critic from run artifacts + cross-run data.
+ * Pure data extraction — no LLM needed.
+ */
+export function gatherEvidence(runDir: string, proposedRuleIds: string[]): GatheredEvidence {
+  const result: GatheredEvidence = {
+    ruleImpactAssessment: [],
+    uncoveredStruggles: [],
+    actionableGaps: [],
+    priorEvidence: {},
+  };
+
+  // 1. conversion.json → ruleImpactAssessment, uncoveredStruggles
+  const convPath = join(runDir, "conversion.json");
+  if (existsSync(convPath)) {
+    try {
+      const conv = JSON.parse(readFileSync(convPath, "utf-8")) as Record<string, unknown>;
+      if (Array.isArray(conv["ruleImpactAssessment"])) {
+        result.ruleImpactAssessment = conv["ruleImpactAssessment"];
+      }
+      if (Array.isArray(conv["uncoveredStruggles"])) {
+        result.uncoveredStruggles = conv["uncoveredStruggles"];
+      }
+    } catch { /* ignore malformed */ }
+  }
+
+  // 2. gaps.json → actionable gaps
+  const gapsPath = join(runDir, "gaps.json");
+  if (existsSync(gapsPath)) {
+    try {
+      const gaps = JSON.parse(readFileSync(gapsPath, "utf-8")) as Record<string, unknown>;
+      const gapList = Array.isArray(gaps["gaps"]) ? gaps["gaps"] : [];
+      result.actionableGaps = gapList.filter(
+        (g): g is Record<string, unknown> =>
+          typeof g === "object" && g !== null && (g as Record<string, unknown>)["actionable"] === true
+      );
+    } catch { /* ignore malformed */ }
+  }
+
+  // 3. Prior evidence filtered to proposed rules only
+  if (proposedRuleIds.length > 0) {
+    const allEvidence = loadCalibrationEvidence();
+    const ruleSet = new Set(proposedRuleIds.map((id) => id.trim()));
+    for (const [ruleId, group] of Object.entries(allEvidence)) {
+      if (ruleSet.has(ruleId)) {
+        result.priorEvidence[ruleId] = group;
+      }
+    }
+  }
+
+  return result;
+}
+
+/**
+ * Load proposed ruleIds from proposed-rules.json (written by calibrate-evaluate).
+ * Falls back to regex extraction from summary.md if file doesn't exist.
+ */
+export function loadProposedRuleIds(runDir: string): string[] {
+  // Preferred: deterministic list from calibrate-evaluate
+  const proposedPath = join(runDir, "proposed-rules.json");
+  if (existsSync(proposedPath)) {
+    try {
+      const raw: unknown = JSON.parse(readFileSync(proposedPath, "utf-8"));
+      if (Array.isArray(raw)) return raw.filter((id): id is string => typeof id === "string");
+    } catch { /* fall through to regex */ }
+  }
+
+  // Fallback: extract from summary.md (may have false positives)
+  const summaryPath = join(runDir, "summary.md");
+  if (!existsSync(summaryPath)) return [];
+  try {
+    const content = readFileSync(summaryPath, "utf-8");
+    const ids = new Set<string>();
+    for (const match of content.matchAll(/`([a-z][\w-]*)`/g)) {
+      if (match[1]) ids.add(match[1]);
+    }
+    return [...ids];
+  } catch {
+    return [];
+  }
+}
+
+export function registerGatherEvidence(cli: CAC): void {
+  cli
+    .command(
+      "calibrate-gather-evidence <runDir>",
+      "Gather structured evidence for Critic from run artifacts + cross-run data"
+    )
+    .action((runDir: string) => {
+      const dir = resolve(runDir);
+      if (!existsSync(dir)) {
+        console.log(`Run directory not found: ${runDir}`);
+        return;
+      }
+
+      const proposedRuleIds = loadProposedRuleIds(dir);
+      const evidence = gatherEvidence(dir, proposedRuleIds);
+
+      // Write to file for orchestrator to include in Critic prompt
+      const outPath = join(dir, "critic-evidence.json");
+      writeFileSync(outPath, JSON.stringify(evidence, null, 2) + "\n", "utf-8");
+      console.log(`Gathered evidence: ${evidence.ruleImpactAssessment.length} impact assessments, ${evidence.actionableGaps.length} gaps, ${Object.keys(evidence.priorEvidence).length} prior rules`);
+      console.log(`Written to ${outPath}`);
+    });
+}
+
+// ─── calibrate-finalize-debate ──────────────────────────────────────────────
+
+interface FinalizeResult {
+  action: "early-stop" | "continue" | "finalized";
+  stoppingReason?: string;
+}
+
+export function registerFinalizeDebate(cli: CAC): void {
+  cli
+    .command(
+      "calibrate-finalize-debate <runDir>",
+      "Check early-stop or determine stoppingReason after debate"
+    )
+    .action((runDir: string) => {
+      const dir = resolve(runDir);
+      if (!existsSync(dir)) {
+        console.log(`Run directory not found: ${runDir}`);
+        return;
+      }
+
+      const debate = parseDebateResult(dir);
+      if (!debate) {
+        console.log("No debate.json found");
+        return;
+      }
+
+      const debatePath = join(dir, "debate.json");
+      let raw: Record<string, unknown>;
+      try {
+        raw = JSON.parse(readFileSync(debatePath, "utf-8")) as Record<string, unknown>;
+      } catch {
+        console.log(JSON.stringify({ action: "continue" }));
+        return;
+      }
+
+      // Case 1: Critic done, no Arbitrator yet → check early-stop
+      if (debate.critic && !debate.arbitrator) {
+        const reviews = debate.critic.reviews;
+        const allHighConfidenceReject = reviews.length > 0 && reviews.every((r) => {
+          return r.decision.trim().toUpperCase() === "REJECT" && r.confidence === "high";
+        });
+
+        if (allHighConfidenceReject) {
+          raw["stoppingReason"] = "all-high-confidence-reject";
+          writeFileSync(debatePath, JSON.stringify(raw, null, 2) + "\n", "utf-8");
+          const result: FinalizeResult = { action: "early-stop", stoppingReason: "all-high-confidence-reject" };
+          console.log(JSON.stringify(result));
+          // exit 0 = early-stop, orchestrator should skip Arbitrator
+          return;
+        }
+
+        const result: FinalizeResult = { action: "continue" };
+        console.log(JSON.stringify(result));
+        // exit 0 but action=continue → orchestrator proceeds to Arbitrator
+        return;
+      }
+
+      // Case 2: Both Critic and Arbitrator done → determine stoppingReason
+      if (debate.arbitrator) {
+        const decisions = debate.arbitrator.decisions;
+        const allHold = decisions.length > 0 && decisions.every((d) =>
+          d.decision.trim().toLowerCase() === "hold"
+        );
+
+        if (allHold) {
+          raw["stoppingReason"] = "low-confidence-hold";
+          writeFileSync(debatePath, JSON.stringify(raw, null, 2) + "\n", "utf-8");
+          const result: FinalizeResult = { action: "finalized", stoppingReason: "low-confidence-hold" };
+          console.log(JSON.stringify(result));
+          return;
+        }
+
+        // Normal completion — no stoppingReason needed
+        const result: FinalizeResult = { action: "finalized" };
+        console.log(JSON.stringify(result));
+        return;
+      }
+
+      // Fallback
+      const result: FinalizeResult = { action: "continue" };
+      console.log(JSON.stringify(result));
+    });
+}
diff --git a/src/cli/commands/internal/calibrate-evaluate.ts b/src/cli/commands/internal/calibrate-evaluate.ts
index 02e9d637..45665042 100644
--- a/src/cli/commands/internal/calibrate-evaluate.ts
+++ b/src/cli/commands/internal/calibrate-evaluate.ts
@@ -82,6 +82,15 @@ export function registerCalibrateEvaluate(cli: CAC): void {
           mismatchCounts[key]++;
         }
 
+        // Write proposed ruleIds for deterministic evidence gathering
+        if (options.runDir && tuningOutput.adjustments.length > 0) {
+          const proposedIds = tuningOutput.adjustments.map(
+            (a: { ruleId: string }) => a.ruleId
+          );
+          const proposedPath = resolve(options.runDir, "proposed-rules.json");
+          await writeFile(proposedPath, JSON.stringify(proposedIds) + "\n", "utf-8");
+        }
+
         console.log(`\nEvaluation complete.`);
         console.log(`  Validated: ${mismatchCounts.validated}`);
         console.log(`  Overscored: ${mismatchCounts.overscored}`);
diff --git a/src/cli/commands/internal/fixture-management.ts b/src/cli/commands/internal/fixture-management.ts
index 35bf5161..85dfea03 100644
--- a/src/cli/commands/internal/fixture-management.ts
+++ b/src/cli/commands/internal/fixture-management.ts
@@ -1,5 +1,5 @@
 import { existsSync } from "node:fs";
-import { resolve } from "node:path";
+import { basename, resolve } from "node:path";
 import type { CAC } from "cac";
 
 import {
@@ -7,6 +7,7 @@ import {
   listDoneFixtures,
   moveFixtureToDone,
   parseDebateResult,
+  parseRunDirName,
   extractAppliedRuleIds,
   extractFixtureName,
   resolveLatestRunDir,
@@ -15,6 +16,7 @@ import {
 import {
   pruneCalibrationEvidence,
   pruneDiscoveryEvidence,
+  enrichCalibrationEvidence,
 } from "../../../agents/evidence-collector.js";
 
 export function registerFixtureManagement(cli: CAC): void {
@@ -112,6 +114,46 @@ export function registerFixtureManagement(cli: CAC): void {
     });
 }
 
+export function registerEvidenceEnrich(cli: CAC): void {
+  cli
+    .command(
+      "calibrate-enrich-evidence <runDir>",
+      "Enrich evidence with Critic's pro/con/confidence from debate.json"
+    )
+    .action((runDir: string) => {
+      const resolvedDir = resolve(runDir);
+      if (!existsSync(resolvedDir)) {
+        console.log(`Run directory not found: ${runDir}`);
+        return;
+      }
+      const debate = parseDebateResult(resolvedDir);
+      if (!debate?.critic) {
+        console.log("No critic reviews in debate.json — nothing to enrich.");
+        return;
+      }
+
+      // Extract fixture name from run directory (e.g. "material3-kit--2026-03-26-0900" → "material3-kit")
+      const { name: fixture, timestamp } = parseRunDirName(basename(resolvedDir));
+      if (!timestamp) {
+        console.log(`Run directory "${basename(resolvedDir)}" does not match expected <name>--<timestamp> format`);
+        return;
+      }
+
+      const reviews = debate.critic.reviews.map((r) => {
+        const entry: Parameters<typeof enrichCalibrationEvidence>[0][number] = { ruleId: r.ruleId };
+        if (r.confidence) entry.confidence = r.confidence;
+        if (r.pro) entry.pro = r.pro;
+        if (r.con) entry.con = r.con;
+        const dec = r.decision.trim().toUpperCase();
+        if (dec === "APPROVE" || dec === "REJECT" || dec === "REVISE" || dec === "HOLD") entry.decision = dec;
+        return entry;
+      });
+
+      enrichCalibrationEvidence(reviews, fixture);
+      console.log(`Enriched calibration evidence for fixture "${fixture}" with ${reviews.length} review(s)`);
+    });
+}
+
 export function registerEvidencePrune(cli: CAC): void {
   cli
     .command(
diff --git a/src/cli/index.ts b/src/cli/index.ts
index 9ee3caef..3074df24 100644
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@@ -32,7 +32,8 @@ import { registerCalibrateAnalyze } from "./commands/internal/calibrate-analyze.
 import { registerCalibrateEvaluate } from "./commands/internal/calibrate-evaluate.js";
 import { registerCalibrateGapReport } from "./commands/internal/calibrate-gap-report.js";
 import { registerCalibrateRun } from "./commands/internal/calibrate-run.js";
-import { registerFixtureManagement, registerEvidencePrune } from "./commands/internal/fixture-management.js";
+import { registerGatherEvidence, registerFinalizeDebate } from "./commands/internal/calibrate-debate.js";
+import { registerFixtureManagement, registerEvidenceEnrich, registerEvidencePrune } from "./commands/internal/fixture-management.js";
 
 const require = createRequire(import.meta.url);
 const pkg = require("../../package.json") as { version: string };
@@ -78,7 +79,10 @@ registerCalibrateAnalyze(cli);
 registerCalibrateEvaluate(cli);
 registerCalibrateGapReport(cli);
 registerCalibrateRun(cli);
+registerGatherEvidence(cli);
+registerFinalizeDebate(cli);
 registerFixtureManagement(cli);
+registerEvidenceEnrich(cli);
 registerEvidencePrune(cli);
 
 // ============================================