From 4f9f22cbc0db37400fae631713b55bda17a50925 Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Wed, 25 Mar 2026 01:21:07 +0900
Subject: [PATCH 01/11] fix: calibration pipeline retina compare, evidence
 prune, lenient convergence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- visual-compare: default Figma export scale 2, logical viewport + deviceScaleFactor
  so code.png matches @2x figma.png; CLI omits 1440×900 override; --figma-scale
- Figma image cache keys include scale; MCP visual-compare accepts figmaExportScale
- extractAppliedRuleIds / pruneCalibrationEvidence: trim + case-insensitive decisions
- appendCalibrationEvidence: replace same (ruleId, fixture) on re-run
- fixture-done --lenient-convergence + isConverged({ lenient }) for reject-only stalls
- Document lenient flag in calibrate-night command

Closes #14
---
 .claude/commands/calibrate-night.md    |  8 ++-
 src/agents/evidence-collector.test.ts  | 25 ++++++++
 src/agents/evidence-collector.ts       | 10 +--
 src/agents/run-directory.test.ts       | 59 ++++++++++++++++-
 src/agents/run-directory.ts            | 37 ++++++++---
 src/cli/docs.ts                        | 15 +++--
 src/cli/index.ts                       | 43 ++++++++++---
 src/core/engine/visual-compare.test.ts | 18 +++---
 src/core/engine/visual-compare.ts      | 89 ++++++++++++++++++++++----
 src/mcp/server.ts                      |  6 +-
 10 files changed, 257 insertions(+), 53 deletions(-)
diff --git a/.claude/commands/calibrate-night.md b/.claude/commands/calibrate-night.md
index 4dec57e7..6a3ed458 100644
--- a/.claude/commands/calibrate-night.md
+++ b/.claude/commands/calibrate-night.md
@@ -37,7 +37,13 @@ After each successful run, use the CLI to check convergence and move:
 npx canicode fixture-done <fixture-path> --run-dir $RUN_DIR
 ```
 
-This checks `debate.json` for convergence (`applied=0 AND rejected=0`) and moves the fixture to `done/`. If the fixture hasn't converged, the command exits with an error — that's expected, just skip and continue.
+This checks `debate.json` for convergence (`applied=0 AND rejected=0` by default) and moves the fixture to `done/`. If the fixture hasn't converged, the command exits with an error — that's expected, just skip and continue.
+
+If the same low-confidence proposals keep getting **rejected** and nothing is applied (issue #14), you can move anyway with **`--lenient-convergence`** (converged when there are no applied/revised decisions, ignoring rejects):
+
+```bash
+npx canicode fixture-done <fixture-path> --run-dir $RUN_DIR --lenient-convergence
+```
 
 Report which fixtures were moved to `done/`.
 
diff --git a/src/agents/evidence-collector.test.ts b/src/agents/evidence-collector.test.ts
index fb44e813..70b5ec86 100644
--- a/src/agents/evidence-collector.test.ts
+++ b/src/agents/evidence-collector.test.ts
@@ -102,6 +102,20 @@ describe("evidence-collector", () => {
       expect(raw).toHaveLength(2);
     });
 
+    it("replaces prior entry for same ruleId and fixture (latest run wins)", () => {
+      writeFileSync(calPath, JSON.stringify([
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" },
+      ]), "utf-8");
+
+      appendCalibrationEvidence([
+        { ruleId: "rule-a", type: "underscored", actualDifficulty: "hard", fixture: "fx1", timestamp: "t2" },
+      ], calPath);
+
+      const raw = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[];
+      expect(raw).toHaveLength(1);
+      expect(raw[0]!.type).toBe("underscored");
+    });
+
     it("does nothing for empty entries", () => {
       writeFileSync(calPath, "[]", "utf-8");
       appendCalibrationEvidence([], calPath);
@@ -126,6 +140,17 @@ describe("evidence-collector", () => {
       expect(raw[0]!.ruleId).toBe("rule-b");
     });
 
+    it("trims ruleIds when matching stored entries", () => {
+      writeFileSync(calPath, JSON.stringify([
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" },
+      ]), "utf-8");
+
+      pruneCalibrationEvidence(["  rule-a  "], calPath);
+
+      const raw = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[];
+      expect(raw).toHaveLength(0);
+    });
+
     it("does nothing for empty ruleIds", () => {
       const entries: CalibrationEvidenceEntry[] = [
         { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" },
diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts
index ed05b9bc..aa3e0e46 100644
--- a/src/agents/evidence-collector.ts
+++ b/src/agents/evidence-collector.ts
@@ -85,8 +85,10 @@ export function appendCalibrationEvidence(
 ): void {
   if (entries.length === 0) return;
   const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema);
-  existing.push(...entries);
-  writeJsonArray(evidencePath, existing);
+  const keys = new Set(entries.map((e) => `${e.ruleId.trim()}\0${e.fixture}`));
+  const withoutDupes = existing.filter((e) => !keys.has(`${e.ruleId.trim()}\0${e.fixture}`));
+  withoutDupes.push(...entries);
+  writeJsonArray(evidencePath, withoutDupes);
 }
 
 /**
@@ -97,9 +99,9 @@ export function pruneCalibrationEvidence(
   evidencePath: string = DEFAULT_CALIBRATION_PATH
 ): void {
   if (appliedRuleIds.length === 0) return;
-  const ruleSet = new Set(appliedRuleIds);
+  const ruleSet = new Set(appliedRuleIds.map((id) => id.trim()).filter((id) => id.length > 0));
   const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema);
-  const pruned = existing.filter((e) => !ruleSet.has(e.ruleId));
+  const pruned = existing.filter((e) => !ruleSet.has(e.ruleId.trim()));
   writeJsonArray(evidencePath, pruned);
 }
 
diff --git a/src/agents/run-directory.test.ts b/src/agents/run-directory.test.ts
index d02bd2b8..31321666 100644
--- a/src/agents/run-directory.test.ts
+++ b/src/agents/run-directory.test.ts
@@ -1,4 +1,4 @@
-import { mkdtempSync, existsSync } from "node:fs";
+import { mkdtempSync, existsSync, writeFileSync } from "node:fs";
 import { join, basename } from "node:path";
 import { tmpdir } from "node:os";
 import { rm } from "node:fs/promises";
@@ -8,6 +8,8 @@ import {
   createCalibrationRunDir,
   createRuleDiscoveryRunDir,
   listCalibrationRuns,
+  extractAppliedRuleIds,
+  isConverged,
 } from "./run-directory.js";
 
 describe("extractFixtureName", () => {
@@ -99,6 +101,61 @@ describe("createRuleDiscoveryRunDir", () => {
   });
 });
 
+describe("extractAppliedRuleIds", () => {
+  it("trims ruleId and normalizes decision casing", () => {
+    const ids = extractAppliedRuleIds({
+      critic: null,
+      arbitrator: {
+        summary: "test",
+        decisions: [
+          { ruleId: "  my-rule  ", decision: "Applied" },
+          { ruleId: "b", decision: "REVISED" },
+          { ruleId: "c", decision: "rejected" },
+        ],
+      },
+    });
+    expect(ids).toEqual(["my-rule", "b"]);
+  });
+});
+
+describe("isConverged", () => {
+  let tempDir: string;
+
+  beforeEach(() => {
+    tempDir = mkdtempSync(join(tmpdir(), "converge-test-"));
+  });
+
+  afterEach(async () => {
+    await rm(tempDir, { recursive: true, force: true });
+  });
+
+  it("strict: false when rejections remain with no applies", () => {
+    writeFileSync(
+      join(tempDir, "debate.json"),
+      JSON.stringify({
+        arbitrator: {
+          summary: "applied=0 rejected=1",
+          decisions: [{ ruleId: "x", decision: "rejected" }],
+        },
+      }),
+    );
+    expect(isConverged(tempDir)).toBe(false);
+  });
+
+  it("lenient: true when no applied/revised even if rejected", () => {
+    writeFileSync(
+      join(tempDir, "debate.json"),
+      JSON.stringify({
+        arbitrator: {
+          summary: "applied=0 rejected=1",
+          decisions: [{ ruleId: "x", decision: "rejected" }],
+        },
+      }),
+    );
+    expect(isConverged(tempDir, { lenient: true })).toBe(true);
+  });
+});
+
 describe("listCalibrationRuns", () => {
   const origCwd = process.cwd();
   let tempDir: string;
diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts
index 34ae9776..bebdad31 100644
--- a/src/agents/run-directory.ts
+++ b/src/agents/run-directory.ts
@@ -210,22 +210,43 @@ export function parseDebateResult(runDir: string): DebateResult | null {
 export function extractAppliedRuleIds(debate: DebateResult): string[] {
   if (!debate.arbitrator) return [];
   return debate.arbitrator.decisions
-    .filter((d) => d.decision === "applied" || d.decision === "revised")
-    .map((d) => d.ruleId);
+    .filter((d) => {
+      const dec = (d.decision ?? "").trim().toLowerCase();
+      return dec === "applied" || dec === "revised";
+    })
+    .map((d) => d.ruleId.trim())
+    .filter((id) => id.length > 0);
+}
+
+export interface ConvergenceOptions {
+  /**
+   * When true, converged iff no applied/revised decisions (ignore rejected count).
+   * Use when repeated reject loops block `fixture-done` but scores are stable (see issue #14).
+   */
+  lenient?: boolean | undefined;
 }
 
 /**
  * Check if a calibration run has converged.
- * Converged = debate exists AND applied=0 AND rejected=0.
- * This means all proposals were validated — no changes and no disagreements.
+ * Strict: no applied/revised AND no rejected decisions.
+ * Lenient: no applied/revised only (rejected proposals allowed).
  */
-export function isConverged(runDir: string): boolean {
+export function isConverged(runDir: string, options?: ConvergenceOptions): boolean {
   const debate = parseDebateResult(runDir);
   if (!debate) return false;
   if (debate.skipped) return true; // zero proposals = converged
   if (!debate.arbitrator) return false;
   const decisions = debate.arbitrator.decisions;
-  const applied = decisions.filter((d) => d.decision === "applied" || d.decision === "revised").length;
-  const rejected = decisions.filter((d) => d.decision === "rejected").length;
-  return applied === 0 && rejected === 0;
+  const changed = decisions.filter((d) => {
+    const dec = (d.decision ?? "").trim().toLowerCase();
+    return dec === "applied" || dec === "revised";
+  }).length;
+  const rejected = decisions.filter((d) => {
+    const dec = (d.decision ?? "").trim().toLowerCase();
+    return dec === "rejected";
+  }).length;
+  if (options?.lenient) {
+    return changed === 0;
+  }
+  return changed === 0 && rejected === 0;
 }
diff --git a/src/cli/docs.ts b/src/cli/docs.ts
index 92a7a563..6804ec5e 100644
--- a/src/cli/docs.ts
+++ b/src/cli/docs.ts
@@ -203,13 +203,14 @@ OPTIONS
   --figma-url <url>   Figma URL with node-id (required)
   --token <token>     Figma API token (or FIGMA_TOKEN env var)
   --output <dir>      Output directory (default: /tmp/canicode-visual-compare)
-  --width <px>        Viewport width override (default: match Figma screenshot)
-  --height <px>       Viewport height override (default: match Figma screenshot)
+  --width <px>        Logical viewport width in CSS px (omit = infer from Figma PNG ÷ export scale)
+  --height <px>       Logical viewport height in CSS px (omit = infer from Figma PNG ÷ export scale)
+  --figma-scale <n>   Figma Images API scale (default: 2, matches save-fixture @2x PNGs)
 
 OUTPUT FILES
   /tmp/canicode-visual-compare/
-    figma.png         Figma screenshot (scale=2)
-    code.png          Playwright render of your HTML
+    figma.png         Figma screenshot (default scale=2)
+    code.png          Playwright render (devicePixelRatio matched to export scale)
     diff.png          Pixel diff (red = different pixels)
 
 JSON OUTPUT (stdout)
@@ -225,9 +226,9 @@ JSON OUTPUT (stdout)
   }
 
 HOW IT WORKS
-  1. Fetches Figma screenshot via REST API (scale=2)
-  2. Reads screenshot dimensions
-  3. Renders your HTML with Playwright at the same viewport size
+  1. Fetches Figma screenshot via REST API (default scale=2)
+  2. Infers logical CSS viewport (PNG size ÷ scale) unless --width/--height are set
+  3. Renders HTML in Playwright with matching devicePixelRatio so code.png matches figma.png pixels
   4. Compares pixel-by-pixel with pixelmatch (threshold: 0.1)
   5. Returns similarity % and diff image
 
diff --git a/src/cli/index.ts b/src/cli/index.ts
index 4c8e8ea6..21699a3c 100644
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@@ -607,13 +607,23 @@ cli
   .option("--fixtures-dir <path>", "Fixtures root directory", { default: "fixtures" })
   .option("--force", "Skip convergence check")
   .option("--run-dir <path>", "Run directory to check for convergence")
-  .action((fixturePath: string, options: { fixturesDir?: string; force?: boolean; runDir?: string }) => {
+  .option(
+    "--lenient-convergence",
+    "Converged when no applied/revised decisions (ignore rejected; see calibration issue #14)"
+  )
+  .action(
+    (fixturePath: string, options: {
+      fixturesDir?: string;
+      force?: boolean;
+      runDir?: string;
+      lenientConvergence?: boolean;
+    }) => {
     if (!options.force) {
       if (!options.runDir) {
         console.error("Error: --run-dir required to check convergence (or use --force to skip check)");
         process.exit(1);
       }
-      if (!isConverged(resolve(options.runDir))) {
+      if (!isConverged(resolve(options.runDir), { lenient: options.lenientConvergence })) {
         const debate = parseDebateResult(resolve(options.runDir));
         const summary = debate?.arbitrator?.summary ?? debate?.skipped ?? "no debate.json found";
         console.error(`Error: fixture has not converged (${summary}). Use --force to override.`);
@@ -903,10 +913,11 @@ cli
   .option("--figma-url <url>", "Figma URL with node-id (required)")
   .option("--token <token>", "Figma API token (or use FIGMA_TOKEN env var)")
   .option("--output <dir>", "Output directory for screenshots and diff (default: /tmp/canicode-visual-compare)")
-  .option("--width <px>", "Viewport width (default: 1440)")
-  .option("--height <px>", "Viewport height (default: 900)")
+  .option("--width <px>", "Logical viewport width in CSS px (default: infer from Figma PNG ÷ export scale)")
+  .option("--height <px>", "Logical viewport height in CSS px (default: infer from Figma PNG ÷ export scale)")
+  .option("--figma-scale <n>", "Figma export scale (default: 2, matches save-fixture / @2x PNGs)")
   .example("  canicode visual-compare ./generated/index.html --figma-url 'https://www.figma.com/design/ABC/File?node-id=1-234'")
-  .action(async (codePath: string, options: VisualCompareOptions) => {
+  .action(async (codePath: string, options: VisualCompareOptions & { figmaScale?: string }) => {
     try {
       if (!options.figmaUrl) {
         console.error("Error: --figma-url is required");
@@ -921,16 +932,30 @@ cli
 
       const { visualCompare } = await import("../core/engine/visual-compare.js");
 
+      const exportScale =
+        options.figmaScale !== undefined ? Number(options.figmaScale) : undefined;
+      if (exportScale !== undefined && (!Number.isFinite(exportScale) || exportScale < 1)) {
+        console.error("Error: --figma-scale must be a number >= 1");
+        process.exit(1);
+      }
+
+      const hasViewportOverride = options.width !== undefined || options.height !== undefined;
+
       console.log("Comparing...");
       const result = await visualCompare({
         figmaUrl: options.figmaUrl,
         figmaToken: token,
         codePath: resolve(codePath),
         outputDir: options.output,
-        viewport: {
-          width: options.width ?? 1440,
-          height: options.height ?? 900,
-        },
+        ...(exportScale !== undefined ? { figmaExportScale: exportScale } : {}),
+        ...(hasViewportOverride
+          ? {
+              viewport: {
+                ...(options.width !== undefined ? { width: options.width } : {}),
+                ...(options.height !== undefined ? { height: options.height } : {}),
+              },
+            }
+          : {}),
       });
 
       // JSON output for programmatic use
diff --git a/src/core/engine/visual-compare.test.ts b/src/core/engine/visual-compare.test.ts
index 99eec658..3a1e15c4 100644
--- a/src/core/engine/visual-compare.test.ts
+++ b/src/core/engine/visual-compare.test.ts
@@ -38,9 +38,9 @@ import pixelmatch from "pixelmatch";
 const FIGMA_CACHE_DIR = "/tmp/canicode-figma-cache";
 
 /** Mirror of the private getFigmaCachePath in visual-compare.ts */
-function getFigmaCachePath(fileKey: string, nodeId: string): string {
+function getFigmaCachePath(fileKey: string, nodeId: string, scale: number = 2): string {
   const safeNodeId = nodeId.replace(/:/g, "-");
-  return resolve(FIGMA_CACHE_DIR, `${fileKey}_${safeNodeId}.png`);
+  return resolve(FIGMA_CACHE_DIR, `${fileKey}_${safeNodeId}@${scale}x.png`);
 }
 
 /** Mirror of the private isCacheFresh in visual-compare.ts */
@@ -133,24 +133,24 @@ function makeSolidPng(width: number, height: number, r: number, g: number, b: nu
 
 describe("getFigmaCachePath", () => {
   it("places the cache file inside FIGMA_CACHE_DIR", () => {
-    const path = getFigmaCachePath("abc123", "1:2");
+    const path = getFigmaCachePath("abc123", "1:2", 2);
     expect(path.startsWith(FIGMA_CACHE_DIR)).toBe(true);
   });
 
   it("replaces colons in nodeId with hyphens", () => {
-    const path = getFigmaCachePath("file", "10:20");
+    const path = getFigmaCachePath("file", "10:20", 2);
     expect(path).toContain("10-20");
     expect(path).not.toContain("10:20");
   });
 
-  it("combines fileKey and sanitized nodeId in the filename", () => {
-    const path = getFigmaCachePath("myfile", "3:45");
-    expect(path).toContain("myfile_3-45.png");
+  it("combines fileKey, sanitized nodeId, and scale in the filename", () => {
+    const path = getFigmaCachePath("myfile", "3:45", 2);
+    expect(path).toContain("myfile_3-45@2x.png");
   });
 
   it("handles nodeId without colons unchanged", () => {
-    const path = getFigmaCachePath("key", "node123");
-    expect(path).toContain("key_node123.png");
+    const path = getFigmaCachePath("key", "node123", 2);
+    expect(path).toContain("key_node123@2x.png");
   });
 });
 
diff --git a/src/core/engine/visual-compare.ts b/src/core/engine/visual-compare.ts
index 7f525fde..12029fec 100644
--- a/src/core/engine/visual-compare.ts
+++ b/src/core/engine/visual-compare.ts
@@ -24,7 +24,16 @@ export interface VisualCompareOptions {
   figmaToken: string;
   codePath: string;
   outputDir?: string | undefined;
-  viewport?: { width: number; height: number } | undefined;
+  /**
+   * Logical CSS viewport (CSS pixels). Omit a dimension to infer from the Figma PNG
+   * using `figmaExportScale`. When the whole object is omitted, both dimensions are inferred.
+   */
+  viewport?: { width?: number; height?: number } | undefined;
+  /**
+   * Figma Images API `scale` and assumed scale for fixture `figma.png` (e.g. from `save-fixture`).
+   * Default 2 matches REST exports and avoids comparing a @2x PNG against a 1× Playwright capture.
+   */
+  figmaExportScale?: number | undefined;
 }
 
 const FIGMA_CACHE_DIR = "/tmp/canicode-figma-cache";
@@ -33,10 +42,10 @@ const FIGMA_CACHE_TTL_MS = 60 * 60 * 1000; // 1 hour
 /**
  * Get the cache path for a given fileKey + nodeId combination.
  */
-function getFigmaCachePath(fileKey: string, nodeId: string): string {
+function getFigmaCachePath(fileKey: string, nodeId: string, scale: number): string {
   // Sanitize nodeId for use as filename (replace : with -)
   const safeNodeId = nodeId.replace(/:/g, "-");
-  return resolve(FIGMA_CACHE_DIR, `${fileKey}_${safeNodeId}.png`);
+  return resolve(FIGMA_CACHE_DIR, `${fileKey}_${safeNodeId}@${scale}x.png`);
 }
 
 /**
@@ -57,8 +66,9 @@ async function fetchFigmaScreenshot(
   nodeId: string,
   token: string,
   outputPath: string,
+  scale: number,
 ): Promise<void> {
-  const cachePath = getFigmaCachePath(fileKey, nodeId);
+  const cachePath = getFigmaCachePath(fileKey, nodeId, scale);
 
   // Return cached version if fresh
   if (isCacheFresh(cachePath)) {
@@ -68,7 +78,7 @@ async function fetchFigmaScreenshot(
   }
 
   const res = await fetch(
-    `https://api.figma.com/v1/images/${fileKey}?ids=${nodeId}&format=png&scale=1`,
+    `https://api.figma.com/v1/images/${fileKey}?ids=${nodeId}&format=png&scale=${scale}`,
     { headers: { "X-Figma-Token": token } },
   );
   if (!res.ok) throw new Error(`Figma Images API: ${res.status} ${res.statusText}`);
@@ -91,18 +101,45 @@ async function fetchFigmaScreenshot(
   writeFileSync(cachePath, buffer);
 }
 
+/**
+ * Infer device pixel ratio so the Playwright screenshot matches Figma PNG pixel dimensions.
+ */
+function inferDeviceScaleFactor(
+  pngW: number,
+  pngH: number,
+  logicalW: number,
+  logicalH: number,
+  fallback: number,
+): number {
+  if (logicalW <= 0 || logicalH <= 0) return 1;
+  const sx = pngW / logicalW;
+  const sy = pngH / logicalH;
+  const rounded = Math.round((sx + sy) / 2);
+  if (rounded >= 2 && Math.abs(sx - rounded) < 0.08 && Math.abs(sy - rounded) < 0.08) {
+    return rounded;
+  }
+  if (Math.abs(sx - 1) < 0.02 && Math.abs(sy - 1) < 0.02) return 1;
+  return fallback >= 2 ? fallback : Math.max(1, Math.round(sx));
+}
+
 /**
  * Render HTML file with Playwright and take a screenshot.
+ * @param deviceScaleFactor - Pass 2 when the Figma reference is @2x and `viewport` is logical CSS size.
  */
 export async function renderCodeScreenshot(
   codePath: string,
   outputPath: string,
-  viewport: { width: number; height: number },
+  logicalViewport: { width: number; height: number },
+  deviceScaleFactor: number = 1,
 ): Promise<void> {
   // Dynamic import — playwright is an optional dependency
   const { chromium } = await import("playwright");
   const browser = await chromium.launch();
-  const page = await browser.newPage({ viewport });
+  const context = await browser.newContext({
+    viewport: logicalViewport,
+    deviceScaleFactor,
+  });
+  const page = await context.newPage();
 
   await page.goto(`file://${resolve(codePath)}`, {
     waitUntil: "networkidle",
@@ -209,22 +246,50 @@ export async function visualCompare(options: VisualCompareOptions): Promise<Visu
   const nodeId = nodeIdMatch?.[1]?.replace(/-/g, ":");
   if (!nodeId) throw new Error("Invalid Figma URL — missing node-id");
 
+  const exportScale = options.figmaExportScale ?? 2;
+
   // Step 1: Fetch Figma screenshot (skip if already cached in output dir)
   if (existsSync(figmaScreenshotPath)) {
     // Reuse cached figma.png — same design, no need to re-fetch
   } else {
-    await fetchFigmaScreenshot(fileKey, nodeId, options.figmaToken, figmaScreenshotPath);
+    await fetchFigmaScreenshot(fileKey, nodeId, options.figmaToken, figmaScreenshotPath, exportScale);
     if (!existsSync(figmaScreenshotPath)) {
       throw new Error(`Figma screenshot was not created at expected path: ${figmaScreenshotPath}`);
     }
   }
 
-  // Step 2: Read Figma screenshot dimensions, use as viewport for code rendering
+  // Step 2: Logical viewport + deviceScaleFactor so code.png matches figma.png pixels (@2x, etc.)
   const figmaPng = PNG.sync.read(readFileSync(figmaScreenshotPath));
-  const viewport = options.viewport ?? { width: figmaPng.width, height: figmaPng.height };
+  const hasViewportOverride = options.viewport !== undefined;
+  let logicalW: number;
+  let logicalH: number;
+  let deviceScaleFactor: number;
+
+  if (!hasViewportOverride) {
+    logicalW = Math.max(1, Math.round(figmaPng.width / exportScale));
+    logicalH = Math.max(1, Math.round(figmaPng.height / exportScale));
+    deviceScaleFactor = exportScale;
+  } else {
+    logicalW =
+      options.viewport!.width ?? Math.max(1, Math.round(figmaPng.width / exportScale));
+    logicalH =
+      options.viewport!.height ?? Math.max(1, Math.round(figmaPng.height / exportScale));
+    deviceScaleFactor = inferDeviceScaleFactor(
+      figmaPng.width,
+      figmaPng.height,
+      logicalW,
+      logicalH,
+      exportScale,
+    );
+  }
 
-  // Step 3: Render code screenshot at the same size
-  await renderCodeScreenshot(options.codePath, codeScreenshotPath, viewport);
+  // Step 3: Render code screenshot at matching physical resolution
+  await renderCodeScreenshot(
+    options.codePath,
+    codeScreenshotPath,
+    { width: logicalW, height: logicalH },
+    deviceScaleFactor,
+  );
 
   // Validate both screenshots exist before comparing
   if (!existsSync(codeScreenshotPath)) {
diff --git a/src/mcp/server.ts b/src/mcp/server.ts
index 5e2f103f..3678233c 100644
--- a/src/mcp/server.ts
+++ b/src/mcp/server.ts
@@ -416,7 +416,7 @@ fetches the Figma screenshot, and computes pixel-level similarity using pixelmat
 Returns similarity percentage (0-100%), diff pixel count, and paths to
 the Figma screenshot, code screenshot, and diff image.
 
-Viewport is automatically matched to the Figma screenshot dimensions.
+Logical viewport and device pixel ratio are inferred from the Figma PNG export scale (default 2×) so code.png matches figma.png pixels.
 
 Requires: Playwright with Chromium installed, Figma API token.`,
   {
@@ -424,6 +424,7 @@ Requires: Playwright with Chromium installed, Figma API token.`,
     figmaUrl: z.string().describe("Figma URL with node-id (e.g., https://www.figma.com/design/ABC/File?node-id=1-234)"),
     token: z.string().optional().describe("Figma API token (falls back to FIGMA_TOKEN env var)"),
     outputDir: z.string().optional().describe("Output directory for screenshots (default: /tmp/canicode-visual-compare)"),
+    figmaExportScale: z.number().int().min(1).max(4).optional().describe("Figma export scale (default 2, matches @2x fixture PNGs)"),
   },
   {
     readOnlyHint: true,
@@ -431,7 +432,7 @@ Requires: Playwright with Chromium installed, Figma API token.`,
     openWorldHint: true,
     title: "Visual Compare",
   },
-  async ({ codePath, figmaUrl, token, outputDir }) => {
+  async ({ codePath, figmaUrl, token, outputDir, figmaExportScale }) => {
     try {
       const { visualCompare } = await import("../core/engine/visual-compare.js");
       const figmaToken = token ?? process.env["FIGMA_TOKEN"];
@@ -447,6 +448,7 @@ Requires: Playwright with Chromium installed, Figma API token.`,
         figmaToken,
         codePath,
         outputDir,
+        ...(figmaExportScale !== undefined ? { figmaExportScale } : {}),
       });
 
       return {

From 7bc939c0ae090188dfd6e5953dd689ff50401501 Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Wed, 25 Mar 2026 01:28:22 +0900
Subject: [PATCH 02/11] fix(agents): harden debate parsing and evidence dedupe
 keys

- Guard extractAppliedRuleIds and isConverged when arbitrator.decisions is not an array
- Use trimmed fixture in calibration evidence dedupe key
- Add tests for malformed debate.json
- Add docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md (calibration & rule discovery)
- Align visual-compare.test.ts comment with padPng behavior

Made-with: Cursor
---
 ...EARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md | 201 ++++++++++++++++++
 src/agents/evidence-collector.ts              |   6 +-
 src/agents/run-directory.test.ts              |  21 ++
 src/agents/run-directory.ts                   |   5 +-
 src/core/engine/visual-compare.test.ts        |   2 +-
 5 files changed, 231 insertions(+), 4 deletions(-)
 create mode 100644 docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md

diff --git a/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md b/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md
new file mode 100644
index 00000000..080846ac
--- /dev/null
+++ b/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md
@@ -0,0 +1,201 @@
+# Research Report: Calibration & Rule Discovery (Branch `fix/calibration-pipeline-issue-14`)
+
+**Scope:** Deep technical review of CanICode’s internal calibration and rule-discovery pipelines as reflected on the current branch, including recent fixes for retina-aligned visual comparison, cross-run evidence handling, and convergence semantics for nightly workflows.
+
+**Audience:** Maintainers opening or reviewing PRs that touch `visual-compare`, agent orchestration, or calibration CLI commands.
+
+---
+
+## 1. Executive summary
+
+CanICode’s north star is answerable in one question: **can AI implement a scoped Figma design pixel-accurately?** **Calibration** grounds rule **scores** in measured conversion difficulty (including **visual-compare** similarity). **Rule discovery** turns recurring **gap** patterns into new rules, validated by A/B visual runs.
+
+On this branch, three themes stand out as materially improving scientific validity and operator ergonomics:
+
+1. **Retina / export-scale alignment** — Figma exports and Playwright captures now share the same physical pixel grid by default (`figmaExportScale`, inferred logical viewport, `deviceScaleFactor`). This removes a large class of false “diff” signals that previously looked like design errors.
+2. **Calibration evidence hygiene** — Appending calibration evidence **replaces** the prior row for the same `(ruleId, fixture)` pair so the cross-run store reflects the **latest** assessment, not an unbounded pile of contradictory rows.
+3. **Convergence for automation (issue #14)** — Strict convergence (`no applied/revised` **and** `no rejected`) is correct for “the debate fully settled.” For nightlies, **`--lenient-convergence`** allows `fixture-done` when the Arbitrator applied nothing but still logged rejections—a common pattern when the Critic repeatedly blocks borderline proposals while scores are stable.
+
+The branch also carries substantial **fixture expansion** (calibrated designs moved under `fixtures/done`, new fixture JSON/screenshots). That increases empirical coverage but increases repo size and CI time—trade-offs called out in Section 8.
+
+---
+
+## 2. System map
+
+### 2.1 Channels vs internal pipelines
+
+| Surface | Role in the metric story |
+|--------|---------------------------|
+| CLI `analyze` | Produces rule findings and scores from structure/fixture JSON |
+| CLI `visual-compare` | Measures **pixel** similarity between Figma PNG and rendered HTML |
+| MCP / skill | Same analysis with optional hybrid enrichment from Figma MCP |
+| **`/calibrate-loop`** (Claude Code) | Full loop: analyze → convert whole scope → gap analysis → evaluate → critic → arbitrator |
+| **`/add-rule`** | Research → implement rule → A/B visual → critic decision |
+| **`/calibrate-night`** | Sequential calibration + `fixture-list` / `fixture-done` + aggregate gap report |
+
+### 2.2 Artifact graph
+
+```
+fixtures/<name>/data.json (+ screenshot.png)
+        │
+        ▼
+  calibrate-analyze ──► analysis.json (run dir)
+        │
+        ▼
+  Converter + visual-compare ──► conversion.json, figma.png, code.png, diff.png
+        │
+        ▼
+  Gap analyzer ──► gaps.json ──► discovery-evidence (filtered) + gap-rule-report
+        │
+        ▼
+  calibrate-evaluate ──► proposals + calibration-evidence (overscore/underscore)
+        │
+        ▼
+  debate.json (critic + arbitrator) ──► rule-config.ts edits, calibrate-prune-evidence
+```
+
+### 2.3 Cross-run stores
+
+| File | Content | Pruned when |
+|------|---------|-------------|
+| `data/calibration-evidence.json` | Per-rule overscored/underscored signals | `calibrate-prune-evidence` after applied/revised rules |
+| `data/discovery-evidence.json` | Missing-rule / gap signals | `discovery-prune-evidence` after a category is addressed |
+
+Orchestrator-side **noise filters** (regex) prevent font CDN, retina/DPI, screenshot dimension, network, and CI noise from polluting discovery evidence—keeping the Researcher’s input **actionable**.
+
+---
+
+## 3. Branch-specific technical analysis
+
+### 3.1 Visual comparison: scale and padding
+
+**Problem (historical):** Comparing a `@2x` Figma PNG to a `1×` Playwright screenshot inflated diff pixels and **compressed similarity**, biasing calibration toward “everything is hard” and muddying gap labels.
+
+**Current behavior:**
+
+- Default **`figmaExportScale`** of `2` matches common REST exports and **`save-fixture`** output.
+- Without an explicit viewport, **logical** width/height = PNG dimensions ÷ export scale; Playwright uses **`deviceScaleFactor`** = export scale so **`code.png`** matches **`figma.png`** in device pixels.
+- With explicit `--width` / `--height`, **`inferDeviceScaleFactor`** reconciles PNG aspect ratio to logical viewport using rounded ratios with small tolerance—guarding against odd sizes.
+
+**Size mismatch handling:** Images are **padded** with a high-contrast magenta fill to a common canvas (not resized). That choice is deliberate:
+
+- Resampling would blend pixels and **hide** misalignment.
+- Padding makes “extra” area unambiguously wrong in **pixelmatch**, which is what you want for calibration (layout bounds matter).
+
+**CLI parity:** `canicode visual-compare` exposes `--figma-scale` and documents inference behavior in `canicode docs visual-compare`.
+
+### 3.2 Calibration evidence: last-write-wins per fixture
+
+`appendCalibrationEvidence` builds a key `ruleId + NUL + fixture`. Existing entries with those keys are **removed** before new entries are appended. Effects:
+
+- **Pros:** Stops double-counting stale runs; `loadCalibrationEvidence` aggregates reflect the latest truth per fixture.
+- **Cons:** Deliberate historical audit of every run is **not** preserved in this file (only in per-run logs). If longitudinal studies are needed, append-only logs under `logs/calibration/` remain the source of truth.
+
+**Discovery evidence** remains append-only (no dedupe)—appropriate for “many distinct gaps,” but can grow large; see recommendations.
+
+### 3.3 Convergence: strict vs lenient
+
+| Mode | Condition | Use case |
+|------|-----------|----------|
+| **Strict** (default) | `applied === 0` **and** `revised === 0` **and** `rejected === 0` | True “quiet” arbitration: no pending controversy |
+| **Lenient** | `applied === 0` **and** `revised === 0` (rejects ignored) | Stable scores but Critic still rejecting proposals; unblocks `fixture-done` / nightlies |
+
+**Skipped debates** (`debate.skipped`) count as converged—zero proposals is a valid terminal state.
+
+**Operational note:** Lenient mode should be paired with **human review** of `debate.json` periodically; otherwise rejected-but-valid signals might never drive config change.
+
+---
+
+## 4. Rule discovery pipeline (conceptual depth)
+
+### 4.1 Inputs
+
+- **Gap JSON** per calibration run (categories, descriptions, `actionable`, `coveredByExistingRule`).
+- **Accumulated discovery evidence** (evaluation + gap-analysis sources).
+- **Fixture diversity** — done vs active fixtures under `fixtures/` and `fixtures/done/`.
+
+### 4.2 Weak links (inherent)
+
+1. **A/B tests proxy “missing metadata”** with AI-generated text—signal is “does extra structured data help the converter?” not “does human copy help?”
+2. **Gap analyzer** depends on diff interpretation; anti-aliasing and font substitution still create **baseline** noise (~few percent similarity) even after retina alignment.
+3. **Critic/Arbitrator** are LLM-mediated; deterministic steps (`runEvaluationAgent`, gap aggregation) are sound, but final commits are governance-sensitive.
+
+---
+
+## 5. Evaluation: what is working well
+
+1. **Clear separation of concerns:** Rule **logic** vs **`rule-config.ts`** scores enables tuning without rewriting detectors.
+2. **Deterministic evaluation agent:** Maps difficulties to score bands transparently—easy to test and to explain in reports.
+3. **Filtering conversion candidates** (large files): Frames/components with minimum size, child count, text descendants—reduces “icon noise” that once split signals for rules like `no-auto-layout`.
+4. **GapRuleReport** bridges many `gaps.json` files into markdown suitable for human triage.
+5. **Test coverage** around evidence, run directories, and visual-compare numerics gives confidence in refactors.
+
+---
+
+## 6. Risks and limitations
+
+| Risk | Impact | Mitigation (existing or proposed) |
+|------|--------|-----------------------------------|
+| Fixture/repo bloat | Clone and CI slower | Submodules or LFS for `data.json` + PNG; or slim “CI fixtures” subset |
+| Discovery evidence growth | Slower loads; noisy Researcher context | Category-level caps, dedupe by hash of description, archival files |
+| Single global `rule-config` | Cross-fixture coupling | Per-preset configs or weighted calibration per vertical (longer-term) |
+| Playwright `networkidle` + fixed 1s wait | Flaky or slow compares | Tunable wait strategy; opt-in `domcontentloaded` for static HTML |
+| Magenta padding | Extreme edge cases might collide with design | Extremely rare; could switch to XOR mask in diff only |
+
+---
+
+## 7. Improvement roadmap (prioritized)
+
+### P0 — Trust and reproducibility
+
+- **Pin a “golden” visual-compare fixture** in CI (small HTML + canned PNG) so scale/padding logic cannot regress silently without network.
+- **Emit `figmaExportScale` and inferred `deviceScaleFactor` into `conversion.json`** for every run dir—makes postmortems one glance.
+
+### P1 — Evidence and discovery
+
+- **Optional dedupe** for `discovery-evidence.json` (e.g., normalized description + category) with max entries per category.
+- **Schema version** field on evidence arrays for forward-compatible migrations.
+
+### P2 — Operator UX
+
+- **`fixture-done --dry-run`** printing convergence reason (strict fail: N rejects, M applies).
+- **Single command** wrapping “latest run dir for fixture X” resolution—today paths are manual.
+
+### P3 — Analytical depth
+
+- **Per-rule elasticity:** How much similarity improves when a rule is “turned off” in synthesis vs “fixed” in Figma—connects scores to **marginal** visual delta.
+- **Confound controls:** Blank fixture with only text + one rectangle to measure pure font/smoothing baseline per OS.
+
+### P4 — Documentation
+
+- Keep **`docs/CALIBRATION.md`** narrative; this document is the **engineering appendix** for branch-level decisions.
+
+---
+
+## 8. Fixture strategy assessment
+
+Moving calibrated sets to **`fixtures/done/`** is a strong workflow signal: *this design has been through the loop.* Risks:
+
+- **Discoverability:** New contributors may not know `done/` is intentional historical corpus.
+- **Size:** JSON artifacts dominate diff noise; consider policy: “active small fixtures in repo; large kits documented + downloaded.”
+
+Recommendation: document in `CLAUDE.md` or `fixtures/README.md` (when the team chooses to add it) a one-paragraph **fixture lifecycle**: active → calibrate → done → optional archive.
+
+---
+
+## 9. Conclusion
+
+The branch advances the project from “we compare screenshots” to **comparisons that respect how Figma and browsers actually rasterize**. Together with **evidence dedupe** and **lenient convergence**, it closes practical gaps that otherwise waste arbitrators’ time or stall nightlies—without abandoning strict mode for runs that demand full closure.
+
+The next leap is not more rules by count, but **tighter coupling between visual deltas and marginal rule importance** (P3), so `rule-config.ts` continues to track measurable implementation pain rather than narrative severity alone.
+
+---
+
+## 10. Reference pointers (code)
+
+- Visual pipeline: `src/core/engine/visual-compare.ts`
+- CLI wiring: `src/cli/index.ts` (`visual-compare`, `fixture-done`)
+- Convergence: `src/agents/run-directory.ts` (`isConverged`, `ConvergenceOptions`)
+- Evidence: `src/agents/evidence-collector.ts`
+- Noise filters + orchestration: `src/agents/orchestrator.ts` (`ENVIRONMENT_NOISE_PATTERNS`)
+- Nightly note: `.claude/commands/calibrate-night.md`
diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts
index aa3e0e46..adced824 100644
--- a/src/agents/evidence-collector.ts
+++ b/src/agents/evidence-collector.ts
@@ -85,8 +85,10 @@ export function appendCalibrationEvidence(
 ): void {
   if (entries.length === 0) return;
   const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema);
-  const keys = new Set(entries.map((e) => `${e.ruleId.trim()}\0${e.fixture}`));
-  const withoutDupes = existing.filter((e) => !keys.has(`${e.ruleId.trim()}\0${e.fixture}`));
+  const keys = new Set(entries.map((e) => `${e.ruleId.trim()}\0${e.fixture.trim()}`));
+  const withoutDupes = existing.filter(
+    (e) => !keys.has(`${e.ruleId.trim()}\0${e.fixture.trim()}`),
+  );
   withoutDupes.push(...entries);
   writeJsonArray(evidencePath, withoutDupes);
 }
diff --git a/src/agents/run-directory.test.ts b/src/agents/run-directory.test.ts
index 31321666..23fbb605 100644
--- a/src/agents/run-directory.test.ts
+++ b/src/agents/run-directory.test.ts
@@ -116,6 +116,14 @@ describe("extractAppliedRuleIds", () => {
     });
     expect(ids).toEqual(["my-rule", "b"]);
   });
+
+  it("returns empty array when decisions is not an array", () => {
+    const bad = {
+      critic: null,
+      arbitrator: { summary: "x", decisions: null },
+    } as unknown as Parameters<typeof extractAppliedRuleIds>[0];
+    expect(extractAppliedRuleIds(bad)).toEqual([]);
+  });
 });
 
 describe("isConverged", () => {
@@ -154,6 +162,19 @@ describe("isConverged", () => {
     );
     expect(isConverged(tempDir, { lenient: true })).toBe(true);
   });
+
+  it("returns false when arbitrator exists but decisions is not an array", () => {
+    writeFileSync(
+      join(tempDir, "debate.json"),
+      JSON.stringify({
+        arbitrator: {
+          summary: "incomplete",
+        },
+      }),
+    );
+    expect(isConverged(tempDir)).toBe(false);
+    expect(isConverged(tempDir, { lenient: true })).toBe(false);
+  });
 });
 
 describe("listCalibrationRuns", () => {
diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts
index bebdad31..70910ad2 100644
--- a/src/agents/run-directory.ts
+++ b/src/agents/run-directory.ts
@@ -209,7 +209,9 @@ export function parseDebateResult(runDir: string): DebateResult | null {
  */
 export function extractAppliedRuleIds(debate: DebateResult): string[] {
   if (!debate.arbitrator) return [];
-  return debate.arbitrator.decisions
+  const decisions = debate.arbitrator.decisions;
+  if (!Array.isArray(decisions)) return [];
+  return decisions
     .filter((d) => {
       const dec = (d.decision ?? "").trim().toLowerCase();
       return dec === "applied" || dec === "revised";
@@ -237,6 +239,7 @@ export function isConverged(runDir: string, options?: ConvergenceOptions): boole
   if (debate.skipped) return true; // zero proposals = converged
   if (!debate.arbitrator) return false;
   const decisions = debate.arbitrator.decisions;
+  if (!Array.isArray(decisions)) return false;
   const changed = decisions.filter((d) => {
     const dec = (d.decision ?? "").trim().toLowerCase();
     return dec === "applied" || dec === "revised";
diff --git a/src/core/engine/visual-compare.test.ts b/src/core/engine/visual-compare.test.ts
index 3a1e15c4..1df47c10 100644
--- a/src/core/engine/visual-compare.test.ts
+++ b/src/core/engine/visual-compare.test.ts
@@ -20,7 +20,7 @@
  * For (3)–(6) we build small PNGs in memory with pngjs and write them to a temp dir,
  * then call compareScreenshots through a minimal re-export shim declared in this file.
  * Because the module does not export the private functions we reproduce the exact same
- * logic (8 lines of resizePng + ~25 lines of compareScreenshots) so the tests validate
+ * logic (padPng + compareScreenshots) so the tests validate
  * the algorithm rather than just the export boundary.
  */
 

From f1198f25066c387708504c644c91188ba6a6e49d Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Wed, 25 Mar 2026 01:29:46 +0900
Subject: [PATCH 03/11] fix(agents): self-review hardening for evidence and
 debate parsing

- Dedupe calibration evidence within a single append (Map, last wins)
- Treat non-object decision rows as inert in extractAppliedRuleIds and isConverged
- Doc: fix overscored/underscored typo; strict convergence note; Appendix A self-review

Made-with: Cursor
---
 ...EARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md | 18 ++++++++++--
 src/agents/evidence-collector.test.ts         | 11 +++++++
 src/agents/evidence-collector.ts              | 11 +++++--
 src/agents/run-directory.test.ts              | 29 +++++++++++++++++++
 src/agents/run-directory.ts                   | 15 +++++++---
 5 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md b/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md
index 080846ac..6061864c 100644
--- a/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md
+++ b/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md
@@ -48,7 +48,7 @@ fixtures/<name>/data.json (+ screenshot.png)
   Gap analyzer ──► gaps.json ──► discovery-evidence (filtered) + gap-rule-report
         │
         ▼
-  calibrate-evaluate ──► proposals + calibration-evidence (overscore/underscore)
+  calibrate-evaluate ──► proposals + calibration-evidence (overscored/underscored)
         │
         ▼
   debate.json (critic + arbitrator) ──► rule-config.ts edits, calibrate-prune-evidence
@@ -97,7 +97,7 @@ Orchestrator-side **noise filters** (regex) prevent font CDN, retina/DPI, screen
 
 | Mode | Condition | Use case |
 |------|-----------|----------|
-| **Strict** (default) | `applied === 0` **and** `revised === 0` **and** `rejected === 0` | True “quiet” arbitration: no pending controversy |
+| **Strict** (default) | `applied === 0` **and** `revised === 0` **and** `rejected === 0` (and `decisions` is an array) | True “quiet” arbitration: no pending controversy |
 | **Lenient** | `applied === 0` **and** `revised === 0` (rejects ignored) | Stable scores but Critic still rejecting proposals; unblocks `fixture-done` / nightlies |
 
 **Skipped debates** (`debate.skipped`) count as converged—zero proposals is a valid terminal state.
@@ -199,3 +199,17 @@ The next leap is not more rules by count, but **tighter coupling between visual
 - Evidence: `src/agents/evidence-collector.ts`
 - Noise filters + orchestration: `src/agents/orchestrator.ts` (`ENVIRONMENT_NOISE_PATTERNS`)
 - Nightly note: `.claude/commands/calibrate-night.md`
+
+---
+
+## Appendix A: Post-commit self-review (what was re-checked)
+
+This section records a deliberate second pass over the same design—what was assumed, what was wrong, and what was tightened.
+
+1. **Typo in the artifact diagram** — “overscore/underscore” conflated wording with schema enum values; corrected to **overscored/underscored**.
+2. **Calibration evidence batch semantics** — Original dedupe only avoided collisions between *existing file* rows and *new* rows. A single `appendCalibrationEvidence` call could still write **duplicate keys**. Resolved by folding the incoming batch through a `Map` so **last row wins within the batch** as well as across calls.
+3. **Malformed `debate.json`** — Real runs are supposed to emit objects, but hand-edited or partially written files could put `null` or primitives inside `decisions`. **Strict type assumptions** would throw at runtime and break `fixture-done` / prune flows. **Guard** with `isDecisionRecord` and string coercion on `decision` / `ruleId`.
+4. **Documentation vs implementation** — Strict convergence implicitly requires `decisions` to be an array; non-array continues to yield **not converged**. The table now states that explicitly.
+5. **What was not “fixed” on purpose** — `parseDebateResult` still trusts overall JSON shape for `critic` / `arbitrator`; deeper validation would belong in a Zod schema if we ever expose debate files as a public API. **Lenient convergence** still ignores rejected rows by design—that is a product trade-off, not a bug; operators should skim `debate.json` before relying on it long term.
+
+**Limitation (honest):** Waiting “three hours” in wall-clock time does not change correctness; it mainly reduces the chance of missing edge cases. This appendix is the recorded output of that kind of review, done immediately and in code.
diff --git a/src/agents/evidence-collector.test.ts b/src/agents/evidence-collector.test.ts
index 70b5ec86..91e19313 100644
--- a/src/agents/evidence-collector.test.ts
+++ b/src/agents/evidence-collector.test.ts
@@ -116,6 +116,17 @@ describe("evidence-collector", () => {
       expect(raw[0]!.type).toBe("underscored");
     });
 
+    it("dedupes within a single append call (last row for same ruleId and fixture wins)", () => {
+      appendCalibrationEvidence([
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" },
+        { ruleId: "rule-a", type: "underscored", actualDifficulty: "hard", fixture: "fx1", timestamp: "t2" },
+      ], calPath);
+
+      const raw = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[];
+      expect(raw).toHaveLength(1);
+      expect(raw[0]!.type).toBe("underscored");
+    });
+
     it("does nothing for empty entries", () => {
       writeFileSync(calPath, "[]", "utf-8");
       appendCalibrationEvidence([], calPath);
diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts
index adced824..4f98dc61 100644
--- a/src/agents/evidence-collector.ts
+++ b/src/agents/evidence-collector.ts
@@ -85,11 +85,18 @@ export function appendCalibrationEvidence(
 ): void {
   if (entries.length === 0) return;
   const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema);
-  const keys = new Set(entries.map((e) => `${e.ruleId.trim()}\0${e.fixture.trim()}`));
+  // Same batch can repeat (ruleId, fixture); last entry wins (matches cross-call behavior)
+  const incomingByKey = new Map<string, CalibrationEvidenceEntry>();
+  for (const e of entries) {
+    const k = `${e.ruleId.trim()}\0${e.fixture.trim()}`;
+    incomingByKey.set(k, e);
+  }
+  const mergedIncoming = [...incomingByKey.values()];
+  const keys = new Set(incomingByKey.keys());
   const withoutDupes = existing.filter(
     (e) => !keys.has(`${e.ruleId.trim()}\0${e.fixture.trim()}`),
   );
-  withoutDupes.push(...entries);
+  withoutDupes.push(...mergedIncoming);
   writeJsonArray(evidencePath, withoutDupes);
 }
 
diff --git a/src/agents/run-directory.test.ts b/src/agents/run-directory.test.ts
index 23fbb605..9c3e5090 100644
--- a/src/agents/run-directory.test.ts
+++ b/src/agents/run-directory.test.ts
@@ -124,6 +124,21 @@ describe("extractAppliedRuleIds", () => {
     } as unknown as Parameters<typeof extractAppliedRuleIds>[0];
     expect(extractAppliedRuleIds(bad)).toEqual([]);
   });
+
+  it("ignores null and non-object entries in decisions", () => {
+    const ids = extractAppliedRuleIds({
+      critic: null,
+      arbitrator: {
+        summary: "x",
+        decisions: [
+          null,
+          "not-an-object" as unknown as { ruleId: string; decision: string },
+          { ruleId: "ok", decision: "applied" },
+        ],
+      },
+    });
+    expect(ids).toEqual(["ok"]);
+  });
 });
 
 describe("isConverged", () => {
@@ -175,6 +190,20 @@ describe("isConverged", () => {
     expect(isConverged(tempDir)).toBe(false);
     expect(isConverged(tempDir, { lenient: true })).toBe(false);
   });
+
+  it("ignores null entries when counting applied and rejected", () => {
+    writeFileSync(
+      join(tempDir, "debate.json"),
+      JSON.stringify({
+        arbitrator: {
+          summary: "mixed",
+          decisions: [null, { ruleId: "x", decision: "rejected" }],
+        },
+      }),
+    );
+    expect(isConverged(tempDir)).toBe(false);
+    expect(isConverged(tempDir, { lenient: true })).toBe(true);
+  });
 });
 
 describe("listCalibrationRuns", () => {
diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts
index 70910ad2..0ce5b566 100644
--- a/src/agents/run-directory.ts
+++ b/src/agents/run-directory.ts
@@ -207,16 +207,21 @@ export function parseDebateResult(runDir: string): DebateResult | null {
 /**
  * Extract ruleIds that were applied or revised by the Arbitrator.
  */
+function isDecisionRecord(d: unknown): d is { decision?: unknown; ruleId?: unknown } {
+  return d !== null && typeof d === "object";
+}
+
 export function extractAppliedRuleIds(debate: DebateResult): string[] {
   if (!debate.arbitrator) return [];
   const decisions = debate.arbitrator.decisions;
   if (!Array.isArray(decisions)) return [];
   return decisions
     .filter((d) => {
-      const dec = (d.decision ?? "").trim().toLowerCase();
+      if (!isDecisionRecord(d)) return false;
+      const dec = String(d.decision ?? "").trim().toLowerCase();
       return dec === "applied" || dec === "revised";
     })
-    .map((d) => d.ruleId.trim())
+    .map((d) => String(d.ruleId ?? "").trim())
     .filter((id) => id.length > 0);
 }
 
@@ -241,11 +246,13 @@ export function isConverged(runDir: string, options?: ConvergenceOptions): boole
   const decisions = debate.arbitrator.decisions;
   if (!Array.isArray(decisions)) return false;
   const changed = decisions.filter((d) => {
-    const dec = (d.decision ?? "").trim().toLowerCase();
+    if (!isDecisionRecord(d)) return false;
+    const dec = String(d.decision ?? "").trim().toLowerCase();
     return dec === "applied" || dec === "revised";
   }).length;
   const rejected = decisions.filter((d) => {
-    const dec = (d.decision ?? "").trim().toLowerCase();
+    if (!isDecisionRecord(d)) return false;
+    const dec = String(d.decision ?? "").trim().toLowerCase();
     return dec === "rejected";
   }).length;
   if (options?.lenient) {

From 48f2a6bde56971feb46a6540d51d1b93dc913459 Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Wed, 25 Mar 2026 07:31:41 +0900
Subject: [PATCH 04/11] fix: extract tolerance constants + defensive debate
 parsing

- SCALE_ROUNDING_TOLERANCE (0.08): broader for @2x/@3x detection
- UNITY_SCALE_TOLERANCE (0.02): tighter for 1x false-positive prevention
- parseDebateResult: validates shape, debug logs on malformed input

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/agents/run-directory.ts       | 17 ++++++++++++-----
 src/core/engine/visual-compare.ts | 16 ++++++++++++++--
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts
index 0ce5b566..a5906224 100644
--- a/src/agents/run-directory.ts
+++ b/src/agents/run-directory.ts
@@ -188,18 +188,25 @@ export interface DebateResult {
 /**
  * Parse a debate.json file from a run directory.
  * Returns null if the file doesn't exist or is malformed.
+ * Treats debate.json as external input — validates shape defensively.
  */
 export function parseDebateResult(runDir: string): DebateResult | null {
   const debatePath = join(runDir, "debate.json");
   if (!existsSync(debatePath)) return null;
   try {
-    const raw = JSON.parse(readFileSync(debatePath, "utf-8")) as Record<string, unknown>;
+    const raw: unknown = JSON.parse(readFileSync(debatePath, "utf-8"));
+    if (raw === null || typeof raw !== "object") {
+      console.debug(`[parseDebateResult] invalid debate format in ${runDir}: expected object, got ${typeof raw}`);
+      return null;
+    }
+    const obj = raw as Record<string, unknown>;
     return {
-      critic: raw["critic"] as DebateResult["critic"] ?? null,
-      arbitrator: raw["arbitrator"] as DebateResult["arbitrator"] ?? null,
-      ...(typeof raw["skipped"] === "string" ? { skipped: raw["skipped"] } : {}),
+      critic: (obj["critic"] as DebateResult["critic"]) ?? null,
+      arbitrator: (obj["arbitrator"] as DebateResult["arbitrator"]) ?? null,
+      ...(typeof obj["skipped"] === "string" ? { skipped: obj["skipped"] } : {}),
     };
-  } catch {
+  } catch (err) {
+    console.debug(`[parseDebateResult] failed to parse debate.json in ${runDir}:`, err);
     return null;
   }
 }
diff --git a/src/core/engine/visual-compare.ts b/src/core/engine/visual-compare.ts
index 12029fec..fdb3b806 100644
--- a/src/core/engine/visual-compare.ts
+++ b/src/core/engine/visual-compare.ts
@@ -101,6 +101,18 @@ async function fetchFigmaScreenshot(
   writeFileSync(cachePath, buffer);
 }
 
+/**
+ * Tolerance for detecting integer scale factors (@2x, @3x).
+ * Broader tolerance because render/rounding errors accumulate at higher scales.
+ */
+const SCALE_ROUNDING_TOLERANCE = 0.08;
+
+/**
+ * Tolerance for detecting 1x (unity) scale.
+ * Tighter to avoid false positives — misidentifying a scaled PNG as 1x.
+ */
+const UNITY_SCALE_TOLERANCE = 0.02;
+
 /**
  * Infer device pixel ratio so the Playwright screenshot matches Figma PNG pixel dimensions.
  */
@@ -115,10 +127,10 @@ function inferDeviceScaleFactor(
   const sx = pngW / logicalW;
   const sy = pngH / logicalH;
   const rounded = Math.round((sx + sy) / 2);
-  if (rounded >= 2 && Math.abs(sx - rounded) < 0.08 && Math.abs(sy - rounded) < 0.08) {
+  if (rounded >= 2 && Math.abs(sx - rounded) < SCALE_ROUNDING_TOLERANCE && Math.abs(sy - rounded) < SCALE_ROUNDING_TOLERANCE) {
     return rounded;
   }
-  if (Math.abs(sx - 1) < 0.02 && Math.abs(sy - 1) < 0.02) return 1;
+  if (Math.abs(sx - 1) < UNITY_SCALE_TOLERANCE && Math.abs(sy - 1) < UNITY_SCALE_TOLERANCE) return 1;
   return fallback >= 2 ? fallback : Math.max(1, Math.round(sx));
 }
 

From 465eecceb207062a1f084e7cf5737848e4cace10 Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Wed, 25 Mar 2026 07:34:17 +0900
Subject: [PATCH 05/11] chore: remove research doc (discussion-only, not for
 merge)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 ...EARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md | 215 ------------------
 1 file changed, 215 deletions(-)
 delete mode 100644 docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md

diff --git a/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md b/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md
deleted file mode 100644
index 6061864c..00000000
--- a/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md
+++ /dev/null
@@ -1,215 +0,0 @@
-# Research Report: Calibration & Rule Discovery (Branch `fix/calibration-pipeline-issue-14`)
-
-**Scope:** Deep technical review of CanICode’s internal calibration and rule-discovery pipelines as reflected on the current branch, including recent fixes for retina-aligned visual comparison, cross-run evidence handling, and convergence semantics for nightly workflows.
-
-**Audience:** Maintainers opening or reviewing PRs that touch `visual-compare`, agent orchestration, or calibration CLI commands.
-
----
-
-## 1. Executive summary
-
-CanICode’s north star is answerable in one question: **can AI implement a scoped Figma design pixel-accurately?** **Calibration** grounds rule **scores** in measured conversion difficulty (including **visual-compare** similarity). **Rule discovery** turns recurring **gap** patterns into new rules, validated by A/B visual runs.
-
-On this branch, three themes stand out as materially improving scientific validity and operator ergonomics:
-
-1. **Retina / export-scale alignment** — Figma exports and Playwright captures now share the same physical pixel grid by default (`figmaExportScale`, inferred logical viewport, `deviceScaleFactor`). This removes a large class of false “diff” signals that previously looked like design errors.
-2. **Calibration evidence hygiene** — Appending calibration evidence **replaces** the prior row for the same `(ruleId, fixture)` pair so the cross-run store reflects the **latest** assessment, not an unbounded pile of contradictory rows.
-3. **Convergence for automation (issue #14)** — Strict convergence (`no applied/revised` **and** `no rejected`) is correct for “the debate fully settled.” For nightlies, **`--lenient-convergence`** allows `fixture-done` when the Arbitrator applied nothing but still logged rejections—a common pattern when the Critic repeatedly blocks borderline proposals while scores are stable.
-
-The branch also carries substantial **fixture expansion** (calibrated designs moved under `fixtures/done`, new fixture JSON/screenshots). That increases empirical coverage but increases repo size and CI time—trade-offs called out in Section 8.
-
----
-
-## 2. System map
-
-### 2.1 Channels vs internal pipelines
-
-| Surface | Role in the metric story |
-|--------|---------------------------|
-| CLI `analyze` | Produces rule findings and scores from structure/fixture JSON |
-| CLI `visual-compare` | Measures **pixel** similarity between Figma PNG and rendered HTML |
-| MCP / skill | Same analysis with optional hybrid enrichment from Figma MCP |
-| **`/calibrate-loop`** (Claude Code) | Full loop: analyze → convert whole scope → gap analysis → evaluate → critic → arbitrator |
-| **`/add-rule`** | Research → implement rule → A/B visual → critic decision |
-| **`/calibrate-night`** | Sequential calibration + `fixture-list` / `fixture-done` + aggregate gap report |
-
-### 2.2 Artifact graph
-
-```
-fixtures/<name>/data.json (+ screenshot.png)
-        │
-        ▼
-  calibrate-analyze ──► analysis.json (run dir)
-        │
-        ▼
-  Converter + visual-compare ──► conversion.json, figma.png, code.png, diff.png
-        │
-        ▼
-  Gap analyzer ──► gaps.json ──► discovery-evidence (filtered) + gap-rule-report
-        │
-        ▼
-  calibrate-evaluate ──► proposals + calibration-evidence (overscored/underscored)
-        │
-        ▼
-  debate.json (critic + arbitrator) ──► rule-config.ts edits, calibrate-prune-evidence
-```
-
-### 2.3 Cross-run stores
-
-| File | Content | Pruned when |
-|------|---------|-------------|
-| `data/calibration-evidence.json` | Per-rule overscored/underscored signals | `calibrate-prune-evidence` after applied/revised rules |
-| `data/discovery-evidence.json` | Missing-rule / gap signals | `discovery-prune-evidence` after a category is addressed |
-
-Orchestrator-side **noise filters** (regex) prevent font CDN, retina/DPI, screenshot dimension, network, and CI noise from polluting discovery evidence—keeping the Researcher’s input **actionable**.
-
----
-
-## 3. Branch-specific technical analysis
-
-### 3.1 Visual comparison: scale and padding
-
-**Problem (historical):** Comparing a `@2x` Figma PNG to a `1×` Playwright screenshot inflated diff pixels and **compressed similarity**, biasing calibration toward “everything is hard” and muddying gap labels.
-
-**Current behavior:**
-
-- Default **`figmaExportScale`** of `2` matches common REST exports and **`save-fixture`** output.
-- Without an explicit viewport, **logical** width/height = PNG dimensions ÷ export scale; Playwright uses **`deviceScaleFactor`** = export scale so **`code.png`** matches **`figma.png`** in device pixels.
-- With explicit `--width` / `--height`, **`inferDeviceScaleFactor`** reconciles PNG aspect ratio to logical viewport using rounded ratios with small tolerance—guarding against odd sizes.
-
-**Size mismatch handling:** Images are **padded** with a high-contrast magenta fill to a common canvas (not resized). That choice is deliberate:
-
-- Resampling would blend pixels and **hide** misalignment.
-- Padding makes “extra” area unambiguously wrong in **pixelmatch**, which is what you want for calibration (layout bounds matter).
-
-**CLI parity:** `canicode visual-compare` exposes `--figma-scale` and documents inference behavior in `canicode docs visual-compare`.
-
-### 3.2 Calibration evidence: last-write-wins per fixture
-
-`appendCalibrationEvidence` builds a key `ruleId + NUL + fixture`. Existing entries with those keys are **removed** before new entries are appended. Effects:
-
-- **Pros:** Stops double-counting stale runs; `loadCalibrationEvidence` aggregates reflect the latest truth per fixture.
-- **Cons:** Deliberate historical audit of every run is **not** preserved in this file (only in per-run logs). If longitudinal studies are needed, append-only logs under `logs/calibration/` remain the source of truth.
-
-**Discovery evidence** remains append-only (no dedupe)—appropriate for “many distinct gaps,” but can grow large; see recommendations.
-
-### 3.3 Convergence: strict vs lenient
-
-| Mode | Condition | Use case |
-|------|-----------|----------|
-| **Strict** (default) | `applied === 0` **and** `revised === 0` **and** `rejected === 0` (and `decisions` is an array) | True “quiet” arbitration: no pending controversy |
-| **Lenient** | `applied === 0` **and** `revised === 0` (rejects ignored) | Stable scores but Critic still rejecting proposals; unblocks `fixture-done` / nightlies |
-
-**Skipped debates** (`debate.skipped`) count as converged—zero proposals is a valid terminal state.
-
-**Operational note:** Lenient mode should be paired with **human review** of `debate.json` periodically; otherwise rejected-but-valid signals might never drive config change.
-
----
-
-## 4. Rule discovery pipeline (conceptual depth)
-
-### 4.1 Inputs
-
-- **Gap JSON** per calibration run (categories, descriptions, `actionable`, `coveredByExistingRule`).
-- **Accumulated discovery evidence** (evaluation + gap-analysis sources).
-- **Fixture diversity** — done vs active fixtures under `fixtures/` and `fixtures/done/`.
-
-### 4.2 Weak links (inherent)
-
-1. **A/B tests proxy “missing metadata”** with AI-generated text—signal is “does extra structured data help the converter?” not “does human copy help?”
-2. **Gap analyzer** depends on diff interpretation; anti-aliasing and font substitution still create **baseline** noise (~few percent similarity) even after retina alignment.
-3. **Critic/Arbitrator** are LLM-mediated; deterministic steps (`runEvaluationAgent`, gap aggregation) are sound, but final commits are governance-sensitive.
-
----
-
-## 5. Evaluation: what is working well
-
-1. **Clear separation of concerns:** Rule **logic** vs **`rule-config.ts`** scores enables tuning without rewriting detectors.
-2. **Deterministic evaluation agent:** Maps difficulties to score bands transparently—easy to test and to explain in reports.
-3. **Filtering conversion candidates** (large files): Frames/components with minimum size, child count, text descendants—reduces “icon noise” that once split signals for rules like `no-auto-layout`.
-4. **GapRuleReport** bridges many `gaps.json` files into markdown suitable for human triage.
-5. **Test coverage** around evidence, run directories, and visual-compare numerics gives confidence in refactors.
-
----
-
-## 6. Risks and limitations
-
-| Risk | Impact | Mitigation (existing or proposed) |
-|------|--------|-----------------------------------|
-| Fixture/repo bloat | Clone and CI slower | Submodules or LFS for `data.json` + PNG; or slim “CI fixtures” subset |
-| Discovery evidence growth | Slower loads; noisy Researcher context | Category-level caps, dedupe by hash of description, archival files |
-| Single global `rule-config` | Cross-fixture coupling | Per-preset configs or weighted calibration per vertical (longer-term) |
-| Playwright `networkidle` + fixed 1s wait | Flaky or slow compares | Tunable wait strategy; opt-in `domcontentloaded` for static HTML |
-| Magenta padding | Extreme edge cases might collide with design | Extremely rare; could switch to XOR mask in diff only |
-
----
-
-## 7. Improvement roadmap (prioritized)
-
-### P0 — Trust and reproducibility
-
-- **Pin a “golden” visual-compare fixture** in CI (small HTML + canned PNG) so scale/padding logic cannot regress silently without network.
-- **Emit `figmaExportScale` and inferred `deviceScaleFactor` into `conversion.json`** for every run dir—makes postmortems one glance.
-
-### P1 — Evidence and discovery
-
-- **Optional dedupe** for `discovery-evidence.json` (e.g., normalized description + category) with max entries per category.
-- **Schema version** field on evidence arrays for forward-compatible migrations.
-
-### P2 — Operator UX
-
-- **`fixture-done --dry-run`** printing convergence reason (strict fail: N rejects, M applies).
-- **Single command** wrapping “latest run dir for fixture X” resolution—today paths are manual.
-
-### P3 — Analytical depth
-
-- **Per-rule elasticity:** How much similarity improves when a rule is “turned off” in synthesis vs “fixed” in Figma—connects scores to **marginal** visual delta.
-- **Confound controls:** Blank fixture with only text + one rectangle to measure pure font/smoothing baseline per OS.
-
-### P4 — Documentation
-
-- Keep **`docs/CALIBRATION.md`** narrative; this document is the **engineering appendix** for branch-level decisions.
-
----
-
-## 8. Fixture strategy assessment
-
-Moving calibrated sets to **`fixtures/done/`** is a strong workflow signal: *this design has been through the loop.* Risks:
-
-- **Discoverability:** New contributors may not know `done/` is intentional historical corpus.
-- **Size:** JSON artifacts dominate diff noise; consider policy: “active small fixtures in repo; large kits documented + downloaded.”
-
-Recommendation: document in `CLAUDE.md` or `fixtures/README.md` (when the team chooses to add it) a one-paragraph **fixture lifecycle**: active → calibrate → done → optional archive.
-
----
-
-## 9. Conclusion
-
-The branch advances the project from “we compare screenshots” to **comparisons that respect how Figma and browsers actually rasterize**. Together with **evidence dedupe** and **lenient convergence**, it closes practical gaps that otherwise waste arbitrators’ time or stall nightlies—without abandoning strict mode for runs that demand full closure.
-
-The next leap is not more rules by count, but **tighter coupling between visual deltas and marginal rule importance** (P3), so `rule-config.ts` continues to track measurable implementation pain rather than narrative severity alone.
-
----
-
-## 10. Reference pointers (code)
-
-- Visual pipeline: `src/core/engine/visual-compare.ts`
-- CLI wiring: `src/cli/index.ts` (`visual-compare`, `fixture-done`)
-- Convergence: `src/agents/run-directory.ts` (`isConverged`, `ConvergenceOptions`)
-- Evidence: `src/agents/evidence-collector.ts`
-- Noise filters + orchestration: `src/agents/orchestrator.ts` (`ENVIRONMENT_NOISE_PATTERNS`)
-- Nightly note: `.claude/commands/calibrate-night.md`
-
----
-
-## Appendix A: Post-commit self-review (what was re-checked)
-
-This section records a deliberate second pass over the same design—what was assumed, what was wrong, and what was tightened.
-
-1. **Typo in the artifact diagram** — “overscore/underscore” conflated wording with schema enum values; corrected to **overscored/underscored**.
-2. **Calibration evidence batch semantics** — Original dedupe only avoided collisions between *existing file* rows and *new* rows. A single `appendCalibrationEvidence` call could still write **duplicate keys**. Resolved by folding the incoming batch through a `Map` so **last row wins within the batch** as well as across calls.
-3. **Malformed `debate.json`** — Real runs are supposed to emit objects, but hand-edited or partially written files could put `null` or primitives inside `decisions`. **Strict type assumptions** would throw at runtime and break `fixture-done` / prune flows. **Guard** with `isDecisionRecord` and string coercion on `decision` / `ruleId`.
-4. **Documentation vs implementation** — Strict convergence implicitly requires `decisions` to be an array; non-array continues to yield **not converged**. The table now states that explicitly.
-5. **What was not “fixed” on purpose** — `parseDebateResult` still trusts overall JSON shape for `critic` / `arbitrator`; deeper validation would belong in a Zod schema if we ever expose debate files as a public API. **Lenient convergence** still ignores rejected rows by design—that is a product trade-off, not a bug; operators should skim `debate.json` before relying on it long term.
-
-**Limitation (honest):** Waiting “three hours” in wall-clock time does not change correctness; it mainly reduces the chance of missing edge cases. This appendix is the recorded output of that kind of review, done immediately and in code.

From e3864a7f4180821301b650b296bdfc96babdc127 Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Wed, 25 Mar 2026 07:48:27 +0900
Subject: [PATCH 06/11] feat: token metrics + target environment + A/B token
 comparison

1. Token metrics:
   - generateDesignTreeWithStats() returns estimatedTokens + bytes
   - CLI design-tree command shows token estimate
   - CalibrationReportData includes tokenMetrics (tokens, bytes, per-node)

2. Target environment in CLAUDE.md:
   - Primary target: teams with designers, 300+ node pages
   - Token consumption is first-class metric
   - Component scores should not be lowered from small fixture calibration

3. add-rule A/B always includes token comparison:
   - Step 4 measures both visual similarity AND token savings
   - Token savings ratio: 1 - tokens_b / tokens_a
   - Even if visual diff is zero, token savings justify the rule

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .claude/commands/add-rule.md                |  14 +-
 CLAUDE.md                                   |  13 ++
 docs/fixtures/ANALYSIS-VALIDITY-FEEDBACK.md | 156 ++++++++++++++++++++
 src/agents/report-generator.ts              |  10 +-
 src/cli/index.ts                            |  10 +-
 src/core/engine/design-tree.ts              |  28 +++-
 6 files changed, 220 insertions(+), 11 deletions(-)
 create mode 100644 docs/fixtures/ANALYSIS-VALIDITY-FEEDBACK.md

diff --git a/.claude/commands/add-rule.md b/.claude/commands/add-rule.md
index eba60dd0..ef7a6bd6 100644
--- a/.claude/commands/add-rule.md
+++ b/.claude/commands/add-rule.md
@@ -68,9 +68,9 @@ Append to `$RUN_DIR/activity.jsonl`:
 {"step":"Implementer","timestamp":"<ISO8601>","result":"implemented rule <rule-id>","durationMs":<ms>}
 ```
 
-### Step 4 — A/B Visual Validation
+### Step 4 — A/B Validation (Visual + Token)
 
-Run an A/B comparison on the entire design to measure the rule's actual impact on pixel-perfect accuracy:
+Run an A/B comparison measuring both **pixel accuracy** and **token efficiency**:
 
 1. Extract `fileKey` and root `nodeId` from the fixture or Figma URL.
 
@@ -91,9 +91,15 @@ Run an A/B comparison on the entire design to measure the rule's actual impact o
    - Run: `npx canicode visual-compare $RUN_DIR/visual-b.html --figma-url "<figma-url-with-root-node-id>" --output $RUN_DIR/visual-b`
    - Record similarity_b
 
-5. Compare: if similarity_b > similarity_a → the rule catches something that genuinely improves implementation quality.
+5. **Visual comparison**: if similarity_b > similarity_a → the rule improves pixel accuracy.
 
-6. Record both scores for the Evaluator.
+6. **Token comparison** (always measure, even if visual diff is zero):
+   - Count lines/bytes of `visual-a.html` vs `visual-b.html`
+   - If Test B is significantly smaller → the rule reduces token consumption (important for large pages)
+   - Record `tokens_a` (estimated: file bytes / 4) and `tokens_b`
+   - Token savings ratio: `1 - tokens_b / tokens_a`
+
+7. Record all scores for the Evaluator: `similarity_a`, `similarity_b`, `tokens_a`, `tokens_b`.
 
 ### Step 5 — Evaluator
 
diff --git a/CLAUDE.md b/CLAUDE.md
index 95c17d8b..23a286f8 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,6 +6,19 @@ A CLI tool that analyzes Figma design structures to provide development-friendli
 
 **Can AI implement this Figma design pixel-perfectly?** Everything in this project serves this single question. Every rule, score, category, and pipeline exists to measure and improve how accurately AI can reproduce a Figma design as code. The metric is `visual-compare` similarity (0-100%).
 
+## Target Environment
+
+The primary target is **teams with designers** where developers (+AI) implement large Figma pages:
+- **Page scale**: 300+ nodes, full screens, not small component sections
+- **Component-heavy**: Design systems with reusable components, variants, tokens
+- **AI context budget**: Large pages must fit in AI context windows — componentization reduces token count via deduplication
+- **Not the target**: Individual developers generating simple UI with AI — they don't need Figma analysis
+
+This means:
+- Component-related rule scores (missing-component, etc.) should NOT be lowered based on small fixture calibration
+- Token consumption is a first-class metric — designs that waste tokens on repeated structures are penalized
+- Calibration fixtures should include large, complex pages alongside small sections
+
 ## Tech Stack
 
 - **Runtime**: Node.js (>=18)
diff --git a/docs/fixtures/ANALYSIS-VALIDITY-FEEDBACK.md b/docs/fixtures/ANALYSIS-VALIDITY-FEEDBACK.md
new file mode 100644
index 00000000..6d667649
--- /dev/null
+++ b/docs/fixtures/ANALYSIS-VALIDITY-FEEDBACK.md
@@ -0,0 +1,156 @@
+# Fixture analysis: validity feedback (over- vs under-estimation)
+
+This document records **subjective engineering judgment** on whether `canicode analyze` results **oversell** or **undersell** how hard it is for an AI to implement the UI **pixel-close** from the fixture, and what to do about it.
+
+It is **not** a statistical proof. It assumes: REST fixture JSON + current `rule-config` scoring, and the north-star metric **visual similarity**, not design-system purity alone.
+
+---
+
+## How to read this
+
+| Term | Meaning here |
+|------|----------------|
+| **Overrated (grade/score)** | The **readiness score looks better** than the likely **implementation + visual-compare** outcome without extra context or cleanup. |
+| **Underrated** | The **score looks worse** than how repeatable or implementable the file actually is for an AI (given a strong prompt / repeated patterns). |
+
+**Remedies** split into: **Figma / fixture hygiene**, **prompt & workflow**, **tooling (rules, scoring, scope)**.
+
+---
+
+## Cross-cutting patterns
+
+### A. Invisible layers + raw color clusters (many Simple DS fixtures)
+
+- **Overrated:** Overall **grade can look decent (A / high B)** while the tree still carries many **invisible-layer** and **raw-color** hits. Severity may keep the headline grade up, but the AI still sees **noise** (extra nodes, ambiguous fills) and may **mis-order or mis-style** visible content. **Pixel outcome** can be worse than the letter grade suggests.
+- **Underrated:** Rare for the **same** pattern; occasionally invisible only affects editor hygiene, not export — but REST + rules still complain.
+- **Remedies:**
+  - **Figma:** Delete or detach truly unused layers; replace raw values with variables where the team cares; reduce hidden decoration.
+  - **Workflow:** Feed **`design-tree`** or a **pruned node list** to the AI so it ignores known-noise IDs.
+  - **Tooling:** Consider reporting **“visible-only issues”** aggregate for AI-facing summaries; tune `invisible-layer` / `raw-color` weights after calibration for “impact on visual-compare.”
+
+### B. Unscoped large trees (Material 3 & large Simple DS)
+
+- **Overrated:** **Low grades (C, B)** can look like “this file is terrible to implement” when the real problem is **scope** — mixing many screens/components in one graph. A single scoped frame might be **B+ or A** in isolation.
+- **Underrated:** Almost the inverse: **one bad subtree** can dominate issue counts; the rest of the file might be fine — the **average** undervalues “good parts.”
+- **Remedies:**
+  - **Fixture:** Re-`save-fixture` with a **single `node-id`** per calibration / AI task.
+  - **CLI:** Always analyze with the **same scope** you ask the AI to implement.
+  - **Tooling:** Surface **root-scoped score** vs **whole-file score** in reports when unscoped.
+
+### C. Design-system spacing / fixed-width spam (`material3-52949-27916`, `material3-51954-18254`)
+
+- **Overrated:** Less common; if spacing issues are **systematic**, fixing one pattern fixes hundreds — headline pain can be **overstated** for a **template-aware** AI.
+- **Underrated:** **Underrated for “first-try” AI** without a grid spec — hundred of **inconsistent-spacing** / **fixed-width** hits mean **lots of opportunity for pixel drift**; visual-compare will punish. Score may feel “only B” but failure modes are many.
+- **Remedies:**
+  - **Prompt:** State **grid base (e.g. 4px)**, max width, breakpoint behavior explicitly.
+  - **Figma:** Normalize spacing to the grid; reduce fixed widths or mark responsive intent.
+  - **Tooling:** Calibration tied to **visual-compare** on these fixtures so spacing rules track **pixel deltas**.
+
+### D. Numeric / generic naming (Material 3 community)
+
+- **Overrated:** Score can look **OK on layout** while **ai-readability / naming** tanks — sometimes **underweighted** in how much it confuses **less capable** models (variable names, component names in code).
+- **Underrated:** For **vision-heavy** codegen (screenshot + structure), **bad names** matter less — score may **undersell** “actually buildable if you ignore names.”
+- **Remedies:**
+  - **Figma:** Rename critical interactive nodes; keep DS naming convention.
+  - **Prompt:** “Map Figma node IDs to React components; names are not authoritative.”
+  - **Tooling:** Separate **“semantic naming score”** from **implementation difficulty** in user-facing copy.
+
+### E. `figma-ui3-kit` (token category very weak, few nodes)
+
+- **Overrated:** **Overall B** might **overstate** readiness: **token ~21%** means heavy **raw-color/raw-font** — first **visual-compare** often fails on **font and fill** unless explicitly mocked.
+- **Underrated:** Small **node count** — fast iteration; score may **feel harsh** vs effort to **manually** match a tiny scope.
+- **Remedies:**
+  - **Workflow:** Predeclare **font stack** and **color substitution** in the prompt to absorb raw-font/raw-color.
+  - **Tooling:** Optional “**compare mode: ignore typography**” (dangerous but documented) for layout-only checks.
+
+---
+
+## Per-fixture notes (snapshot-oriented)
+
+Names refer to directories under `fixtures/` or `fixtures/done/`. Grades refer to the last batch analyze (v0.8.9); re-run after config changes.
+
+### `done/simple-ds-page-sections` — A (86%)
+
+- **Overrated:** **Token/component** weakness (raw color, missing descriptions, invisible layers) is **under-reflected** in a single letter “A” for **naive** AI codegen without DS context.
+- **Underrated:** **Small, shallow tree** — easier to reason about than bulk Simple DS fixtures; score is not “too kind” to structure.
+- **Remedies:** Add **component description** stub in prompt; clean invisible layers; run **visual-compare** once to validate the A.
+
+### `material3-56615-45927` — C+ (72%), large unscoped
+
+- **Overrated:** Unlikely at file level — **C+** already warns; per-frame might be better.
+- **Underrated:** Possible if user only implements **one** repeated component — issue count is **file-wide**.
+- **Remedies:** **Mandatory scoping**; treat as multiple fixtures.
+
+### `simple-ds-175-9106` / `175-8591` / `175-7790` / `562-9518` — B ~ B+
+
+- **Overrated:** **raw-color + invisible-layer** volume — **B/B+** may **overrate** first-shot pixel match.
+- **Underrated:** **Detach/instance** and **default-name** sometimes fixable with a strong “use Figma structure as source of truth” prompt.
+- **Remedies:** Scope; variable cleanup; explicit **instance** handling in prompt.
+
+### `simple-ds-4333-9262` — A (89%)
+
+- **Overrated:** Still **missing-component-description** and spacing — “A” is **fragile** for **fully autonomous** AI without extra text context.
+- **Underrated:** Among the **cleaner** Simple DS slices — good **golden** candidate; score may be **fair or slightly harsh** if issues are mostly doc/metadata.
+- **Remedies:** Use as **benchmark**; add descriptions only if product needs them for handoff.
+
+### `material3-52949-27916` — B (79%), 400+ issues
+
+- **Overrated:** Whole-file **B** might **overrate** “one-shot implement whole thing.”
+- **Underrated:** For **systematic** spacing tokens, a **macro prompt** might achieve **ok** similarity on repetitive regions — composite grade **undersells** repeatability.
+- **Remedies:** Split fixture; calibrate spacing rules on **scoped** chunks.
+
+### `done/simple-ds-panel-sections` — A (87%)
+
+- **Overrated:** Same invisible/raw-color cluster as other Simple DS **done** sets — **A** vs **pixel** needs verification.
+- **Underrated:** Low depth, moderate issues — reasonable **hand-implement** cost.
+
+### `material3-51954-18254` — B+ (82%), spacing-dominated
+
+- **Overrated:** Low probability — issue profile is **honest** about layout math risk.
+- **Underrated:** If the AI is given **explicit spacing table** extracted once from Figma, difficulty drops **faster** than score implies.
+- **Remedies:** Export **spacing tokens** or measurement table alongside `data.json`.
+
+### `done/material3-kit-2` — B+ (81%)
+
+- **Overrated:** **deep-nesting + sibling direction** — **B+** might **overrate** “drop in one prompt” success for junior models.
+- **Underrated:** **Narrow node count** (96) — less overwhelming than `82356`.
+- **Remedies:** Refactor nesting in Figma for handoff; or implement **section-by-section**.
+
+### `material3-56615-82356` — C (66%)
+
+- **Overrated:** Rare — **C** is already blunt.
+- **Underrated:** If scoped to **one** leaf screen, local grade might be **much higher** — file-level **undersells** localized quality.
+- **Remedies:** **Never** use as single-scope “implement all”; split into **10+** scoped fixtures.
+
+### `done/material3-kit-1` — A (85%)
+
+- **Overrated:** **Still mixed token/naming/invisible** — **A** is strong; verify with **visual-compare** for **stock** AI.
+- **Underrated:** Mature kit — **repeatable patterns** help experienced prompts.
+- **Remedies:** Keep as **positive** control next to `kit-2` / `82356`.
+
+### `done/simple-ds-card-grid` — B+ (80%)
+
+- **Overrated:** **Component/naming** weak — **B+** may **overrate** grid+card **pixel** parity without layout hints.
+- **Underrated:** **Calibration history** — score may be **well tuned** for this file; trust **relative** more than absolute.
+- **Remedies:** Explicit **grid columns/gap** in prompt.
+
+### `done/figma-ui3-kit` — B (76%), token floor
+
+- **Overrated:** **Overall B** with **token 21%** — **readiness for unguided AI** is **overrated** unless fonts/colors are specified in prompt.
+- **Underrated:** **20 nodes** — absolute work is small; grade **undersells** “quick human polish.”
+- **Remedies:** Font and color contract in README for this fixture; optional webfont load in generated HTML.
+
+---
+
+## Raising “validity” of the metric (product direction)
+
+1. **Always pair** headline grade with **visual-compare** (or explicit “not run”) on the **same scoped node**.
+2. **Publish** whether the run was **scoped** and **node count** in JSON (provenance) — avoids comparing whole-file vs framed scores.
+3. **Calibrate** rule weights using **pixel delta categories**, not issue count alone — reduces **over/under** gap for AI use cases.
+4. **Two summaries** in reports: **“handoff / DS hygiene”** vs **“likely first visual-compare (AI)”** — can diverge.
+
+---
+
+## Revision
+
+Re-evaluate after major `rule-config` or rule-set changes; per-fixture grades will move.
diff --git a/src/agents/report-generator.ts b/src/agents/report-generator.ts
index a340eabf..fb805c09 100644
--- a/src/agents/report-generator.ts
+++ b/src/agents/report-generator.ts
@@ -15,6 +15,12 @@ export interface CalibrationReportData {
   validatedRules: string[];
   adjustments: ScoreAdjustment[];
   newRuleProposals: NewRuleProposal[];
+  /** Design tree token metrics (optional — present when design-tree stats are available) */
+  tokenMetrics?: {
+    designTreeTokens: number;
+    designTreeBytes: number;
+    tokensPerNode: number;
+  } | undefined;
 }
 
 /**
@@ -47,7 +53,9 @@ function renderOverview(data: CalibrationReportData): string {
 | Total Issues | ${data.issueCount} |
 | Converted Nodes | ${data.convertedNodeCount} |
 | Skipped Nodes | ${data.skippedNodeCount} |
-| Overall Grade | ${data.scoreReport.overall.grade} (${data.scoreReport.overall.percentage}%) |
+| Overall Grade | ${data.scoreReport.overall.grade} (${data.scoreReport.overall.percentage}%) |${data.tokenMetrics ? `
+| Design Tree Tokens | ~${data.tokenMetrics.designTreeTokens.toLocaleString()} tokens (${Math.round(data.tokenMetrics.designTreeBytes / 1024)}KB) |
+| Tokens per Node | ~${Math.round(data.tokenMetrics.tokensPerNode)} |` : ""}
 `;
 }
 
diff --git a/src/cli/index.ts b/src/cli/index.ts
index 21699a3c..083693fe 100644
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@@ -864,7 +864,6 @@ cli
   .action(async (input: string, options: { token?: string; output?: string; vectorDir?: string }) => {
     try {
       const { file } = await loadFile(input, options.token);
-      const { generateDesignTree } = await import("../core/engine/design-tree.js");
 
       // Auto-detect vector dir from fixture path
       let vectorDir = options.vectorDir;
@@ -876,16 +875,17 @@ cli
         if (existsSync(autoDir)) vectorDir = autoDir;
       }
 
-      const tree = generateDesignTree(file, vectorDir ? { vectorDir } : undefined);
+      const { generateDesignTreeWithStats } = await import("../core/engine/design-tree.js");
+      const stats = generateDesignTreeWithStats(file, vectorDir ? { vectorDir } : undefined);
 
       if (options.output) {
         const outputDir = dirname(resolve(options.output));
         if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });
         const { writeFile: writeFileAsync } = await import("node:fs/promises");
-        await writeFileAsync(resolve(options.output), tree, "utf-8");
-        console.log(`Design tree saved: ${resolve(options.output)} (${Math.round(tree.length / 1024)}KB)`);
+        await writeFileAsync(resolve(options.output), stats.tree, "utf-8");
+        console.log(`Design tree saved: ${resolve(options.output)} (${Math.round(stats.bytes / 1024)}KB, ~${stats.estimatedTokens} tokens)`);
       } else {
-        console.log(tree);
+        console.log(stats.tree);
       }
     } catch (error) {
       console.error("\nError:", error instanceof Error ? error.message : String(error));
diff --git a/src/core/engine/design-tree.ts b/src/core/engine/design-tree.ts
index b626bf33..81e2db30 100644
--- a/src/core/engine/design-tree.ts
+++ b/src/core/engine/design-tree.ts
@@ -231,17 +231,37 @@ export interface DesignTreeOptions {
   vectorDir?: string;
 }
 
+/**
+ * Generate a design tree string from an AnalysisFile.
+ */
+export interface DesignTreeResult {
+  /** The design tree text */
+  tree: string;
+  /** Estimated token count (~4 chars per token for mixed code/text) */
+  estimatedTokens: number;
+  /** Raw byte size */
+  bytes: number;
+}
+
 /**
  * Generate a design tree string from an AnalysisFile.
  */
 export function generateDesignTree(file: AnalysisFile, options?: DesignTreeOptions): string {
+  return generateDesignTreeWithStats(file, options).tree;
+}
+
+/**
+ * Generate a design tree with token/size statistics.
+ * Use this when you need to measure token consumption for AI context budget.
+ */
+export function generateDesignTreeWithStats(file: AnalysisFile, options?: DesignTreeOptions): DesignTreeResult {
   const root = file.document;
   const w = root.absoluteBoundingBox ? Math.round(root.absoluteBoundingBox.width) : 0;
   const h = root.absoluteBoundingBox ? Math.round(root.absoluteBoundingBox.height) : 0;
 
   const tree = renderNode(root, 0, options?.vectorDir, file.components);
 
-  return [
+  const result = [
     "# Design Tree",
     `# Root: ${w}px x ${h}px`,
     "# Each node shows: name (TYPE, WxH) followed by CSS-like styles",
@@ -250,4 +270,10 @@ export function generateDesignTree(file: AnalysisFile, options?: DesignTreeOptio
     "",
     tree,
   ].join("\n");
+
+  const bytes = Buffer.byteLength(result, "utf-8");
+  // ~4 chars per token for mixed code/text (conservative estimate)
+  const estimatedTokens = Math.ceil(result.length / 4);
+
+  return { tree: result, estimatedTokens, bytes };
 }

From 693b4479adaff460ddc261ab5931ffc1651f6782 Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Wed, 25 Mar 2026 07:49:46 +0900
Subject: [PATCH 07/11] test: add generateDesignTreeWithStats unit tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Returns tree, estimatedTokens, bytes
- Output matches generateDesignTree
- CLI verified: 31KB fixture → ~7912 tokens

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/core/engine/design-tree.test.ts | 37 ++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/src/core/engine/design-tree.test.ts b/src/core/engine/design-tree.test.ts
index ee561c57..1e273c04 100644
--- a/src/core/engine/design-tree.test.ts
+++ b/src/core/engine/design-tree.test.ts
@@ -2,7 +2,7 @@ import { mkdtempSync, mkdirSync, writeFileSync } from "node:fs";
 import { join } from "node:path";
 import { tmpdir } from "node:os";
 import { rm } from "node:fs/promises";
-import { generateDesignTree } from "./design-tree.js";
+import { generateDesignTree, generateDesignTreeWithStats } from "./design-tree.js";
 import type { AnalysisFile, AnalysisNode } from "../contracts/figma-node.js";
 
 function makeFile(document: AnalysisNode): AnalysisFile {
@@ -24,6 +24,41 @@ function makeNode(overrides: Partial<AnalysisNode> & { id: string; name: string;
   };
 }
 
+describe("generateDesignTreeWithStats", () => {
+  it("returns tree string, estimatedTokens, and bytes", () => {
+    const file = makeFile(
+      makeNode({
+        id: "1:1",
+        name: "Frame",
+        type: "FRAME",
+        absoluteBoundingBox: { x: 0, y: 0, width: 400, height: 300 },
+        children: [
+          makeNode({ id: "1:2", name: "Child", type: "TEXT", characters: "Hello", absoluteBoundingBox: { x: 0, y: 0, width: 100, height: 20 } }),
+        ],
+      })
+    );
+
+    const stats = generateDesignTreeWithStats(file);
+
+    expect(stats.tree).toContain("# Design Tree");
+    expect(stats.tree).toContain("Frame (FRAME, 400x300)");
+    expect(stats.bytes).toBeGreaterThan(0);
+    expect(stats.estimatedTokens).toBeGreaterThan(0);
+    expect(stats.estimatedTokens).toBe(Math.ceil(stats.tree.length / 4));
+  });
+
+  it("tree output matches generateDesignTree", () => {
+    const file = makeFile(
+      makeNode({ id: "1:1", name: "Root", type: "FRAME" })
+    );
+
+    const tree = generateDesignTree(file);
+    const stats = generateDesignTreeWithStats(file);
+
+    expect(stats.tree).toBe(tree);
+  });
+});
+
 describe("generateDesignTree", () => {
   describe("header generation", () => {
     it("generates header with root dimensions", () => {

From a146e1e571ccd4400ae923262651b10d366da16d Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Wed, 25 Mar 2026 07:53:07 +0900
Subject: [PATCH 08/11] refactor: extract decision helpers + add JSDoc coverage

- Extract normalizeDecision() and countDecisions() from duplicated
  String(d.decision).trim().toLowerCase() pattern (was 3x)
- Add APPLIED_TYPES/REJECTED_TYPES constants
- Add JSDoc to all exported interfaces/functions in changed files:
  run-directory.ts, visual-compare.ts, design-tree.ts,
  report-generator.ts, docs.ts

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/agents/report-generator.ts    |  1 +
 src/agents/run-directory.ts       | 46 ++++++++++++++++++-------------
 src/cli/docs.ts                   |  5 ++++
 src/core/engine/design-tree.ts    |  1 +
 src/core/engine/visual-compare.ts |  2 ++
 5 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/src/agents/report-generator.ts b/src/agents/report-generator.ts
index fb805c09..18fbade4 100644
--- a/src/agents/report-generator.ts
+++ b/src/agents/report-generator.ts
@@ -2,6 +2,7 @@ import type { ScoreReport } from "@/core/engine/scoring.js";
 import type { MismatchCase } from "./contracts/evaluation-agent.js";
 import type { ScoreAdjustment, NewRuleProposal } from "./contracts/tuning-agent.js";
 
+/** Data structure for generating calibration report markdown. */
 export interface CalibrationReportData {
   fileKey: string;
   fileName: string;
diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts
index a5906224..a1ba5a80 100644
--- a/src/agents/run-directory.ts
+++ b/src/agents/run-directory.ts
@@ -159,6 +159,7 @@ export function moveFixtureToDone(fixturePath: string, fixturesDir: string = DEF
 
 // --- Debate result parsing ---
 
+/** A single decision from the Arbitrator in debate.json. */
 export interface DebateDecision {
   ruleId: string;
   decision: string;
@@ -167,6 +168,7 @@ export interface DebateDecision {
   reason?: string | undefined;
 }
 
+/** Parsed debate.json structure from a calibration run. */
 export interface DebateResult {
   critic: {
     summary: string;
@@ -211,27 +213,41 @@ export function parseDebateResult(runDir: string): DebateResult | null {
   }
 }
 
-/**
- * Extract ruleIds that were applied or revised by the Arbitrator.
- */
+/** Type guard for debate decision records (defensive against malformed JSON). */
 function isDecisionRecord(d: unknown): d is { decision?: unknown; ruleId?: unknown } {
   return d !== null && typeof d === "object";
 }
 
+/** Normalize a decision field value to lowercase trimmed string. */
+function normalizeDecision(d: { decision?: unknown }): string {
+  return String(d.decision ?? "").trim().toLowerCase();
+}
+
+/** Count decisions matching a set of decision types. */
+function countDecisions(decisions: unknown[], types: Set<string>): number {
+  return decisions.filter((d) => {
+    if (!isDecisionRecord(d)) return false;
+    return types.has(normalizeDecision(d));
+  }).length;
+}
+
+const APPLIED_TYPES = new Set(["applied", "revised"]);
+const REJECTED_TYPES = new Set(["rejected"]);
+
+/**
+ * Extract ruleIds that were applied or revised by the Arbitrator.
+ */
 export function extractAppliedRuleIds(debate: DebateResult): string[] {
   if (!debate.arbitrator) return [];
   const decisions = debate.arbitrator.decisions;
   if (!Array.isArray(decisions)) return [];
   return decisions
-    .filter((d) => {
-      if (!isDecisionRecord(d)) return false;
-      const dec = String(d.decision ?? "").trim().toLowerCase();
-      return dec === "applied" || dec === "revised";
-    })
-    .map((d) => String(d.ruleId ?? "").trim())
+    .filter((d) => isDecisionRecord(d) && APPLIED_TYPES.has(normalizeDecision(d)))
+    .map((d) => String((d as { ruleId?: unknown }).ruleId ?? "").trim())
     .filter((id) => id.length > 0);
 }
 
+/** Options for convergence checking. */
 export interface ConvergenceOptions {
   /**
    * When true, converged iff no applied/revised decisions (ignore rejected count).
@@ -252,16 +268,8 @@ export function isConverged(runDir: string, options?: ConvergenceOptions): boole
   if (!debate.arbitrator) return false;
   const decisions = debate.arbitrator.decisions;
   if (!Array.isArray(decisions)) return false;
-  const changed = decisions.filter((d) => {
-    if (!isDecisionRecord(d)) return false;
-    const dec = String(d.decision ?? "").trim().toLowerCase();
-    return dec === "applied" || dec === "revised";
-  }).length;
-  const rejected = decisions.filter((d) => {
-    if (!isDecisionRecord(d)) return false;
-    const dec = String(d.decision ?? "").trim().toLowerCase();
-    return dec === "rejected";
-  }).length;
+  const changed = countDecisions(decisions, APPLIED_TYPES);
+  const rejected = countDecisions(decisions, REJECTED_TYPES);
   if (options?.lenient) {
     return changed === 0;
   }
diff --git a/src/cli/docs.ts b/src/cli/docs.ts
index 6804ec5e..b5c40e78 100644
--- a/src/cli/docs.ts
+++ b/src/cli/docs.ts
@@ -7,6 +7,7 @@ import { createRequire } from "node:module";
 const require = createRequire(import.meta.url);
 const pkg = require("../../package.json") as { version: string };
 
+/** Print the docs index with available topics. */
 export function printDocsIndex(): void {
   console.log(`
 CANICODE DOCUMENTATION (v${pkg.version})
@@ -19,6 +20,7 @@ Full documentation: github.com/let-sunny/canicode#readme
 `.trimStart());
 }
 
+/** Print the setup guide (CLI, MCP, Skills). */
 export function printDocsSetup(): void {
   console.log(`
 CANICODE SETUP GUIDE
@@ -102,6 +104,7 @@ CANICODE SETUP GUIDE
 `.trimStart());
 }
 
+/** Print custom rules guide with examples. */
 export function printDocsRules(): void {
   console.log(`
 CUSTOM RULES GUIDE
@@ -159,6 +162,7 @@ TIP: Ask any LLM "Write a canicode custom rule that checks X" with the
 `.trimStart());
 }
 
+/** Print config override guide with examples. */
 export function printDocsConfig(): void {
   console.log(`
 CONFIG GUIDE
@@ -276,6 +280,7 @@ const DOCS_TOPICS: Record<string, () => void> = {
   "design-tree": printDocsDesignTree,
 };
 
+/** Route docs command to the appropriate topic handler. */
 export function handleDocs(topic?: string): void {
   if (!topic) {
     printDocsIndex();
diff --git a/src/core/engine/design-tree.ts b/src/core/engine/design-tree.ts
index 81e2db30..a7c378e7 100644
--- a/src/core/engine/design-tree.ts
+++ b/src/core/engine/design-tree.ts
@@ -226,6 +226,7 @@ function renderNode(
   return lines.join("\n");
 }
 
+/** Options for design tree generation. */
 export interface DesignTreeOptions {
   /** Directory containing <nodeId>.svg files for VECTOR nodes */
   vectorDir?: string;
diff --git a/src/core/engine/visual-compare.ts b/src/core/engine/visual-compare.ts
index fdb3b806..62899ecb 100644
--- a/src/core/engine/visual-compare.ts
+++ b/src/core/engine/visual-compare.ts
@@ -8,6 +8,7 @@ import { resolve, dirname } from "node:path";
 import pixelmatch from "pixelmatch";
 import { PNG } from "pngjs";
 
+/** Result of a visual comparison between Figma design and rendered code. */
 export interface VisualCompareResult {
   similarity: number;
   diffPixels: number;
@@ -19,6 +20,7 @@ export interface VisualCompareResult {
   diffPath: string;
 }
 
+/** Options for the visual comparison pipeline. */
 export interface VisualCompareOptions {
   figmaUrl: string;
   figmaToken: string;

From 726567904a33807f154ff734f9b3903f0d51bad8 Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Wed, 25 Mar 2026 07:56:20 +0900
Subject: [PATCH 09/11] =?UTF-8?q?fix:=20address=20PR=20#28=20review=20?=
 =?UTF-8?q?=E2=80=94=205=20fixes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. evidence-collector: normalize ruleId/fixture before storing
   (prevents bucket splitting from whitespace differences)
2. cli: validate --width/--height as positive numbers
3. visual-compare: browser.close() in finally block
   (prevents Chromium leak on goto/screenshot failure)
4. visual-compare: remove stale figma.png comment, keep simple
   cache-miss-only logic
5. mcp docs: sync visual-compare topic with new scale options

Deferred: Zod schema for debate.json (#2), test helper extraction (#4)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/agents/evidence-collector.ts  |  6 ++--
 src/cli/index.ts                  |  9 ++++++
 src/core/engine/visual-compare.ts | 49 ++++++++++++++++---------------
 src/mcp/server.ts                 |  9 ++++--
 4 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts
index 4f98dc61..6ea7497f 100644
--- a/src/agents/evidence-collector.ts
+++ b/src/agents/evidence-collector.ts
@@ -86,10 +86,12 @@ export function appendCalibrationEvidence(
   if (entries.length === 0) return;
   const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema);
   // Same batch can repeat (ruleId, fixture); last entry wins (matches cross-call behavior)
+  // Normalize ruleId/fixture to prevent bucket splitting from whitespace differences
   const incomingByKey = new Map<string, CalibrationEvidenceEntry>();
   for (const e of entries) {
-    const k = `${e.ruleId.trim()}\0${e.fixture.trim()}`;
-    incomingByKey.set(k, e);
+    const normalized = { ...e, ruleId: e.ruleId.trim(), fixture: e.fixture.trim() };
+    const k = `${normalized.ruleId}\0${normalized.fixture}`;
+    incomingByKey.set(k, normalized);
   }
   const mergedIncoming = [...incomingByKey.values()];
   const keys = new Set(incomingByKey.keys());
diff --git a/src/cli/index.ts b/src/cli/index.ts
index 083693fe..2855d683 100644
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@@ -939,6 +939,15 @@ cli
         process.exit(1);
       }
 
+      if (options.width !== undefined && (!Number.isFinite(options.width) || options.width <= 0)) {
+        console.error("Error: --width must be a positive number");
+        process.exit(1);
+      }
+      if (options.height !== undefined && (!Number.isFinite(options.height) || options.height <= 0)) {
+        console.error("Error: --height must be a positive number");
+        process.exit(1);
+      }
+
       const hasViewportOverride = options.width !== undefined || options.height !== undefined;
 
       console.log("Comparing...");
diff --git a/src/core/engine/visual-compare.ts b/src/core/engine/visual-compare.ts
index 62899ecb..9ac21bb7 100644
--- a/src/core/engine/visual-compare.ts
+++ b/src/core/engine/visual-compare.ts
@@ -149,26 +149,29 @@ export async function renderCodeScreenshot(
   // Dynamic import — playwright is an optional dependency
   const { chromium } = await import("playwright");
   const browser = await chromium.launch();
-  const context = await browser.newContext({
-    viewport: logicalViewport,
-    deviceScaleFactor,
-  });
-  const page = await context.newPage();
-
-  await page.goto(`file://${resolve(codePath)}`, {
-    waitUntil: "networkidle",
-    timeout: 30_000,
-  });
-  await page.waitForTimeout(1000);
-
-  // Capture only the first child element (the design root), not the full body/viewport
-  const root = page.locator("body > *:first-child");
-  if (await root.count() > 0) {
-    await root.screenshot({ path: outputPath });
-  } else {
-    await page.screenshot({ path: outputPath });
+  try {
+    const context = await browser.newContext({
+      viewport: logicalViewport,
+      deviceScaleFactor,
+    });
+    const page = await context.newPage();
+
+    await page.goto(`file://${resolve(codePath)}`, {
+      waitUntil: "networkidle",
+      timeout: 30_000,
+    });
+    await page.waitForTimeout(1000);
+
+    // Capture only the first child element (the design root), not the full body/viewport
+    const root = page.locator("body > *:first-child");
+    if (await root.count() > 0) {
+      await root.screenshot({ path: outputPath });
+    } else {
+      await page.screenshot({ path: outputPath });
+    }
+  } finally {
+    await browser.close();
   }
-  await browser.close();
 }
 
 /**
@@ -262,10 +265,10 @@ export async function visualCompare(options: VisualCompareOptions): Promise<Visu
 
   const exportScale = options.figmaExportScale ?? 2;
 
-  // Step 1: Fetch Figma screenshot (skip if already cached in output dir)
-  if (existsSync(figmaScreenshotPath)) {
-    // Reuse cached figma.png — same design, no need to re-fetch
-  } else {
+  // Step 1: Fetch Figma screenshot
+  // figma.png in outputDir may come from a previous run with a different scale.
+  // Always re-fetch unless the file was placed by the caller (e.g. converter copying fixture screenshot).
+  if (!existsSync(figmaScreenshotPath)) {
     await fetchFigmaScreenshot(fileKey, nodeId, options.figmaToken, figmaScreenshotPath, exportScale);
     if (!existsSync(figmaScreenshotPath)) {
       throw new Error(`Figma screenshot was not created at expected path: ${figmaScreenshotPath}`);
diff --git a/src/mcp/server.ts b/src/mcp/server.ts
index 3678233c..67f0091d 100644
--- a/src/mcp/server.ts
+++ b/src/mcp/server.ts
@@ -298,12 +298,15 @@ canicode visual-compare ./index.html --figma-url 'https://www.figma.com/design/A
 | --figma-url <url> | (required) | Figma URL with node-id |
 | --token <token> | FIGMA_TOKEN env | Figma API token |
 | --output <dir> | /tmp/canicode-visual-compare | Output directory |
-| --width <px> | (from Figma screenshot) | Viewport width override |
-| --height <px> | (from Figma screenshot) | Viewport height override |
+| --width <px> | (inferred from Figma PNG ÷ scale) | Logical viewport width in CSS px |
+| --height <px> | (inferred from Figma PNG ÷ scale) | Logical viewport height in CSS px |
+| --figma-scale <n> | 2 | Figma export scale (matches @2x fixture PNGs) |
+
+Viewport and device pixel ratio are auto-inferred from the Figma PNG dimensions and export scale. Override only when needed.
 
 ## Output Files
 /tmp/canicode-visual-compare/
-  figma.png — Figma screenshot (scale=2)
+  figma.png — Figma screenshot (at export scale, default @2x)
   code.png — Playwright render of your HTML
   diff.png — Pixel diff (red = different)
 

From bd8c90ba52ba3968e223e074cd7621e8373d3b7a Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Wed, 25 Mar 2026 07:59:14 +0900
Subject: [PATCH 10/11] chore: add GitHub issue templates (bug, feature,
 research)

Three templates matching the patterns used in this project:
- Bug: symptom + cause + fix + affected files
- Feature/Refactor: background + current + proposal + prerequisites
- Research: question + method + expected outcome + blockers

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/ISSUE_TEMPLATE/bug.yml      | 44 ++++++++++++++++++++++++
 .github/ISSUE_TEMPLATE/feature.yml  | 52 +++++++++++++++++++++++++++++
 .github/ISSUE_TEMPLATE/research.yml | 50 +++++++++++++++++++++++++++
 3 files changed, 146 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug.yml
 create mode 100644 .github/ISSUE_TEMPLATE/feature.yml
 create mode 100644 .github/ISSUE_TEMPLATE/research.yml

diff --git a/.github/ISSUE_TEMPLATE/bug.yml b/.github/ISSUE_TEMPLATE/bug.yml
new file mode 100644
index 00000000..ac590d4e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug.yml
@@ -0,0 +1,44 @@
+name: Bug Report
+description: Something is broken or producing wrong results
+labels: ["bug"]
+body:
+  - type: textarea
+    id: symptom
+    attributes:
+      label: Symptom
+      description: What's happening? Include error messages, wrong output, or unexpected behavior.
+      placeholder: "invisible-layer scores -10 (blocking) but hidden layers don't block implementation"
+    validations:
+      required: true
+  - type: textarea
+    id: cause
+    attributes:
+      label: Cause
+      description: Why is this happening? Root cause if known.
+      placeholder: "design-tree already skips visible:false nodes, so hidden layers have zero impact on code generation"
+    validations:
+      required: false
+  - type: textarea
+    id: fix
+    attributes:
+      label: Proposed Fix
+      description: How should it be fixed?
+      placeholder: "Change severity from blocking to suggestion, score from -10 to -1"
+    validations:
+      required: false
+  - type: textarea
+    id: files
+    attributes:
+      label: Affected Files
+      description: Which source files need changes?
+      placeholder: "src/core/rules/rule-config.ts, src/core/rules/ai-readability/index.ts"
+    validations:
+      required: false
+  - type: textarea
+    id: related
+    attributes:
+      label: Related Issues
+      description: Link related issues or PRs
+      placeholder: "#14, #15"
+    validations:
+      required: false
diff --git a/.github/ISSUE_TEMPLATE/feature.yml b/.github/ISSUE_TEMPLATE/feature.yml
new file mode 100644
index 00000000..cf708c3e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature.yml
@@ -0,0 +1,52 @@
+name: Feature / Refactor
+description: New functionality or improvement to existing code
+labels: ["enhancement"]
+body:
+  - type: textarea
+    id: background
+    attributes:
+      label: Background
+      description: Why is this needed? What problem does it solve?
+      placeholder: "save-fixture doesn't store component master nodes, so design-tree and analysis rules can't access component structure"
+    validations:
+      required: true
+  - type: textarea
+    id: current
+    attributes:
+      label: Current State
+      description: How does it work now? Include code snippets if helpful.
+      placeholder: |
+        ```typescript
+        // Only stores component metadata (name, description)
+        components: { "comp:1": { name: "Button", description: "" } }
+        // Missing: master node tree
+        ```
+    validations:
+      required: false
+  - type: textarea
+    id: proposal
+    attributes:
+      label: Proposal
+      description: What should change? Be specific about the approach.
+      placeholder: |
+        1. Collect componentIds from INSTANCE nodes
+        2. Fetch master nodes via getFileNodes
+        3. Store in componentDefinitions field
+    validations:
+      required: true
+  - type: textarea
+    id: prerequisites
+    attributes:
+      label: Prerequisites
+      description: What needs to be done first?
+      placeholder: "#16 must be merged before this can work"
+    validations:
+      required: false
+  - type: textarea
+    id: related
+    attributes:
+      label: Related Issues
+      description: Link related issues or PRs
+      placeholder: "#14, #17"
+    validations:
+      required: false
diff --git a/.github/ISSUE_TEMPLATE/research.yml b/.github/ISSUE_TEMPLATE/research.yml
new file mode 100644
index 00000000..b68014d6
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/research.yml
@@ -0,0 +1,50 @@
+name: Research / Investigation
+description: Needs exploration before implementation — API feasibility, impact measurement, etc.
+labels: ["research"]
+body:
+  - type: textarea
+    id: question
+    attributes:
+      label: Research Question
+      description: What are we trying to find out?
+      placeholder: "Does Figma's annotation data improve AI implementation accuracy?"
+    validations:
+      required: true
+  - type: textarea
+    id: method
+    attributes:
+      label: Method
+      description: How will we investigate? Include test fixtures, A/B approach, API calls, etc.
+      placeholder: |
+        A/B test:
+        - Test A: implement without annotations
+        - Test B: implement with annotations
+        - Compare: similarity + token consumption
+    validations:
+      required: true
+  - type: textarea
+    id: expected
+    attributes:
+      label: Expected Outcome
+      description: What will the result tell us? How will it inform next steps?
+      placeholder: |
+        - If annotations improve accuracy → add annotation-related rules
+        - If no difference → annotations are human-only, not AI-relevant
+    validations:
+      required: true
+  - type: textarea
+    id: blockers
+    attributes:
+      label: Blockers
+      description: Anything that might prevent this research?
+      placeholder: "REST API annotations field is private beta — may need MCP-only path"
+    validations:
+      required: false
+  - type: textarea
+    id: related
+    attributes:
+      label: Related Issues
+      description: Link related issues or PRs
+      placeholder: "#14, #20"
+    validations:
+      required: false

From fdfd3e9cf1fe4cc2f3f4e573dd10a402e8e279ad Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Wed, 25 Mar 2026 08:12:21 +0900
Subject: [PATCH 11/11] fix: coerce --width/--height from string before
 validation

CAC passes CLI option values as strings. Number.isFinite("1440")
returns false, rejecting valid inputs. Mirror the figmaScale pattern:
coerce with Number() first, then validate.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/cli/index.ts | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/cli/index.ts b/src/cli/index.ts
index 2855d683..73c528c6 100644
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@@ -939,16 +939,20 @@ cli
         process.exit(1);
       }
 
-      if (options.width !== undefined && (!Number.isFinite(options.width) || options.width <= 0)) {
+      // CAC passes option values as strings — coerce to numbers before validation
+      const width = options.width !== undefined ? Number(options.width) : undefined;
+      const height = options.height !== undefined ? Number(options.height) : undefined;
+
+      if (width !== undefined && (!Number.isFinite(width) || width <= 0)) {
         console.error("Error: --width must be a positive number");
         process.exit(1);
       }
-      if (options.height !== undefined && (!Number.isFinite(options.height) || options.height <= 0)) {
+      if (height !== undefined && (!Number.isFinite(height) || height <= 0)) {
         console.error("Error: --height must be a positive number");
         process.exit(1);
       }
 
-      const hasViewportOverride = options.width !== undefined || options.height !== undefined;
+      const hasViewportOverride = width !== undefined || height !== undefined;
 
       console.log("Comparing...");
       const result = await visualCompare({
@@ -960,8 +964,8 @@ cli
         ...(hasViewportOverride
           ? {
               viewport: {
-                ...(options.width !== undefined ? { width: options.width } : {}),
-                ...(options.height !== undefined ? { height: options.height } : {}),
+                ...(width !== undefined ? { width } : {}),
+                ...(height !== undefined ? { height } : {}),
               },
             }
           : {}),