let-sunny · let-sunny · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/.claude/commands/add-rule.md b/.claude/commands/add-rule.md
@@ -68,9 +68,9 @@ Append to `$RUN_DIR/activity.jsonl`:
 {"step":"Implementer","timestamp":"<ISO8601>","result":"implemented rule <rule-id>","durationMs":<ms>}
 ```
 
-### Step 4 — A/B Visual Validation
+### Step 4 — A/B Validation (Visual + Token)
 
-Run an A/B comparison on the entire design to measure the rule's actual impact on pixel-perfect accuracy:
+Run an A/B comparison measuring both **pixel accuracy** and **token efficiency**:
 
 1. Extract `fileKey` and root `nodeId` from the fixture or Figma URL.
 
@@ -91,9 +91,15 @@ Run an A/B comparison on the entire design to measure the rule's actual impact o
    - Run: `npx canicode visual-compare $RUN_DIR/visual-b.html --figma-url "<figma-url-with-root-node-id>" --output $RUN_DIR/visual-b`
    - Record similarity_b
 
-5. Compare: if similarity_b > similarity_a → the rule catches something that genuinely improves implementation quality.
+5. **Visual comparison**: if similarity_b > similarity_a → the rule improves pixel accuracy.
 
-6. Record both scores for the Evaluator.
+6. **Token comparison** (always measure, even if visual diff is zero):
+   - Count lines/bytes of `visual-a.html` vs `visual-b.html`
+   - If Test B is significantly smaller → the rule reduces token consumption (important for large pages)
+   - Record `tokens_a` (estimated: file bytes / 4) and `tokens_b`
+   - Token savings ratio: `1 - tokens_b / tokens_a`
+
-6. **Token comparison** (always measure, even if visual diff is zero):
-   - Count lines/bytes of `visual-a.html` vs `visual-b.html`
-   - If Test B is significantly smaller → the rule reduces token consumption (important for large pages)
-   - Record `tokens_a` (estimated: file bytes / 4) and `tokens_b`
-   - Token savings ratio: `1 - tokens_b / tokens_a`
+6. **Token comparison** (always measure, even if visual diff is zero):
+   - Count lines/bytes of `visual-a.html` vs `visual-b.html`
+   - If Test B is significantly smaller → the rule reduces token consumption (important for large pages)
+   - Record `tokens_a` (estimated: file bytes / 4) and `tokens_b`
+   - Token savings ratio:
+     - if `tokens_a > 0`: `1 - tokens_b / tokens_a`
+     - else: `0` (or `n/a`, and mark the run as generation-failed)
-6. **Token comparison** (always measure, even if visual diff is zero):
-   - Count lines/bytes of `visual-a.html` vs `visual-b.html`
-   - If Test B is significantly smaller → the rule reduces token consumption (important for large pages)
-   - Record `tokens_a` (estimated: file bytes / 4) and `tokens_b`
-   - Token savings ratio: `1 - tokens_b / tokens_a`
+6. **Token comparison** (always measure, even if visual diff is zero):
+   - Count lines/bytes of `visual-a.html` vs `visual-b.html`
+   - If Test B is significantly smaller → the rule reduces token consumption (important for large pages)
+   - Record `tokens_a` (estimated: file bytes / 4) and `tokens_b`
+   - Token savings ratio:
+     - if `tokens_a > 0`: `1 - tokens_b / tokens_a`
+     - else: `0` (or `n/a`, and mark the run as generation-failed)
+7. Record all scores for the Evaluator: `similarity_a`, `similarity_b`, `tokens_a`, `tokens_b`.
 
 ### Step 5 — Evaluator
 

diff --git a/.claude/commands/calibrate-night.md b/.claude/commands/calibrate-night.md
@@ -37,7 +37,13 @@ After each successful run, use the CLI to check convergence and move:
 npx canicode fixture-done <fixture-path> --run-dir $RUN_DIR
 ```
 
-This checks `debate.json` for convergence (`applied=0 AND rejected=0`) and moves the fixture to `done/`. If the fixture hasn't converged, the command exits with an error — that's expected, just skip and continue.
+This checks `debate.json` for convergence (`applied=0 AND rejected=0` by default) and moves the fixture to `done/`. If the fixture hasn't converged, the command exits with an error — that's expected, just skip and continue.
+
+If the same low-confidence proposals keep getting **rejected** and nothing is applied (issue #14), you can move anyway with **`--lenient-convergence`** (converged when there are no applied/revised decisions, ignoring rejects):
+
+```bash
+npx canicode fixture-done <fixture-path> --run-dir $RUN_DIR --lenient-convergence
+```
 
 Report which fixtures were moved to `done/`.
 

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -6,6 +6,19 @@ A CLI tool that analyzes Figma design structures to provide development-friendli
 
 **Can AI implement this Figma design pixel-perfectly?** Everything in this project serves this single question. Every rule, score, category, and pipeline exists to measure and improve how accurately AI can reproduce a Figma design as code. The metric is `visual-compare` similarity (0-100%).
 
+## Target Environment
+
+The primary target is **teams with designers** where developers (+AI) implement large Figma pages:
+- **Page scale**: 300+ nodes, full screens, not small component sections
+- **Component-heavy**: Design systems with reusable components, variants, tokens
+- **AI context budget**: Large pages must fit in AI context windows — componentization reduces token count via deduplication
+- **Not the target**: Individual developers generating simple UI with AI — they don't need Figma analysis
+
+This means:
+- Component-related rule scores (missing-component, etc.) should NOT be lowered based on small fixture calibration
+- Token consumption is a first-class metric — designs that waste tokens on repeated structures are penalized
+- Calibration fixtures should include large, complex pages alongside small sections
+
 ## Tech Stack
 
 - **Runtime**: Node.js (>=18)

diff --git a/docs/fixtures/ANALYSIS-VALIDITY-FEEDBACK.md b/docs/fixtures/ANALYSIS-VALIDITY-FEEDBACK.md
@@ -0,0 +1,156 @@
+# Fixture analysis: validity feedback (over- vs under-estimation)
+
+This document records **subjective engineering judgment** on whether `canicode analyze` results **oversell** or **undersell** how hard it is for an AI to implement the UI **pixel-close** from the fixture, and what to do about it.
+
+It is **not** a statistical proof. It assumes: REST fixture JSON + current `rule-config` scoring, and the north-star metric **visual similarity**, not design-system purity alone.
+
+---
+
+## How to read this
+
+| Term | Meaning here |
+|------|----------------|
+| **Overrated (grade/score)** | The **readiness score looks better** than the likely **implementation + visual-compare** outcome without extra context or cleanup. |
+| **Underrated** | The **score looks worse** than how repeatable or implementable the file actually is for an AI (given a strong prompt / repeated patterns). |
+
+**Remedies** split into: **Figma / fixture hygiene**, **prompt & workflow**, **tooling (rules, scoring, scope)**.
+
+---
+
+## Cross-cutting patterns
+
+### A. Invisible layers + raw color clusters (many Simple DS fixtures)
+
+- **Overrated:** Overall **grade can look decent (A / high B)** while the tree still carries many **invisible-layer** and **raw-color** hits. Severity may keep the headline grade up, but the AI still sees **noise** (extra nodes, ambiguous fills) and may **mis-order or mis-style** visible content. **Pixel outcome** can be worse than the letter grade suggests.
+- **Underrated:** Rare for the **same** pattern; occasionally invisible only affects editor hygiene, not export — but REST + rules still complain.
+- **Remedies:**
+  - **Figma:** Delete or detach truly unused layers; replace raw values with variables where the team cares; reduce hidden decoration.
+  - **Workflow:** Feed **`design-tree`** or a **pruned node list** to the AI so it ignores known-noise IDs.
+  - **Tooling:** Consider reporting **“visible-only issues”** aggregate for AI-facing summaries; tune `invisible-layer` / `raw-color` weights after calibration for “impact on visual-compare.”
+
+### B. Unscoped large trees (Material 3 & large Simple DS)
+
+- **Overrated:** **Low grades (C, B)** can look like “this file is terrible to implement” when the real problem is **scope** — mixing many screens/components in one graph. A single scoped frame might be **B+ or A** in isolation.
+- **Underrated:** Almost the inverse: **one bad subtree** can dominate issue counts; the rest of the file might be fine — the **average** undervalues “good parts.”
+- **Remedies:**
+  - **Fixture:** Re-`save-fixture` with a **single `node-id`** per calibration / AI task.
+  - **CLI:** Always analyze with the **same scope** you ask the AI to implement.
+  - **Tooling:** Surface **root-scoped score** vs **whole-file score** in reports when unscoped.
+
+### C. Design-system spacing / fixed-width spam (`material3-52949-27916`, `material3-51954-18254`)
+
+- **Overrated:** Less common; if spacing issues are **systematic**, fixing one pattern fixes hundreds — headline pain can be **overstated** for a **template-aware** AI.
+- **Underrated:** **Underrated for “first-try” AI** without a grid spec — hundred of **inconsistent-spacing** / **fixed-width** hits mean **lots of opportunity for pixel drift**; visual-compare will punish. Score may feel “only B” but failure modes are many.
+- **Remedies:**
+  - **Prompt:** State **grid base (e.g. 4px)**, max width, breakpoint behavior explicitly.
+  - **Figma:** Normalize spacing to the grid; reduce fixed widths or mark responsive intent.
+  - **Tooling:** Calibration tied to **visual-compare** on these fixtures so spacing rules track **pixel deltas**.
+
+### D. Numeric / generic naming (Material 3 community)
+
+- **Overrated:** Score can look **OK on layout** while **ai-readability / naming** tanks — sometimes **underweighted** in how much it confuses **less capable** models (variable names, component names in code).
+- **Underrated:** For **vision-heavy** codegen (screenshot + structure), **bad names** matter less — score may **undersell** “actually buildable if you ignore names.”
+- **Remedies:**
+  - **Figma:** Rename critical interactive nodes; keep DS naming convention.
+  - **Prompt:** “Map Figma node IDs to React components; names are not authoritative.”
+  - **Tooling:** Separate **“semantic naming score”** from **implementation difficulty** in user-facing copy.
+
+### E. `figma-ui3-kit` (token category very weak, few nodes)
+
+- **Overrated:** **Overall B** might **overstate** readiness: **token ~21%** means heavy **raw-color/raw-font** — first **visual-compare** often fails on **font and fill** unless explicitly mocked.
+- **Underrated:** Small **node count** — fast iteration; score may **feel harsh** vs effort to **manually** match a tiny scope.
+- **Remedies:**
+  - **Workflow:** Predeclare **font stack** and **color substitution** in the prompt to absorb raw-font/raw-color.
+  - **Tooling:** Optional “**compare mode: ignore typography**” (dangerous but documented) for layout-only checks.
+
+---
+
+## Per-fixture notes (snapshot-oriented)
+
+Names refer to directories under `fixtures/` or `fixtures/done/`. Grades refer to the last batch analyze (v0.8.9); re-run after config changes.
+
+### `done/simple-ds-page-sections` — A (86%)
+
+- **Overrated:** **Token/component** weakness (raw color, missing descriptions, invisible layers) is **under-reflected** in a single letter “A” for **naive** AI codegen without DS context.
+- **Underrated:** **Small, shallow tree** — easier to reason about than bulk Simple DS fixtures; score is not “too kind” to structure.
+- **Remedies:** Add **component description** stub in prompt; clean invisible layers; run **visual-compare** once to validate the A.
+
+### `material3-56615-45927` — C+ (72%), large unscoped
+
+- **Overrated:** Unlikely at file level — **C+** already warns; per-frame might be better.
+- **Underrated:** Possible if user only implements **one** repeated component — issue count is **file-wide**.
+- **Remedies:** **Mandatory scoping**; treat as multiple fixtures.
+
+### `simple-ds-175-9106` / `175-8591` / `175-7790` / `562-9518` — B ~ B+
+
+- **Overrated:** **raw-color + invisible-layer** volume — **B/B+** may **overrate** first-shot pixel match.
+- **Underrated:** **Detach/instance** and **default-name** sometimes fixable with a strong “use Figma structure as source of truth” prompt.
+- **Remedies:** Scope; variable cleanup; explicit **instance** handling in prompt.
+
+### `simple-ds-4333-9262` — A (89%)
+
+- **Overrated:** Still **missing-component-description** and spacing — “A” is **fragile** for **fully autonomous** AI without extra text context.
+- **Underrated:** Among the **cleaner** Simple DS slices — good **golden** candidate; score may be **fair or slightly harsh** if issues are mostly doc/metadata.
+- **Remedies:** Use as **benchmark**; add descriptions only if product needs them for handoff.
+
+### `material3-52949-27916` — B (79%), 400+ issues
+
+- **Overrated:** Whole-file **B** might **overrate** “one-shot implement whole thing.”
+- **Underrated:** For **systematic** spacing tokens, a **macro prompt** might achieve **ok** similarity on repetitive regions — composite grade **undersells** repeatability.
+- **Remedies:** Split fixture; calibrate spacing rules on **scoped** chunks.
+
+### `done/simple-ds-panel-sections` — A (87%)
+
+- **Overrated:** Same invisible/raw-color cluster as other Simple DS **done** sets — **A** vs **pixel** needs verification.
+- **Underrated:** Low depth, moderate issues — reasonable **hand-implement** cost.
+
+### `material3-51954-18254` — B+ (82%), spacing-dominated
+
+- **Overrated:** Low probability — issue profile is **honest** about layout math risk.
+- **Underrated:** If the AI is given **explicit spacing table** extracted once from Figma, difficulty drops **faster** than score implies.
+- **Remedies:** Export **spacing tokens** or measurement table alongside `data.json`.
+
+### `done/material3-kit-2` — B+ (81%)
+
+- **Overrated:** **deep-nesting + sibling direction** — **B+** might **overrate** “drop in one prompt” success for junior models.
+- **Underrated:** **Narrow node count** (96) — less overwhelming than `82356`.
+- **Remedies:** Refactor nesting in Figma for handoff; or implement **section-by-section**.
+
+### `material3-56615-82356` — C (66%)
+
+- **Overrated:** Rare — **C** is already blunt.
+- **Underrated:** If scoped to **one** leaf screen, local grade might be **much higher** — file-level **undersells** localized quality.
+- **Remedies:** **Never** use as single-scope “implement all”; split into **10+** scoped fixtures.
+
+### `done/material3-kit-1` — A (85%)
+
+- **Overrated:** **Still mixed token/naming/invisible** — **A** is strong; verify with **visual-compare** for **stock** AI.
+- **Underrated:** Mature kit — **repeatable patterns** help experienced prompts.
+- **Remedies:** Keep as **positive** control next to `kit-2` / `82356`.
+
+### `done/simple-ds-card-grid` — B+ (80%)
+
+- **Overrated:** **Component/naming** weak — **B+** may **overrate** grid+card **pixel** parity without layout hints.
+- **Underrated:** **Calibration history** — score may be **well tuned** for this file; trust **relative** more than absolute.
+- **Remedies:** Explicit **grid columns/gap** in prompt.
+
+### `done/figma-ui3-kit` — B (76%), token floor
+
+- **Overrated:** **Overall B** with **token 21%** — **readiness for unguided AI** is **overrated** unless fonts/colors are specified in prompt.
+- **Underrated:** **20 nodes** — absolute work is small; grade **undersells** “quick human polish.”
+- **Remedies:** Font and color contract in README for this fixture; optional webfont load in generated HTML.
+
+---
+
+## Raising “validity” of the metric (product direction)
+
+1. **Always pair** headline grade with **visual-compare** (or explicit “not run”) on the **same scoped node**.
+2. **Publish** whether the run was **scoped** and **node count** in JSON (provenance) — avoids comparing whole-file vs framed scores.
+3. **Calibrate** rule weights using **pixel delta categories**, not issue count alone — reduces **over/under** gap for AI use cases.
+4. **Two summaries** in reports: **“handoff / DS hygiene”** vs **“likely first visual-compare (AI)”** — can diverge.
+
+---
+
+## Revision
+
+Re-evaluate after major `rule-config` or rule-set changes; per-fixture grades will move.
diff --git a/src/agents/evidence-collector.test.ts b/src/agents/evidence-collector.test.ts
@@ -102,6 +102,31 @@ describe("evidence-collector", () => {
       expect(raw).toHaveLength(2);
     });
 
+    it("replaces prior entry for same ruleId and fixture (latest run wins)", () => {
+      writeFileSync(calPath, JSON.stringify([
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" },
+      ]), "utf-8");
+
+      appendCalibrationEvidence([
+        { ruleId: "rule-a", type: "underscored", actualDifficulty: "hard", fixture: "fx1", timestamp: "t2" },
+      ], calPath);
+
+      const raw = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[];
+      expect(raw).toHaveLength(1);
+      expect(raw[0]!.type).toBe("underscored");
+    });
+
+    it("dedupes within a single append call (last row for same ruleId and fixture wins)", () => {
+      appendCalibrationEvidence([
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" },
+        { ruleId: "rule-a", type: "underscored", actualDifficulty: "hard", fixture: "fx1", timestamp: "t2" },
+      ], calPath);
+
+      const raw = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[];
+      expect(raw).toHaveLength(1);
+      expect(raw[0]!.type).toBe("underscored");
+    });
+
     it("does nothing for empty entries", () => {
       writeFileSync(calPath, "[]", "utf-8");
       appendCalibrationEvidence([], calPath);
@@ -126,6 +151,17 @@ describe("evidence-collector", () => {
       expect(raw[0]!.ruleId).toBe("rule-b");
     });
 
+    it("trims ruleIds when matching stored entries", () => {
+      writeFileSync(calPath, JSON.stringify([
+        { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" },
+      ]), "utf-8");
+
+      pruneCalibrationEvidence(["  rule-a  "], calPath);
+
+      const raw = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[];
+      expect(raw).toHaveLength(0);
+    });
+
     it("does nothing for empty ruleIds", () => {
       const entries: CalibrationEvidenceEntry[] = [
         { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" },

diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts
@@ -85,8 +85,21 @@ export function appendCalibrationEvidence(
 ): void {
   if (entries.length === 0) return;
   const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema);
-  existing.push(...entries);
-  writeJsonArray(evidencePath, existing);
+  // Same batch can repeat (ruleId, fixture); last entry wins (matches cross-call behavior)
+  // Normalize ruleId/fixture to prevent bucket splitting from whitespace differences
+  const incomingByKey = new Map<string, CalibrationEvidenceEntry>();
+  for (const e of entries) {
+    const normalized = { ...e, ruleId: e.ruleId.trim(), fixture: e.fixture.trim() };
+    const k = `${normalized.ruleId}\0${normalized.fixture}`;
+    incomingByKey.set(k, normalized);
+  }
+  const mergedIncoming = [...incomingByKey.values()];
+  const keys = new Set(incomingByKey.keys());
+  const withoutDupes = existing.filter(
+    (e) => !keys.has(`${e.ruleId.trim()}\0${e.fixture.trim()}`),
+  );
+  withoutDupes.push(...mergedIncoming);
+  writeJsonArray(evidencePath, withoutDupes);
 }
 
 /**
@@ -97,9 +110,9 @@ export function pruneCalibrationEvidence(
   evidencePath: string = DEFAULT_CALIBRATION_PATH
 ): void {
   if (appliedRuleIds.length === 0) return;
-  const ruleSet = new Set(appliedRuleIds);
+  const ruleSet = new Set(appliedRuleIds.map((id) => id.trim()).filter((id) => id.length > 0));
   const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema);
-  const pruned = existing.filter((e) => !ruleSet.has(e.ruleId));
+  const pruned = existing.filter((e) => !ruleSet.has(e.ruleId.trim()));
   writeJsonArray(evidencePath, pruned);
 }
 

diff --git a/src/agents/report-generator.ts b/src/agents/report-generator.ts
@@ -2,6 +2,7 @@ import type { ScoreReport } from "@/core/engine/scoring.js";
 import type { MismatchCase } from "./contracts/evaluation-agent.js";
 import type { ScoreAdjustment, NewRuleProposal } from "./contracts/tuning-agent.js";
 
+/** Data structure for generating calibration report markdown. */
 export interface CalibrationReportData {
   fileKey: string;
   fileName: string;
@@ -15,6 +16,12 @@ export interface CalibrationReportData {
   validatedRules: string[];
   adjustments: ScoreAdjustment[];
   newRuleProposals: NewRuleProposal[];
+  /** Design tree token metrics (optional — present when design-tree stats are available) */
+  tokenMetrics?: {
+    designTreeTokens: number;
+    designTreeBytes: number;
+    tokensPerNode: number;
+  } | undefined;
-  /** Design tree token metrics (optional — present when design-tree stats are available) */
-  tokenMetrics?: {
-    designTreeTokens: number;
-    designTreeBytes: number;
-    tokensPerNode: number;
-  } | undefined;
+  /** Design tree token metrics (optional — present when design-tree stats are available) */
+  tokenMetrics?: {
+    designTreeTokens: number;
+    designTreeBytes: number;
+    tokensPerNode: number;
+  };
-  /** Design tree token metrics (optional — present when design-tree stats are available) */
-  tokenMetrics?: {
-    designTreeTokens: number;
-    designTreeBytes: number;
-    tokensPerNode: number;
-  } | undefined;
+  /** Design tree token metrics (optional — present when design-tree stats are available) */
+  tokenMetrics?: {
+    designTreeTokens: number;
+    designTreeBytes: number;
+    tokensPerNode: number;
+  };
 }
 
 /**
@@ -47,7 +54,9 @@ function renderOverview(data: CalibrationReportData): string {
 | Total Issues | ${data.issueCount} |
 | Converted Nodes | ${data.convertedNodeCount} |
 | Skipped Nodes | ${data.skippedNodeCount} |
-| Overall Grade | ${data.scoreReport.overall.grade} (${data.scoreReport.overall.percentage}%) |
+| Overall Grade | ${data.scoreReport.overall.grade} (${data.scoreReport.overall.percentage}%) |${data.tokenMetrics ? `
+| Design Tree Tokens | ~${data.tokenMetrics.designTreeTokens.toLocaleString()} tokens (${Math.round(data.tokenMetrics.designTreeBytes / 1024)}KB) |
+| Tokens per Node | ~${Math.round(data.tokenMetrics.tokensPerNode)} |` : ""}
 `;
 }