From 4f9f22cbc0db37400fae631713b55bda17a50925 Mon Sep 17 00:00:00 2001 From: let-sunny Date: Wed, 25 Mar 2026 01:21:07 +0900 Subject: [PATCH 01/11] fix: calibration pipeline retina compare, evidence prune, lenient convergence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - visual-compare: default Figma export scale 2, logical viewport + deviceScaleFactor so code.png matches @2x figma.png; CLI omits 1440×900 override; --figma-scale - Figma image cache keys include scale; MCP visual-compare accepts figmaExportScale - extractAppliedRuleIds / pruneCalibrationEvidence: trim + case-insensitive decisions - appendCalibrationEvidence: replace same (ruleId, fixture) on re-run - fixture-done --lenient-convergence + isConverged({ lenient }) for reject-only stalls - Document lenient flag in calibrate-night command Closes #14 --- .claude/commands/calibrate-night.md | 8 ++- src/agents/evidence-collector.test.ts | 25 ++++++++ src/agents/evidence-collector.ts | 10 +-- src/agents/run-directory.test.ts | 59 ++++++++++++++++- src/agents/run-directory.ts | 37 ++++++++--- src/cli/docs.ts | 15 +++-- src/cli/index.ts | 43 ++++++++++--- src/core/engine/visual-compare.test.ts | 18 +++--- src/core/engine/visual-compare.ts | 89 ++++++++++++++++++++++---- src/mcp/server.ts | 6 +- 10 files changed, 257 insertions(+), 53 deletions(-) diff --git a/.claude/commands/calibrate-night.md b/.claude/commands/calibrate-night.md index 4dec57e7..6a3ed458 100644 --- a/.claude/commands/calibrate-night.md +++ b/.claude/commands/calibrate-night.md @@ -37,7 +37,13 @@ After each successful run, use the CLI to check convergence and move: npx canicode fixture-done --run-dir $RUN_DIR ``` -This checks `debate.json` for convergence (`applied=0 AND rejected=0`) and moves the fixture to `done/`. If the fixture hasn't converged, the command exits with an error — that's expected, just skip and continue. +This checks `debate.json` for convergence (`applied=0 AND rejected=0` by default) and moves the fixture to `done/`. If the fixture hasn't converged, the command exits with an error — that's expected, just skip and continue. + +If the same low-confidence proposals keep getting **rejected** and nothing is applied (issue #14), you can move anyway with **`--lenient-convergence`** (converged when there are no applied/revised decisions, ignoring rejects): + +```bash +npx canicode fixture-done --run-dir $RUN_DIR --lenient-convergence +``` Report which fixtures were moved to `done/`. diff --git a/src/agents/evidence-collector.test.ts b/src/agents/evidence-collector.test.ts index fb44e813..70b5ec86 100644 --- a/src/agents/evidence-collector.test.ts +++ b/src/agents/evidence-collector.test.ts @@ -102,6 +102,20 @@ describe("evidence-collector", () => { expect(raw).toHaveLength(2); }); + it("replaces prior entry for same ruleId and fixture (latest run wins)", () => { + writeFileSync(calPath, JSON.stringify([ + { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" }, + ]), "utf-8"); + + appendCalibrationEvidence([ + { ruleId: "rule-a", type: "underscored", actualDifficulty: "hard", fixture: "fx1", timestamp: "t2" }, + ], calPath); + + const raw = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[]; + expect(raw).toHaveLength(1); + expect(raw[0]!.type).toBe("underscored"); + }); + it("does nothing for empty entries", () => { writeFileSync(calPath, "[]", "utf-8"); appendCalibrationEvidence([], calPath); @@ -126,6 +140,17 @@ describe("evidence-collector", () => { expect(raw[0]!.ruleId).toBe("rule-b"); }); + it("trims ruleIds when matching stored entries", () => { + writeFileSync(calPath, JSON.stringify([ + { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" }, + ]), "utf-8"); + + pruneCalibrationEvidence([" rule-a "], calPath); + + const raw = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[]; + expect(raw).toHaveLength(0); + }); + it("does nothing for empty ruleIds", () => { const entries: CalibrationEvidenceEntry[] = [ { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" }, diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts index ed05b9bc..aa3e0e46 100644 --- a/src/agents/evidence-collector.ts +++ b/src/agents/evidence-collector.ts @@ -85,8 +85,10 @@ export function appendCalibrationEvidence( ): void { if (entries.length === 0) return; const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema); - existing.push(...entries); - writeJsonArray(evidencePath, existing); + const keys = new Set(entries.map((e) => `${e.ruleId.trim()}\0${e.fixture}`)); + const withoutDupes = existing.filter((e) => !keys.has(`${e.ruleId.trim()}\0${e.fixture}`)); + withoutDupes.push(...entries); + writeJsonArray(evidencePath, withoutDupes); } /** @@ -97,9 +99,9 @@ export function pruneCalibrationEvidence( evidencePath: string = DEFAULT_CALIBRATION_PATH ): void { if (appliedRuleIds.length === 0) return; - const ruleSet = new Set(appliedRuleIds); + const ruleSet = new Set(appliedRuleIds.map((id) => id.trim()).filter((id) => id.length > 0)); const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema); - const pruned = existing.filter((e) => !ruleSet.has(e.ruleId)); + const pruned = existing.filter((e) => !ruleSet.has(e.ruleId.trim())); writeJsonArray(evidencePath, pruned); } diff --git a/src/agents/run-directory.test.ts b/src/agents/run-directory.test.ts index d02bd2b8..31321666 100644 --- a/src/agents/run-directory.test.ts +++ b/src/agents/run-directory.test.ts @@ -1,4 +1,4 @@ -import { mkdtempSync, existsSync } from "node:fs"; +import { mkdtempSync, existsSync, writeFileSync } from "node:fs"; import { join, basename } from "node:path"; import { tmpdir } from "node:os"; import { rm } from "node:fs/promises"; @@ -8,6 +8,8 @@ import { createCalibrationRunDir, createRuleDiscoveryRunDir, listCalibrationRuns, + extractAppliedRuleIds, + isConverged, } from "./run-directory.js"; describe("extractFixtureName", () => { @@ -99,6 +101,61 @@ describe("createRuleDiscoveryRunDir", () => { }); }); +describe("extractAppliedRuleIds", () => { + it("trims ruleId and normalizes decision casing", () => { + const ids = extractAppliedRuleIds({ + critic: null, + arbitrator: { + summary: "test", + decisions: [ + { ruleId: " my-rule ", decision: "Applied" }, + { ruleId: "b", decision: "REVISED" }, + { ruleId: "c", decision: "rejected" }, + ], + }, + }); + expect(ids).toEqual(["my-rule", "b"]); + }); +}); + +describe("isConverged", () => { + let tempDir: string; + + beforeEach(() => { + tempDir = mkdtempSync(join(tmpdir(), "converge-test-")); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it("strict: false when rejections remain with no applies", () => { + writeFileSync( + join(tempDir, "debate.json"), + JSON.stringify({ + arbitrator: { + summary: "applied=0 rejected=1", + decisions: [{ ruleId: "x", decision: "rejected" }], + }, + }), + ); + expect(isConverged(tempDir)).toBe(false); + }); + + it("lenient: true when no applied/revised even if rejected", () => { + writeFileSync( + join(tempDir, "debate.json"), + JSON.stringify({ + arbitrator: { + summary: "applied=0 rejected=1", + decisions: [{ ruleId: "x", decision: "rejected" }], + }, + }), + ); + expect(isConverged(tempDir, { lenient: true })).toBe(true); + }); +}); + describe("listCalibrationRuns", () => { const origCwd = process.cwd(); let tempDir: string; diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts index 34ae9776..bebdad31 100644 --- a/src/agents/run-directory.ts +++ b/src/agents/run-directory.ts @@ -210,22 +210,43 @@ export function parseDebateResult(runDir: string): DebateResult | null { export function extractAppliedRuleIds(debate: DebateResult): string[] { if (!debate.arbitrator) return []; return debate.arbitrator.decisions - .filter((d) => d.decision === "applied" || d.decision === "revised") - .map((d) => d.ruleId); + .filter((d) => { + const dec = (d.decision ?? "").trim().toLowerCase(); + return dec === "applied" || dec === "revised"; + }) + .map((d) => d.ruleId.trim()) + .filter((id) => id.length > 0); +} + +export interface ConvergenceOptions { + /** + * When true, converged iff no applied/revised decisions (ignore rejected count). + * Use when repeated reject loops block `fixture-done` but scores are stable (see issue #14). + */ + lenient?: boolean | undefined; } /** * Check if a calibration run has converged. - * Converged = debate exists AND applied=0 AND rejected=0. - * This means all proposals were validated — no changes and no disagreements. + * Strict: no applied/revised AND no rejected decisions. + * Lenient: no applied/revised only (rejected proposals allowed). */ -export function isConverged(runDir: string): boolean { +export function isConverged(runDir: string, options?: ConvergenceOptions): boolean { const debate = parseDebateResult(runDir); if (!debate) return false; if (debate.skipped) return true; // zero proposals = converged if (!debate.arbitrator) return false; const decisions = debate.arbitrator.decisions; - const applied = decisions.filter((d) => d.decision === "applied" || d.decision === "revised").length; - const rejected = decisions.filter((d) => d.decision === "rejected").length; - return applied === 0 && rejected === 0; + const changed = decisions.filter((d) => { + const dec = (d.decision ?? "").trim().toLowerCase(); + return dec === "applied" || dec === "revised"; + }).length; + const rejected = decisions.filter((d) => { + const dec = (d.decision ?? "").trim().toLowerCase(); + return dec === "rejected"; + }).length; + if (options?.lenient) { + return changed === 0; + } + return changed === 0 && rejected === 0; } diff --git a/src/cli/docs.ts b/src/cli/docs.ts index 92a7a563..6804ec5e 100644 --- a/src/cli/docs.ts +++ b/src/cli/docs.ts @@ -203,13 +203,14 @@ OPTIONS --figma-url Figma URL with node-id (required) --token Figma API token (or FIGMA_TOKEN env var) --output Output directory (default: /tmp/canicode-visual-compare) - --width Viewport width override (default: match Figma screenshot) - --height Viewport height override (default: match Figma screenshot) + --width Logical viewport width in CSS px (omit = infer from Figma PNG ÷ export scale) + --height Logical viewport height in CSS px (omit = infer from Figma PNG ÷ export scale) + --figma-scale Figma Images API scale (default: 2, matches save-fixture @2x PNGs) OUTPUT FILES /tmp/canicode-visual-compare/ - figma.png Figma screenshot (scale=2) - code.png Playwright render of your HTML + figma.png Figma screenshot (default scale=2) + code.png Playwright render (devicePixelRatio matched to export scale) diff.png Pixel diff (red = different pixels) JSON OUTPUT (stdout) @@ -225,9 +226,9 @@ JSON OUTPUT (stdout) } HOW IT WORKS - 1. Fetches Figma screenshot via REST API (scale=2) - 2. Reads screenshot dimensions - 3. Renders your HTML with Playwright at the same viewport size + 1. Fetches Figma screenshot via REST API (default scale=2) + 2. Infers logical CSS viewport (PNG size ÷ scale) unless --width/--height are set + 3. Renders HTML in Playwright with matching devicePixelRatio so code.png matches figma.png pixels 4. Compares pixel-by-pixel with pixelmatch (threshold: 0.1) 5. Returns similarity % and diff image diff --git a/src/cli/index.ts b/src/cli/index.ts index 4c8e8ea6..21699a3c 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -607,13 +607,23 @@ cli .option("--fixtures-dir ", "Fixtures root directory", { default: "fixtures" }) .option("--force", "Skip convergence check") .option("--run-dir ", "Run directory to check for convergence") - .action((fixturePath: string, options: { fixturesDir?: string; force?: boolean; runDir?: string }) => { + .option( + "--lenient-convergence", + "Converged when no applied/revised decisions (ignore rejected; see calibration issue #14)" + ) + .action( + (fixturePath: string, options: { + fixturesDir?: string; + force?: boolean; + runDir?: string; + lenientConvergence?: boolean; + }) => { if (!options.force) { if (!options.runDir) { console.error("Error: --run-dir required to check convergence (or use --force to skip check)"); process.exit(1); } - if (!isConverged(resolve(options.runDir))) { + if (!isConverged(resolve(options.runDir), { lenient: options.lenientConvergence })) { const debate = parseDebateResult(resolve(options.runDir)); const summary = debate?.arbitrator?.summary ?? debate?.skipped ?? "no debate.json found"; console.error(`Error: fixture has not converged (${summary}). Use --force to override.`); @@ -903,10 +913,11 @@ cli .option("--figma-url ", "Figma URL with node-id (required)") .option("--token ", "Figma API token (or use FIGMA_TOKEN env var)") .option("--output ", "Output directory for screenshots and diff (default: /tmp/canicode-visual-compare)") - .option("--width ", "Viewport width (default: 1440)") - .option("--height ", "Viewport height (default: 900)") + .option("--width ", "Logical viewport width in CSS px (default: infer from Figma PNG ÷ export scale)") + .option("--height ", "Logical viewport height in CSS px (default: infer from Figma PNG ÷ export scale)") + .option("--figma-scale ", "Figma export scale (default: 2, matches save-fixture / @2x PNGs)") .example(" canicode visual-compare ./generated/index.html --figma-url 'https://www.figma.com/design/ABC/File?node-id=1-234'") - .action(async (codePath: string, options: VisualCompareOptions) => { + .action(async (codePath: string, options: VisualCompareOptions & { figmaScale?: string }) => { try { if (!options.figmaUrl) { console.error("Error: --figma-url is required"); @@ -921,16 +932,30 @@ cli const { visualCompare } = await import("../core/engine/visual-compare.js"); + const exportScale = + options.figmaScale !== undefined ? Number(options.figmaScale) : undefined; + if (exportScale !== undefined && (!Number.isFinite(exportScale) || exportScale < 1)) { + console.error("Error: --figma-scale must be a number >= 1"); + process.exit(1); + } + + const hasViewportOverride = options.width !== undefined || options.height !== undefined; + console.log("Comparing..."); const result = await visualCompare({ figmaUrl: options.figmaUrl, figmaToken: token, codePath: resolve(codePath), outputDir: options.output, - viewport: { - width: options.width ?? 1440, - height: options.height ?? 900, - }, + ...(exportScale !== undefined ? { figmaExportScale: exportScale } : {}), + ...(hasViewportOverride + ? { + viewport: { + ...(options.width !== undefined ? { width: options.width } : {}), + ...(options.height !== undefined ? { height: options.height } : {}), + }, + } + : {}), }); // JSON output for programmatic use diff --git a/src/core/engine/visual-compare.test.ts b/src/core/engine/visual-compare.test.ts index 99eec658..3a1e15c4 100644 --- a/src/core/engine/visual-compare.test.ts +++ b/src/core/engine/visual-compare.test.ts @@ -38,9 +38,9 @@ import pixelmatch from "pixelmatch"; const FIGMA_CACHE_DIR = "/tmp/canicode-figma-cache"; /** Mirror of the private getFigmaCachePath in visual-compare.ts */ -function getFigmaCachePath(fileKey: string, nodeId: string): string { +function getFigmaCachePath(fileKey: string, nodeId: string, scale: number = 2): string { const safeNodeId = nodeId.replace(/:/g, "-"); - return resolve(FIGMA_CACHE_DIR, `${fileKey}_${safeNodeId}.png`); + return resolve(FIGMA_CACHE_DIR, `${fileKey}_${safeNodeId}@${scale}x.png`); } /** Mirror of the private isCacheFresh in visual-compare.ts */ @@ -133,24 +133,24 @@ function makeSolidPng(width: number, height: number, r: number, g: number, b: nu describe("getFigmaCachePath", () => { it("places the cache file inside FIGMA_CACHE_DIR", () => { - const path = getFigmaCachePath("abc123", "1:2"); + const path = getFigmaCachePath("abc123", "1:2", 2); expect(path.startsWith(FIGMA_CACHE_DIR)).toBe(true); }); it("replaces colons in nodeId with hyphens", () => { - const path = getFigmaCachePath("file", "10:20"); + const path = getFigmaCachePath("file", "10:20", 2); expect(path).toContain("10-20"); expect(path).not.toContain("10:20"); }); - it("combines fileKey and sanitized nodeId in the filename", () => { - const path = getFigmaCachePath("myfile", "3:45"); - expect(path).toContain("myfile_3-45.png"); + it("combines fileKey, sanitized nodeId, and scale in the filename", () => { + const path = getFigmaCachePath("myfile", "3:45", 2); + expect(path).toContain("myfile_3-45@2x.png"); }); it("handles nodeId without colons unchanged", () => { - const path = getFigmaCachePath("key", "node123"); - expect(path).toContain("key_node123.png"); + const path = getFigmaCachePath("key", "node123", 2); + expect(path).toContain("key_node123@2x.png"); }); }); diff --git a/src/core/engine/visual-compare.ts b/src/core/engine/visual-compare.ts index 7f525fde..12029fec 100644 --- a/src/core/engine/visual-compare.ts +++ b/src/core/engine/visual-compare.ts @@ -24,7 +24,16 @@ export interface VisualCompareOptions { figmaToken: string; codePath: string; outputDir?: string | undefined; - viewport?: { width: number; height: number } | undefined; + /** + * Logical CSS viewport (CSS pixels). Omit a dimension to infer from the Figma PNG + * using `figmaExportScale`. When the whole object is omitted, both dimensions are inferred. + */ + viewport?: { width?: number; height?: number } | undefined; + /** + * Figma Images API `scale` and assumed scale for fixture `figma.png` (e.g. from `save-fixture`). + * Default 2 matches REST exports and avoids comparing a @2x PNG against a 1× Playwright capture. + */ + figmaExportScale?: number | undefined; } const FIGMA_CACHE_DIR = "/tmp/canicode-figma-cache"; @@ -33,10 +42,10 @@ const FIGMA_CACHE_TTL_MS = 60 * 60 * 1000; // 1 hour /** * Get the cache path for a given fileKey + nodeId combination. */ -function getFigmaCachePath(fileKey: string, nodeId: string): string { +function getFigmaCachePath(fileKey: string, nodeId: string, scale: number): string { // Sanitize nodeId for use as filename (replace : with -) const safeNodeId = nodeId.replace(/:/g, "-"); - return resolve(FIGMA_CACHE_DIR, `${fileKey}_${safeNodeId}.png`); + return resolve(FIGMA_CACHE_DIR, `${fileKey}_${safeNodeId}@${scale}x.png`); } /** @@ -57,8 +66,9 @@ async function fetchFigmaScreenshot( nodeId: string, token: string, outputPath: string, + scale: number, ): Promise { - const cachePath = getFigmaCachePath(fileKey, nodeId); + const cachePath = getFigmaCachePath(fileKey, nodeId, scale); // Return cached version if fresh if (isCacheFresh(cachePath)) { @@ -68,7 +78,7 @@ async function fetchFigmaScreenshot( } const res = await fetch( - `https://api.figma.com/v1/images/${fileKey}?ids=${nodeId}&format=png&scale=1`, + `https://api.figma.com/v1/images/${fileKey}?ids=${nodeId}&format=png&scale=${scale}`, { headers: { "X-Figma-Token": token } }, ); if (!res.ok) throw new Error(`Figma Images API: ${res.status} ${res.statusText}`); @@ -91,18 +101,45 @@ async function fetchFigmaScreenshot( writeFileSync(cachePath, buffer); } +/** + * Infer device pixel ratio so the Playwright screenshot matches Figma PNG pixel dimensions. + */ +function inferDeviceScaleFactor( + pngW: number, + pngH: number, + logicalW: number, + logicalH: number, + fallback: number, +): number { + if (logicalW <= 0 || logicalH <= 0) return 1; + const sx = pngW / logicalW; + const sy = pngH / logicalH; + const rounded = Math.round((sx + sy) / 2); + if (rounded >= 2 && Math.abs(sx - rounded) < 0.08 && Math.abs(sy - rounded) < 0.08) { + return rounded; + } + if (Math.abs(sx - 1) < 0.02 && Math.abs(sy - 1) < 0.02) return 1; + return fallback >= 2 ? fallback : Math.max(1, Math.round(sx)); +} + /** * Render HTML file with Playwright and take a screenshot. + * @param deviceScaleFactor - Pass 2 when the Figma reference is @2x and `viewport` is logical CSS size. */ export async function renderCodeScreenshot( codePath: string, outputPath: string, - viewport: { width: number; height: number }, + logicalViewport: { width: number; height: number }, + deviceScaleFactor: number = 1, ): Promise { // Dynamic import — playwright is an optional dependency const { chromium } = await import("playwright"); const browser = await chromium.launch(); - const page = await browser.newPage({ viewport }); + const context = await browser.newContext({ + viewport: logicalViewport, + deviceScaleFactor, + }); + const page = await context.newPage(); await page.goto(`file://${resolve(codePath)}`, { waitUntil: "networkidle", @@ -209,22 +246,50 @@ export async function visualCompare(options: VisualCompareOptions): Promise { + async ({ codePath, figmaUrl, token, outputDir, figmaExportScale }) => { try { const { visualCompare } = await import("../core/engine/visual-compare.js"); const figmaToken = token ?? process.env["FIGMA_TOKEN"]; @@ -447,6 +448,7 @@ Requires: Playwright with Chromium installed, Figma API token.`, figmaToken, codePath, outputDir, + ...(figmaExportScale !== undefined ? { figmaExportScale } : {}), }); return { From 7bc939c0ae090188dfd6e5953dd689ff50401501 Mon Sep 17 00:00:00 2001 From: let-sunny Date: Wed, 25 Mar 2026 01:28:22 +0900 Subject: [PATCH 02/11] fix(agents): harden debate parsing and evidence dedupe keys - Guard extractAppliedRuleIds and isConverged when arbitrator.decisions is not an array - Use trimmed fixture in calibration evidence dedupe key - Add tests for malformed debate.json - Add docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md (calibration & rule discovery) - Align visual-compare.test.ts comment with padPng behavior Made-with: Cursor --- ...EARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md | 201 ++++++++++++++++++ src/agents/evidence-collector.ts | 6 +- src/agents/run-directory.test.ts | 21 ++ src/agents/run-directory.ts | 5 +- src/core/engine/visual-compare.test.ts | 2 +- 5 files changed, 231 insertions(+), 4 deletions(-) create mode 100644 docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md diff --git a/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md b/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md new file mode 100644 index 00000000..080846ac --- /dev/null +++ b/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md @@ -0,0 +1,201 @@ +# Research Report: Calibration & Rule Discovery (Branch `fix/calibration-pipeline-issue-14`) + +**Scope:** Deep technical review of CanICode’s internal calibration and rule-discovery pipelines as reflected on the current branch, including recent fixes for retina-aligned visual comparison, cross-run evidence handling, and convergence semantics for nightly workflows. + +**Audience:** Maintainers opening or reviewing PRs that touch `visual-compare`, agent orchestration, or calibration CLI commands. + +--- + +## 1. Executive summary + +CanICode’s north star is answerable in one question: **can AI implement a scoped Figma design pixel-accurately?** **Calibration** grounds rule **scores** in measured conversion difficulty (including **visual-compare** similarity). **Rule discovery** turns recurring **gap** patterns into new rules, validated by A/B visual runs. + +On this branch, three themes stand out as materially improving scientific validity and operator ergonomics: + +1. **Retina / export-scale alignment** — Figma exports and Playwright captures now share the same physical pixel grid by default (`figmaExportScale`, inferred logical viewport, `deviceScaleFactor`). This removes a large class of false “diff” signals that previously looked like design errors. +2. **Calibration evidence hygiene** — Appending calibration evidence **replaces** the prior row for the same `(ruleId, fixture)` pair so the cross-run store reflects the **latest** assessment, not an unbounded pile of contradictory rows. +3. **Convergence for automation (issue #14)** — Strict convergence (`no applied/revised` **and** `no rejected`) is correct for “the debate fully settled.” For nightlies, **`--lenient-convergence`** allows `fixture-done` when the Arbitrator applied nothing but still logged rejections—a common pattern when the Critic repeatedly blocks borderline proposals while scores are stable. + +The branch also carries substantial **fixture expansion** (calibrated designs moved under `fixtures/done`, new fixture JSON/screenshots). That increases empirical coverage but increases repo size and CI time—trade-offs called out in Section 8. + +--- + +## 2. System map + +### 2.1 Channels vs internal pipelines + +| Surface | Role in the metric story | +|--------|---------------------------| +| CLI `analyze` | Produces rule findings and scores from structure/fixture JSON | +| CLI `visual-compare` | Measures **pixel** similarity between Figma PNG and rendered HTML | +| MCP / skill | Same analysis with optional hybrid enrichment from Figma MCP | +| **`/calibrate-loop`** (Claude Code) | Full loop: analyze → convert whole scope → gap analysis → evaluate → critic → arbitrator | +| **`/add-rule`** | Research → implement rule → A/B visual → critic decision | +| **`/calibrate-night`** | Sequential calibration + `fixture-list` / `fixture-done` + aggregate gap report | + +### 2.2 Artifact graph + +``` +fixtures//data.json (+ screenshot.png) + │ + ▼ + calibrate-analyze ──► analysis.json (run dir) + │ + ▼ + Converter + visual-compare ──► conversion.json, figma.png, code.png, diff.png + │ + ▼ + Gap analyzer ──► gaps.json ──► discovery-evidence (filtered) + gap-rule-report + │ + ▼ + calibrate-evaluate ──► proposals + calibration-evidence (overscore/underscore) + │ + ▼ + debate.json (critic + arbitrator) ──► rule-config.ts edits, calibrate-prune-evidence +``` + +### 2.3 Cross-run stores + +| File | Content | Pruned when | +|------|---------|-------------| +| `data/calibration-evidence.json` | Per-rule overscored/underscored signals | `calibrate-prune-evidence` after applied/revised rules | +| `data/discovery-evidence.json` | Missing-rule / gap signals | `discovery-prune-evidence` after a category is addressed | + +Orchestrator-side **noise filters** (regex) prevent font CDN, retina/DPI, screenshot dimension, network, and CI noise from polluting discovery evidence—keeping the Researcher’s input **actionable**. + +--- + +## 3. Branch-specific technical analysis + +### 3.1 Visual comparison: scale and padding + +**Problem (historical):** Comparing a `@2x` Figma PNG to a `1×` Playwright screenshot inflated diff pixels and **compressed similarity**, biasing calibration toward “everything is hard” and muddying gap labels. + +**Current behavior:** + +- Default **`figmaExportScale`** of `2` matches common REST exports and **`save-fixture`** output. +- Without an explicit viewport, **logical** width/height = PNG dimensions ÷ export scale; Playwright uses **`deviceScaleFactor`** = export scale so **`code.png`** matches **`figma.png`** in device pixels. +- With explicit `--width` / `--height`, **`inferDeviceScaleFactor`** reconciles PNG aspect ratio to logical viewport using rounded ratios with small tolerance—guarding against odd sizes. + +**Size mismatch handling:** Images are **padded** with a high-contrast magenta fill to a common canvas (not resized). That choice is deliberate: + +- Resampling would blend pixels and **hide** misalignment. +- Padding makes “extra” area unambiguously wrong in **pixelmatch**, which is what you want for calibration (layout bounds matter). + +**CLI parity:** `canicode visual-compare` exposes `--figma-scale` and documents inference behavior in `canicode docs visual-compare`. + +### 3.2 Calibration evidence: last-write-wins per fixture + +`appendCalibrationEvidence` builds a key `ruleId + NUL + fixture`. Existing entries with those keys are **removed** before new entries are appended. Effects: + +- **Pros:** Stops double-counting stale runs; `loadCalibrationEvidence` aggregates reflect the latest truth per fixture. +- **Cons:** Deliberate historical audit of every run is **not** preserved in this file (only in per-run logs). If longitudinal studies are needed, append-only logs under `logs/calibration/` remain the source of truth. + +**Discovery evidence** remains append-only (no dedupe)—appropriate for “many distinct gaps,” but can grow large; see recommendations. + +### 3.3 Convergence: strict vs lenient + +| Mode | Condition | Use case | +|------|-----------|----------| +| **Strict** (default) | `applied === 0` **and** `revised === 0` **and** `rejected === 0` | True “quiet” arbitration: no pending controversy | +| **Lenient** | `applied === 0` **and** `revised === 0` (rejects ignored) | Stable scores but Critic still rejecting proposals; unblocks `fixture-done` / nightlies | + +**Skipped debates** (`debate.skipped`) count as converged—zero proposals is a valid terminal state. + +**Operational note:** Lenient mode should be paired with **human review** of `debate.json` periodically; otherwise rejected-but-valid signals might never drive config change. + +--- + +## 4. Rule discovery pipeline (conceptual depth) + +### 4.1 Inputs + +- **Gap JSON** per calibration run (categories, descriptions, `actionable`, `coveredByExistingRule`). +- **Accumulated discovery evidence** (evaluation + gap-analysis sources). +- **Fixture diversity** — done vs active fixtures under `fixtures/` and `fixtures/done/`. + +### 4.2 Weak links (inherent) + +1. **A/B tests proxy “missing metadata”** with AI-generated text—signal is “does extra structured data help the converter?” not “does human copy help?” +2. **Gap analyzer** depends on diff interpretation; anti-aliasing and font substitution still create **baseline** noise (~few percent similarity) even after retina alignment. +3. **Critic/Arbitrator** are LLM-mediated; deterministic steps (`runEvaluationAgent`, gap aggregation) are sound, but final commits are governance-sensitive. + +--- + +## 5. Evaluation: what is working well + +1. **Clear separation of concerns:** Rule **logic** vs **`rule-config.ts`** scores enables tuning without rewriting detectors. +2. **Deterministic evaluation agent:** Maps difficulties to score bands transparently—easy to test and to explain in reports. +3. **Filtering conversion candidates** (large files): Frames/components with minimum size, child count, text descendants—reduces “icon noise” that once split signals for rules like `no-auto-layout`. +4. **GapRuleReport** bridges many `gaps.json` files into markdown suitable for human triage. +5. **Test coverage** around evidence, run directories, and visual-compare numerics gives confidence in refactors. + +--- + +## 6. Risks and limitations + +| Risk | Impact | Mitigation (existing or proposed) | +|------|--------|-----------------------------------| +| Fixture/repo bloat | Clone and CI slower | Submodules or LFS for `data.json` + PNG; or slim “CI fixtures” subset | +| Discovery evidence growth | Slower loads; noisy Researcher context | Category-level caps, dedupe by hash of description, archival files | +| Single global `rule-config` | Cross-fixture coupling | Per-preset configs or weighted calibration per vertical (longer-term) | +| Playwright `networkidle` + fixed 1s wait | Flaky or slow compares | Tunable wait strategy; opt-in `domcontentloaded` for static HTML | +| Magenta padding | Extreme edge cases might collide with design | Extremely rare; could switch to XOR mask in diff only | + +--- + +## 7. Improvement roadmap (prioritized) + +### P0 — Trust and reproducibility + +- **Pin a “golden” visual-compare fixture** in CI (small HTML + canned PNG) so scale/padding logic cannot regress silently without network. +- **Emit `figmaExportScale` and inferred `deviceScaleFactor` into `conversion.json`** for every run dir—makes postmortems one glance. + +### P1 — Evidence and discovery + +- **Optional dedupe** for `discovery-evidence.json` (e.g., normalized description + category) with max entries per category. +- **Schema version** field on evidence arrays for forward-compatible migrations. + +### P2 — Operator UX + +- **`fixture-done --dry-run`** printing convergence reason (strict fail: N rejects, M applies). +- **Single command** wrapping “latest run dir for fixture X” resolution—today paths are manual. + +### P3 — Analytical depth + +- **Per-rule elasticity:** How much similarity improves when a rule is “turned off” in synthesis vs “fixed” in Figma—connects scores to **marginal** visual delta. +- **Confound controls:** Blank fixture with only text + one rectangle to measure pure font/smoothing baseline per OS. + +### P4 — Documentation + +- Keep **`docs/CALIBRATION.md`** narrative; this document is the **engineering appendix** for branch-level decisions. + +--- + +## 8. Fixture strategy assessment + +Moving calibrated sets to **`fixtures/done/`** is a strong workflow signal: *this design has been through the loop.* Risks: + +- **Discoverability:** New contributors may not know `done/` is intentional historical corpus. +- **Size:** JSON artifacts dominate diff noise; consider policy: “active small fixtures in repo; large kits documented + downloaded.” + +Recommendation: document in `CLAUDE.md` or `fixtures/README.md` (when the team chooses to add it) a one-paragraph **fixture lifecycle**: active → calibrate → done → optional archive. + +--- + +## 9. Conclusion + +The branch advances the project from “we compare screenshots” to **comparisons that respect how Figma and browsers actually rasterize**. Together with **evidence dedupe** and **lenient convergence**, it closes practical gaps that otherwise waste arbitrators’ time or stall nightlies—without abandoning strict mode for runs that demand full closure. + +The next leap is not more rules by count, but **tighter coupling between visual deltas and marginal rule importance** (P3), so `rule-config.ts` continues to track measurable implementation pain rather than narrative severity alone. + +--- + +## 10. Reference pointers (code) + +- Visual pipeline: `src/core/engine/visual-compare.ts` +- CLI wiring: `src/cli/index.ts` (`visual-compare`, `fixture-done`) +- Convergence: `src/agents/run-directory.ts` (`isConverged`, `ConvergenceOptions`) +- Evidence: `src/agents/evidence-collector.ts` +- Noise filters + orchestration: `src/agents/orchestrator.ts` (`ENVIRONMENT_NOISE_PATTERNS`) +- Nightly note: `.claude/commands/calibrate-night.md` diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts index aa3e0e46..adced824 100644 --- a/src/agents/evidence-collector.ts +++ b/src/agents/evidence-collector.ts @@ -85,8 +85,10 @@ export function appendCalibrationEvidence( ): void { if (entries.length === 0) return; const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema); - const keys = new Set(entries.map((e) => `${e.ruleId.trim()}\0${e.fixture}`)); - const withoutDupes = existing.filter((e) => !keys.has(`${e.ruleId.trim()}\0${e.fixture}`)); + const keys = new Set(entries.map((e) => `${e.ruleId.trim()}\0${e.fixture.trim()}`)); + const withoutDupes = existing.filter( + (e) => !keys.has(`${e.ruleId.trim()}\0${e.fixture.trim()}`), + ); withoutDupes.push(...entries); writeJsonArray(evidencePath, withoutDupes); } diff --git a/src/agents/run-directory.test.ts b/src/agents/run-directory.test.ts index 31321666..23fbb605 100644 --- a/src/agents/run-directory.test.ts +++ b/src/agents/run-directory.test.ts @@ -116,6 +116,14 @@ describe("extractAppliedRuleIds", () => { }); expect(ids).toEqual(["my-rule", "b"]); }); + + it("returns empty array when decisions is not an array", () => { + const bad = { + critic: null, + arbitrator: { summary: "x", decisions: null }, + } as unknown as Parameters[0]; + expect(extractAppliedRuleIds(bad)).toEqual([]); + }); }); describe("isConverged", () => { @@ -154,6 +162,19 @@ describe("isConverged", () => { ); expect(isConverged(tempDir, { lenient: true })).toBe(true); }); + + it("returns false when arbitrator exists but decisions is not an array", () => { + writeFileSync( + join(tempDir, "debate.json"), + JSON.stringify({ + arbitrator: { + summary: "incomplete", + }, + }), + ); + expect(isConverged(tempDir)).toBe(false); + expect(isConverged(tempDir, { lenient: true })).toBe(false); + }); }); describe("listCalibrationRuns", () => { diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts index bebdad31..70910ad2 100644 --- a/src/agents/run-directory.ts +++ b/src/agents/run-directory.ts @@ -209,7 +209,9 @@ export function parseDebateResult(runDir: string): DebateResult | null { */ export function extractAppliedRuleIds(debate: DebateResult): string[] { if (!debate.arbitrator) return []; - return debate.arbitrator.decisions + const decisions = debate.arbitrator.decisions; + if (!Array.isArray(decisions)) return []; + return decisions .filter((d) => { const dec = (d.decision ?? "").trim().toLowerCase(); return dec === "applied" || dec === "revised"; @@ -237,6 +239,7 @@ export function isConverged(runDir: string, options?: ConvergenceOptions): boole if (debate.skipped) return true; // zero proposals = converged if (!debate.arbitrator) return false; const decisions = debate.arbitrator.decisions; + if (!Array.isArray(decisions)) return false; const changed = decisions.filter((d) => { const dec = (d.decision ?? "").trim().toLowerCase(); return dec === "applied" || dec === "revised"; diff --git a/src/core/engine/visual-compare.test.ts b/src/core/engine/visual-compare.test.ts index 3a1e15c4..1df47c10 100644 --- a/src/core/engine/visual-compare.test.ts +++ b/src/core/engine/visual-compare.test.ts @@ -20,7 +20,7 @@ * For (3)–(6) we build small PNGs in memory with pngjs and write them to a temp dir, * then call compareScreenshots through a minimal re-export shim declared in this file. * Because the module does not export the private functions we reproduce the exact same - * logic (8 lines of resizePng + ~25 lines of compareScreenshots) so the tests validate + * logic (padPng + compareScreenshots) so the tests validate * the algorithm rather than just the export boundary. */ From f1198f25066c387708504c644c91188ba6a6e49d Mon Sep 17 00:00:00 2001 From: let-sunny Date: Wed, 25 Mar 2026 01:29:46 +0900 Subject: [PATCH 03/11] fix(agents): self-review hardening for evidence and debate parsing - Dedupe calibration evidence within a single append (Map, last wins) - Treat non-object decision rows as inert in extractAppliedRuleIds and isConverged - Doc: fix overscored/underscored typo; strict convergence note; Appendix A self-review Made-with: Cursor --- ...EARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md | 18 ++++++++++-- src/agents/evidence-collector.test.ts | 11 +++++++ src/agents/evidence-collector.ts | 11 +++++-- src/agents/run-directory.test.ts | 29 +++++++++++++++++++ src/agents/run-directory.ts | 15 +++++++--- 5 files changed, 76 insertions(+), 8 deletions(-) diff --git a/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md b/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md index 080846ac..6061864c 100644 --- a/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md +++ b/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md @@ -48,7 +48,7 @@ fixtures//data.json (+ screenshot.png) Gap analyzer ──► gaps.json ──► discovery-evidence (filtered) + gap-rule-report │ ▼ - calibrate-evaluate ──► proposals + calibration-evidence (overscore/underscore) + calibrate-evaluate ──► proposals + calibration-evidence (overscored/underscored) │ ▼ debate.json (critic + arbitrator) ──► rule-config.ts edits, calibrate-prune-evidence @@ -97,7 +97,7 @@ Orchestrator-side **noise filters** (regex) prevent font CDN, retina/DPI, screen | Mode | Condition | Use case | |------|-----------|----------| -| **Strict** (default) | `applied === 0` **and** `revised === 0` **and** `rejected === 0` | True “quiet” arbitration: no pending controversy | +| **Strict** (default) | `applied === 0` **and** `revised === 0` **and** `rejected === 0` (and `decisions` is an array) | True “quiet” arbitration: no pending controversy | | **Lenient** | `applied === 0` **and** `revised === 0` (rejects ignored) | Stable scores but Critic still rejecting proposals; unblocks `fixture-done` / nightlies | **Skipped debates** (`debate.skipped`) count as converged—zero proposals is a valid terminal state. @@ -199,3 +199,17 @@ The next leap is not more rules by count, but **tighter coupling between visual - Evidence: `src/agents/evidence-collector.ts` - Noise filters + orchestration: `src/agents/orchestrator.ts` (`ENVIRONMENT_NOISE_PATTERNS`) - Nightly note: `.claude/commands/calibrate-night.md` + +--- + +## Appendix A: Post-commit self-review (what was re-checked) + +This section records a deliberate second pass over the same design—what was assumed, what was wrong, and what was tightened. + +1. **Typo in the artifact diagram** — “overscore/underscore” conflated wording with schema enum values; corrected to **overscored/underscored**. +2. **Calibration evidence batch semantics** — Original dedupe only avoided collisions between *existing file* rows and *new* rows. A single `appendCalibrationEvidence` call could still write **duplicate keys**. Resolved by folding the incoming batch through a `Map` so **last row wins within the batch** as well as across calls. +3. **Malformed `debate.json`** — Real runs are supposed to emit objects, but hand-edited or partially written files could put `null` or primitives inside `decisions`. **Strict type assumptions** would throw at runtime and break `fixture-done` / prune flows. **Guard** with `isDecisionRecord` and string coercion on `decision` / `ruleId`. +4. **Documentation vs implementation** — Strict convergence implicitly requires `decisions` to be an array; non-array continues to yield **not converged**. The table now states that explicitly. +5. **What was not “fixed” on purpose** — `parseDebateResult` still trusts overall JSON shape for `critic` / `arbitrator`; deeper validation would belong in a Zod schema if we ever expose debate files as a public API. **Lenient convergence** still ignores rejected rows by design—that is a product trade-off, not a bug; operators should skim `debate.json` before relying on it long term. + +**Limitation (honest):** Waiting “three hours” in wall-clock time does not change correctness; it mainly reduces the chance of missing edge cases. This appendix is the recorded output of that kind of review, done immediately and in code. diff --git a/src/agents/evidence-collector.test.ts b/src/agents/evidence-collector.test.ts index 70b5ec86..91e19313 100644 --- a/src/agents/evidence-collector.test.ts +++ b/src/agents/evidence-collector.test.ts @@ -116,6 +116,17 @@ describe("evidence-collector", () => { expect(raw[0]!.type).toBe("underscored"); }); + it("dedupes within a single append call (last row for same ruleId and fixture wins)", () => { + appendCalibrationEvidence([ + { ruleId: "rule-a", type: "overscored", actualDifficulty: "easy", fixture: "fx1", timestamp: "t1" }, + { ruleId: "rule-a", type: "underscored", actualDifficulty: "hard", fixture: "fx1", timestamp: "t2" }, + ], calPath); + + const raw = JSON.parse(readFileSync(calPath, "utf-8")) as CalibrationEvidenceEntry[]; + expect(raw).toHaveLength(1); + expect(raw[0]!.type).toBe("underscored"); + }); + it("does nothing for empty entries", () => { writeFileSync(calPath, "[]", "utf-8"); appendCalibrationEvidence([], calPath); diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts index adced824..4f98dc61 100644 --- a/src/agents/evidence-collector.ts +++ b/src/agents/evidence-collector.ts @@ -85,11 +85,18 @@ export function appendCalibrationEvidence( ): void { if (entries.length === 0) return; const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema); - const keys = new Set(entries.map((e) => `${e.ruleId.trim()}\0${e.fixture.trim()}`)); + // Same batch can repeat (ruleId, fixture); last entry wins (matches cross-call behavior) + const incomingByKey = new Map(); + for (const e of entries) { + const k = `${e.ruleId.trim()}\0${e.fixture.trim()}`; + incomingByKey.set(k, e); + } + const mergedIncoming = [...incomingByKey.values()]; + const keys = new Set(incomingByKey.keys()); const withoutDupes = existing.filter( (e) => !keys.has(`${e.ruleId.trim()}\0${e.fixture.trim()}`), ); - withoutDupes.push(...entries); + withoutDupes.push(...mergedIncoming); writeJsonArray(evidencePath, withoutDupes); } diff --git a/src/agents/run-directory.test.ts b/src/agents/run-directory.test.ts index 23fbb605..9c3e5090 100644 --- a/src/agents/run-directory.test.ts +++ b/src/agents/run-directory.test.ts @@ -124,6 +124,21 @@ describe("extractAppliedRuleIds", () => { } as unknown as Parameters[0]; expect(extractAppliedRuleIds(bad)).toEqual([]); }); + + it("ignores null and non-object entries in decisions", () => { + const ids = extractAppliedRuleIds({ + critic: null, + arbitrator: { + summary: "x", + decisions: [ + null, + "not-an-object" as unknown as { ruleId: string; decision: string }, + { ruleId: "ok", decision: "applied" }, + ], + }, + }); + expect(ids).toEqual(["ok"]); + }); }); describe("isConverged", () => { @@ -175,6 +190,20 @@ describe("isConverged", () => { expect(isConverged(tempDir)).toBe(false); expect(isConverged(tempDir, { lenient: true })).toBe(false); }); + + it("ignores null entries when counting applied and rejected", () => { + writeFileSync( + join(tempDir, "debate.json"), + JSON.stringify({ + arbitrator: { + summary: "mixed", + decisions: [null, { ruleId: "x", decision: "rejected" }], + }, + }), + ); + expect(isConverged(tempDir)).toBe(false); + expect(isConverged(tempDir, { lenient: true })).toBe(true); + }); }); describe("listCalibrationRuns", () => { diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts index 70910ad2..0ce5b566 100644 --- a/src/agents/run-directory.ts +++ b/src/agents/run-directory.ts @@ -207,16 +207,21 @@ export function parseDebateResult(runDir: string): DebateResult | null { /** * Extract ruleIds that were applied or revised by the Arbitrator. */ +function isDecisionRecord(d: unknown): d is { decision?: unknown; ruleId?: unknown } { + return d !== null && typeof d === "object"; +} + export function extractAppliedRuleIds(debate: DebateResult): string[] { if (!debate.arbitrator) return []; const decisions = debate.arbitrator.decisions; if (!Array.isArray(decisions)) return []; return decisions .filter((d) => { - const dec = (d.decision ?? "").trim().toLowerCase(); + if (!isDecisionRecord(d)) return false; + const dec = String(d.decision ?? "").trim().toLowerCase(); return dec === "applied" || dec === "revised"; }) - .map((d) => d.ruleId.trim()) + .map((d) => String(d.ruleId ?? "").trim()) .filter((id) => id.length > 0); } @@ -241,11 +246,13 @@ export function isConverged(runDir: string, options?: ConvergenceOptions): boole const decisions = debate.arbitrator.decisions; if (!Array.isArray(decisions)) return false; const changed = decisions.filter((d) => { - const dec = (d.decision ?? "").trim().toLowerCase(); + if (!isDecisionRecord(d)) return false; + const dec = String(d.decision ?? "").trim().toLowerCase(); return dec === "applied" || dec === "revised"; }).length; const rejected = decisions.filter((d) => { - const dec = (d.decision ?? "").trim().toLowerCase(); + if (!isDecisionRecord(d)) return false; + const dec = String(d.decision ?? "").trim().toLowerCase(); return dec === "rejected"; }).length; if (options?.lenient) { From 48f2a6bde56971feb46a6540d51d1b93dc913459 Mon Sep 17 00:00:00 2001 From: let-sunny Date: Wed, 25 Mar 2026 07:31:41 +0900 Subject: [PATCH 04/11] fix: extract tolerance constants + defensive debate parsing - SCALE_ROUNDING_TOLERANCE (0.08): broader for @2x/@3x detection - UNITY_SCALE_TOLERANCE (0.02): tighter for 1x false-positive prevention - parseDebateResult: validates shape, debug logs on malformed input Co-Authored-By: Claude Opus 4.6 (1M context) --- src/agents/run-directory.ts | 17 ++++++++++++----- src/core/engine/visual-compare.ts | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts index 0ce5b566..a5906224 100644 --- a/src/agents/run-directory.ts +++ b/src/agents/run-directory.ts @@ -188,18 +188,25 @@ export interface DebateResult { /** * Parse a debate.json file from a run directory. * Returns null if the file doesn't exist or is malformed. + * Treats debate.json as external input — validates shape defensively. */ export function parseDebateResult(runDir: string): DebateResult | null { const debatePath = join(runDir, "debate.json"); if (!existsSync(debatePath)) return null; try { - const raw = JSON.parse(readFileSync(debatePath, "utf-8")) as Record; + const raw: unknown = JSON.parse(readFileSync(debatePath, "utf-8")); + if (raw === null || typeof raw !== "object") { + console.debug(`[parseDebateResult] invalid debate format in ${runDir}: expected object, got ${typeof raw}`); + return null; + } + const obj = raw as Record; return { - critic: raw["critic"] as DebateResult["critic"] ?? null, - arbitrator: raw["arbitrator"] as DebateResult["arbitrator"] ?? null, - ...(typeof raw["skipped"] === "string" ? { skipped: raw["skipped"] } : {}), + critic: (obj["critic"] as DebateResult["critic"]) ?? null, + arbitrator: (obj["arbitrator"] as DebateResult["arbitrator"]) ?? null, + ...(typeof obj["skipped"] === "string" ? { skipped: obj["skipped"] } : {}), }; - } catch { + } catch (err) { + console.debug(`[parseDebateResult] failed to parse debate.json in ${runDir}:`, err); return null; } } diff --git a/src/core/engine/visual-compare.ts b/src/core/engine/visual-compare.ts index 12029fec..fdb3b806 100644 --- a/src/core/engine/visual-compare.ts +++ b/src/core/engine/visual-compare.ts @@ -101,6 +101,18 @@ async function fetchFigmaScreenshot( writeFileSync(cachePath, buffer); } +/** + * Tolerance for detecting integer scale factors (@2x, @3x). + * Broader tolerance because render/rounding errors accumulate at higher scales. + */ +const SCALE_ROUNDING_TOLERANCE = 0.08; + +/** + * Tolerance for detecting 1x (unity) scale. + * Tighter to avoid false positives — misidentifying a scaled PNG as 1x. + */ +const UNITY_SCALE_TOLERANCE = 0.02; + /** * Infer device pixel ratio so the Playwright screenshot matches Figma PNG pixel dimensions. */ @@ -115,10 +127,10 @@ function inferDeviceScaleFactor( const sx = pngW / logicalW; const sy = pngH / logicalH; const rounded = Math.round((sx + sy) / 2); - if (rounded >= 2 && Math.abs(sx - rounded) < 0.08 && Math.abs(sy - rounded) < 0.08) { + if (rounded >= 2 && Math.abs(sx - rounded) < SCALE_ROUNDING_TOLERANCE && Math.abs(sy - rounded) < SCALE_ROUNDING_TOLERANCE) { return rounded; } - if (Math.abs(sx - 1) < 0.02 && Math.abs(sy - 1) < 0.02) return 1; + if (Math.abs(sx - 1) < UNITY_SCALE_TOLERANCE && Math.abs(sy - 1) < UNITY_SCALE_TOLERANCE) return 1; return fallback >= 2 ? fallback : Math.max(1, Math.round(sx)); } From 465eecceb207062a1f084e7cf5737848e4cace10 Mon Sep 17 00:00:00 2001 From: let-sunny Date: Wed, 25 Mar 2026 07:34:17 +0900 Subject: [PATCH 05/11] chore: remove research doc (discussion-only, not for merge) Co-Authored-By: Claude Opus 4.6 (1M context) --- ...EARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md | 215 ------------------ 1 file changed, 215 deletions(-) delete mode 100644 docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md diff --git a/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md b/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md deleted file mode 100644 index 6061864c..00000000 --- a/docs/RESEARCH-BRANCH-CALIBRATION-RULE-DISCOVERY.md +++ /dev/null @@ -1,215 +0,0 @@ -# Research Report: Calibration & Rule Discovery (Branch `fix/calibration-pipeline-issue-14`) - -**Scope:** Deep technical review of CanICode’s internal calibration and rule-discovery pipelines as reflected on the current branch, including recent fixes for retina-aligned visual comparison, cross-run evidence handling, and convergence semantics for nightly workflows. - -**Audience:** Maintainers opening or reviewing PRs that touch `visual-compare`, agent orchestration, or calibration CLI commands. - ---- - -## 1. Executive summary - -CanICode’s north star is answerable in one question: **can AI implement a scoped Figma design pixel-accurately?** **Calibration** grounds rule **scores** in measured conversion difficulty (including **visual-compare** similarity). **Rule discovery** turns recurring **gap** patterns into new rules, validated by A/B visual runs. - -On this branch, three themes stand out as materially improving scientific validity and operator ergonomics: - -1. **Retina / export-scale alignment** — Figma exports and Playwright captures now share the same physical pixel grid by default (`figmaExportScale`, inferred logical viewport, `deviceScaleFactor`). This removes a large class of false “diff” signals that previously looked like design errors. -2. **Calibration evidence hygiene** — Appending calibration evidence **replaces** the prior row for the same `(ruleId, fixture)` pair so the cross-run store reflects the **latest** assessment, not an unbounded pile of contradictory rows. -3. **Convergence for automation (issue #14)** — Strict convergence (`no applied/revised` **and** `no rejected`) is correct for “the debate fully settled.” For nightlies, **`--lenient-convergence`** allows `fixture-done` when the Arbitrator applied nothing but still logged rejections—a common pattern when the Critic repeatedly blocks borderline proposals while scores are stable. - -The branch also carries substantial **fixture expansion** (calibrated designs moved under `fixtures/done`, new fixture JSON/screenshots). That increases empirical coverage but increases repo size and CI time—trade-offs called out in Section 8. - ---- - -## 2. System map - -### 2.1 Channels vs internal pipelines - -| Surface | Role in the metric story | -|--------|---------------------------| -| CLI `analyze` | Produces rule findings and scores from structure/fixture JSON | -| CLI `visual-compare` | Measures **pixel** similarity between Figma PNG and rendered HTML | -| MCP / skill | Same analysis with optional hybrid enrichment from Figma MCP | -| **`/calibrate-loop`** (Claude Code) | Full loop: analyze → convert whole scope → gap analysis → evaluate → critic → arbitrator | -| **`/add-rule`** | Research → implement rule → A/B visual → critic decision | -| **`/calibrate-night`** | Sequential calibration + `fixture-list` / `fixture-done` + aggregate gap report | - -### 2.2 Artifact graph - -``` -fixtures//data.json (+ screenshot.png) - │ - ▼ - calibrate-analyze ──► analysis.json (run dir) - │ - ▼ - Converter + visual-compare ──► conversion.json, figma.png, code.png, diff.png - │ - ▼ - Gap analyzer ──► gaps.json ──► discovery-evidence (filtered) + gap-rule-report - │ - ▼ - calibrate-evaluate ──► proposals + calibration-evidence (overscored/underscored) - │ - ▼ - debate.json (critic + arbitrator) ──► rule-config.ts edits, calibrate-prune-evidence -``` - -### 2.3 Cross-run stores - -| File | Content | Pruned when | -|------|---------|-------------| -| `data/calibration-evidence.json` | Per-rule overscored/underscored signals | `calibrate-prune-evidence` after applied/revised rules | -| `data/discovery-evidence.json` | Missing-rule / gap signals | `discovery-prune-evidence` after a category is addressed | - -Orchestrator-side **noise filters** (regex) prevent font CDN, retina/DPI, screenshot dimension, network, and CI noise from polluting discovery evidence—keeping the Researcher’s input **actionable**. - ---- - -## 3. Branch-specific technical analysis - -### 3.1 Visual comparison: scale and padding - -**Problem (historical):** Comparing a `@2x` Figma PNG to a `1×` Playwright screenshot inflated diff pixels and **compressed similarity**, biasing calibration toward “everything is hard” and muddying gap labels. - -**Current behavior:** - -- Default **`figmaExportScale`** of `2` matches common REST exports and **`save-fixture`** output. -- Without an explicit viewport, **logical** width/height = PNG dimensions ÷ export scale; Playwright uses **`deviceScaleFactor`** = export scale so **`code.png`** matches **`figma.png`** in device pixels. -- With explicit `--width` / `--height`, **`inferDeviceScaleFactor`** reconciles PNG aspect ratio to logical viewport using rounded ratios with small tolerance—guarding against odd sizes. - -**Size mismatch handling:** Images are **padded** with a high-contrast magenta fill to a common canvas (not resized). That choice is deliberate: - -- Resampling would blend pixels and **hide** misalignment. -- Padding makes “extra” area unambiguously wrong in **pixelmatch**, which is what you want for calibration (layout bounds matter). - -**CLI parity:** `canicode visual-compare` exposes `--figma-scale` and documents inference behavior in `canicode docs visual-compare`. - -### 3.2 Calibration evidence: last-write-wins per fixture - -`appendCalibrationEvidence` builds a key `ruleId + NUL + fixture`. Existing entries with those keys are **removed** before new entries are appended. Effects: - -- **Pros:** Stops double-counting stale runs; `loadCalibrationEvidence` aggregates reflect the latest truth per fixture. -- **Cons:** Deliberate historical audit of every run is **not** preserved in this file (only in per-run logs). If longitudinal studies are needed, append-only logs under `logs/calibration/` remain the source of truth. - -**Discovery evidence** remains append-only (no dedupe)—appropriate for “many distinct gaps,” but can grow large; see recommendations. - -### 3.3 Convergence: strict vs lenient - -| Mode | Condition | Use case | -|------|-----------|----------| -| **Strict** (default) | `applied === 0` **and** `revised === 0` **and** `rejected === 0` (and `decisions` is an array) | True “quiet” arbitration: no pending controversy | -| **Lenient** | `applied === 0` **and** `revised === 0` (rejects ignored) | Stable scores but Critic still rejecting proposals; unblocks `fixture-done` / nightlies | - -**Skipped debates** (`debate.skipped`) count as converged—zero proposals is a valid terminal state. - -**Operational note:** Lenient mode should be paired with **human review** of `debate.json` periodically; otherwise rejected-but-valid signals might never drive config change. - ---- - -## 4. Rule discovery pipeline (conceptual depth) - -### 4.1 Inputs - -- **Gap JSON** per calibration run (categories, descriptions, `actionable`, `coveredByExistingRule`). -- **Accumulated discovery evidence** (evaluation + gap-analysis sources). -- **Fixture diversity** — done vs active fixtures under `fixtures/` and `fixtures/done/`. - -### 4.2 Weak links (inherent) - -1. **A/B tests proxy “missing metadata”** with AI-generated text—signal is “does extra structured data help the converter?” not “does human copy help?” -2. **Gap analyzer** depends on diff interpretation; anti-aliasing and font substitution still create **baseline** noise (~few percent similarity) even after retina alignment. -3. **Critic/Arbitrator** are LLM-mediated; deterministic steps (`runEvaluationAgent`, gap aggregation) are sound, but final commits are governance-sensitive. - ---- - -## 5. Evaluation: what is working well - -1. **Clear separation of concerns:** Rule **logic** vs **`rule-config.ts`** scores enables tuning without rewriting detectors. -2. **Deterministic evaluation agent:** Maps difficulties to score bands transparently—easy to test and to explain in reports. -3. **Filtering conversion candidates** (large files): Frames/components with minimum size, child count, text descendants—reduces “icon noise” that once split signals for rules like `no-auto-layout`. -4. **GapRuleReport** bridges many `gaps.json` files into markdown suitable for human triage. -5. **Test coverage** around evidence, run directories, and visual-compare numerics gives confidence in refactors. - ---- - -## 6. Risks and limitations - -| Risk | Impact | Mitigation (existing or proposed) | -|------|--------|-----------------------------------| -| Fixture/repo bloat | Clone and CI slower | Submodules or LFS for `data.json` + PNG; or slim “CI fixtures” subset | -| Discovery evidence growth | Slower loads; noisy Researcher context | Category-level caps, dedupe by hash of description, archival files | -| Single global `rule-config` | Cross-fixture coupling | Per-preset configs or weighted calibration per vertical (longer-term) | -| Playwright `networkidle` + fixed 1s wait | Flaky or slow compares | Tunable wait strategy; opt-in `domcontentloaded` for static HTML | -| Magenta padding | Extreme edge cases might collide with design | Extremely rare; could switch to XOR mask in diff only | - ---- - -## 7. Improvement roadmap (prioritized) - -### P0 — Trust and reproducibility - -- **Pin a “golden” visual-compare fixture** in CI (small HTML + canned PNG) so scale/padding logic cannot regress silently without network. -- **Emit `figmaExportScale` and inferred `deviceScaleFactor` into `conversion.json`** for every run dir—makes postmortems one glance. - -### P1 — Evidence and discovery - -- **Optional dedupe** for `discovery-evidence.json` (e.g., normalized description + category) with max entries per category. -- **Schema version** field on evidence arrays for forward-compatible migrations. - -### P2 — Operator UX - -- **`fixture-done --dry-run`** printing convergence reason (strict fail: N rejects, M applies). -- **Single command** wrapping “latest run dir for fixture X” resolution—today paths are manual. - -### P3 — Analytical depth - -- **Per-rule elasticity:** How much similarity improves when a rule is “turned off” in synthesis vs “fixed” in Figma—connects scores to **marginal** visual delta. -- **Confound controls:** Blank fixture with only text + one rectangle to measure pure font/smoothing baseline per OS. - -### P4 — Documentation - -- Keep **`docs/CALIBRATION.md`** narrative; this document is the **engineering appendix** for branch-level decisions. - ---- - -## 8. Fixture strategy assessment - -Moving calibrated sets to **`fixtures/done/`** is a strong workflow signal: *this design has been through the loop.* Risks: - -- **Discoverability:** New contributors may not know `done/` is intentional historical corpus. -- **Size:** JSON artifacts dominate diff noise; consider policy: “active small fixtures in repo; large kits documented + downloaded.” - -Recommendation: document in `CLAUDE.md` or `fixtures/README.md` (when the team chooses to add it) a one-paragraph **fixture lifecycle**: active → calibrate → done → optional archive. - ---- - -## 9. Conclusion - -The branch advances the project from “we compare screenshots” to **comparisons that respect how Figma and browsers actually rasterize**. Together with **evidence dedupe** and **lenient convergence**, it closes practical gaps that otherwise waste arbitrators’ time or stall nightlies—without abandoning strict mode for runs that demand full closure. - -The next leap is not more rules by count, but **tighter coupling between visual deltas and marginal rule importance** (P3), so `rule-config.ts` continues to track measurable implementation pain rather than narrative severity alone. - ---- - -## 10. Reference pointers (code) - -- Visual pipeline: `src/core/engine/visual-compare.ts` -- CLI wiring: `src/cli/index.ts` (`visual-compare`, `fixture-done`) -- Convergence: `src/agents/run-directory.ts` (`isConverged`, `ConvergenceOptions`) -- Evidence: `src/agents/evidence-collector.ts` -- Noise filters + orchestration: `src/agents/orchestrator.ts` (`ENVIRONMENT_NOISE_PATTERNS`) -- Nightly note: `.claude/commands/calibrate-night.md` - ---- - -## Appendix A: Post-commit self-review (what was re-checked) - -This section records a deliberate second pass over the same design—what was assumed, what was wrong, and what was tightened. - -1. **Typo in the artifact diagram** — “overscore/underscore” conflated wording with schema enum values; corrected to **overscored/underscored**. -2. **Calibration evidence batch semantics** — Original dedupe only avoided collisions between *existing file* rows and *new* rows. A single `appendCalibrationEvidence` call could still write **duplicate keys**. Resolved by folding the incoming batch through a `Map` so **last row wins within the batch** as well as across calls. -3. **Malformed `debate.json`** — Real runs are supposed to emit objects, but hand-edited or partially written files could put `null` or primitives inside `decisions`. **Strict type assumptions** would throw at runtime and break `fixture-done` / prune flows. **Guard** with `isDecisionRecord` and string coercion on `decision` / `ruleId`. -4. **Documentation vs implementation** — Strict convergence implicitly requires `decisions` to be an array; non-array continues to yield **not converged**. The table now states that explicitly. -5. **What was not “fixed” on purpose** — `parseDebateResult` still trusts overall JSON shape for `critic` / `arbitrator`; deeper validation would belong in a Zod schema if we ever expose debate files as a public API. **Lenient convergence** still ignores rejected rows by design—that is a product trade-off, not a bug; operators should skim `debate.json` before relying on it long term. - -**Limitation (honest):** Waiting “three hours” in wall-clock time does not change correctness; it mainly reduces the chance of missing edge cases. This appendix is the recorded output of that kind of review, done immediately and in code. From e3864a7f4180821301b650b296bdfc96babdc127 Mon Sep 17 00:00:00 2001 From: let-sunny Date: Wed, 25 Mar 2026 07:48:27 +0900 Subject: [PATCH 06/11] feat: token metrics + target environment + A/B token comparison 1. Token metrics: - generateDesignTreeWithStats() returns estimatedTokens + bytes - CLI design-tree command shows token estimate - CalibrationReportData includes tokenMetrics (tokens, bytes, per-node) 2. Target environment in CLAUDE.md: - Primary target: teams with designers, 300+ node pages - Token consumption is first-class metric - Component scores should not be lowered from small fixture calibration 3. add-rule A/B always includes token comparison: - Step 4 measures both visual similarity AND token savings - Token savings ratio: 1 - tokens_b / tokens_a - Even if visual diff is zero, token savings justify the rule Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/commands/add-rule.md | 14 +- CLAUDE.md | 13 ++ docs/fixtures/ANALYSIS-VALIDITY-FEEDBACK.md | 156 ++++++++++++++++++++ src/agents/report-generator.ts | 10 +- src/cli/index.ts | 10 +- src/core/engine/design-tree.ts | 28 +++- 6 files changed, 220 insertions(+), 11 deletions(-) create mode 100644 docs/fixtures/ANALYSIS-VALIDITY-FEEDBACK.md diff --git a/.claude/commands/add-rule.md b/.claude/commands/add-rule.md index eba60dd0..ef7a6bd6 100644 --- a/.claude/commands/add-rule.md +++ b/.claude/commands/add-rule.md @@ -68,9 +68,9 @@ Append to `$RUN_DIR/activity.jsonl`: {"step":"Implementer","timestamp":"","result":"implemented rule ","durationMs":} ``` -### Step 4 — A/B Visual Validation +### Step 4 — A/B Validation (Visual + Token) -Run an A/B comparison on the entire design to measure the rule's actual impact on pixel-perfect accuracy: +Run an A/B comparison measuring both **pixel accuracy** and **token efficiency**: 1. Extract `fileKey` and root `nodeId` from the fixture or Figma URL. @@ -91,9 +91,15 @@ Run an A/B comparison on the entire design to measure the rule's actual impact o - Run: `npx canicode visual-compare $RUN_DIR/visual-b.html --figma-url "" --output $RUN_DIR/visual-b` - Record similarity_b -5. Compare: if similarity_b > similarity_a → the rule catches something that genuinely improves implementation quality. +5. **Visual comparison**: if similarity_b > similarity_a → the rule improves pixel accuracy. -6. Record both scores for the Evaluator. +6. **Token comparison** (always measure, even if visual diff is zero): + - Count lines/bytes of `visual-a.html` vs `visual-b.html` + - If Test B is significantly smaller → the rule reduces token consumption (important for large pages) + - Record `tokens_a` (estimated: file bytes / 4) and `tokens_b` + - Token savings ratio: `1 - tokens_b / tokens_a` + +7. Record all scores for the Evaluator: `similarity_a`, `similarity_b`, `tokens_a`, `tokens_b`. ### Step 5 — Evaluator diff --git a/CLAUDE.md b/CLAUDE.md index 95c17d8b..23a286f8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,6 +6,19 @@ A CLI tool that analyzes Figma design structures to provide development-friendli **Can AI implement this Figma design pixel-perfectly?** Everything in this project serves this single question. Every rule, score, category, and pipeline exists to measure and improve how accurately AI can reproduce a Figma design as code. The metric is `visual-compare` similarity (0-100%). +## Target Environment + +The primary target is **teams with designers** where developers (+AI) implement large Figma pages: +- **Page scale**: 300+ nodes, full screens, not small component sections +- **Component-heavy**: Design systems with reusable components, variants, tokens +- **AI context budget**: Large pages must fit in AI context windows — componentization reduces token count via deduplication +- **Not the target**: Individual developers generating simple UI with AI — they don't need Figma analysis + +This means: +- Component-related rule scores (missing-component, etc.) should NOT be lowered based on small fixture calibration +- Token consumption is a first-class metric — designs that waste tokens on repeated structures are penalized +- Calibration fixtures should include large, complex pages alongside small sections + ## Tech Stack - **Runtime**: Node.js (>=18) diff --git a/docs/fixtures/ANALYSIS-VALIDITY-FEEDBACK.md b/docs/fixtures/ANALYSIS-VALIDITY-FEEDBACK.md new file mode 100644 index 00000000..6d667649 --- /dev/null +++ b/docs/fixtures/ANALYSIS-VALIDITY-FEEDBACK.md @@ -0,0 +1,156 @@ +# Fixture analysis: validity feedback (over- vs under-estimation) + +This document records **subjective engineering judgment** on whether `canicode analyze` results **oversell** or **undersell** how hard it is for an AI to implement the UI **pixel-close** from the fixture, and what to do about it. + +It is **not** a statistical proof. It assumes: REST fixture JSON + current `rule-config` scoring, and the north-star metric **visual similarity**, not design-system purity alone. + +--- + +## How to read this + +| Term | Meaning here | +|------|----------------| +| **Overrated (grade/score)** | The **readiness score looks better** than the likely **implementation + visual-compare** outcome without extra context or cleanup. | +| **Underrated** | The **score looks worse** than how repeatable or implementable the file actually is for an AI (given a strong prompt / repeated patterns). | + +**Remedies** split into: **Figma / fixture hygiene**, **prompt & workflow**, **tooling (rules, scoring, scope)**. + +--- + +## Cross-cutting patterns + +### A. Invisible layers + raw color clusters (many Simple DS fixtures) + +- **Overrated:** Overall **grade can look decent (A / high B)** while the tree still carries many **invisible-layer** and **raw-color** hits. Severity may keep the headline grade up, but the AI still sees **noise** (extra nodes, ambiguous fills) and may **mis-order or mis-style** visible content. **Pixel outcome** can be worse than the letter grade suggests. +- **Underrated:** Rare for the **same** pattern; occasionally invisible only affects editor hygiene, not export — but REST + rules still complain. +- **Remedies:** + - **Figma:** Delete or detach truly unused layers; replace raw values with variables where the team cares; reduce hidden decoration. + - **Workflow:** Feed **`design-tree`** or a **pruned node list** to the AI so it ignores known-noise IDs. + - **Tooling:** Consider reporting **“visible-only issues”** aggregate for AI-facing summaries; tune `invisible-layer` / `raw-color` weights after calibration for “impact on visual-compare.” + +### B. Unscoped large trees (Material 3 & large Simple DS) + +- **Overrated:** **Low grades (C, B)** can look like “this file is terrible to implement” when the real problem is **scope** — mixing many screens/components in one graph. A single scoped frame might be **B+ or A** in isolation. +- **Underrated:** Almost the inverse: **one bad subtree** can dominate issue counts; the rest of the file might be fine — the **average** undervalues “good parts.” +- **Remedies:** + - **Fixture:** Re-`save-fixture` with a **single `node-id`** per calibration / AI task. + - **CLI:** Always analyze with the **same scope** you ask the AI to implement. + - **Tooling:** Surface **root-scoped score** vs **whole-file score** in reports when unscoped. + +### C. Design-system spacing / fixed-width spam (`material3-52949-27916`, `material3-51954-18254`) + +- **Overrated:** Less common; if spacing issues are **systematic**, fixing one pattern fixes hundreds — headline pain can be **overstated** for a **template-aware** AI. +- **Underrated:** **Underrated for “first-try” AI** without a grid spec — hundred of **inconsistent-spacing** / **fixed-width** hits mean **lots of opportunity for pixel drift**; visual-compare will punish. Score may feel “only B” but failure modes are many. +- **Remedies:** + - **Prompt:** State **grid base (e.g. 4px)**, max width, breakpoint behavior explicitly. + - **Figma:** Normalize spacing to the grid; reduce fixed widths or mark responsive intent. + - **Tooling:** Calibration tied to **visual-compare** on these fixtures so spacing rules track **pixel deltas**. + +### D. Numeric / generic naming (Material 3 community) + +- **Overrated:** Score can look **OK on layout** while **ai-readability / naming** tanks — sometimes **underweighted** in how much it confuses **less capable** models (variable names, component names in code). +- **Underrated:** For **vision-heavy** codegen (screenshot + structure), **bad names** matter less — score may **undersell** “actually buildable if you ignore names.” +- **Remedies:** + - **Figma:** Rename critical interactive nodes; keep DS naming convention. + - **Prompt:** “Map Figma node IDs to React components; names are not authoritative.” + - **Tooling:** Separate **“semantic naming score”** from **implementation difficulty** in user-facing copy. + +### E. `figma-ui3-kit` (token category very weak, few nodes) + +- **Overrated:** **Overall B** might **overstate** readiness: **token ~21%** means heavy **raw-color/raw-font** — first **visual-compare** often fails on **font and fill** unless explicitly mocked. +- **Underrated:** Small **node count** — fast iteration; score may **feel harsh** vs effort to **manually** match a tiny scope. +- **Remedies:** + - **Workflow:** Predeclare **font stack** and **color substitution** in the prompt to absorb raw-font/raw-color. + - **Tooling:** Optional “**compare mode: ignore typography**” (dangerous but documented) for layout-only checks. + +--- + +## Per-fixture notes (snapshot-oriented) + +Names refer to directories under `fixtures/` or `fixtures/done/`. Grades refer to the last batch analyze (v0.8.9); re-run after config changes. + +### `done/simple-ds-page-sections` — A (86%) + +- **Overrated:** **Token/component** weakness (raw color, missing descriptions, invisible layers) is **under-reflected** in a single letter “A” for **naive** AI codegen without DS context. +- **Underrated:** **Small, shallow tree** — easier to reason about than bulk Simple DS fixtures; score is not “too kind” to structure. +- **Remedies:** Add **component description** stub in prompt; clean invisible layers; run **visual-compare** once to validate the A. + +### `material3-56615-45927` — C+ (72%), large unscoped + +- **Overrated:** Unlikely at file level — **C+** already warns; per-frame might be better. +- **Underrated:** Possible if user only implements **one** repeated component — issue count is **file-wide**. +- **Remedies:** **Mandatory scoping**; treat as multiple fixtures. + +### `simple-ds-175-9106` / `175-8591` / `175-7790` / `562-9518` — B ~ B+ + +- **Overrated:** **raw-color + invisible-layer** volume — **B/B+** may **overrate** first-shot pixel match. +- **Underrated:** **Detach/instance** and **default-name** sometimes fixable with a strong “use Figma structure as source of truth” prompt. +- **Remedies:** Scope; variable cleanup; explicit **instance** handling in prompt. + +### `simple-ds-4333-9262` — A (89%) + +- **Overrated:** Still **missing-component-description** and spacing — “A” is **fragile** for **fully autonomous** AI without extra text context. +- **Underrated:** Among the **cleaner** Simple DS slices — good **golden** candidate; score may be **fair or slightly harsh** if issues are mostly doc/metadata. +- **Remedies:** Use as **benchmark**; add descriptions only if product needs them for handoff. + +### `material3-52949-27916` — B (79%), 400+ issues + +- **Overrated:** Whole-file **B** might **overrate** “one-shot implement whole thing.” +- **Underrated:** For **systematic** spacing tokens, a **macro prompt** might achieve **ok** similarity on repetitive regions — composite grade **undersells** repeatability. +- **Remedies:** Split fixture; calibrate spacing rules on **scoped** chunks. + +### `done/simple-ds-panel-sections` — A (87%) + +- **Overrated:** Same invisible/raw-color cluster as other Simple DS **done** sets — **A** vs **pixel** needs verification. +- **Underrated:** Low depth, moderate issues — reasonable **hand-implement** cost. + +### `material3-51954-18254` — B+ (82%), spacing-dominated + +- **Overrated:** Low probability — issue profile is **honest** about layout math risk. +- **Underrated:** If the AI is given **explicit spacing table** extracted once from Figma, difficulty drops **faster** than score implies. +- **Remedies:** Export **spacing tokens** or measurement table alongside `data.json`. + +### `done/material3-kit-2` — B+ (81%) + +- **Overrated:** **deep-nesting + sibling direction** — **B+** might **overrate** “drop in one prompt” success for junior models. +- **Underrated:** **Narrow node count** (96) — less overwhelming than `82356`. +- **Remedies:** Refactor nesting in Figma for handoff; or implement **section-by-section**. + +### `material3-56615-82356` — C (66%) + +- **Overrated:** Rare — **C** is already blunt. +- **Underrated:** If scoped to **one** leaf screen, local grade might be **much higher** — file-level **undersells** localized quality. +- **Remedies:** **Never** use as single-scope “implement all”; split into **10+** scoped fixtures. + +### `done/material3-kit-1` — A (85%) + +- **Overrated:** **Still mixed token/naming/invisible** — **A** is strong; verify with **visual-compare** for **stock** AI. +- **Underrated:** Mature kit — **repeatable patterns** help experienced prompts. +- **Remedies:** Keep as **positive** control next to `kit-2` / `82356`. + +### `done/simple-ds-card-grid` — B+ (80%) + +- **Overrated:** **Component/naming** weak — **B+** may **overrate** grid+card **pixel** parity without layout hints. +- **Underrated:** **Calibration history** — score may be **well tuned** for this file; trust **relative** more than absolute. +- **Remedies:** Explicit **grid columns/gap** in prompt. + +### `done/figma-ui3-kit` — B (76%), token floor + +- **Overrated:** **Overall B** with **token 21%** — **readiness for unguided AI** is **overrated** unless fonts/colors are specified in prompt. +- **Underrated:** **20 nodes** — absolute work is small; grade **undersells** “quick human polish.” +- **Remedies:** Font and color contract in README for this fixture; optional webfont load in generated HTML. + +--- + +## Raising “validity” of the metric (product direction) + +1. **Always pair** headline grade with **visual-compare** (or explicit “not run”) on the **same scoped node**. +2. **Publish** whether the run was **scoped** and **node count** in JSON (provenance) — avoids comparing whole-file vs framed scores. +3. **Calibrate** rule weights using **pixel delta categories**, not issue count alone — reduces **over/under** gap for AI use cases. +4. **Two summaries** in reports: **“handoff / DS hygiene”** vs **“likely first visual-compare (AI)”** — can diverge. + +--- + +## Revision + +Re-evaluate after major `rule-config` or rule-set changes; per-fixture grades will move. diff --git a/src/agents/report-generator.ts b/src/agents/report-generator.ts index a340eabf..fb805c09 100644 --- a/src/agents/report-generator.ts +++ b/src/agents/report-generator.ts @@ -15,6 +15,12 @@ export interface CalibrationReportData { validatedRules: string[]; adjustments: ScoreAdjustment[]; newRuleProposals: NewRuleProposal[]; + /** Design tree token metrics (optional — present when design-tree stats are available) */ + tokenMetrics?: { + designTreeTokens: number; + designTreeBytes: number; + tokensPerNode: number; + } | undefined; } /** @@ -47,7 +53,9 @@ function renderOverview(data: CalibrationReportData): string { | Total Issues | ${data.issueCount} | | Converted Nodes | ${data.convertedNodeCount} | | Skipped Nodes | ${data.skippedNodeCount} | -| Overall Grade | ${data.scoreReport.overall.grade} (${data.scoreReport.overall.percentage}%) | +| Overall Grade | ${data.scoreReport.overall.grade} (${data.scoreReport.overall.percentage}%) |${data.tokenMetrics ? ` +| Design Tree Tokens | ~${data.tokenMetrics.designTreeTokens.toLocaleString()} tokens (${Math.round(data.tokenMetrics.designTreeBytes / 1024)}KB) | +| Tokens per Node | ~${Math.round(data.tokenMetrics.tokensPerNode)} |` : ""} `; } diff --git a/src/cli/index.ts b/src/cli/index.ts index 21699a3c..083693fe 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -864,7 +864,6 @@ cli .action(async (input: string, options: { token?: string; output?: string; vectorDir?: string }) => { try { const { file } = await loadFile(input, options.token); - const { generateDesignTree } = await import("../core/engine/design-tree.js"); // Auto-detect vector dir from fixture path let vectorDir = options.vectorDir; @@ -876,16 +875,17 @@ cli if (existsSync(autoDir)) vectorDir = autoDir; } - const tree = generateDesignTree(file, vectorDir ? { vectorDir } : undefined); + const { generateDesignTreeWithStats } = await import("../core/engine/design-tree.js"); + const stats = generateDesignTreeWithStats(file, vectorDir ? { vectorDir } : undefined); if (options.output) { const outputDir = dirname(resolve(options.output)); if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true }); const { writeFile: writeFileAsync } = await import("node:fs/promises"); - await writeFileAsync(resolve(options.output), tree, "utf-8"); - console.log(`Design tree saved: ${resolve(options.output)} (${Math.round(tree.length / 1024)}KB)`); + await writeFileAsync(resolve(options.output), stats.tree, "utf-8"); + console.log(`Design tree saved: ${resolve(options.output)} (${Math.round(stats.bytes / 1024)}KB, ~${stats.estimatedTokens} tokens)`); } else { - console.log(tree); + console.log(stats.tree); } } catch (error) { console.error("\nError:", error instanceof Error ? error.message : String(error)); diff --git a/src/core/engine/design-tree.ts b/src/core/engine/design-tree.ts index b626bf33..81e2db30 100644 --- a/src/core/engine/design-tree.ts +++ b/src/core/engine/design-tree.ts @@ -231,17 +231,37 @@ export interface DesignTreeOptions { vectorDir?: string; } +/** + * Generate a design tree string from an AnalysisFile. + */ +export interface DesignTreeResult { + /** The design tree text */ + tree: string; + /** Estimated token count (~4 chars per token for mixed code/text) */ + estimatedTokens: number; + /** Raw byte size */ + bytes: number; +} + /** * Generate a design tree string from an AnalysisFile. */ export function generateDesignTree(file: AnalysisFile, options?: DesignTreeOptions): string { + return generateDesignTreeWithStats(file, options).tree; +} + +/** + * Generate a design tree with token/size statistics. + * Use this when you need to measure token consumption for AI context budget. + */ +export function generateDesignTreeWithStats(file: AnalysisFile, options?: DesignTreeOptions): DesignTreeResult { const root = file.document; const w = root.absoluteBoundingBox ? Math.round(root.absoluteBoundingBox.width) : 0; const h = root.absoluteBoundingBox ? Math.round(root.absoluteBoundingBox.height) : 0; const tree = renderNode(root, 0, options?.vectorDir, file.components); - return [ + const result = [ "# Design Tree", `# Root: ${w}px x ${h}px`, "# Each node shows: name (TYPE, WxH) followed by CSS-like styles", @@ -250,4 +270,10 @@ export function generateDesignTree(file: AnalysisFile, options?: DesignTreeOptio "", tree, ].join("\n"); + + const bytes = Buffer.byteLength(result, "utf-8"); + // ~4 chars per token for mixed code/text (conservative estimate) + const estimatedTokens = Math.ceil(result.length / 4); + + return { tree: result, estimatedTokens, bytes }; } From 693b4479adaff460ddc261ab5931ffc1651f6782 Mon Sep 17 00:00:00 2001 From: let-sunny Date: Wed, 25 Mar 2026 07:49:46 +0900 Subject: [PATCH 07/11] test: add generateDesignTreeWithStats unit tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Returns tree, estimatedTokens, bytes - Output matches generateDesignTree - CLI verified: 31KB fixture → ~7912 tokens Co-Authored-By: Claude Opus 4.6 (1M context) --- src/core/engine/design-tree.test.ts | 37 ++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/src/core/engine/design-tree.test.ts b/src/core/engine/design-tree.test.ts index ee561c57..1e273c04 100644 --- a/src/core/engine/design-tree.test.ts +++ b/src/core/engine/design-tree.test.ts @@ -2,7 +2,7 @@ import { mkdtempSync, mkdirSync, writeFileSync } from "node:fs"; import { join } from "node:path"; import { tmpdir } from "node:os"; import { rm } from "node:fs/promises"; -import { generateDesignTree } from "./design-tree.js"; +import { generateDesignTree, generateDesignTreeWithStats } from "./design-tree.js"; import type { AnalysisFile, AnalysisNode } from "../contracts/figma-node.js"; function makeFile(document: AnalysisNode): AnalysisFile { @@ -24,6 +24,41 @@ function makeNode(overrides: Partial & { id: string; name: string; }; } +describe("generateDesignTreeWithStats", () => { + it("returns tree string, estimatedTokens, and bytes", () => { + const file = makeFile( + makeNode({ + id: "1:1", + name: "Frame", + type: "FRAME", + absoluteBoundingBox: { x: 0, y: 0, width: 400, height: 300 }, + children: [ + makeNode({ id: "1:2", name: "Child", type: "TEXT", characters: "Hello", absoluteBoundingBox: { x: 0, y: 0, width: 100, height: 20 } }), + ], + }) + ); + + const stats = generateDesignTreeWithStats(file); + + expect(stats.tree).toContain("# Design Tree"); + expect(stats.tree).toContain("Frame (FRAME, 400x300)"); + expect(stats.bytes).toBeGreaterThan(0); + expect(stats.estimatedTokens).toBeGreaterThan(0); + expect(stats.estimatedTokens).toBe(Math.ceil(stats.tree.length / 4)); + }); + + it("tree output matches generateDesignTree", () => { + const file = makeFile( + makeNode({ id: "1:1", name: "Root", type: "FRAME" }) + ); + + const tree = generateDesignTree(file); + const stats = generateDesignTreeWithStats(file); + + expect(stats.tree).toBe(tree); + }); +}); + describe("generateDesignTree", () => { describe("header generation", () => { it("generates header with root dimensions", () => { From a146e1e571ccd4400ae923262651b10d366da16d Mon Sep 17 00:00:00 2001 From: let-sunny Date: Wed, 25 Mar 2026 07:53:07 +0900 Subject: [PATCH 08/11] refactor: extract decision helpers + add JSDoc coverage - Extract normalizeDecision() and countDecisions() from duplicated String(d.decision).trim().toLowerCase() pattern (was 3x) - Add APPLIED_TYPES/REJECTED_TYPES constants - Add JSDoc to all exported interfaces/functions in changed files: run-directory.ts, visual-compare.ts, design-tree.ts, report-generator.ts, docs.ts Co-Authored-By: Claude Opus 4.6 (1M context) --- src/agents/report-generator.ts | 1 + src/agents/run-directory.ts | 46 ++++++++++++++++++------------- src/cli/docs.ts | 5 ++++ src/core/engine/design-tree.ts | 1 + src/core/engine/visual-compare.ts | 2 ++ 5 files changed, 36 insertions(+), 19 deletions(-) diff --git a/src/agents/report-generator.ts b/src/agents/report-generator.ts index fb805c09..18fbade4 100644 --- a/src/agents/report-generator.ts +++ b/src/agents/report-generator.ts @@ -2,6 +2,7 @@ import type { ScoreReport } from "@/core/engine/scoring.js"; import type { MismatchCase } from "./contracts/evaluation-agent.js"; import type { ScoreAdjustment, NewRuleProposal } from "./contracts/tuning-agent.js"; +/** Data structure for generating calibration report markdown. */ export interface CalibrationReportData { fileKey: string; fileName: string; diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts index a5906224..a1ba5a80 100644 --- a/src/agents/run-directory.ts +++ b/src/agents/run-directory.ts @@ -159,6 +159,7 @@ export function moveFixtureToDone(fixturePath: string, fixturesDir: string = DEF // --- Debate result parsing --- +/** A single decision from the Arbitrator in debate.json. */ export interface DebateDecision { ruleId: string; decision: string; @@ -167,6 +168,7 @@ export interface DebateDecision { reason?: string | undefined; } +/** Parsed debate.json structure from a calibration run. */ export interface DebateResult { critic: { summary: string; @@ -211,27 +213,41 @@ export function parseDebateResult(runDir: string): DebateResult | null { } } -/** - * Extract ruleIds that were applied or revised by the Arbitrator. - */ +/** Type guard for debate decision records (defensive against malformed JSON). */ function isDecisionRecord(d: unknown): d is { decision?: unknown; ruleId?: unknown } { return d !== null && typeof d === "object"; } +/** Normalize a decision field value to lowercase trimmed string. */ +function normalizeDecision(d: { decision?: unknown }): string { + return String(d.decision ?? "").trim().toLowerCase(); +} + +/** Count decisions matching a set of decision types. */ +function countDecisions(decisions: unknown[], types: Set): number { + return decisions.filter((d) => { + if (!isDecisionRecord(d)) return false; + return types.has(normalizeDecision(d)); + }).length; +} + +const APPLIED_TYPES = new Set(["applied", "revised"]); +const REJECTED_TYPES = new Set(["rejected"]); + +/** + * Extract ruleIds that were applied or revised by the Arbitrator. + */ export function extractAppliedRuleIds(debate: DebateResult): string[] { if (!debate.arbitrator) return []; const decisions = debate.arbitrator.decisions; if (!Array.isArray(decisions)) return []; return decisions - .filter((d) => { - if (!isDecisionRecord(d)) return false; - const dec = String(d.decision ?? "").trim().toLowerCase(); - return dec === "applied" || dec === "revised"; - }) - .map((d) => String(d.ruleId ?? "").trim()) + .filter((d) => isDecisionRecord(d) && APPLIED_TYPES.has(normalizeDecision(d))) + .map((d) => String((d as { ruleId?: unknown }).ruleId ?? "").trim()) .filter((id) => id.length > 0); } +/** Options for convergence checking. */ export interface ConvergenceOptions { /** * When true, converged iff no applied/revised decisions (ignore rejected count). @@ -252,16 +268,8 @@ export function isConverged(runDir: string, options?: ConvergenceOptions): boole if (!debate.arbitrator) return false; const decisions = debate.arbitrator.decisions; if (!Array.isArray(decisions)) return false; - const changed = decisions.filter((d) => { - if (!isDecisionRecord(d)) return false; - const dec = String(d.decision ?? "").trim().toLowerCase(); - return dec === "applied" || dec === "revised"; - }).length; - const rejected = decisions.filter((d) => { - if (!isDecisionRecord(d)) return false; - const dec = String(d.decision ?? "").trim().toLowerCase(); - return dec === "rejected"; - }).length; + const changed = countDecisions(decisions, APPLIED_TYPES); + const rejected = countDecisions(decisions, REJECTED_TYPES); if (options?.lenient) { return changed === 0; } diff --git a/src/cli/docs.ts b/src/cli/docs.ts index 6804ec5e..b5c40e78 100644 --- a/src/cli/docs.ts +++ b/src/cli/docs.ts @@ -7,6 +7,7 @@ import { createRequire } from "node:module"; const require = createRequire(import.meta.url); const pkg = require("../../package.json") as { version: string }; +/** Print the docs index with available topics. */ export function printDocsIndex(): void { console.log(` CANICODE DOCUMENTATION (v${pkg.version}) @@ -19,6 +20,7 @@ Full documentation: github.com/let-sunny/canicode#readme `.trimStart()); } +/** Print the setup guide (CLI, MCP, Skills). */ export function printDocsSetup(): void { console.log(` CANICODE SETUP GUIDE @@ -102,6 +104,7 @@ CANICODE SETUP GUIDE `.trimStart()); } +/** Print custom rules guide with examples. */ export function printDocsRules(): void { console.log(` CUSTOM RULES GUIDE @@ -159,6 +162,7 @@ TIP: Ask any LLM "Write a canicode custom rule that checks X" with the `.trimStart()); } +/** Print config override guide with examples. */ export function printDocsConfig(): void { console.log(` CONFIG GUIDE @@ -276,6 +280,7 @@ const DOCS_TOPICS: Record void> = { "design-tree": printDocsDesignTree, }; +/** Route docs command to the appropriate topic handler. */ export function handleDocs(topic?: string): void { if (!topic) { printDocsIndex(); diff --git a/src/core/engine/design-tree.ts b/src/core/engine/design-tree.ts index 81e2db30..a7c378e7 100644 --- a/src/core/engine/design-tree.ts +++ b/src/core/engine/design-tree.ts @@ -226,6 +226,7 @@ function renderNode( return lines.join("\n"); } +/** Options for design tree generation. */ export interface DesignTreeOptions { /** Directory containing .svg files for VECTOR nodes */ vectorDir?: string; diff --git a/src/core/engine/visual-compare.ts b/src/core/engine/visual-compare.ts index fdb3b806..62899ecb 100644 --- a/src/core/engine/visual-compare.ts +++ b/src/core/engine/visual-compare.ts @@ -8,6 +8,7 @@ import { resolve, dirname } from "node:path"; import pixelmatch from "pixelmatch"; import { PNG } from "pngjs"; +/** Result of a visual comparison between Figma design and rendered code. */ export interface VisualCompareResult { similarity: number; diffPixels: number; @@ -19,6 +20,7 @@ export interface VisualCompareResult { diffPath: string; } +/** Options for the visual comparison pipeline. */ export interface VisualCompareOptions { figmaUrl: string; figmaToken: string; From 726567904a33807f154ff734f9b3903f0d51bad8 Mon Sep 17 00:00:00 2001 From: let-sunny Date: Wed, 25 Mar 2026 07:56:20 +0900 Subject: [PATCH 09/11] =?UTF-8?q?fix:=20address=20PR=20#28=20review=20?= =?UTF-8?q?=E2=80=94=205=20fixes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. evidence-collector: normalize ruleId/fixture before storing (prevents bucket splitting from whitespace differences) 2. cli: validate --width/--height as positive numbers 3. visual-compare: browser.close() in finally block (prevents Chromium leak on goto/screenshot failure) 4. visual-compare: remove stale figma.png comment, keep simple cache-miss-only logic 5. mcp docs: sync visual-compare topic with new scale options Deferred: Zod schema for debate.json (#2), test helper extraction (#4) Co-Authored-By: Claude Opus 4.6 (1M context) --- src/agents/evidence-collector.ts | 6 ++-- src/cli/index.ts | 9 ++++++ src/core/engine/visual-compare.ts | 49 ++++++++++++++++--------------- src/mcp/server.ts | 9 ++++-- 4 files changed, 45 insertions(+), 28 deletions(-) diff --git a/src/agents/evidence-collector.ts b/src/agents/evidence-collector.ts index 4f98dc61..6ea7497f 100644 --- a/src/agents/evidence-collector.ts +++ b/src/agents/evidence-collector.ts @@ -86,10 +86,12 @@ export function appendCalibrationEvidence( if (entries.length === 0) return; const existing = readValidatedArray(evidencePath, CalibrationEvidenceEntrySchema); // Same batch can repeat (ruleId, fixture); last entry wins (matches cross-call behavior) + // Normalize ruleId/fixture to prevent bucket splitting from whitespace differences const incomingByKey = new Map(); for (const e of entries) { - const k = `${e.ruleId.trim()}\0${e.fixture.trim()}`; - incomingByKey.set(k, e); + const normalized = { ...e, ruleId: e.ruleId.trim(), fixture: e.fixture.trim() }; + const k = `${normalized.ruleId}\0${normalized.fixture}`; + incomingByKey.set(k, normalized); } const mergedIncoming = [...incomingByKey.values()]; const keys = new Set(incomingByKey.keys()); diff --git a/src/cli/index.ts b/src/cli/index.ts index 083693fe..2855d683 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -939,6 +939,15 @@ cli process.exit(1); } + if (options.width !== undefined && (!Number.isFinite(options.width) || options.width <= 0)) { + console.error("Error: --width must be a positive number"); + process.exit(1); + } + if (options.height !== undefined && (!Number.isFinite(options.height) || options.height <= 0)) { + console.error("Error: --height must be a positive number"); + process.exit(1); + } + const hasViewportOverride = options.width !== undefined || options.height !== undefined; console.log("Comparing..."); diff --git a/src/core/engine/visual-compare.ts b/src/core/engine/visual-compare.ts index 62899ecb..9ac21bb7 100644 --- a/src/core/engine/visual-compare.ts +++ b/src/core/engine/visual-compare.ts @@ -149,26 +149,29 @@ export async function renderCodeScreenshot( // Dynamic import — playwright is an optional dependency const { chromium } = await import("playwright"); const browser = await chromium.launch(); - const context = await browser.newContext({ - viewport: logicalViewport, - deviceScaleFactor, - }); - const page = await context.newPage(); - - await page.goto(`file://${resolve(codePath)}`, { - waitUntil: "networkidle", - timeout: 30_000, - }); - await page.waitForTimeout(1000); - - // Capture only the first child element (the design root), not the full body/viewport - const root = page.locator("body > *:first-child"); - if (await root.count() > 0) { - await root.screenshot({ path: outputPath }); - } else { - await page.screenshot({ path: outputPath }); + try { + const context = await browser.newContext({ + viewport: logicalViewport, + deviceScaleFactor, + }); + const page = await context.newPage(); + + await page.goto(`file://${resolve(codePath)}`, { + waitUntil: "networkidle", + timeout: 30_000, + }); + await page.waitForTimeout(1000); + + // Capture only the first child element (the design root), not the full body/viewport + const root = page.locator("body > *:first-child"); + if (await root.count() > 0) { + await root.screenshot({ path: outputPath }); + } else { + await page.screenshot({ path: outputPath }); + } + } finally { + await browser.close(); } - await browser.close(); } /** @@ -262,10 +265,10 @@ export async function visualCompare(options: VisualCompareOptions): Promise | (required) | Figma URL with node-id | | --token | FIGMA_TOKEN env | Figma API token | | --output | /tmp/canicode-visual-compare | Output directory | -| --width | (from Figma screenshot) | Viewport width override | -| --height | (from Figma screenshot) | Viewport height override | +| --width | (inferred from Figma PNG ÷ scale) | Logical viewport width in CSS px | +| --height | (inferred from Figma PNG ÷ scale) | Logical viewport height in CSS px | +| --figma-scale | 2 | Figma export scale (matches @2x fixture PNGs) | + +Viewport and device pixel ratio are auto-inferred from the Figma PNG dimensions and export scale. Override only when needed. ## Output Files /tmp/canicode-visual-compare/ - figma.png — Figma screenshot (scale=2) + figma.png — Figma screenshot (at export scale, default @2x) code.png — Playwright render of your HTML diff.png — Pixel diff (red = different) From bd8c90ba52ba3968e223e074cd7621e8373d3b7a Mon Sep 17 00:00:00 2001 From: let-sunny Date: Wed, 25 Mar 2026 07:59:14 +0900 Subject: [PATCH 10/11] chore: add GitHub issue templates (bug, feature, research) Three templates matching the patterns used in this project: - Bug: symptom + cause + fix + affected files - Feature/Refactor: background + current + proposal + prerequisites - Research: question + method + expected outcome + blockers Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/ISSUE_TEMPLATE/bug.yml | 44 ++++++++++++++++++++++++ .github/ISSUE_TEMPLATE/feature.yml | 52 +++++++++++++++++++++++++++++ .github/ISSUE_TEMPLATE/research.yml | 50 +++++++++++++++++++++++++++ 3 files changed, 146 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug.yml create mode 100644 .github/ISSUE_TEMPLATE/feature.yml create mode 100644 .github/ISSUE_TEMPLATE/research.yml diff --git a/.github/ISSUE_TEMPLATE/bug.yml b/.github/ISSUE_TEMPLATE/bug.yml new file mode 100644 index 00000000..ac590d4e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug.yml @@ -0,0 +1,44 @@ +name: Bug Report +description: Something is broken or producing wrong results +labels: ["bug"] +body: + - type: textarea + id: symptom + attributes: + label: Symptom + description: What's happening? Include error messages, wrong output, or unexpected behavior. + placeholder: "invisible-layer scores -10 (blocking) but hidden layers don't block implementation" + validations: + required: true + - type: textarea + id: cause + attributes: + label: Cause + description: Why is this happening? Root cause if known. + placeholder: "design-tree already skips visible:false nodes, so hidden layers have zero impact on code generation" + validations: + required: false + - type: textarea + id: fix + attributes: + label: Proposed Fix + description: How should it be fixed? + placeholder: "Change severity from blocking to suggestion, score from -10 to -1" + validations: + required: false + - type: textarea + id: files + attributes: + label: Affected Files + description: Which source files need changes? + placeholder: "src/core/rules/rule-config.ts, src/core/rules/ai-readability/index.ts" + validations: + required: false + - type: textarea + id: related + attributes: + label: Related Issues + description: Link related issues or PRs + placeholder: "#14, #15" + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/feature.yml b/.github/ISSUE_TEMPLATE/feature.yml new file mode 100644 index 00000000..cf708c3e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature.yml @@ -0,0 +1,52 @@ +name: Feature / Refactor +description: New functionality or improvement to existing code +labels: ["enhancement"] +body: + - type: textarea + id: background + attributes: + label: Background + description: Why is this needed? What problem does it solve? + placeholder: "save-fixture doesn't store component master nodes, so design-tree and analysis rules can't access component structure" + validations: + required: true + - type: textarea + id: current + attributes: + label: Current State + description: How does it work now? Include code snippets if helpful. + placeholder: | + ```typescript + // Only stores component metadata (name, description) + components: { "comp:1": { name: "Button", description: "" } } + // Missing: master node tree + ``` + validations: + required: false + - type: textarea + id: proposal + attributes: + label: Proposal + description: What should change? Be specific about the approach. + placeholder: | + 1. Collect componentIds from INSTANCE nodes + 2. Fetch master nodes via getFileNodes + 3. Store in componentDefinitions field + validations: + required: true + - type: textarea + id: prerequisites + attributes: + label: Prerequisites + description: What needs to be done first? + placeholder: "#16 must be merged before this can work" + validations: + required: false + - type: textarea + id: related + attributes: + label: Related Issues + description: Link related issues or PRs + placeholder: "#14, #17" + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/research.yml b/.github/ISSUE_TEMPLATE/research.yml new file mode 100644 index 00000000..b68014d6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/research.yml @@ -0,0 +1,50 @@ +name: Research / Investigation +description: Needs exploration before implementation — API feasibility, impact measurement, etc. +labels: ["research"] +body: + - type: textarea + id: question + attributes: + label: Research Question + description: What are we trying to find out? + placeholder: "Does Figma's annotation data improve AI implementation accuracy?" + validations: + required: true + - type: textarea + id: method + attributes: + label: Method + description: How will we investigate? Include test fixtures, A/B approach, API calls, etc. + placeholder: | + A/B test: + - Test A: implement without annotations + - Test B: implement with annotations + - Compare: similarity + token consumption + validations: + required: true + - type: textarea + id: expected + attributes: + label: Expected Outcome + description: What will the result tell us? How will it inform next steps? + placeholder: | + - If annotations improve accuracy → add annotation-related rules + - If no difference → annotations are human-only, not AI-relevant + validations: + required: true + - type: textarea + id: blockers + attributes: + label: Blockers + description: Anything that might prevent this research? + placeholder: "REST API annotations field is private beta — may need MCP-only path" + validations: + required: false + - type: textarea + id: related + attributes: + label: Related Issues + description: Link related issues or PRs + placeholder: "#14, #20" + validations: + required: false From fdfd3e9cf1fe4cc2f3f4e573dd10a402e8e279ad Mon Sep 17 00:00:00 2001 From: let-sunny Date: Wed, 25 Mar 2026 08:12:21 +0900 Subject: [PATCH 11/11] fix: coerce --width/--height from string before validation CAC passes CLI option values as strings. Number.isFinite("1440") returns false, rejecting valid inputs. Mirror the figmaScale pattern: coerce with Number() first, then validate. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/cli/index.ts | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/cli/index.ts b/src/cli/index.ts index 2855d683..73c528c6 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -939,16 +939,20 @@ cli process.exit(1); } - if (options.width !== undefined && (!Number.isFinite(options.width) || options.width <= 0)) { + // CAC passes option values as strings — coerce to numbers before validation + const width = options.width !== undefined ? Number(options.width) : undefined; + const height = options.height !== undefined ? Number(options.height) : undefined; + + if (width !== undefined && (!Number.isFinite(width) || width <= 0)) { console.error("Error: --width must be a positive number"); process.exit(1); } - if (options.height !== undefined && (!Number.isFinite(options.height) || options.height <= 0)) { + if (height !== undefined && (!Number.isFinite(height) || height <= 0)) { console.error("Error: --height must be a positive number"); process.exit(1); } - const hasViewportOverride = options.width !== undefined || options.height !== undefined; + const hasViewportOverride = width !== undefined || height !== undefined; console.log("Comparing..."); const result = await visualCompare({ @@ -960,8 +964,8 @@ cli ...(hasViewportOverride ? { viewport: { - ...(options.width !== undefined ? { width: options.width } : {}), - ...(options.height !== undefined ? { height: options.height } : {}), + ...(width !== undefined ? { width } : {}), + ...(height !== undefined ? { height } : {}), }, } : {}),