let-sunny · let-sunny · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/.claude/agents/calibration/converter.md b/.claude/agents/calibration/converter.md
@@ -53,14 +53,37 @@ Read and follow `.claude/skills/design-to-code/PROMPT.md` for all code generatio
    - Each node in the tree maps 1:1 to an HTML element
    - Copy style values directly — they are already CSS-ready
    - Follow all rules from DESIGN-TO-CODE-PROMPT.md
-3. Save to `$RUN_DIR/output.html`
-4. Run visual comparison:
-   ```
-   npx canicode visual-compare $RUN_DIR/output.html --figma-url "https://www.figma.com/design/<fileKey>/file?node-id=<rootNodeId>" --output $RUN_DIR
+4. Save to `$RUN_DIR/output.html`
+5. Run visual comparison:
+
+   ```bash
+   npx canicode visual-compare $RUN_DIR/output.html \
+     --figma-url "https://www.figma.com/design/<fileKey>/file?node-id=<rootNodeId>" \
+     --output $RUN_DIR
    ```
+
    This saves `figma.png`, `code.png`, and `diff.png` into the run directory.
    Replace `:` with `-` in the nodeId for the URL.
-5. Use similarity to determine overall difficulty (thresholds defined in `src/agents/orchestrator.ts` → `SIMILARITY_DIFFICULTY_THRESHOLDS`):
+6. **Responsive comparison** (if expanded screenshot exists):
+
+   List `screenshot-*.png` in the fixture directory. Extract the width number from each filename, sort numerically. If 2+ screenshots exist, the smallest width is the original and the largest is the expanded viewport.
+
+   ```bash
+   # Example: screenshot-1200.png (original), screenshot-1920.png (expanded)
+   SCREENSHOTS=($(ls <fixture-path>/screenshot-*.png | sort -t- -k2 -n))
+   LARGEST="${SCREENSHOTS[-1]}"
+   LARGEST_WIDTH=$(echo "$LARGEST" | grep -oP 'screenshot-\K\d+')
+
+   npx canicode visual-compare $RUN_DIR/output.html \
+     --figma-url "https://www.figma.com/design/<fileKey>/file?node-id=<rootNodeId>" \
+     --figma-screenshot "$LARGEST" \
+     --width "$LARGEST_WIDTH" \
+     --output $RUN_DIR/responsive
+   ```
+
+   The command outputs JSON to stdout with a `similarity` field. Record it as `responsiveSimilarity` and calculate `responsiveDelta = similarity - responsiveSimilarity`.
+   If only 1 screenshot exists, skip responsive comparison and set `responsiveSimilarity`, `responsiveDelta`, and `responsiveViewport` to `null`.
+7. Use similarity to determine overall difficulty (thresholds defined in `src/agents/orchestrator.ts` → `SIMILARITY_DIFFICULTY_THRESHOLDS`):
 
    | Similarity | Difficulty |
    |-----------|-----------|
@@ -69,13 +92,18 @@ Read and follow `.claude/skills/design-to-code/PROMPT.md` for all code generatio
    | 50-69% | hard |
    | <50% | failed |
 
-6. **MANDATORY — Rule Impact Assessment**: For EVERY rule ID in `nodeIssueSummaries[].flaggedRuleIds`, assess its actual impact on conversion. Read the analysis JSON, collect all unique `flaggedRuleIds`, and for each one write an entry in `ruleImpactAssessment`. This array MUST NOT be empty if there are flagged rules.
+8. **MANDATORY — Rule Impact Assessment**: For EVERY rule ID in `nodeIssueSummaries[].flaggedRuleIds`, assess its actual impact on conversion. Read the analysis JSON, collect all unique `flaggedRuleIds`, and for each one write an entry in `ruleImpactAssessment`. This array MUST NOT be empty if there are flagged rules.
    - Did this rule's issue actually make the conversion harder?
    - What was its real impact on the final similarity score?
    - Rate as: `easy` (no real difficulty), `moderate` (some guessing needed), `hard` (significant pixel loss), `failed` (could not reproduce)
-7. Note any difficulties NOT covered by existing rules as `uncoveredStruggles`
-   - **Only include design-related issues** — problems in the Figma file structure, missing tokens, ambiguous layout, etc.
-   - **Exclude environment/tooling issues** — font CDN availability, screenshot DPI/retina scaling, browser rendering quirks, network issues, CI limitations. These are not design problems and create noise in rule discovery.
+9. **Code metrics** (recorded for analysis/reporting — not consumed by evaluation):
+   - `htmlBytes`: file size in bytes
+   - `htmlLines`: line count
+   - `cssClassCount`: unique CSS class selectors in `<style>` block
+   - `cssVariableCount`: unique CSS custom properties (e.g., `--primary-color:`, `--spacing-md:`) in `<style>` block
+10. Note any difficulties NOT covered by existing rules as `uncoveredStruggles`
+    - **Only include design-related issues** — problems in the Figma file structure, missing tokens, ambiguous layout, etc.
+    - **Exclude environment/tooling issues** — font CDN availability, screenshot DPI/retina scaling, browser rendering quirks, network issues, CI limitations. These are not design problems and create noise in rule discovery.
 
 ## Output
 
@@ -88,6 +116,13 @@ Write results to `$RUN_DIR/conversion.json`.
   "rootNodeId": "562:9069",
   "generatedCode": "// The full HTML page",
   "similarity": 87,
+  "responsiveSimilarity": 72,
+  "responsiveDelta": 15,
+  "responsiveViewport": 1920,
+  "htmlBytes": 42000,
+  "htmlLines": 850,
+  "cssClassCount": 45,
+  "cssVariableCount": 12,
   "difficulty": "moderate",
   "notes": "Summary of the conversion experience",
   "ruleImpactAssessment": [

diff --git a/.claude/commands/calibrate-loop.md b/.claude/commands/calibrate-loop.md
@@ -79,9 +79,14 @@ ls $RUN_DIR/conversion.json $RUN_DIR/output.html
 
 If `conversion.json` is missing, write it yourself from the Converter's returned summary.
 
+**Record token usage**: The subagent result includes `total_tokens`, `tool_uses`, `duration_ms` in usage metadata. Read `conversion.json`, add these fields, and write back:
+- `converterTokens`: total tokens consumed by the Converter subagent
+- `converterToolUses`: number of tool calls
+- `converterDurationMs`: execution time in milliseconds
+
 Append to `$RUN_DIR/activity.jsonl`:
 ```json
-{"step":"Converter","timestamp":"<ISO8601>","result":"similarity=<N>% difficulty=<level>","durationMs":<ms>}
+{"step":"Converter","timestamp":"<ISO8601>","result":"similarity=<N>% difficulty=<level> tokens=<N>","durationMs":<ms>}
 ```
 
 ### Step 3 — Gap Analysis

diff --git a/src/agents/ablation/helpers.ts b/src/agents/ablation/helpers.ts
@@ -7,7 +7,7 @@ import { resolve, join } from "node:path";
 import Anthropic from "@anthropic-ai/sdk";
 
 import { renderCodeScreenshot } from "../../core/engine/visual-compare.js";
-import { compareScreenshots } from "../../core/engine/visual-compare-helpers.js";
+import { compareScreenshots, inferExportScale } from "../../core/engine/visual-compare-helpers.js";
 
 // --- Configuration ---
 
@@ -94,21 +94,9 @@ export function getResponseText(response: Anthropic.Message): string {
     .join("\n");
 }
 
-// --- CSS metrics ---
+// --- CSS metrics (re-export from core) ---
 
-export function countCssClasses(html: string): number {
-  const styleMatch = html.match(/<style[\s\S]*?<\/style>/i);
-  if (!styleMatch) return 0;
-  const classes = styleMatch[0].match(/\.[a-zA-Z][\w-]*\s*[{,:]/g);
-  return new Set(classes?.map((c) => c.replace(/\s*[{,:]$/, ""))).size;
-}
-
-export function countCssVariables(html: string): number {
-  const styleMatch = html.match(/<style[\s\S]*?<\/style>/i);
-  if (!styleMatch) return 0;
-  const vars = styleMatch[0].match(/--[\w-]+\s*:/g);
-  return new Set(vars?.map((v) => v.replace(/\s*:$/, ""))).size;
-}
+export { countCssClasses, countCssVariables } from "../../core/engine/visual-compare-helpers.js";
 
 // --- File operations ---
 
@@ -162,10 +150,7 @@ export async function renderAndCompare(
   const { PNG } = await import("pngjs");
   const figmaImage = PNG.sync.read(readFileSync(figmaScreenshotPath));
   const figmaWidth = figmaImage.width;
-  // Figma save-fixture exports at @2x by default. 1920/768 condition screenshots are @1x.
-  // Detect: if width matches a known @1x size (1920, 768), use scale 1. Otherwise @2x.
-  const KNOWN_1X_WIDTHS = [1920, 768];
-  const exportScale = KNOWN_1X_WIDTHS.includes(figmaWidth) ? 1 : 2;
+  const exportScale = inferExportScale(figmaWidth);
   const logicalW = Math.max(1, Math.round(figmaWidth / exportScale));
   const logicalH = Math.max(1, Math.round(figmaImage.height / exportScale));
 

diff --git a/src/agents/contracts/evaluation-agent.ts b/src/agents/contracts/evaluation-agent.ts
@@ -47,6 +47,12 @@ export interface EvaluationAgentInput {
     }>;
   }>;
   ruleScores: Record<string, { score: number; severity: string }>;
+  /**
+   * Responsive viewport comparison delta (similarity - responsiveSimilarity).
+   * Positive = design breaks at expanded viewport. Used to evaluate responsive-critical rules.
+   * null/undefined = no responsive comparison available.
+   */
+  responsiveDelta?: number | null | undefined;
 }
 
 export interface EvaluationAgentOutput {

diff --git a/src/agents/evaluation-agent.test.ts b/src/agents/evaluation-agent.test.ts
@@ -189,6 +189,130 @@ describe("runEvaluationAgent", () => {
     expect(result.validatedRules).toContain("rule-a");
   });
 
+  it("overrides responsive-critical rule from validated to underscored when responsiveDelta is high", () => {
+    const input: EvaluationAgentInput = {
+      nodeIssueSummaries: [
+        { nodeId: "node-1", nodePath: "Page > Frame", flaggedRuleIds: ["fixed-size-in-auto-layout"] },
+      ],
+      conversionRecords: [
+        {
+          nodeId: "node-1",
+          nodePath: "Page > Frame",
+          difficulty: "easy",
+          ruleRelatedStruggles: [
+            { ruleId: "fixed-size-in-auto-layout", description: "Looked fine", actualImpact: "easy" },
+          ],
+          uncoveredStruggles: [],
+        },
+      ],
+      ruleScores: {
+        "fixed-size-in-auto-layout": { score: -6, severity: "risk" },
+      },
+      responsiveDelta: 25,
+    };
+
+    const result = runEvaluationAgent(input);
+
+    const match = result.mismatches.find(m => m.ruleId === "fixed-size-in-auto-layout");
+    expect(match).toBeDefined();
+    // AI said "easy" but responsiveDelta=25 → hard → score -6 is underscored (expected -8 to -12)
+    expect(match!.type).toBe("underscored");
+    expect(match!.actualDifficulty).toBe("hard");
+    expect(match!.reasoning).toContain("responsive");
+    // Must NOT be in validatedRules (was validated before override, removed after)
+    expect(result.validatedRules).not.toContain("fixed-size-in-auto-layout");
+  });
+
+  it("keeps responsive-critical rule validated when responsiveDelta is low", () => {
+    const input: EvaluationAgentInput = {
+      nodeIssueSummaries: [
+        { nodeId: "node-1", nodePath: "Page > Frame", flaggedRuleIds: ["missing-size-constraint"] },
+      ],
+      conversionRecords: [
+        {
+          nodeId: "node-1",
+          nodePath: "Page > Frame",
+          difficulty: "easy",
+          ruleRelatedStruggles: [
+            { ruleId: "missing-size-constraint", description: "Fine", actualImpact: "easy" },
+          ],
+          uncoveredStruggles: [],
+        },
+      ],
+      ruleScores: {
+        "missing-size-constraint": { score: -2, severity: "suggestion" },
+      },
+      responsiveDelta: 3,
+    };
+
+    const result = runEvaluationAgent(input);
+
+    const match = result.mismatches.find(m => m.ruleId === "missing-size-constraint");
+    expect(match).toBeDefined();
+    expect(match!.type).toBe("validated");
+    expect(match!.actualDifficulty).toBe("easy");
+    expect(result.validatedRules).toContain("missing-size-constraint");
+  });
+
+  it("does not override non-responsive-critical rules even with high responsiveDelta", () => {
+    const input: EvaluationAgentInput = {
+      nodeIssueSummaries: [
+        { nodeId: "node-1", nodePath: "Page > Frame", flaggedRuleIds: ["raw-value"] },
+      ],
+      conversionRecords: [
+        {
+          nodeId: "node-1",
+          nodePath: "Page > Frame",
+          difficulty: "easy",
+          ruleRelatedStruggles: [
+            { ruleId: "raw-value", description: "Easy", actualImpact: "easy" },
+          ],
+          uncoveredStruggles: [],
+        },
+      ],
+      ruleScores: {
+        "raw-value": { score: -3, severity: "missing-info" },
+      },
+      responsiveDelta: 30,
+    };
+
+    const result = runEvaluationAgent(input);
+
+    const match = result.mismatches.find(m => m.ruleId === "raw-value");
+    expect(match).toBeDefined();
+    // raw-value is token-management, not responsive-critical — no override
+    expect(match!.type).toBe("validated");
+  });
+
+  it("treats negative responsiveDelta as easy", () => {
+    const input: EvaluationAgentInput = {
+      nodeIssueSummaries: [
+        { nodeId: "node-1", nodePath: "Page > Frame", flaggedRuleIds: ["fixed-size-in-auto-layout"] },
+      ],
+      conversionRecords: [
+        {
+          nodeId: "node-1",
+          nodePath: "Page > Frame",
+          difficulty: "easy",
+          ruleRelatedStruggles: [
+            { ruleId: "fixed-size-in-auto-layout", description: "Fine", actualImpact: "easy" },
+          ],
+          uncoveredStruggles: [],
+        },
+      ],
+      ruleScores: {
+        "fixed-size-in-auto-layout": { score: -2, severity: "suggestion" },
+      },
+      responsiveDelta: -5,
+    };
+
+    const result = runEvaluationAgent(input);
+
+    const match = result.mismatches.find(m => m.ruleId === "fixed-size-in-auto-layout");
+    expect(match).toBeDefined();
+    expect(match!.actualDifficulty).toBe("easy");
+  });
+
   it("returns empty mismatches and validatedRules for empty input", () => {
     const input: EvaluationAgentInput = {
       nodeIssueSummaries: [],

diff --git a/src/agents/evaluation-agent.ts b/src/agents/evaluation-agent.ts
@@ -6,6 +6,8 @@ import type {
 } from "./contracts/evaluation-agent.js";
 import type { Difficulty } from "./contracts/conversion-agent.js";
 import type { Severity } from "../core/contracts/severity.js";
+import type { RuleId } from "../core/contracts/rule.js";
+import { RULE_ID_CATEGORY } from "../core/rules/rule-config.js";
 
 /**
  * Difficulty-to-score range mapping.
@@ -167,8 +169,47 @@ export function runEvaluationAgent(
     }
   }
 
+  // Override responsive-critical rule evaluations with measured responsiveDelta
+  if (input.responsiveDelta != null) {
+    const responsiveDifficulty = responsiveDeltaToDifficulty(input.responsiveDelta);
+    for (const mismatch of mismatches) {
+      if (!mismatch.ruleId) continue;
+      if (!(mismatch.ruleId in RULE_ID_CATEGORY)) continue;
+      const category = RULE_ID_CATEGORY[mismatch.ruleId as RuleId];
+      if (category !== "responsive-critical") continue;
+
+      const prevType = mismatch.type;
+      const newType = classifyFlaggedRule(mismatch.currentScore ?? 0, responsiveDifficulty);
+      mismatch.type = newType;
+      mismatch.actualDifficulty = responsiveDifficulty;
+      mismatch.reasoning = buildReasoning(newType, mismatch.ruleId, mismatch.currentScore, responsiveDifficulty)
+        + ` (responsive: delta=${input.responsiveDelta}%p, overrides AI opinion "${prevType}")`;
+
+      if (newType === "validated") {
+        validatedRuleSet.add(mismatch.ruleId);
+      } else {
+        validatedRuleSet.delete(mismatch.ruleId);
+      }
+    }
+  }
+
   return {
     mismatches,
     validatedRules: [...validatedRuleSet],
   };
 }
+
+/**
+ * Map responsiveDelta to difficulty.
+ * Based on ablation Experiment 04: structure drops -32%p at different viewport.
+ * Higher delta = more responsive breakage = harder to implement.
+ */
+function responsiveDeltaToDifficulty(delta: number): Difficulty {
+  // Negative delta = expanded viewport matches better than original (unusual).
+  // Treat as easy — the design is not breaking at wider viewport.
+  const d = Math.max(0, delta);
+  if (d <= 5) return "easy";      // minimal responsive breakage
+  if (d <= 15) return "moderate";  // noticeable breakage
+  if (d <= 30) return "hard";      // severe breakage
+  return "failed";                  // completely broken at expanded viewport
+}
diff --git a/src/agents/orchestrator.ts b/src/agents/orchestrator.ts
@@ -304,6 +304,11 @@ export function runCalibrationEvaluate(
     conversionRecords = [];
   }
 
+  // Extract responsive comparison data if available
+  const responsiveDelta = typeof conversionJson["responsiveDelta"] === "number"
+    ? conversionJson["responsiveDelta"] as number
+    : null;
+
   const evaluationOutput = runEvaluationAgent({
     nodeIssueSummaries: analysisJson.nodeIssueSummaries.map((s) => ({
       nodeId: s.nodeId,
@@ -312,6 +317,7 @@ export function runCalibrationEvaluate(
     })),
     conversionRecords,
     ruleScores,
+    responsiveDelta,
   });
 
   // Load prior evidence if collecting