diff --git a/.claude/commands/calibrate-loop.md b/.claude/commands/calibrate-loop.md index ca3d6d8a..1672a728 100644 --- a/.claude/commands/calibrate-loop.md +++ b/.claude/commands/calibrate-loop.md @@ -35,8 +35,8 @@ Read `$RUN_DIR/analysis.json`. If `issueCount` is 0, stop here. Read the `calibrationTier` field from `analysis.json`. The CLI determines the tier based on grade percentage. Branch accordingly: -- **`"full"`**: Full pipeline — proceed to Step 2 (Converter + visual-compare + Gap Analysis) -- **`"visual-only"`**: Converter + visual-compare, but **skip Step 3 (Gap Analysis)**. Gap analysis on diff images is only meaningful at high similarity. +- **`"full"`**: Full pipeline — proceed to Step 2 (Converter + Measurements + Gap Analysis) +- **`"visual-only"`**: Converter + Measurements, but **skip Step 3 (Gap Analysis)**. Gap analysis on diff images is only meaningful at high similarity. **Always run the Converter** regardless of tier. Low-scoring designs need score validation the most. diff --git a/.claude/skills/design-to-code/PROMPT.md b/.claude/skills/design-to-code/PROMPT.md index 149fde85..0f99324a 100644 --- a/.claude/skills/design-to-code/PROMPT.md +++ b/.claude/skills/design-to-code/PROMPT.md @@ -2,7 +2,7 @@ This prompt is used by all code generation pipelines: - Calibration Converter -- Rule Discovery A/B Validation +- Ablation experiments (API-based) - User-facing `canicode implement` command (default prompt) ## Stack diff --git a/CLAUDE.md b/CLAUDE.md index 6bf25517..f312ad9b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -107,9 +107,9 @@ Calibration commands are NOT exposed as CLI commands. They run exclusively insid **`/calibrate-loop` (Claude Code command)** - Role: Autonomous rule-config.ts improvement via fixture-based calibration - Input: fixture directory path (e.g. `fixtures/material3-kit`) -- Flow: Analysis → Converter (baseline + strip ablation → HTML + visual-compare) → Gap Analyzer → Evaluation → Critic → Arbitrator → Prune Evidence -- Converter implements the full scoped design as one HTML page, runs `visual-compare` for pixel-level similarity -- **Strip ablation**: Converter also converts 6 stripped design-trees (`DESIGN_TREE_INFO_TYPES` in `src/core/design-tree/strip.ts`: layout-direction-spacing, size-constraints, component-references, node-names-hierarchy, variable-references, style-references) → measures similarity delta vs baseline (plus tokens/HTML/CSS/responsive where recorded) → objective difficulty per rule category +- Flow: Analysis → Converter (HTML generation) → Measurements (html-postprocess + visual-compare + code-metrics) → Gap Analyzer → Evaluation → Critic → Arbitrator → Prune Evidence +- Converter implements the full scoped design as one HTML page + 6 stripped variants; orchestrator runs all measurements (visual-compare, code-metrics, responsive comparison) +- **Strip ablation**: Orchestrator measures 6 stripped design-trees (`DESIGN_TREE_INFO_TYPES` in `src/core/design-tree/strip.ts`: layout-direction-spacing, size-constraints, component-references, node-names-hierarchy, variable-references, style-references) → similarity delta vs baseline (plus tokens/HTML/CSS/responsive) → objective difficulty per rule category - Gap Analyzer examines the diff image, categorizes pixel differences, saves to run directory - Cross-run evidence: Evaluation appends overscored/underscored findings to `data/calibration-evidence.json` - After Arbitrator applies changes, evidence for applied rules is pruned (`calibrate-prune-evidence`) diff --git a/README.md b/README.md index af7531f0..5040b670 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ CanICode solves this: ### Scores You Can Trust -Rule scores aren't guesswork. A 6-agent calibration pipeline converts real Figma designs to HTML, measures pixel-level similarity (via `visual-compare`), and adjusts scores based on actual implementation difficulty. +Rule scores aren't guesswork. The calibration pipeline converts real Figma designs to HTML, measures pixel-level similarity (via `visual-compare`), and adjusts scores based on actual implementation difficulty. - Design that's hard to implement accurately → rule score goes **up** - Design that's easy despite the flag → rule score goes **down** diff --git a/src/agents/orchestrator.ts b/src/agents/orchestrator.ts index a88a98ff..041c5e7d 100644 --- a/src/agents/orchestrator.ts +++ b/src/agents/orchestrator.ts @@ -38,8 +38,8 @@ function normalizeActualImpact(impact: string): string { /** * Calibration tier thresholds (percentage-based). - * - "full": Converter + visual-compare + Gap Analysis - * - "visual-only": Converter + visual-compare (Gap Analysis skipped) + * - "full": Converter + Measurements + Gap Analysis + * - "visual-only": Converter + Measurements (Gap Analysis skipped) */ export const CALIBRATION_TIER_THRESHOLDS = { full: 90, // A or higher