diff --git a/.claude/agents/calibration/arbitrator.md b/.claude/agents/calibration/arbitrator.md
index 85c9482e..0c2dfaf0 100644
--- a/.claude/agents/calibration/arbitrator.md
+++ b/.claude/agents/calibration/arbitrator.md
@@ -1,7 +1,7 @@
 ---
 name: calibration-arbitrator
 description: Makes final calibration decisions by weighing Runner and Critic. Applies approved changes to rule-config.ts and commits. Use after calibration-critic completes.
-tools: Read, Write, Edit, Bash
+tools: Read, Edit, Bash
 model: claude-sonnet-4-6
 ---
 
@@ -13,7 +13,7 @@ You receive the Runner's proposals and the Critic's reviews, and make final deci
 - **Both APPROVE** → apply Runner's proposed value
 - **Critic REJECT** → keep current score (no change)
 - **Critic REVISE** → apply the Critic's revised value
-- **New rule proposals** → append to `logs/calibration/new-rule-proposals.md` only, do NOT add to `rule-config.ts`
+- **New rule proposals** → record in `$RUN_DIR/debate.json` only, do NOT add to `rule-config.ts`
 
 ## After Deciding
 
@@ -27,16 +27,28 @@ You receive the Runner's proposals and the Critic's reviews, and make final deci
 
 ## Output
 
-**CRITICAL: Your prompt will contain a line like `Activity log: logs/activity/2026-03-20-22-30-material3-kit.jsonl`. You MUST append your summary to that EXACT file path. Do NOT use any other path. Do NOT create `agent-activity-*.jsonl` or any other file.**
+**Do NOT write to any log files. Return your decisions as JSON text so the orchestrator can save it.**
 
-The log uses **JSON Lines format** — append exactly one JSON object on a single line:
+Only `rule-config.ts` may be edited directly (for approved score changes). All log writes are the orchestrator's job.
+
+Return this JSON structure:
 
 ```json
-{"step":"Arbitrator","timestamp":"<ISO8601>","result":"applied=2 rejected=1 revised=1 newProposals=0","durationMs":<ms>,"decisions":[{"ruleId":"X","decision":"applied","before":-10,"after":-7,"reason":"Critic revised, midpoint applied"},{"ruleId":"X","decision":"rejected","reason":"Critic rejection compelling — insufficient evidence"}]}
+{
+  "timestamp": "<ISO8601>",
+  "summary": "applied=2 rejected=1 revised=1 newProposals=0",
+  "decisions": [
+    {"ruleId": "X", "decision": "applied", "before": -10, "after": -7, "reason": "Critic revised, midpoint applied"},
+    {"ruleId": "X", "decision": "rejected", "reason": "Critic rejection compelling — insufficient evidence"}
+  ],
+  "newRuleProposals": []
+}
 ```
 
 ## Rules
 
+- **Do NOT write to ANY file except `src/rules/rule-config.ts`.** No log files, no `new-rule-proposals.md`, no `debate.json`, no `activity.jsonl`. The orchestrator handles ALL other file I/O.
+- **Do NOT create files.** Only Edit existing `rule-config.ts` when applying approved score changes.
 - Only modify `rule-config.ts` for approved score/severity changes.
 - Never force-push or amend existing commits.
 - If tests fail, revert everything and report which change caused the failure.
diff --git a/.claude/agents/calibration/converter.md b/.claude/agents/calibration/converter.md
index 9b578773..8cfa2c71 100644
--- a/.claude/agents/calibration/converter.md
+++ b/.claude/agents/calibration/converter.md
@@ -10,11 +10,11 @@ You are the Converter agent in a calibration pipeline. Your job is to implement
 ## Input
 
 You will be given:
-- A path to an analysis JSON file (`logs/calibration/calibration-analysis.json`)
+- A run directory path (`$RUN_DIR`) containing `analysis.json`
 - The original fixture path or Figma URL
 - The `fileKey` and root `nodeId` from the analysis
 
-Read the analysis JSON to get:
+Read `$RUN_DIR/analysis.json` to get:
 - `fileKey`: The Figma file key
 - `nodeIssueSummaries`: Issues grouped by node (used for per-rule impact assessment, not for selecting what to convert)
 
@@ -28,7 +28,7 @@ Use BOTH sources together for accurate conversion:
 
 **Primary source — design tree (structure + CSS-ready values):**
 ```
-npx canicode design-tree <fixture-path> --output /tmp/design-tree.txt
+npx canicode design-tree <fixture-path> --output $RUN_DIR/design-tree.txt
 ```
 This produces a 4KB DOM-like tree with inline CSS styles instead of 250KB+ raw JSON. Each node = one HTML element. Every style value is CSS-ready.
 
@@ -55,11 +55,12 @@ Read and follow `.claude/skills/design-to-code/PROMPT.md` for all code generatio
    - Each node in the tree maps 1:1 to an HTML element
    - Copy style values directly — they are already CSS-ready
    - Follow all rules from DESIGN-TO-CODE-PROMPT.md
-3. Save to `/tmp/calibration-output.html`
+3. Save to `$RUN_DIR/output.html`
 4. Run visual comparison:
    ```
-   npx canicode visual-compare /tmp/calibration-output.html --figma-url "https://www.figma.com/design/<fileKey>/file?node-id=<rootNodeId>"
+   npx canicode visual-compare $RUN_DIR/output.html --figma-url "https://www.figma.com/design/<fileKey>/file?node-id=<rootNodeId>" --output $RUN_DIR
    ```
+   This saves `figma.png`, `code.png`, and `diff.png` into the run directory.
    Replace `:` with `-` in the nodeId for the URL.
 5. Use similarity to determine overall difficulty:
 
@@ -70,14 +71,17 @@ Read and follow `.claude/skills/design-to-code/PROMPT.md` for all code generatio
    | 50-70% | hard |
    | <50% | failed |
 
-6. Review each issue in `nodeIssueSummaries`:
+6. **MANDATORY — Rule Impact Assessment**: For EVERY rule ID in `nodeIssueSummaries[].flaggedRuleIds`, assess its actual impact on conversion. Read the analysis JSON, collect all unique `flaggedRuleIds`, and for each one write an entry in `ruleImpactAssessment`. This array MUST NOT be empty if there are flagged rules.
    - Did this rule's issue actually make the conversion harder?
    - What was its real impact on the final similarity score?
-7. Note any difficulties NOT covered by existing rules
+   - Rate as: `easy` (no real difficulty), `moderate` (some guessing needed), `hard` (significant pixel loss), `failed` (could not reproduce)
+7. Note any difficulties NOT covered by existing rules as `uncoveredStruggles`
 
 ## Output
 
-Write results to `logs/calibration/calibration-conversion.json`:
+Write results to `$RUN_DIR/conversion.json`.
+
+**CRITICAL: `ruleImpactAssessment` MUST contain one entry per unique flagged rule ID. An empty array means the calibration pipeline cannot evaluate rule scores.**
 
 ```json
 {
@@ -90,8 +94,14 @@ Write results to `logs/calibration/calibration-conversion.json`:
     {
       "ruleId": "raw-color",
       "issueCount": 4,
-      "actualImpact": "easy | moderate | hard | failed",
-      "description": "How this rule's issues affected the overall conversion"
+      "actualImpact": "easy",
+      "description": "Colors were directly available in design tree, no difficulty"
+    },
+    {
+      "ruleId": "detached-instance",
+      "issueCount": 2,
+      "actualImpact": "easy",
+      "description": "Detached instances rendered identically to attached ones"
     }
   ],
   "interpretations": [
@@ -108,16 +118,9 @@ Write results to `logs/calibration/calibration-conversion.json`:
 }
 ```
 
-Also append a brief summary to the activity log file specified by the orchestrator.
-The log uses **JSON Lines format** — append exactly one JSON object on a single line:
-
-```json
-{"step":"Converter","timestamp":"<ISO8601>","result":"similarity=<N>% difficulty=<level>","durationMs":<ms>}
-```
-
 ## Rules
 
-- Do NOT modify any source files. Only write to `logs/` and `/tmp/`.
+- Do NOT modify any source files. Only write to the run directory.
 - Implement the FULL design, not individual nodes.
 - If visual-compare fails (rate limit, etc.), set similarity to -1 and explain in notes.
 - Return a brief summary so the orchestrator can proceed.
diff --git a/.claude/agents/calibration/critic.md b/.claude/agents/calibration/critic.md
index e411f111..fbc21faa 100644
--- a/.claude/agents/calibration/critic.md
+++ b/.claude/agents/calibration/critic.md
@@ -1,7 +1,7 @@
 ---
 name: calibration-critic
 description: Challenges calibration proposals from Runner. Rejects low-confidence or over-aggressive adjustments. Use after calibration-runner completes.
-tools: Read, Write
+tools: Read
 model: claude-sonnet-4-6
 ---
 
@@ -35,16 +35,25 @@ For each proposal, output ONE of:
 
 ## Output
 
-**CRITICAL: Your prompt will contain a line like `Append your critique to: logs/activity/2026-03-20-22-30-material3-kit.jsonl`. You MUST append your output to that EXACT file path. Do NOT use any other path. Do NOT create `agent-activity-*.jsonl` or any other file.**
+**Do NOT write any files. Return your critique as JSON text so the orchestrator can save it.**
 
-The log uses **JSON Lines format** — append exactly one JSON object on a single line:
+Return this JSON structure:
 
 ```json
-{"step":"Critic","timestamp":"<ISO8601>","result":"approved=1 rejected=1 revised=1","durationMs":<ms>,"reviews":[{"ruleId":"X","decision":"APPROVE","reason":"3 cases, high confidence"},{"ruleId":"X","decision":"REJECT","reason":"Rule 1 — only 1 case with low confidence"},{"ruleId":"X","decision":"REVISE","revised":-7,"reason":"Rule 2 — change too large, midpoint applied"}]}
+{
+  "timestamp": "<ISO8601>",
+  "summary": "approved=1 rejected=1 revised=1",
+  "reviews": [
+    {"ruleId": "X", "decision": "APPROVE", "reason": "3 cases, high confidence"},
+    {"ruleId": "X", "decision": "REJECT", "reason": "Rule 1 — only 1 case with low confidence"},
+    {"ruleId": "X", "decision": "REVISE", "revised": -7, "reason": "Rule 2 — change too large, midpoint applied"}
+  ]
+}
 ```
 
 ## Rules
 
+- **Do NOT write any files.** The orchestrator handles all file I/O.
 - Do NOT modify `src/rules/rule-config.ts`.
 - Be strict. When in doubt, REJECT or REVISE.
 - Return your full critique so the Arbitrator can decide.
diff --git a/.claude/agents/calibration/gap-analyzer.md b/.claude/agents/calibration/gap-analyzer.md
index f4ff448b..3fb4902c 100644
--- a/.claude/agents/calibration/gap-analyzer.md
+++ b/.claude/agents/calibration/gap-analyzer.md
@@ -1,7 +1,7 @@
 ---
 name: calibration-gap-analyzer
 description: Analyzes visual diff between Figma screenshot and AI-generated code to identify specific causes of pixel differences. Accumulates gap data for rule discovery.
-tools: Bash, Read, Write
+tools: Bash, Read
 model: claude-sonnet-4-6
 ---
 
@@ -10,14 +10,15 @@ You are the Gap Analyzer agent in a calibration pipeline. Your job is to examine
 ## Input
 
 You will be given:
-- Figma screenshot path (e.g., `/tmp/canicode-visual-compare/figma.png`)
-- Code screenshot path (e.g., `/tmp/canicode-visual-compare/code.png`)
-- Diff image path (e.g., `/tmp/canicode-visual-compare/diff.png`)
+- Figma screenshot path (e.g., `$RUN_DIR/figma.png`)
+- Code screenshot path (e.g., `$RUN_DIR/code.png`)
+- Diff image path (e.g., `$RUN_DIR/diff.png`)
 - Similarity score (e.g., 95%)
 - The generated HTML code path
 - The fixture path (for reference)
-- The analysis JSON (nodeIssueSummaries)
+- The analysis JSON (`$RUN_DIR/analysis.json`)
 - The Converter's interpretations list (values that were guessed, not from data)
+- A run directory path (`$RUN_DIR`)
 
 ## Steps
 
@@ -52,7 +53,9 @@ You will be given:
 
 ## Output
 
-Write gap analysis to `logs/calibration/gaps/<fixture-name>-<timestamp>.json`:
+**Do NOT write any files. Return the gap analysis as JSON text so the orchestrator can save it.**
+
+Return this JSON structure:
 
 ```json
 {
@@ -68,14 +71,6 @@ Write gap analysis to `logs/calibration/gaps/<fixture-name>-<timestamp>.json`:
       "causedByInterpretation": false,
       "actionable": true,
       "suggestedRuleCategory": "layout"
-    },
-    {
-      "category": "typography",
-      "description": "System font fallback — Inter not available in Playwright",
-      "pixelImpact": "medium",
-      "coveredByRule": null,
-      "actionable": false,
-      "reason": "Rendering environment limitation"
     }
   ],
   "summary": {
@@ -88,16 +83,9 @@ Write gap analysis to `logs/calibration/gaps/<fixture-name>-<timestamp>.json`:
 }
 ```
 
-Also append a summary to the activity log file specified by the orchestrator.
-The log uses **JSON Lines format** — append exactly one JSON object on a single line:
-
-```json
-{"step":"Gap Analyzer","timestamp":"<ISO8601>","result":"similarity=95% gaps=5 actionable=3 newRuleCandidates=2","durationMs":<ms>}
-```
-
 ## Rules
 
-- Do NOT modify any source files. Only write to `logs/`.
+- **Do NOT write any files.** The orchestrator handles all file I/O.
 - Be specific about pixel values — "4px off" not "slightly off".
 - Distinguish actionable gaps from rendering artifacts clearly.
 - This data accumulates over time — future rule discovery agents will read it.
diff --git a/.claude/agents/calibration/runner.md b/.claude/agents/calibration/runner.md
index 0b2d1d08..af6846b7 100644
--- a/.claude/agents/calibration/runner.md
+++ b/.claude/agents/calibration/runner.md
@@ -9,23 +9,22 @@ You are the Runner agent in a calibration pipeline. You perform analysis only 
 
 ## Steps
 
-1. Run `pnpm exec canicode calibrate-analyze $input --output logs/calibration/calibration-analysis.json`
-2. Read the generated `logs/calibration/calibration-analysis.json`
+1. Run `pnpm exec canicode calibrate-analyze $input --run-dir $RUN_DIR`
+2. Read the generated `$RUN_DIR/analysis.json`
 3. Extract the analysis summary: node count, issue count, grade, and the list of `nodeIssueSummaries`
 
 ## Output
 
-Append your report to the activity log file specified by the orchestrator.
-If no log file is specified, use `logs/activity/YYYY-MM-DD-HH-mm-<fixture-name>.jsonl`.
+Append your report to `$RUN_DIR/activity.jsonl` (the run directory is provided by the orchestrator).
 
 The log uses **JSON Lines format** — append exactly one JSON object on a single line:
 
 ```json
-{"step":"Runner","timestamp":"<ISO8601>","result":"nodes=<N> issues=<N> grade=<X>","durationMs":<ms>,"fixture":"<input>","analysisOutput":"logs/calibration/calibration-analysis.json"}
+{"step":"Runner","timestamp":"<ISO8601>","result":"nodes=<N> issues=<N> grade=<X>","durationMs":<ms>,"fixture":"<input>","analysisOutput":"$RUN_DIR/analysis.json"}
 ```
 
 ## Rules
 
-- Do NOT modify any source files. Only write to `logs/`.
+- Do NOT modify any source files. Only write to the run directory.
 - Return your full report text so the orchestrator can proceed.
 - If the analysis produces zero issues, return: "No issues found — calibration not needed."
diff --git a/.claude/agents/rule-discovery/critic.md b/.claude/agents/rule-discovery/critic.md
index c25b81d0..70b4c876 100644
--- a/.claude/agents/rule-discovery/critic.md
+++ b/.claude/agents/rule-discovery/critic.md
@@ -1,7 +1,7 @@
 ---
 name: rule-discovery-critic
 description: Challenges whether a new rule adds real value. Decides keep, adjust, or drop based on Evaluator's data.
-tools: Read, Write
+tools: Read
 model: claude-sonnet-4-6
 ---
 
@@ -41,8 +41,9 @@ You will receive:
 
 ## Output
 
-Append your critique to the activity log file specified by the orchestrator.
-The log uses **JSON Lines format** — append exactly one JSON object on a single line:
+**Do NOT write any files. Return your decision as JSON text so the orchestrator can save it.**
+
+Return this JSON structure:
 
 ```json
 {"step":"Critic","timestamp":"<ISO8601>","result":"<KEEP|ADJUST|DROP> for rule <rule-id>","durationMs":<ms>,"ruleId":"<rule-id>","decision":"<KEEP|ADJUST|DROP>","evidenceStrength":"<strong|moderate|weak>","falsePositiveConcern":"<none|low|high>","difficultyCorrelation":"<strong|moderate|weak>","adjustments":{"score":-7,"severity":"blocking","triggerChange":"..."},"dropReason":"..."}
diff --git a/.claude/agents/rule-discovery/designer.md b/.claude/agents/rule-discovery/designer.md
index 34eb90dc..cf87a3d6 100644
--- a/.claude/agents/rule-discovery/designer.md
+++ b/.claude/agents/rule-discovery/designer.md
@@ -1,7 +1,7 @@
 ---
 name: rule-discovery-designer
 description: Proposes rule specification based on Researcher findings. Defines check logic, severity, category, and initial score.
-tools: Read, Write
+tools: Read
 model: claude-sonnet-4-6
 ---
 
@@ -30,8 +30,9 @@ You will receive:
 
 ## Output
 
-Append your proposal to the activity log file specified by the orchestrator.
-The log uses **JSON Lines format** — append exactly one JSON object on a single line:
+**Do NOT write any files. Return your proposal as JSON text so the orchestrator can save it.**
+
+Return this JSON structure:
 
 ```json
 {"step":"Designer","timestamp":"<ISO8601>","result":"proposed rule <rule-id>","durationMs":<ms>,"ruleId":"<rule-id>","category":"<category>","severity":"<severity>","initialScore":-5,"trigger":"<when does this fire>","requiresTransformerChanges":false}
diff --git a/.claude/agents/rule-discovery/evaluator.md b/.claude/agents/rule-discovery/evaluator.md
index d8355df2..22f47c72 100644
--- a/.claude/agents/rule-discovery/evaluator.md
+++ b/.claude/agents/rule-discovery/evaluator.md
@@ -1,7 +1,7 @@
 ---
 name: rule-discovery-evaluator
 description: Tests new rule against fixtures. Reports issue count, false positive rate, and score impact.
-tools: Bash, Read, Write
+tools: Bash, Read
 model: claude-sonnet-4-6
 ---
 
@@ -33,8 +33,9 @@ You will receive:
 
 ## Output
 
-Append your evaluation to the activity log file specified by the orchestrator.
-The log uses **JSON Lines format** — append exactly one JSON object on a single line:
+**Do NOT write any files. Return your evaluation as JSON text so the orchestrator can save it.**
+
+Return this JSON structure:
 
 ```json
 {"step":"Evaluator","timestamp":"<ISO8601>","result":"verdict=<KEEP|ADJUST|DROP> falsePositiveRate=<X>%","durationMs":<ms>,"ruleId":"<rule-id>","fixtures":[{"name":"material3-kit.json","issues":0,"nodesAffected":0,"scoreImpact":"-X%"}],"falsePositiveRate":"<X>%","verdict":"<KEEP|ADJUST|DROP>","verdictReason":"..."}
diff --git a/.claude/agents/rule-discovery/implementer.md b/.claude/agents/rule-discovery/implementer.md
index 489b418a..c3e3b0e9 100644
--- a/.claude/agents/rule-discovery/implementer.md
+++ b/.claude/agents/rule-discovery/implementer.md
@@ -35,12 +35,12 @@ You will receive:
 
 ## Output
 
-Append your implementation summary to the activity log file specified by the orchestrator.
-The log uses **JSON Lines format** — append exactly one JSON object on a single line:
+**Do NOT write to log files.** The orchestrator handles activity logging.
 
-```json
-{"step":"Implementer","timestamp":"<ISO8601>","result":"implemented rule <rule-id> lintOk=true testsOk=true buildOk=true","durationMs":<ms>,"ruleId":"<rule-id>","filesModified":["src/core/rules/<category>/index.ts","src/core/rules/rule-config.ts","src/core/rules/index.ts"],"newTests":0,"lintOk":true,"testsOk":true,"buildOk":true}
-```
+Return a summary of what you did, including:
+- Rule ID
+- Files modified
+- Whether lint, tests, and build passed
 
 ## Rules
 
diff --git a/.claude/agents/rule-discovery/researcher.md b/.claude/agents/rule-discovery/researcher.md
index 93b919b8..50c7bec2 100644
--- a/.claude/agents/rule-discovery/researcher.md
+++ b/.claude/agents/rule-discovery/researcher.md
@@ -12,6 +12,7 @@ You are the Researcher agent in a rule discovery pipeline. Your job is to explor
 You will receive:
 - A **concept** to investigate (e.g., "component description", "annotations", "component properties")
 - One or more **fixture paths** (e.g., `fixtures/material3-kit.json`)
+- A **run directory** (`$RUN_DIR`)
 
 ## Steps
 
@@ -25,7 +26,7 @@ You will receive:
    - Is it stored in `src/core/contracts/figma-node.ts`?
    - Are there existing rules that use it?
 4. Check the Figma REST API spec (`@figma/rest-api-spec`) for the field's type and availability
-5. Read accumulated gap data in `logs/calibration/gaps/*.json`:
+5. Read accumulated gap data in `logs/calibration/*/gaps.json`:
    - Are there recurring gaps related to this concept?
    - How many times has this gap appeared across runs?
    - What pixel impact does it have?
@@ -36,8 +37,9 @@ You will receive:
 
 ## Output
 
-Append your report to the activity log file specified by the orchestrator.
-The log uses **JSON Lines format** — append exactly one JSON object on a single line:
+**Do NOT write any files. Return your findings as JSON text so the orchestrator can save it.**
+
+Return this JSON structure:
 
 ```json
 {"step":"Researcher","timestamp":"<ISO8601>","result":"concept=<concept> feasible=<yes|no>","durationMs":<ms>,"concept":"<concept>","fixtures":["<fixture-path>"],"fieldAvailable":true,"parsedInTransformer":false,"requiresTransformerChanges":true,"feasible":true,"suggestedDirection":"..."}
@@ -45,6 +47,6 @@ The log uses **JSON Lines format** — append exactly one JSON object on a singl
 
 ## Rules
 
-- Do NOT modify any source files. Only write to `logs/`.
+- **Do NOT write any files.** The orchestrator handles all file I/O.
 - Be thorough — the Designer agent depends on your data.
 - If the concept doesn't exist in the fixture data, say so clearly.
diff --git a/.claude/commands/add-rule.md b/.claude/commands/add-rule.md
index 6142e30f..acf3d8ac 100644
--- a/.claude/commands/add-rule.md
+++ b/.claude/commands/add-rule.md
@@ -6,27 +6,33 @@ Input: $ARGUMENTS (concept + fixture path, e.g. `"component description" fixture
 
 You are the orchestrator. Do NOT implement rules yourself. Only pass data between agents and run CLI steps.
 
+**CRITICAL: You are responsible for writing ALL files to $RUN_DIR. Subagents return text/JSON — you write files. Never rely on a subagent to write to the correct path.**
+
 ### Step 0 — Setup
 
 Parse the input: first argument is the concept (quoted string), remaining arguments are fixture paths.
 
-Generate the activity log filename:
+Create the run directory:
 
 ```
-LOG_FILE=logs/activity/YYYY-MM-DD-HH-mm-rule-<concept-slug>.md
+RUN_DIR=logs/rule-discovery/<concept-slug>--<YYYY-MM-DD>/
+mkdir -p $RUN_DIR
 ```
 
-Create the file with a header.
+Create `$RUN_DIR/activity.jsonl` with a session-start entry.
 
 ### Step 1 — Researcher
 
 Spawn the `rule-discovery-researcher` subagent. Provide:
 - The concept to investigate
 - The fixture paths
-- The activity log path:
+- **Tell the agent: "Return your findings as JSON. Do NOT write any files."**
 
-```
-Append your report to: <paste LOG_FILE here>
+After the Researcher returns, **you** write the JSON to `$RUN_DIR/research.json`.
+
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Researcher","timestamp":"<ISO8601>","result":"concept=<concept> feasible=<yes|no>","durationMs":<ms>}
 ```
 
 If the Researcher says the concept is not feasible, stop here and report why.
@@ -36,9 +42,13 @@ If the Researcher says the concept is not feasible, stop here and report why.
 Spawn the `rule-discovery-designer` subagent. Provide:
 - The Researcher's report (copy the findings)
 - The concept
+- **Tell the agent: "Return your proposal as JSON. Do NOT write any files."**
 
-```
-Append your proposal to: <paste LOG_FILE here>
+After the Designer returns, **you** write the JSON to `$RUN_DIR/design.json`.
+
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Designer","timestamp":"<ISO8601>","result":"proposed rule <rule-id>","durationMs":<ms>}
 ```
 
 ### Step 3 — Implementer
@@ -46,38 +56,41 @@ Append your proposal to: <paste LOG_FILE here>
 Spawn the `rule-discovery-implementer` subagent. Provide:
 - The Designer's rule proposal
 
-```
-Append your summary to: <paste LOG_FILE here>
-```
+The Implementer DOES modify source files (this is the only agent allowed to).
 
 After implementation, rebuild: `pnpm build`
 
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Implementer","timestamp":"<ISO8601>","result":"implemented rule <rule-id>","durationMs":<ms>}
+```
+
 ### Step 4 — A/B Visual Validation
 
 Run an A/B comparison on the entire design to measure the rule's actual impact on pixel-perfect accuracy:
 
 1. Extract `fileKey` and root `nodeId` from the fixture or Figma URL.
 
-2. Generate design tree: `npx canicode design-tree <fixture> --output /tmp/design-tree.txt`
+2. Generate design tree: `npx canicode design-tree <fixture> --output $RUN_DIR/design-tree.txt`
 
 3. Spawn a general-purpose subagent for **Test A (without the rule's data)**:
    - Read and follow `.claude/skills/design-to-code/PROMPT.md` for code generation rules
    - Use the design tree to convert the ENTIRE design to a single HTML page
-   - Strip or withhold the information the rule checks for from the tree (e.g., remove descriptions if testing missing-component-description)
-   - Save to `/tmp/visual-a.html`
-   - Run: `npx canicode visual-compare /tmp/visual-a.html --figma-url "<figma-url-with-root-node-id>"`
+   - Strip or withhold the information the rule checks for from the tree
+   - Save to `$RUN_DIR/visual-a.html`
+   - Run: `npx canicode visual-compare $RUN_DIR/visual-a.html --figma-url "<figma-url-with-root-node-id>" --output $RUN_DIR/visual-a`
    - Record similarity_a
 
 4. Spawn a general-purpose subagent for **Test B (with the rule's data)**:
    - Read and follow `.claude/skills/design-to-code/PROMPT.md` for code generation rules
-   - Same design tree, but this time INCLUDE the information (e.g., generate component descriptions via AI and add them to the tree)
-   - Save to `/tmp/visual-b.html`
-   - Run: `npx canicode visual-compare /tmp/visual-b.html --figma-url "<figma-url-with-root-node-id>"`
+   - Same design tree, but this time INCLUDE the information
+   - Save to `$RUN_DIR/visual-b.html`
+   - Run: `npx canicode visual-compare $RUN_DIR/visual-b.html --figma-url "<figma-url-with-root-node-id>" --output $RUN_DIR/visual-b`
    - Record similarity_b
 
 5. Compare: if similarity_b > similarity_a → the rule catches something that genuinely improves implementation quality.
 
-5. Record both scores for the Evaluator.
+6. Record both scores for the Evaluator.
 
 ### Step 5 — Evaluator
 
@@ -85,9 +98,13 @@ Spawn the `rule-discovery-evaluator` subagent. Provide:
 - The rule ID
 - The fixture paths
 - The visual comparison results from Step 4
+- **Tell the agent: "Return your evaluation as JSON. Do NOT write any files."**
 
-```
-Append your evaluation to: <paste LOG_FILE here>
+After the Evaluator returns, **you** write the JSON to `$RUN_DIR/evaluation.json`.
+
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Evaluator","timestamp":"<ISO8601>","result":"verdict=<KEEP|ADJUST|DROP>","durationMs":<ms>}
 ```
 
 ### Step 6 — Critic
@@ -95,9 +112,13 @@ Append your evaluation to: <paste LOG_FILE here>
 Spawn the `rule-discovery-critic` subagent. Provide:
 - The Designer's proposal
 - The Evaluator's results (including visual scores)
+- **Tell the agent: "Return your decision as JSON. Do NOT write any files."**
 
-```
-Append your critique to: <paste LOG_FILE here>
+After the Critic returns, **you** write the JSON to `$RUN_DIR/decision.json`.
+
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Critic","timestamp":"<ISO8601>","result":"<KEEP|ADJUST|DROP> for rule <rule-id>","durationMs":<ms>}
 ```
 
 ### Step 7 — Apply Decision
@@ -117,4 +138,5 @@ Report the final decision and summary.
 - Pass only structured data between agents — never raw reasoning.
 - Only the Implementer may modify source files.
 - If the Critic says DROP, revert ALL source changes (`git checkout -- src/`).
-- **CRITICAL**: Every subagent prompt MUST contain the exact LOG_FILE path.
+- **CRITICAL: YOU write all files to $RUN_DIR. Tell every subagent (except Implementer): "Do NOT write any files." You handle all file I/O.**
+- **CRITICAL: After each step, append to $RUN_DIR/activity.jsonl yourself.**
diff --git a/.claude/commands/calibrate-loop-deep.md b/.claude/commands/calibrate-loop-deep.md
index b61e6106..4a8dd1da 100644
--- a/.claude/commands/calibrate-loop-deep.md
+++ b/.claude/commands/calibrate-loop-deep.md
@@ -6,34 +6,37 @@ Input: $ARGUMENTS (Figma URL with node-id, e.g. `https://www.figma.com/design/AB
 
 You are the orchestrator. Do NOT make calibration decisions yourself. Only pass data between agents and run deterministic CLI steps.
 
+**CRITICAL: You are responsible for writing ALL files to $RUN_DIR. Subagents return text/JSON — you write files. Never rely on a subagent to write to the correct path.**
+
 ### Step 0 — Setup
 
-Generate the activity log filename. Extract a short name from the URL (fileKey or design name). Build the path:
+Extract a short name from the URL (fileKey or design name). Create the run directory:
 
 ```
-LOG_FILE=logs/activity/YYYY-MM-DD-HH-mm-<name>.jsonl
+RUN_DIR=logs/calibration/<name>--<YYYY-MM-DD-HHMM>/
+mkdir -p $RUN_DIR
 ```
 
-Create the file and write the first JSON Lines entry:
+Create `$RUN_DIR/activity.jsonl` and write the first JSON Lines entry:
 
 ```json
 {"step":"session-start","timestamp":"<ISO8601>","result":"Calibration activity log initialized","durationMs":0}
 ```
 
-The log uses **JSON Lines format** (one JSON object per line). Each entry has this shape:
-```json
-{"step":"<StepName>","timestamp":"<ISO8601>","result":"<summary>","durationMs":<ms>}
-```
-
-Store the exact path — you will paste it verbatim into every subagent prompt below.
+Store the exact `RUN_DIR` path — you will paste it verbatim into every subagent prompt below.
 
 ### Step 1 — Analysis (CLI)
 
 ```
-npx canicode calibrate-analyze "$ARGUMENTS" --output logs/calibration/calibration-analysis.json
+npx canicode calibrate-analyze "$ARGUMENTS" --run-dir $RUN_DIR
 ```
 
-Read `logs/calibration/calibration-analysis.json`. If `issueCount` is 0, stop here.
+Read `$RUN_DIR/analysis.json`. If `issueCount` is 0, stop here.
+
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Analysis","timestamp":"<ISO8601>","result":"nodes=<N> issues=<N> grade=<X>","durationMs":<ms>}
+```
 
 ### Step 2 — Converter
 
@@ -46,66 +49,101 @@ This is a Figma URL. Use `get_design_context` MCP tool with fileKey and root nod
 Figma URL: <paste input URL here>
 fileKey: <extracted fileKey>
 Root nodeId: <extracted nodeId>
-Activity log: <paste LOG_FILE here>
-Append a brief summary to this EXACT file. Do NOT write to any other log file.
+Run directory: <paste RUN_DIR here>
+```
+
+After the Converter returns, **verify** files exist in $RUN_DIR:
+```bash
+ls $RUN_DIR/conversion.json $RUN_DIR/output.html
 ```
 
-The Converter will implement the ENTIRE design as one HTML page and run visual-compare.
+If `conversion.json` is missing, write it yourself from the Converter's returned summary.
+
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Converter","timestamp":"<ISO8601>","result":"similarity=<N>% difficulty=<level>","durationMs":<ms>}
+```
 
 ### Step 3 — Gap Analysis
 
-Before spawning the Gap Analyzer, check whether the visual-compare screenshots were produced by the Converter:
+Check whether screenshots were produced:
 
 ```bash
-test -f /tmp/canicode-visual-compare/figma.png && echo "EXISTS" || echo "MISSING"
+test -f $RUN_DIR/figma.png && echo "EXISTS" || echo "MISSING"
 ```
 
-- **If `/tmp/canicode-visual-compare/figma.png` does NOT exist**: skip Gap Analyzer entirely. Log a warning to `LOG_FILE`:
-  ```
-  WARNING: Gap Analyzer skipped — /tmp/canicode-visual-compare/figma.png not found. Visual-compare may have failed or been skipped by Converter.
-  ```
-  Then proceed directly to Step 4.
+**If MISSING**: append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Gap Analyzer","timestamp":"<ISO8601>","result":"SKIPPED — figma.png not found","durationMs":0}
+```
+Proceed to Step 4.
 
-- **If the file exists**: spawn the `calibration-gap-analyzer` subagent. Provide:
-  - Screenshot paths: `/tmp/canicode-visual-compare/figma.png`, `/tmp/canicode-visual-compare/code.png`, `/tmp/canicode-visual-compare/diff.png`
-  - Similarity score from the Converter's output
-  - Generated HTML path: `/tmp/calibration-output.html`
-  - Figma URL
-  - Analysis JSON path: `logs/calibration/calibration-analysis.json`
+**If EXISTS**: spawn the `calibration-gap-analyzer` subagent. In the prompt include:
+- Screenshot paths: `$RUN_DIR/figma.png`, `$RUN_DIR/code.png`, `$RUN_DIR/diff.png`
+- Similarity score, HTML path, fixture/URL, analysis JSON path
+- The Converter's interpretations list
+- **Tell the agent: "Return the gap analysis as JSON. Do NOT write any files."**
 
-  ```
-  Append your summary to: <paste LOG_FILE here>
-  ```
+After the Gap Analyzer returns, **you** write the JSON to `$RUN_DIR/gaps.json`.
 
-Gap data is saved to `logs/calibration/gaps/` and accumulates over time for rule discovery.
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Gap Analyzer","timestamp":"<ISO8601>","result":"gaps=<N> actionable=<N>","durationMs":<ms>}
+```
 
 ### Step 4 — Evaluation (CLI)
 
 ```
-npx canicode calibrate-evaluate logs/calibration/calibration-analysis.json logs/calibration/calibration-conversion.json
+npx canicode calibrate-evaluate _ _ --run-dir $RUN_DIR
+```
+
+Read `$RUN_DIR/summary.md`, extract proposals.
+
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Evaluation","timestamp":"<ISO8601>","result":"overscored=<N> underscored=<N> validated=<N> proposals=<N>","durationMs":<ms>}
 ```
 
-Read the generated report, extract proposals. If zero proposals, stop.
+If zero proposals, write `$RUN_DIR/debate.json` with skip reason and jump to Step 7:
+```json
+{"critic": null, "arbitrator": null, "skipped": "zero proposals from evaluation"}
+```
 
 ### Step 5 — Critic
 
-Spawn the `calibration-critic` subagent. The prompt MUST include this exact line:
+Spawn the `calibration-critic` subagent. In the prompt:
+- Include only the proposal list (NOT the Converter's reasoning)
+- **Tell the agent: "Return your reviews as JSON. Do NOT write any files."**
 
-```
-Append your critique to: <paste LOG_FILE here>
+After the Critic returns, **you** write the JSON to `$RUN_DIR/debate.json`.
+
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Critic","timestamp":"<ISO8601>","result":"approved=<N> rejected=<N> revised=<N>","durationMs":<ms>}
 ```
 
 ### Step 6 — Arbitrator
 
-Spawn the `calibration-arbitrator` subagent. The prompt MUST include this exact line:
+Spawn the `calibration-arbitrator` subagent. In the prompt:
+- Include proposals and the Critic's reviews
+- **Tell the agent: "Return your decisions as JSON. Only edit rule-config.ts if applying changes. Do NOT write to logs."**
+
+After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — add the `arbitrator` field.
+
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Arbitrator","timestamp":"<ISO8601>","result":"applied=<N> rejected=<N>","durationMs":<ms>}
+```
+
+### Step 7 — Generate Report
 
 ```
-Activity log: <paste LOG_FILE here>
+npx canicode calibrate-gap-report --output logs/calibration/REPORT.md
 ```
 
 ### Done
 
-Report the final summary from the Arbitrator.
+Report the final summary: similarity, proposals, decisions, and path to `logs/calibration/REPORT.md`.
 
 ## Rules
 
@@ -113,5 +151,6 @@ Report the final summary from the Arbitrator.
 - Pass only structured data between agents — never raw reasoning.
 - The Critic must NOT see the Runner's or Converter's reasoning, only the proposal list.
 - Only the Arbitrator may edit `rule-config.ts`.
-- Steps 1 and 3 are CLI commands — run them directly with Bash.
-- **CRITICAL**: Every subagent prompt MUST contain the exact LOG_FILE path. Do NOT use placeholders. Paste the actual path string.
+- Steps 1, 4, 7 are CLI commands — run them directly with Bash.
+- **CRITICAL: YOU write all files to $RUN_DIR. Subagents (Gap Analyzer, Critic, Arbitrator) MUST return JSON as text — tell them "Do NOT write any files." You are the only one who writes to $RUN_DIR.**
+- **CRITICAL: After each step, append to $RUN_DIR/activity.jsonl yourself. Do NOT rely on subagents to append.**
diff --git a/.claude/commands/calibrate-loop.md b/.claude/commands/calibrate-loop.md
index 0c07a313..7c400b0a 100644
--- a/.claude/commands/calibrate-loop.md
+++ b/.claude/commands/calibrate-loop.md
@@ -6,34 +6,37 @@ Input: $ARGUMENTS (fixture path, e.g. `fixtures/material3-kit.json`)
 
 You are the orchestrator. Do NOT make calibration decisions yourself. Only pass data between agents and run deterministic CLI steps.
 
+**CRITICAL: You are responsible for writing ALL files to $RUN_DIR. Subagents return text/JSON — you write files. Never rely on a subagent to write to the correct path.**
+
 ### Step 0 — Setup
 
-Generate the activity log filename. Extract the fixture name (e.g. `fixtures/material3-kit.json` → `material3-kit`). Build the path:
+Extract the fixture name (e.g. `fixtures/material3-kit.json` → `material3-kit`). Create the run directory:
 
 ```
-LOG_FILE=logs/activity/YYYY-MM-DD-HH-mm-<fixture-name>.jsonl
+RUN_DIR=logs/calibration/<fixture-name>--<YYYY-MM-DD-HHMM>/
+mkdir -p $RUN_DIR
 ```
 
-Create the file and write the first JSON Lines entry:
+Create `$RUN_DIR/activity.jsonl` and write the first JSON Lines entry:
 
 ```json
 {"step":"session-start","timestamp":"<ISO8601>","result":"Calibration activity log initialized","durationMs":0}
 ```
 
-The log uses **JSON Lines format** (one JSON object per line). Each entry has this shape:
-```json
-{"step":"<StepName>","timestamp":"<ISO8601>","result":"<summary>","durationMs":<ms>}
-```
-
-Store the exact path — you will paste it verbatim into every subagent prompt below.
+Store the exact `RUN_DIR` path — you will paste it verbatim into every subagent prompt below.
 
 ### Step 1 — Analysis (CLI)
 
 ```
-npx canicode calibrate-analyze $ARGUMENTS --output logs/calibration/calibration-analysis.json
+npx canicode calibrate-analyze $ARGUMENTS --run-dir $RUN_DIR
 ```
 
-Read `logs/calibration/calibration-analysis.json`. If `issueCount` is 0, stop here.
+Read `$RUN_DIR/analysis.json`. If `issueCount` is 0, stop here.
+
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Analysis","timestamp":"<ISO8601>","result":"nodes=<N> issues=<N> grade=<X>","durationMs":<ms>}
+```
 
 ### Step 2 — Converter
 
@@ -45,67 +48,126 @@ Spawn a `general-purpose` subagent. In the prompt, include the full converter in
 Fixture path: <paste input path here>
 fileKey: <extracted fileKey>
 Root nodeId: <extracted nodeId>
-Activity log: <paste LOG_FILE here>
-Append a brief summary to this EXACT file. Do NOT write to any other log file.
+Run directory: <paste RUN_DIR here>
+```
+
+The Converter writes `output.html`, `conversion.json`, `design-tree.txt` to $RUN_DIR and runs `visual-compare --output $RUN_DIR` which creates `figma.png`, `code.png`, `diff.png`.
+
+After the Converter returns, **verify** these files exist in $RUN_DIR:
+```bash
+ls $RUN_DIR/conversion.json $RUN_DIR/output.html
 ```
 
-The Converter will implement the ENTIRE design as one HTML page and run visual-compare.
+If `conversion.json` is missing, write it yourself from the Converter's returned summary.
+
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Converter","timestamp":"<ISO8601>","result":"similarity=<N>% difficulty=<level>","durationMs":<ms>}
+```
 
 ### Step 3 — Gap Analysis
 
-Before spawning the Gap Analyzer, check whether the visual-compare screenshots were produced by the Converter:
+Check whether screenshots were produced:
 
 ```bash
-test -f /tmp/canicode-visual-compare/figma.png && echo "EXISTS" || echo "MISSING"
+test -f $RUN_DIR/figma.png && echo "EXISTS" || echo "MISSING"
 ```
 
-- **If `/tmp/canicode-visual-compare/figma.png` does NOT exist**: skip Gap Analyzer entirely. Log a warning to `LOG_FILE`:
-  ```
-  WARNING: Gap Analyzer skipped — /tmp/canicode-visual-compare/figma.png not found. Visual-compare may have failed or been skipped by Converter.
-  ```
-  Then proceed directly to Step 4.
+**If MISSING**: append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Gap Analyzer","timestamp":"<ISO8601>","result":"SKIPPED — figma.png not found","durationMs":0}
+```
+Proceed to Step 4.
 
-- **If the file exists**: spawn the `calibration-gap-analyzer` subagent. Provide:
-  - Screenshot paths: `/tmp/canicode-visual-compare/figma.png`, `/tmp/canicode-visual-compare/code.png`, `/tmp/canicode-visual-compare/diff.png`
-  - Similarity score from the Converter's output
-  - Generated HTML path: `/tmp/calibration-output.html`
-  - Fixture path
-  - Analysis JSON path: `logs/calibration/calibration-analysis.json`
+**If EXISTS**: spawn the `calibration-gap-analyzer` subagent. In the prompt include:
+- Screenshot paths: `$RUN_DIR/figma.png`, `$RUN_DIR/code.png`, `$RUN_DIR/diff.png`
+- Similarity score from the Converter's output
+- Generated HTML path: `$RUN_DIR/output.html`
+- Fixture path and analysis JSON path: `$RUN_DIR/analysis.json`
+- The Converter's interpretations list
+- **Tell the agent: "Return the gap analysis as JSON. Do NOT write any files."**
 
-  ```
-  Append your summary to: <paste LOG_FILE here>
-  ```
+After the Gap Analyzer returns, **you** write the JSON to `$RUN_DIR/gaps.json`.
 
-Gap data is saved to `logs/calibration/gaps/` and accumulates over time for rule discovery.
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Gap Analyzer","timestamp":"<ISO8601>","result":"gaps=<N> actionable=<N>","durationMs":<ms>}
+```
 
 ### Step 4 — Evaluation (CLI)
 
-
 ```
-npx canicode calibrate-evaluate logs/calibration/calibration-analysis.json logs/calibration/calibration-conversion.json
+npx canicode calibrate-evaluate _ _ --run-dir $RUN_DIR
 ```
 
-Read the generated report, extract proposals. If zero proposals, stop.
+Read `$RUN_DIR/summary.md`, extract proposals.
+
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Evaluation","timestamp":"<ISO8601>","result":"overscored=<N> underscored=<N> validated=<N> proposals=<N>","durationMs":<ms>}
+```
+
+If zero proposals, write `$RUN_DIR/debate.json` with skip reason and jump to Step 7:
+```json
+{"critic": null, "arbitrator": null, "skipped": "zero proposals from evaluation"}
+```
 
 ### Step 5 — Critic
 
-Spawn the `calibration-critic` subagent. The prompt MUST include this exact line:
+Spawn the `calibration-critic` subagent. In the prompt:
+- Include only the proposal list (NOT the Converter's reasoning)
+- **Tell the agent: "Return your reviews as JSON. Do NOT write any files."**
 
+After the Critic returns, **you** write the JSON to `$RUN_DIR/debate.json`:
+```json
+{
+  "critic": {
+    "timestamp": "<ISO8601>",
+    "summary": "approved=<N> rejected=<N> revised=<N>",
+    "reviews": [ ... ]
+  }
+}
 ```
-Append your critique to: <paste LOG_FILE here>
+
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Critic","timestamp":"<ISO8601>","result":"approved=<N> rejected=<N> revised=<N>","durationMs":<ms>}
 ```
 
 ### Step 6 — Arbitrator
 
-Spawn the `calibration-arbitrator` subagent. The prompt MUST include this exact line:
+Spawn the `calibration-arbitrator` subagent. In the prompt:
+- Include proposals and the Critic's reviews from `$RUN_DIR/debate.json`
+- **Tell the agent: "Return your decisions as JSON. Only edit rule-config.ts if applying changes. Do NOT write to logs."**
 
+After the Arbitrator returns, **you** update `$RUN_DIR/debate.json` — read the existing content and add the `arbitrator` field:
+```json
+{
+  "critic": { ... },
+  "arbitrator": {
+    "timestamp": "<ISO8601>",
+    "summary": "applied=<N> rejected=<N> revised=<N>",
+    "decisions": [ ... ]
+  }
+}
 ```
-Activity log: <paste LOG_FILE here>
+
+Append to `$RUN_DIR/activity.jsonl`:
+```json
+{"step":"Arbitrator","timestamp":"<ISO8601>","result":"applied=<N> rejected=<N>","durationMs":<ms>}
 ```
 
+### Step 7 — Generate Report
+
+```
+npx canicode calibrate-gap-report --output logs/calibration/REPORT.md
+```
+
+This aggregates all run directories into a single report.
+
 ### Done
 
-Report the final summary from the Arbitrator.
+Report the final summary: similarity, proposals, decisions, and path to `logs/calibration/REPORT.md`.
 
 ## Rules
 
@@ -113,5 +175,6 @@ Report the final summary from the Arbitrator.
 - Pass only structured data between agents — never raw reasoning.
 - The Critic must NOT see the Runner's or Converter's reasoning, only the proposal list.
 - Only the Arbitrator may edit `rule-config.ts`.
-- Steps 1 and 3 are CLI commands — run them directly with Bash.
-- **CRITICAL**: Every subagent prompt MUST contain the exact LOG_FILE path. Do NOT use placeholders. Paste the actual path string.
+- Steps 1, 4, 7 are CLI commands — run them directly with Bash.
+- **CRITICAL: YOU write all files to $RUN_DIR. Subagents (Gap Analyzer, Critic, Arbitrator) MUST return JSON as text — tell them "Do NOT write any files." You are the only one who writes to $RUN_DIR.**
+- **CRITICAL: After each step, append to $RUN_DIR/activity.jsonl yourself. Do NOT rely on subagents to append.**
diff --git a/.claude/commands/calibrate-night.md b/.claude/commands/calibrate-night.md
new file mode 100644
index 00000000..ad98a06e
--- /dev/null
+++ b/.claude/commands/calibrate-night.md
@@ -0,0 +1,73 @@
+Run nightly calibration across fixtures, then generate a gap-based rule review report.
+
+Input: $ARGUMENTS (optional: fixture directory path, default `fixtures`)
+
+## Instructions
+
+You are the nightly orchestrator. Scan for active fixtures, run `/calibrate-loop` on each, move converged ones to `done/`, then generate the aggregate report.
+
+### Step 0 — Discover fixtures
+
+Determine the fixture directory from the input (default: `fixtures`).
+
+```bash
+ls <fixture-dir>/*.json
+```
+
+These are the **active** fixtures to calibrate. Fixtures in `<fixture-dir>/done/` have already converged and are skipped.
+
+If no `.json` files found, stop with a message: "No active fixtures found."
+
+### Step 1 — Run calibration for each fixture
+
+For each active fixture, run `/calibrate-loop` with that fixture path.
+
+- Run them **sequentially** (not in parallel) — each one modifies `rule-config.ts`
+- If one fixture fails, log the failure and continue to the next
+- Track pass/fail counts
+
+After each fixture, briefly report:
+```
+[1/6] fixtures/material3-kit.json — Complete (applied=2)
+[2/6] fixtures/simple-ds.json — Complete (applied=0, converged)
+[3/6] fixtures/figma-ui3-kit.json — Failed (reason)
+```
+
+### Step 2 — Move converged fixtures
+
+After each successful run, check the run's `debate.json` for the Arbitrator's summary.
+
+If `applied=0` (no score changes were made), this fixture has converged:
+
+```bash
+mkdir -p <fixture-dir>/done
+mv <fixture-path> <fixture-dir>/done/
+```
+
+Report which fixtures were moved to `done/`.
+
+### Step 3 — Generate aggregate report
+
+After all fixtures are done, build and run the gap report:
+
+```bash
+pnpm build
+npx canicode calibrate-gap-report --output logs/calibration/REPORT.md
+```
+
+### Step 4 — Summary
+
+Report:
+- How many fixtures ran / passed / failed / converged
+- Which fixtures were moved to `done/`
+- Which fixtures remain active
+- Where the aggregate report is: `logs/calibration/REPORT.md`
+- Remind: "Review the report, then run `/add-rule` when you want to implement a new rule."
+
+## Rules
+
+- Run fixtures sequentially, not in parallel.
+- If a fixture fails, continue to the next — don't stop the whole run.
+- Each `/calibrate-loop` creates its own run directory under `logs/calibration/`.
+- Do NOT modify source files yourself — `/calibrate-loop` handles that via its agent pipeline.
+- Only move a fixture to `done/` when `applied=0` — meaning the Arbitrator made zero changes.
diff --git a/CLAUDE.md b/CLAUDE.md
index 0692d2b1..0cc702b6 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -87,7 +87,8 @@ Calibration commands are NOT exposed as CLI commands. They run exclusively insid
 - Input: fixture JSON path (e.g. `fixtures/material3-kit.json`)
 - Flow: Analysis → Converter (entire design → HTML + visual-compare) → Gap Analyzer → Evaluation → Critic → Arbitrator
 - Converter implements the full scoped design as one HTML page, runs `visual-compare` for pixel-level similarity
-- Gap Analyzer examines the diff image, categorizes pixel differences, saves to `logs/calibration/gaps/`
+- Gap Analyzer examines the diff image, categorizes pixel differences, saves to run directory
+- Each run creates a self-contained directory: `logs/calibration/<fixture>--<timestamp>/`
 - No Figma MCP or API keys needed — works fully offline
 - Auto-commits agreed score changes
 
@@ -96,21 +97,41 @@ Calibration commands are NOT exposed as CLI commands. They run exclusively insid
 - Input: Figma URL (e.g. `https://www.figma.com/design/ABC123/MyDesign?node-id=1-234`)
 - Flow: Same as `/calibrate-loop` but Converter uses Figma MCP `get_design_context` for richer style data
 
+**`/calibrate-night` (Claude Code command)**
+- Role: Run calibration on multiple fixtures sequentially, then generate aggregate report
+- Input: comma-separated fixture paths (e.g. `fixtures/a.json,fixtures/b.json`)
+- Flow: sequential `/calibrate-loop` per fixture → `calibrate-gap-report` → `logs/calibration/REPORT.md`
+
 **`/add-rule` (Claude Code command)**
 - Role: Research, design, implement, and evaluate new analysis rules
 - Input: concept + fixture path (e.g. `"component description" fixtures/material3-kit.json`)
 - Flow: Researcher → Designer → Implementer → A/B Visual Validation → Evaluator → Critic
-- Researcher reads accumulated gap data from `logs/calibration/gaps/` to find recurring patterns
+- Researcher reads accumulated gap data from `logs/calibration/*/gaps.json` to find recurring patterns
+- Each run creates a directory: `logs/rule-discovery/<concept>--<date>/`
 - A/B Validation: implements entire design with/without the rule's data, compares similarity
 - Critic decides KEEP / ADJUST / DROP
 
 ### File Output Structure
 
 ```
-reports/            # HTML reports (canicode analyze)
-logs/calibration/   # Calibration analysis results (internal)
-logs/calibration/gaps/  # Accumulated gap analysis data (internal)
-logs/activity/      # Agent activity logs (internal)
+reports/                                    # HTML reports (canicode analyze)
+logs/calibration/                           # Calibration runs (internal)
+logs/calibration/<name>--<timestamp>/       # One calibration run = one folder
+  ├── analysis.json                         #   Rule analysis result
+  ├── conversion.json                       #   HTML conversion + similarity
+  ├── gaps.json                             #   Pixel gap analysis
+  ├── debate.json                           #   Critic + Arbitrator decisions
+  ├── activity.jsonl                        #   Agent step-by-step timeline
+  ├── summary.md                            #   Human-readable summary
+  ├── output.html                           #   Generated HTML page
+  ├── design-tree.txt                       #   Design tree (structure)
+  ├── figma.png                             #   Figma screenshot
+  ├── code.png                              #   Code rendering screenshot
+  └── diff.png                              #   Pixel diff image
+logs/calibration/REPORT.md                  # Cross-run aggregate report
+logs/rule-discovery/                        # Rule discovery runs (internal)
+logs/rule-discovery/<concept>--<date>/      # One rule discovery = one folder
+logs/activity/                              # Nightly orchestration logs
 ```
 
 ## Analysis Scope Policy
@@ -192,7 +213,7 @@ Process:
 5. Compare conversion difficulty vs rule scores (`canicode calibrate-evaluate`)
 6. 6-agent debate loop (`/calibrate-loop`): Analysis → Converter → Gap Analyzer → Evaluation → Critic → Arbitrator
 
-Gap data accumulates in `logs/calibration/gaps/` and feeds into rule discovery (`/add-rule`).
+Gap data accumulates in each run's `gaps.json` and feeds into rule discovery (`/add-rule`).
 
 Final score adjustments in `rule-config.ts` are always reviewed by the developer via the Arbitrator's decisions.
 
diff --git a/PRIVACY.md b/PRIVACY.md
index 0fc8fbf6..dc5c505f 100644
--- a/PRIVACY.md
+++ b/PRIVACY.md
@@ -60,8 +60,9 @@ CanICode stores data locally on your machine:
 |----------|----------|
 | `~/.canicode/config.json` | User configuration (Figma token, telemetry preference) |
 | `reports/` (project directory) | Generated HTML analysis reports |
-| `logs/calibration/` | Calibration analysis results (internal/development use) |
-| `logs/activity/` | Agent activity logs (internal/development use) |
+| `logs/calibration/` | Calibration run data — each run in its own directory (internal/development use) |
+| `logs/rule-discovery/` | Rule discovery run data (internal/development use) |
+| `logs/activity/` | Nightly orchestration logs (internal/development use) |
 
 No local data is uploaded or shared unless you explicitly choose to do so.
 
diff --git a/docs/CALIBRATION-PLAYBOOK.md b/docs/CALIBRATION-PLAYBOOK.md
new file mode 100644
index 00000000..eb888c15
--- /dev/null
+++ b/docs/CALIBRATION-PLAYBOOK.md
@@ -0,0 +1,214 @@
+# Calibration & Rule Discovery Playbook
+
+How to run calibration, review results, and discover new rules. For technical details on the pipeline architecture, see [CALIBRATION.md](./CALIBRATION.md).
+
+---
+
+## 1. Fixture Preparation
+
+Save Figma designs as local JSON fixtures for offline analysis:
+
+```bash
+npx canicode save-fixture "https://www.figma.com/design/ABC123/MyDesign?node-id=1-234"
+# → fixtures/ABC123.json (includes sourceUrl for future reference)
+```
+
+- One fixture = one scoped section or page (not a full file)
+- Add `?node-id=` to scope to a specific section
+- Fixtures in `fixtures/` are active; converged ones get moved to `fixtures/done/`
+
+---
+
+## 2. Single Calibration Run
+
+```
+/calibrate-loop fixtures/material3-kit-1.json
+```
+
+### What happens
+
+| Step | Agent | Output | Description |
+|------|-------|--------|-------------|
+| 0 | Orchestrator | Run directory created | `logs/calibration/<name>--<timestamp>/` |
+| 1 | CLI | `analysis.json` | Rule analysis — which rules flagged what |
+| 2 | Converter | `output.html`, `figma.png`, `code.png`, `diff.png`, `conversion.json` | Implements the entire design as HTML, runs visual-compare |
+| 3 | Gap Analyzer | `gaps.json` | Categorizes pixel differences between Figma and code |
+| 4 | CLI | `summary.md` | Score vs actual impact comparison |
+| 5 | Critic | `debate.json` | Reviews proposals: APPROVE / REJECT / REVISE |
+| 6 | Arbitrator | `debate.json` (appended), `rule-config.ts` | Makes final decisions, applies approved changes, commits |
+
+### What you see
+
+```
+Arbitrator result:
+  applied=2: raw-color (-10 → -7), inconsistent-spacing (-8 → -6)
+  rejected=1: missing-token (insufficient evidence)
+```
+
+### Your decision
+
+None — fully automatic. Review the commit if you want.
+
+---
+
+## 3. Nightly Calibration (Multiple Fixtures)
+
+### In Claude Code / Cursor
+
+```
+/calibrate-night
+```
+
+### On a server
+
+```bash
+./scripts/calibrate-night.sh
+./scripts/calibrate-night.sh --deep  # uses Figma MCP for richer data
+```
+
+### What happens
+
+```
+Scan fixtures/*.json → 6 active fixtures found
+
+[1/6] fixtures/material3-kit-1.json    — Complete (applied=2)
+[2/6] fixtures/material3-kit-2.json    — Complete (applied=0, converged)
+  → moved to fixtures/done/
+[3/6] fixtures/simple-ds-card-grid.json — Complete (applied=1)
+[4/6] fixtures/simple-ds-page.json     — Complete (applied=0, converged)
+  → moved to fixtures/done/
+[5/6] fixtures/simple-ds-panel.json    — Failed (timeout)
+[6/6] fixtures/figma-ui3-kit.json      — Complete (applied=3)
+
+Phase 2: logs/calibration/REPORT.md generated
+```
+
+- `applied=0` means the fixture has converged (scores are stable) → moved to `fixtures/done/`
+- Next run automatically skips converged fixtures
+- To re-calibrate a converged fixture, move it back from `fixtures/done/` to `fixtures/`
+
+---
+
+## 4. Reviewing the Report
+
+Open `logs/calibration/REPORT.md` the next morning. Key sections:
+
+| Section | What to look for | Action |
+|---------|-----------------|--------|
+| **Similarity per run** | Low similarity = hard design | Consider adding more rules for that pattern |
+| **Repeating patterns** | Same gap in 3+ fixtures | Strong candidate for `/add-rule` |
+| **Rule score vs impact** | Overscored in most runs | Score will auto-adjust in next calibration |
+| **New rule candidates** | `text-alignment-mismatch` in 4/6 | Run `/add-rule` |
+| **Never flagged rules** | Rule never triggered | Consider `enabled: false` in `rule-config.ts` |
+
+### Decisions you make
+
+- **"This pattern should be a rule"** → Go to Step 5 (Rule Discovery)
+- **"This rule is overscored"** → Next nightly will auto-adjust, or manually edit `rule-config.ts`
+- **"This rule never fires"** → Set `enabled: false` in `rule-config.ts`
+
+---
+
+## 5. Rule Discovery
+
+When the report identifies a new pattern worth codifying:
+
+```
+/add-rule "text-alignment-mismatch" fixtures/material3-kit-1.json
+```
+
+### What happens
+
+| Step | Agent | Output | Description |
+|------|-------|--------|-------------|
+| 0 | Orchestrator | Run directory | `logs/rule-discovery/<concept>--<date>/` |
+| 1 | Researcher | `research.json` | Checks if the concept exists in fixture data, reads accumulated gaps |
+| 2 | Designer | `design.json` | Proposes rule spec: ID, category, severity, score, trigger logic |
+| 3 | Implementer | Source code | Writes rule code + tests, builds |
+| 4 | Orchestrator | `visual-a.html`, `visual-b.html` | A/B test: converts design with/without the rule's data |
+| 5 | Evaluator | `evaluation.json` | Measures false positive rate, visual improvement |
+| 6 | Critic | `decision.json` | Final verdict |
+
+### Possible outcomes
+
+| Decision | Meaning | What happens |
+|----------|---------|-------------|
+| **KEEP** | Rule is valuable | Auto-committed: `feat: add rule <id>` |
+| **ADJUST** | Good idea, tweak needed | Score/severity adjusted, then committed |
+| **DROP** | Not worth it | All source changes reverted |
+
+### Early stops
+
+- **Researcher says not feasible** → Pipeline stops at Step 1
+- **Build/test fails** → Implementer attempts fix; if can't, pipeline stops
+- **A/B shows no improvement** → Evaluator likely recommends DROP
+
+### Your decision
+
+None during execution — fully automatic. After completion:
+- If KEEP/ADJUST: review the commit, revert if you disagree
+- If DROP: nothing to do, code was already reverted
+
+---
+
+## 6. Inspecting a Run
+
+Every run is a self-contained directory. Open it to see everything:
+
+```bash
+ls logs/calibration/material3-kit-1--2026-03-24-0800/
+```
+
+```
+analysis.json       # Which rules flagged what
+conversion.json     # Conversion result + similarity score
+output.html         # Generated HTML (open in browser)
+design-tree.txt     # Design tree used for conversion
+figma.png           # Original Figma screenshot
+code.png            # AI-generated code screenshot
+diff.png            # Pixel diff (red = differences)
+gaps.json           # Why differences exist, categorized
+debate.json         # Critic + Arbitrator decisions
+activity.jsonl      # Step-by-step timeline with durations
+summary.md          # Human-readable summary
+```
+
+For rule discovery:
+
+```bash
+ls logs/rule-discovery/text-alignment-mismatch--2026-03-25/
+```
+
+```
+research.json       # Researcher findings
+design.json         # Rule specification
+evaluation.json     # Test results + verdict
+decision.json       # Critic's KEEP/ADJUST/DROP
+activity.jsonl      # Timeline
+summary.md          # Human-readable summary
+```
+
+---
+
+## 7. Full Cycle
+
+```
+ Prepare fixtures (save-fixture)
+         ↓
+ Nightly (/calibrate-night) ←────────────────┐
+   scan fixtures/*.json                       │
+   run /calibrate-loop per fixture            │
+   converged → fixtures/done/                 │
+   generate REPORT.md                         │
+         ↓                                    │
+ Morning review (REPORT.md)                   │
+   repeating gaps → new rule candidates       │
+   overscored rules → auto-adjusted next run  │
+   never-flagged → disable manually           │
+         ↓                                    │
+ Rule discovery (/add-rule)                   │
+   6-agent pipeline                           │
+   KEEP / ADJUST / DROP                       │
+         ↓                                    │
+ New rule included in next calibration ───────┘
+```
diff --git a/docs/CALIBRATION.md b/docs/CALIBRATION.md
index adae5080..1029a4e5 100644
--- a/docs/CALIBRATION.md
+++ b/docs/CALIBRATION.md
@@ -99,7 +99,7 @@ Each adjustment requires:
 - Severity changes require high confidence
 - All changes committed with fixture source and reasoning
 
-The full calibration log is auto-generated via `/calibrate-loop` and stored in `logs/activity/`.
+The full calibration log is auto-generated via `/calibrate-loop` and stored in `logs/calibration/<name>--<timestamp>/activity.jsonl`.
 
 ## Running Calibration
 
@@ -219,7 +219,7 @@ The Gap Analyzer examines the diff image between Figma screenshot and AI-generat
 - **Actionable but no rule?** → candidate for rule discovery
 - **Rendering artifact?** → not actionable (font smoothing, anti-aliasing)
 
-Gap data accumulates in `logs/calibration/gaps/` across runs. The rule discovery pipeline reads this data to find recurring patterns worth turning into new rules.
+Gap data accumulates in each run's `gaps.json` file (`logs/calibration/*/gaps.json`). The rule discovery pipeline reads this data to find recurring patterns worth turning into new rules.
 
 ---
 
@@ -243,7 +243,7 @@ Step 6 — Critic: decide KEEP / ADJUST / DROP
 ```
 Calibration runs accumulate gap data
     ↓
-logs/calibration/gaps/*.json
+logs/calibration/*/gaps.json  (one per run directory)
     ↓
 Researcher reads accumulated gaps
     ↓
diff --git a/scripts/calibrate-night.sh b/scripts/calibrate-night.sh
index 46ece564..e539d556 100755
--- a/scripts/calibrate-night.sh
+++ b/scripts/calibrate-night.sh
@@ -1,186 +1,220 @@
 #!/usr/bin/env bash
 set -euo pipefail
 
-# Nightly calibration script (loop mode, fixture-based)
-# Runs /calibrate-loop against local JSON fixtures, repeats if changes detected.
-# Stops on: no changes, max cycles reached, or error.
+# Nightly calibration: scan fixtures directory, run calibration, move converged fixtures to done/.
+#
+# Each /calibrate-loop invocation creates its own run directory under logs/calibration/<name>--<timestamp>/.
+#
+# Phase 1 — For each active fixture (fixtures/*.json): run calibration.
+#           If applied=0 (converged), move fixture to fixtures/done/.
+# Phase 2 — canicode calibrate-gap-report → logs/calibration/REPORT.md
+# Phase 3 — Manual: review the report, then run /add-rule in Claude Code.
+#
 # Usage:
-#   ./scripts/calibrate-night.sh          # fixture-only (fast)
-#   ./scripts/calibrate-night.sh --deep   # Figma MCP deep validation
+#   ./scripts/calibrate-night.sh                        # scan fixtures/ dir
+#   ./scripts/calibrate-night.sh --fixture-dir path/    # custom fixture directory
+#   ./scripts/calibrate-night.sh --deep                 # uses /calibrate-loop-deep
+#
+# Optional:
+#   CALIBRATE_SKIP_PHASE2=1     — only Phase 1 (no gap report)
+#   CALIBRATE_SKIP_BUILD=1      — skip pnpm build before Phase 2
+#   CALIBRATE_AUTO_COMMIT=1     — git commit + push at end
 
-MAX_CYCLES=5
-WAIT_SECONDS=1800  # 30 minutes
 COMMAND="/calibrate-loop"
-
-# ── Parse flags ────────────────────────────────────────────────────
+FIXTURE_DIR="fixtures"
 
 for arg in "$@"; do
   case "$arg" in
     --deep)
       COMMAND="/calibrate-loop-deep"
+      ;;
+    --fixture-dir)
       shift
+      FIXTURE_DIR="$1"
       ;;
   esac
 done
 
-# ── Load .env ───────────────────────────────────────────────────────
-
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+cd "$PROJECT_ROOT"
 
 if [ -f "$PROJECT_ROOT/.env" ]; then
   set -a
+  # shellcheck source=/dev/null
   source "$PROJECT_ROOT/.env"
   set +a
 fi
 
-# ── Validate env vars ───────────────────────────────────────────────
-
-if [ -z "${CALIBRATE_FIXTURES:-}" ]; then
-  echo "Error: CALIBRATE_FIXTURES is not set."
-  echo ""
-  echo "Usage:"
-  echo "  export CALIBRATE_FIXTURES=\"fixtures/a.json,fixtures/b.json\""
-  echo "  ./scripts/calibrate-night.sh"
-  echo ""
-  echo "Or add to .env:"
-  echo "  CALIBRATE_FIXTURES=fixtures/material3-kit.json,fixtures/simple-ds-card-grid.json"
-  exit 1
-fi
-
-# Split comma-separated list into array
-IFS=',' read -ra FIXTURES <<< "$CALIBRATE_FIXTURES"
-
-# Verify all fixtures exist
-for f in "${FIXTURES[@]}"; do
-  if [ ! -f "$f" ]; then
-    echo "Error: Fixture not found: $f"
-    exit 1
-  fi
+# Discover active fixtures (skip done/)
+FIXTURES=()
+for f in "$FIXTURE_DIR"/*.json; do
+  [ -f "$f" ] && FIXTURES+=("$f")
 done
 
-# ── Logging setup ───────────────────────────────────────────────────
+if [ ${#FIXTURES[@]} -eq 0 ]; then
+  echo "No active fixtures found in $FIXTURE_DIR/*.json"
+  echo "All fixtures may have converged (moved to $FIXTURE_DIR/done/)."
+  exit 0
+fi
 
-LOG_DIR="logs/activity"
-mkdir -p "$LOG_DIR"
+# Nightly-level log (tracks the orchestration itself)
+NIGHTLY_LOG_DIR="logs/activity"
+mkdir -p "$NIGHTLY_LOG_DIR"
 
 DATETIME=$(date +%Y-%m-%d-%H-%M)
-LOG_FILE="$LOG_DIR/${DATETIME}-nightly.md"
+NIGHTLY_LOG="$NIGHTLY_LOG_DIR/${DATETIME}-nightly.md"
 
 log() {
   local timestamp
   timestamp=$(date +%H:%M)
-  echo "## $timestamp — $1" >> "$LOG_FILE"
-  echo "" >> "$LOG_FILE"
+  echo "## $timestamp — $1" >> "$NIGHTLY_LOG"
+  echo "" >> "$NIGHTLY_LOG"
   if [ -n "${2:-}" ]; then
-    echo "$2" >> "$LOG_FILE"
-    echo "" >> "$LOG_FILE"
+    echo "$2" >> "$NIGHTLY_LOG"
+    echo "" >> "$NIGHTLY_LOG"
   fi
 }
 
-echo "# Calibration Activity Log — $DATETIME" > "$LOG_FILE"
-echo "" >> "$LOG_FILE"
-
-# ── Prevent sleep (re-exec under caffeinate if not already) ─────────
+echo "# Calibration night — $DATETIME" > "$NIGHTLY_LOG"
+echo "" >> "$NIGHTLY_LOG"
 
 if [ -z "${CAFFEINATED:-}" ]; then
   echo "Wrapping in caffeinate to prevent sleep..."
   CAFFEINATED=1 exec caffeinate -i "$0" "$@"
 fi
 
-# ── Cycle loop ──────────────────────────────────────────────────────
-
 TOTAL_START=$SECONDS
-STOP_REASON=""
-
-log "Nightly Calibration Started" "Command: $COMMAND | Max cycles: $MAX_CYCLES | Wait between cycles: ${WAIT_SECONDS}s | Fixtures: ${#FIXTURES[@]}"
-echo "Starting nightly calibration (max $MAX_CYCLES cycles, ${#FIXTURES[@]} fixtures per cycle, command: $COMMAND)"
-echo ""
-
-for cycle in $(seq 1 "$MAX_CYCLES"); do
-  CYCLE_START=$SECONDS
-  PASS=0
-  FAIL=0
+BEFORE_HASH=$(git hash-object src/rules/rule-config.ts 2>/dev/null || echo "none")
 
-  echo "=== Cycle $cycle/$MAX_CYCLES ==="
-  log "Cycle $cycle Start" "Fixtures: ${#FIXTURES[@]}"
+log "Phase 1 started" "Command: $COMMAND | Active fixtures: ${#FIXTURES[@]}"
 
-  # Snapshot rule-config.ts before this cycle
-  BEFORE_HASH=$(git hash-object src/rules/rule-config.ts 2>/dev/null || echo "none")
-
-  for i in "${!FIXTURES[@]}"; do
-    fixture="${FIXTURES[$i]}"
-    idx=$((i + 1))
-
-    echo "  [$idx/${#FIXTURES[@]}] $fixture"
-    log "Cycle $cycle — Fixture $idx Start" "File: $fixture"
+echo "Phase 1: calibrate ${#FIXTURES[@]} active fixture(s) with ${COMMAND}"
+echo "  (converged fixtures in $FIXTURE_DIR/done/ are skipped)"
+echo ""
 
-    RUN_START=$SECONDS
+PASS=0
+FAIL=0
+CONVERGED=0
+CONVERGED_LIST=""
+
+for i in "${!FIXTURES[@]}"; do
+  fixture="${FIXTURES[$i]}"
+  idx=$((i + 1))
+  base="$(basename "$fixture" .json)"
+
+  echo "  [$idx/${#FIXTURES[@]}] $fixture"
+  log "Fixture $idx start" "File: $fixture"
+
+  RUN_START=$SECONDS
+  if claude --dangerously-skip-permissions "$COMMAND" "$fixture"; then
+    DURATION=$(( SECONDS - RUN_START ))
+
+    # Check if converged: find the latest run dir for this fixture and check debate.json
+    LATEST_RUN_DIR=$(ls -d logs/calibration/"${base}"--* 2>/dev/null | sort | tail -1)
+    APPLIED="?"
+    if [ -n "$LATEST_RUN_DIR" ] && [ -f "$LATEST_RUN_DIR/debate.json" ]; then
+      # Extract applied count from arbitrator summary
+      APPLIED=$(python3 -c "
+import json, sys
+try:
+    d = json.load(open('$LATEST_RUN_DIR/debate.json'))
+    s = d.get('arbitrator', {}).get('summary', '')
+    # Parse 'applied=N' from summary string
+    for part in s.split():
+        if part.startswith('applied='):
+            print(part.split('=')[1])
+            sys.exit(0)
+    print('?')
+except: print('?')
+" 2>/dev/null || echo "?")
+    fi
 
-    if claude --dangerously-skip-permissions "$COMMAND" "$fixture"; then
-      DURATION=$(( SECONDS - RUN_START ))
-      log "Cycle $cycle — Fixture $idx Complete" "Duration: ${DURATION}s"
-      echo "    Complete (${DURATION}s)"
-      PASS=$((PASS + 1))
+    if [ "$APPLIED" = "0" ]; then
+      # Converged — move fixture to done/
+      mkdir -p "$FIXTURE_DIR/done"
+      mv "$fixture" "$FIXTURE_DIR/done/"
+      CONVERGED=$((CONVERGED + 1))
+      CONVERGED_LIST="${CONVERGED_LIST}    → $fixture (moved to done/)\n"
+      log "Fixture $idx converged" "Duration: ${DURATION}s — moved to done/"
+      echo "    Complete (${DURATION}s) — converged, moved to done/"
     else
-      DURATION=$(( SECONDS - RUN_START ))
-      log "Cycle $cycle — Fixture $idx Failed" "Duration: ${DURATION}s — exit code: $?"
-      echo "    Failed (${DURATION}s)"
-      FAIL=$((FAIL + 1))
+      log "Fixture $idx complete" "Duration: ${DURATION}s — applied=$APPLIED"
+      echo "    Complete (${DURATION}s) — applied=$APPLIED"
     fi
-  done
+    PASS=$((PASS + 1))
+  else
+    DURATION=$(( SECONDS - RUN_START ))
+    log "Fixture $idx failed" "Duration: ${DURATION}s"
+    echo "    Failed (${DURATION}s)"
+    FAIL=$((FAIL + 1))
+  fi
+done
 
-  CYCLE_DURATION=$(( SECONDS - CYCLE_START ))
+PHASE1_DURATION=$(( SECONDS - TOTAL_START ))
+log "Phase 1 finished" "Passed: $PASS | Failed: $FAIL | Converged: $CONVERGED | Duration: ${PHASE1_DURATION}s"
 
-  # Check if rule-config.ts changed
-  AFTER_HASH=$(git hash-object src/rules/rule-config.ts 2>/dev/null || echo "none")
-  HAS_CHANGES=false
-  if [ "$BEFORE_HASH" != "$AFTER_HASH" ]; then
-    HAS_CHANGES=true
-  fi
+echo ""
+echo "Phase 1 done: ${PASS} passed, ${FAIL} failed, ${CONVERGED} converged (${PHASE1_DURATION}s)"
+if [ -n "$CONVERGED_LIST" ]; then
+  echo -e "$CONVERGED_LIST"
+fi
+echo ""
 
-  # Commit & push if changed
-  if [ "$HAS_CHANGES" = true ]; then
-    git add src/rules/rule-config.ts logs/
-    git commit -m "chore: calibrate rule scores — cycle $cycle ($DATETIME)
+GAP_REPORT_PATH="logs/calibration/REPORT.md"
 
-Passed: $PASS / ${#FIXTURES[@]}, Failed: $FAIL
-Cycle duration: ${CYCLE_DURATION}s"
-    git push
-    log "Cycle $cycle Complete — Pushed" "Duration: ${CYCLE_DURATION}s | Passed: $PASS | Failed: $FAIL | Changes committed."
-    echo "  Cycle $cycle: changes committed and pushed (${CYCLE_DURATION}s)"
-  else
-    log "Cycle $cycle Complete — No Changes" "Duration: ${CYCLE_DURATION}s | Passed: $PASS | Failed: $FAIL"
-    echo "  Cycle $cycle: no changes (${CYCLE_DURATION}s)"
-    STOP_REASON="no-changes"
-    break
+if [ -z "${CALIBRATE_SKIP_PHASE2:-}" ]; then
+  echo "Phase 2: gap rule review report → ${GAP_REPORT_PATH}"
+
+  if [ -z "${CALIBRATE_SKIP_BUILD:-}" ]; then
+    pnpm build
   fi
 
-  # Check if this was the last cycle
-  if [ "$cycle" -eq "$MAX_CYCLES" ]; then
-    STOP_REASON="max-cycles"
-    break
+  if [ ! -f dist/cli/index.js ]; then
+    echo "Error: dist/cli/index.js not found. Run pnpm build or unset CALIBRATE_SKIP_BUILD."
+    exit 1
   fi
 
-  # Wait before next cycle
-  echo ""
-  echo "  Waiting ${WAIT_SECONDS}s before next cycle..."
-  log "Waiting" "${WAIT_SECONDS}s until cycle $((cycle + 1))"
-  sleep "$WAIT_SECONDS"
-done
+  node dist/cli/index.js calibrate-gap-report --output "$GAP_REPORT_PATH"
 
-# ── Final summary ──────────────────────────────────────────────────
+  log "Phase 2 complete" "Report: ${GAP_REPORT_PATH}"
+  echo ""
+  echo "Phase 2 done."
+  echo "  Report: ${GAP_REPORT_PATH}"
+  echo ""
+  echo "Phase 3 (manual): read the report, then run /add-rule in Claude Code when you add a rule."
+else
+  echo "Phase 2 skipped (CALIBRATE_SKIP_PHASE2=1)."
+fi
 
 TOTAL_DURATION=$(( SECONDS - TOTAL_START ))
+REMAINING=$(ls "$FIXTURE_DIR"/*.json 2>/dev/null | wc -l | tr -d ' ')
+log "Nightly finished" "Total: ${TOTAL_DURATION}s | Remaining active: $REMAINING | Converged: $CONVERGED"
 
-if [ -z "$STOP_REASON" ]; then
-  STOP_REASON="completed"
+echo "Log: $NIGHTLY_LOG"
+echo "Active fixtures remaining: $REMAINING"
+echo "Total time: ${TOTAL_DURATION}s"
+
+AFTER_HASH=$(git hash-object src/rules/rule-config.ts 2>/dev/null || echo "none")
+HAS_CHANGES=false
+if [ "$BEFORE_HASH" != "$AFTER_HASH" ]; then
+  HAS_CHANGES=true
 fi
 
-log "Nightly Calibration Finished" "Reason: $STOP_REASON | Total duration: ${TOTAL_DURATION}s"
+if [ "${CALIBRATE_AUTO_COMMIT:-}" = "1" ]; then
+  if [ "$HAS_CHANGES" = true ] || [ -n "$(git status --porcelain logs/ 2>/dev/null)" ]; then
+    git add src/rules/rule-config.ts logs/ || true
+    if git diff --cached --quiet; then
+      echo "No staged changes to commit."
+    else
+      git commit -m "chore: nightly calibration — ${DATETIME}
 
-echo ""
-echo "Nightly calibration finished."
-echo "  Reason: $STOP_REASON"
-echo "  Total: ${TOTAL_DURATION}s"
-echo "  Log: $LOG_FILE"
+Phase 1: ${PASS}/${#FIXTURES[@]} passed, ${CONVERGED} converged
+Report: ${GAP_REPORT_PATH}"
+      git push
+      echo "Committed and pushed calibration changes."
+    fi
+  else
+    echo "No rule-config or logs changes to commit."
+  fi
+fi
diff --git a/src/agents/activity-logger.test.ts b/src/agents/activity-logger.test.ts
index 336ea0c1..407a488b 100644
--- a/src/agents/activity-logger.test.ts
+++ b/src/agents/activity-logger.test.ts
@@ -27,8 +27,8 @@ describe("ActivityLogger", () => {
   });
 
   it("logStep creates directory and file if they don't exist, file contains step data", async () => {
-    const logDir = join(tempDir, "nested", "logs");
-    const logger = new ActivityLogger("fixtures/http-design.json", logDir);
+    const runDir = join(tempDir, "nested", "run");
+    const logger = new ActivityLogger(runDir);
 
     await logger.logStep({
       step: "Analyze Node",
@@ -39,6 +39,7 @@ describe("ActivityLogger", () => {
 
     const logPath = logger.getLogPath();
     expect(existsSync(logPath)).toBe(true);
+    expect(logPath).toContain("activity.jsonl");
 
     const entries = readJsonLines(logPath);
     // First entry is the session-start header, second is our step
@@ -51,7 +52,7 @@ describe("ActivityLogger", () => {
   });
 
   it("logStep with nodePath includes nodePath field in entry", async () => {
-    const logger = new ActivityLogger("fixtures/sample.json", tempDir);
+    const logger = new ActivityLogger(tempDir);
 
     await logger.logStep({
       step: "Convert Component",
@@ -66,7 +67,7 @@ describe("ActivityLogger", () => {
   });
 
   it("logStep without nodePath omits nodePath field", async () => {
-    const logger = new ActivityLogger("fixtures/sample.json", tempDir);
+    const logger = new ActivityLogger(tempDir);
 
     await logger.logStep({
       step: "Initialize Pipeline",
@@ -83,7 +84,7 @@ describe("ActivityLogger", () => {
   });
 
   it("logSummary writes summary entry with all fields", async () => {
-    const logger = new ActivityLogger("fixtures/sample.json", tempDir);
+    const logger = new ActivityLogger(tempDir);
 
     await logger.logSummary({
       totalDurationMs: 5000,
@@ -106,7 +107,7 @@ describe("ActivityLogger", () => {
   });
 
   it("multiple logStep calls append to the same file (not overwrite)", async () => {
-    const logger = new ActivityLogger("fixtures/sample.json", tempDir);
+    const logger = new ActivityLogger(tempDir);
 
     await logger.logStep({
       step: "First Step",
@@ -130,24 +131,11 @@ describe("ActivityLogger", () => {
     expect(secondEntry!["result"]).toBe("done");
   });
 
-  it("getLogPath contains fixture name, datetime, and .jsonl extension", () => {
-    const logger = new ActivityLogger("fixtures/http-design.json", tempDir);
+  it("getLogPath returns activity.jsonl inside the run directory", () => {
+    const logger = new ActivityLogger(tempDir);
     const logPath = logger.getLogPath();
 
-    expect(logPath).toContain("http-design");
-    expect(logPath.endsWith(".jsonl")).toBe(true);
-
-    const now = new Date();
-    const year = now.getFullYear();
-    const month = String(now.getMonth() + 1).padStart(2, "0");
-    const day = String(now.getDate()).padStart(2, "0");
-    const todayStr = `${year}-${month}-${day}`;
-
-    expect(logPath).toContain(todayStr);
-  });
-
-  it("defaults fixture name to unknown when not provided", () => {
-    const logger = new ActivityLogger(undefined, tempDir);
-    expect(logger.getLogPath()).toContain("unknown");
+    expect(logPath.endsWith("activity.jsonl")).toBe(true);
+    expect(logPath).toContain(tempDir);
   });
 });
diff --git a/src/agents/activity-logger.ts b/src/agents/activity-logger.ts
index 44465aa4..59639687 100644
--- a/src/agents/activity-logger.ts
+++ b/src/agents/activity-logger.ts
@@ -1,6 +1,6 @@
 import { existsSync, mkdirSync } from "node:fs";
 import { appendFile, writeFile } from "node:fs/promises";
-import { resolve, dirname } from "node:path";
+import { resolve, join } from "node:path";
 
 export interface ActivityStep {
   step: string;
@@ -13,33 +13,12 @@ function getIsoTimestamp(): string {
   return new Date().toISOString();
 }
 
-function getDateTimeString(): string {
-  const now = new Date();
-  const year = now.getFullYear();
-  const month = String(now.getMonth() + 1).padStart(2, "0");
-  const day = String(now.getDate()).padStart(2, "0");
-  const hours = String(now.getHours()).padStart(2, "0");
-  const minutes = String(now.getMinutes()).padStart(2, "0");
-  return `${year}-${month}-${day}-${hours}-${minutes}`;
-}
-
-/**
- * Extract a short fixture name from a file path.
- * e.g. "fixtures/http-design.json" → "http-design"
- */
-function extractFixtureName(fixturePath: string): string {
-  const fileName = fixturePath.split("/").pop() ?? fixturePath;
-  return fileName.replace(/\.json$/, "");
-}
-
 export class ActivityLogger {
   private logPath: string;
   private initialized = false;
 
-  constructor(fixturePath?: string, logDir = "logs/activity") {
-    const dateTimeStr = getDateTimeString();
-    const fixtureName = fixturePath ? extractFixtureName(fixturePath) : "unknown";
-    this.logPath = resolve(logDir, `${dateTimeStr}-${fixtureName}.jsonl`);
+  constructor(runDir: string) {
+    this.logPath = resolve(join(runDir, "activity.jsonl"));
   }
 
   /**
@@ -48,7 +27,7 @@ export class ActivityLogger {
   private async ensureInitialized(): Promise<void> {
     if (this.initialized) return;
 
-    const dir = dirname(this.logPath);
+    const dir = resolve(this.logPath, "..");
     if (!existsSync(dir)) {
       mkdirSync(dir, { recursive: true });
     }
diff --git a/src/agents/contracts/calibration.ts b/src/agents/contracts/calibration.ts
index 635d48ff..940ddfc0 100644
--- a/src/agents/contracts/calibration.ts
+++ b/src/agents/contracts/calibration.ts
@@ -21,6 +21,7 @@ export const CalibrationConfigSchema = z.object({
   maxConversionNodes: z.number().int().positive().default(20),
   samplingStrategy: SamplingStrategySchema.default("top-issues"),
   outputPath: z.string().default("logs/calibration/calibration-report.md"),
+  runDir: z.string().optional(),
 });
 
 export type CalibrationConfig = z.infer<typeof CalibrationConfigSchema>;
diff --git a/src/agents/gap-rule-report.test.ts b/src/agents/gap-rule-report.test.ts
new file mode 100644
index 00000000..29e66815
--- /dev/null
+++ b/src/agents/gap-rule-report.test.ts
@@ -0,0 +1,109 @@
+import { mkdirSync, writeFileSync, rmSync } from "node:fs";
+import { join } from "node:path";
+import { generateGapRuleReport } from "./gap-rule-report.js";
+
+describe("generateGapRuleReport", () => {
+  const tmpRoot = join(process.cwd(), "logs/calibration/.gap-report-test");
+
+  afterEach(() => {
+    try {
+      rmSync(tmpRoot, { recursive: true, force: true });
+    } catch {
+      /* ignore */
+    }
+  });
+
+  it("aggregates gap files from run directories and writes markdown sections", () => {
+    // Create two run directories with gaps.json
+    const runA = join(tmpRoot, "fx-a--2026-03-24-0100");
+    const runB = join(tmpRoot, "fx-b--2026-03-24-0200");
+    mkdirSync(runA, { recursive: true });
+    mkdirSync(runB, { recursive: true });
+
+    writeFileSync(
+      join(runA, "gaps.json"),
+      JSON.stringify({
+        fileKey: "fx-a",
+        gaps: [
+          {
+            category: "layout",
+            area: "Title",
+            description: "Alignment mismatch",
+            coveredByExistingRule: false,
+            actionable: true,
+          },
+        ],
+        newRuleSuggestions: [{ ruleId: "text-alignment-mismatch" }],
+      }),
+      "utf-8"
+    );
+
+    writeFileSync(
+      join(runB, "gaps.json"),
+      JSON.stringify({
+        fileKey: "fx-b",
+        gaps: [
+          {
+            category: "layout",
+            area: "Title",
+            description: "Alignment mismatch",
+            coveredByExistingRule: false,
+            actionable: true,
+          },
+        ],
+        newRuleSuggestions: [{ ruleId: "text-alignment-mismatch" }],
+      }),
+      "utf-8"
+    );
+
+    const { markdown, runCount, gapRunCount } = generateGapRuleReport({
+      calibrationDir: tmpRoot,
+      minPatternRepeat: 2,
+    });
+
+    expect(gapRunCount).toBe(2);
+    expect(runCount).toBe(0); // No analysis.json + conversion.json in these dirs
+    expect(markdown).toContain("layout");
+    expect(markdown).toContain("text-alignment-mismatch");
+    expect(markdown).toContain("Repeating patterns");
+  });
+
+  it("extracts fixture key from run directory name when no fileKey in JSON", () => {
+    const runDir = join(tmpRoot, "material3-kit--2026-03-24-0300");
+    mkdirSync(runDir, { recursive: true });
+
+    writeFileSync(
+      join(runDir, "gaps.json"),
+      JSON.stringify({
+        gaps: [
+          {
+            category: "spacing",
+            description: "Padding off by 4px",
+            actionable: true,
+          },
+        ],
+      }),
+      "utf-8"
+    );
+
+    const { markdown } = generateGapRuleReport({
+      calibrationDir: tmpRoot,
+      minPatternRepeat: 1,
+    });
+
+    expect(markdown).toContain("material3-kit");
+  });
+
+  it("returns empty report when no run directories exist", () => {
+    mkdirSync(tmpRoot, { recursive: true });
+
+    const { markdown, runCount, gapRunCount } = generateGapRuleReport({
+      calibrationDir: tmpRoot,
+      minPatternRepeat: 2,
+    });
+
+    expect(runCount).toBe(0);
+    expect(gapRunCount).toBe(0);
+    expect(markdown).toContain("No gap entries found");
+  });
+});
diff --git a/src/agents/gap-rule-report.ts b/src/agents/gap-rule-report.ts
new file mode 100644
index 00000000..8ff7d426
--- /dev/null
+++ b/src/agents/gap-rule-report.ts
@@ -0,0 +1,493 @@
+import { readdirSync, readFileSync, existsSync, statSync } from "node:fs";
+import { join, resolve } from "node:path";
+import { RULE_CONFIGS } from "../core/rules/rule-config.js";
+import type { RuleId } from "../core/contracts/rule.js";
+import { runCalibrationEvaluate } from "./orchestrator.js";
+
+type CalibrationAnalysisJson = Parameters<typeof runCalibrationEvaluate>[0] & {
+  ruleScores: Record<string, { score: number; severity: string }>;
+};
+
+export interface GapRuleReportOptions {
+  calibrationDir: string;
+  minPatternRepeat: number;
+}
+
+export interface GapRuleReportResult {
+  markdown: string;
+  runCount: number;
+  gapRunCount: number;
+}
+
+interface NormalizedGap {
+  category: string;
+  description: string;
+  area?: string;
+  coveredByExistingRule: boolean;
+  existingRule: string | null;
+  actionable: boolean;
+  fixtureKey: string;
+}
+
+interface ParsedGapFile {
+  runDir: string;
+  fixtureKey: string;
+  similarity: number | undefined;
+  gaps: NormalizedGap[];
+  newRuleSuggestions: Array<{ ruleId: string; rationale?: string }>;
+}
+
+function fixtureKeyFromRunDir(runDir: string, raw: Record<string, unknown>): string {
+  const fromJson =
+    (typeof raw["fileKey"] === "string" && raw["fileKey"]) ||
+    (typeof raw["fixture"] === "string" && raw["fixture"]);
+  if (fromJson) return fromJson;
+  // Extract fixture name from run dir name: <fixture-name>--<timestamp>
+  const dirName = runDir.split(/[/\\]/).pop() ?? runDir;
+  const idx = dirName.lastIndexOf("--");
+  return idx === -1 ? dirName : dirName.slice(0, idx);
+}
+
+function normalizeGapEntry(
+  raw: Record<string, unknown>,
+  fixtureKey: string
+): NormalizedGap | null {
+  const category = typeof raw["category"] === "string" ? raw["category"] : "unknown";
+  const description =
+    typeof raw["description"] === "string" ? raw["description"] : "";
+  const area = typeof raw["area"] === "string" ? raw["area"] : undefined;
+
+  let covered = false;
+  if (typeof raw["coveredByExistingRule"] === "boolean") {
+    covered = raw["coveredByExistingRule"];
+  } else if (raw["coveredByRule"] === true) {
+    covered = true;
+  }
+
+  let existingRule: string | null = null;
+  if (typeof raw["existingRule"] === "string") {
+    existingRule = raw["existingRule"];
+  }
+
+  const actionable = raw["actionable"] !== false;
+
+  if (!description && !area) return null;
+
+  return {
+    category,
+    description,
+    ...(area !== undefined ? { area } : {}),
+    coveredByExistingRule: covered,
+    existingRule,
+    actionable,
+    fixtureKey,
+  };
+}
+
+function parseGapFile(runDir: string, gapsPath: string): ParsedGapFile | null {
+  let raw: Record<string, unknown>;
+  try {
+    raw = JSON.parse(readFileSync(gapsPath, "utf-8")) as Record<string, unknown>;
+  } catch {
+    return null;
+  }
+
+  const fixtureKey = fixtureKeyFromRunDir(runDir, raw);
+  const gapsRaw = raw["gaps"];
+  const gaps: NormalizedGap[] = [];
+  if (Array.isArray(gapsRaw)) {
+    for (const g of gapsRaw) {
+      if (!g || typeof g !== "object") continue;
+      const n = normalizeGapEntry(g as Record<string, unknown>, fixtureKey);
+      if (n) gaps.push(n);
+    }
+  }
+
+  const newRuleSuggestions: Array<{ ruleId: string; rationale?: string }> = [];
+  const sug = raw["newRuleSuggestions"];
+  if (Array.isArray(sug)) {
+    for (const s of sug) {
+      if (!s || typeof s !== "object") continue;
+      const o = s as Record<string, unknown>;
+      if (typeof o["ruleId"] === "string") {
+        const entry: { ruleId: string; rationale?: string } = { ruleId: o["ruleId"] };
+        if (typeof o["rationale"] === "string") {
+          entry.rationale = o["rationale"];
+        }
+        newRuleSuggestions.push(entry);
+      }
+    }
+  }
+
+  return {
+    runDir,
+    fixtureKey,
+    similarity: typeof raw["similarity"] === "number" ? raw["similarity"] : undefined,
+    gaps,
+    newRuleSuggestions,
+  };
+}
+
+function patternKey(g: NormalizedGap): string {
+  const label = (g.area ?? g.description).trim().slice(0, 120);
+  return `${g.category}|${label.toLowerCase().replace(/\s+/g, " ")}`;
+}
+
+/**
+ * List all run directories under the calibration dir.
+ * Each run dir is expected to be `<name>--<timestamp>`.
+ */
+function listRunDirs(calibrationDir: string): string[] {
+  if (!existsSync(calibrationDir)) return [];
+  return readdirSync(calibrationDir, { withFileTypes: true })
+    .filter((e) => e.isDirectory() && e.name.includes("--"))
+    .map((e) => join(calibrationDir, e.name))
+    .sort();
+}
+
+interface RunSnapshot {
+  dir: string;
+  label: string;
+  analysis: CalibrationAnalysisJson;
+  conversion: Record<string, unknown>;
+}
+
+function loadRunSnapshot(dir: string): RunSnapshot | null {
+  const aPath = join(dir, "analysis.json");
+  const cPath = join(dir, "conversion.json");
+  if (!existsSync(aPath) || !existsSync(cPath)) return null;
+  try {
+    const analysis = JSON.parse(readFileSync(aPath, "utf-8")) as CalibrationAnalysisJson;
+    const conversion = JSON.parse(readFileSync(cPath, "utf-8")) as Record<string, unknown>;
+    if (!analysis.nodeIssueSummaries || !analysis.ruleScores) return null;
+    const label = dir.split(/[/\\]/).pop() ?? dir;
+    return { dir, label, analysis, conversion };
+  } catch {
+    return null;
+  }
+}
+
+function enabledRuleIds(): RuleId[] {
+  return (Object.keys(RULE_CONFIGS) as RuleId[]).filter(
+    (id) => RULE_CONFIGS[id]?.enabled !== false
+  );
+}
+
+/**
+ * Aggregates gap data and calibration snapshots from run directories into a markdown report.
+ */
+export function generateGapRuleReport(options: GapRuleReportOptions): GapRuleReportResult {
+  const calibrationDir = resolve(options.calibrationDir);
+  const minRepeat = options.minPatternRepeat;
+
+  const runDirs = listRunDirs(calibrationDir);
+
+  // Parse gaps from each run directory
+  const parsed: ParsedGapFile[] = [];
+  for (const dir of runDirs) {
+    const gapsPath = join(dir, "gaps.json");
+    if (!existsSync(gapsPath) || !statSync(gapsPath).isFile()) continue;
+    const g = parseGapFile(dir, gapsPath);
+    if (g && (g.gaps.length > 0 || g.newRuleSuggestions.length > 0)) parsed.push(g);
+  }
+
+  const allGaps = parsed.flatMap((f) => f.gaps);
+  const fixtureKeys = [...new Set(parsed.map((p) => p.fixtureKey))];
+  const totalFixtures = fixtureKeys.length;
+
+  const byCategory = new Map<string, number>();
+  for (const g of allGaps) {
+    byCategory.set(g.category, (byCategory.get(g.category) ?? 0) + 1);
+  }
+
+  const patternMap = new Map<
+    string,
+    { count: number; fixtures: Set<string>; sample: string; category: string }
+  >();
+  for (const g of allGaps) {
+    const key = patternKey(g);
+    const cur = patternMap.get(key);
+    if (cur) {
+      cur.count++;
+      cur.fixtures.add(g.fixtureKey);
+    } else {
+      patternMap.set(key, {
+        count: 1,
+        fixtures: new Set([g.fixtureKey]),
+        sample: g.description.slice(0, 200),
+        category: g.category,
+      });
+    }
+  }
+
+  const repeatingPatterns = [...patternMap.entries()]
+    .filter(([, v]) => v.fixtures.size >= minRepeat)
+    .sort((a, b) => b[1].fixtures.size - a[1].fixtures.size);
+
+  const existingRuleMentions = new Map<string, Set<string>>();
+  for (const g of allGaps) {
+    if (g.existingRule) {
+      let set = existingRuleMentions.get(g.existingRule);
+      if (!set) {
+        set = new Set();
+        existingRuleMentions.set(g.existingRule, set);
+      }
+      set.add(g.fixtureKey);
+    }
+  }
+
+  const notCoveredActionable = allGaps.filter((g) => !g.coveredByExistingRule && g.actionable);
+  const suggestionCounts = new Map<string, { count: number; fixtures: Set<string> }>();
+  for (const f of parsed) {
+    for (const s of f.newRuleSuggestions) {
+      const id = s.ruleId.trim();
+      if (!id) continue;
+      const cur = suggestionCounts.get(id);
+      if (cur) {
+        cur.count++;
+        cur.fixtures.add(f.fixtureKey);
+      } else {
+        suggestionCounts.set(id, { count: 1, fixtures: new Set([f.fixtureKey]) });
+      }
+    }
+  }
+
+  // Load run snapshots for score-vs-impact analysis
+  const runs: RunSnapshot[] = [];
+  for (const dir of runDirs) {
+    const snap = loadRunSnapshot(dir);
+    if (snap) runs.push(snap);
+  }
+
+  const flaggedRules = new Set<string>();
+  const overscoredRuns = new Map<string, Set<number>>();
+  const underscoredRuns = new Map<string, Set<number>>();
+  const validatedRuns = new Map<string, Set<number>>();
+
+  for (let i = 0; i < runs.length; i++) {
+    const snap = runs[i];
+    if (!snap) continue;
+    for (const n of snap.analysis.nodeIssueSummaries) {
+      for (const id of n.flaggedRuleIds) {
+        flaggedRules.add(id);
+      }
+    }
+
+    const a = snap.analysis;
+    const { evaluationOutput } = runCalibrationEvaluate(
+      {
+        nodeIssueSummaries: a.nodeIssueSummaries,
+        scoreReport: a.scoreReport,
+        fileKey: a.fileKey,
+        fileName: a.fileName,
+        analyzedAt: a.analyzedAt,
+        nodeCount: a.nodeCount,
+        issueCount: a.issueCount,
+      },
+      snap.conversion,
+      a.ruleScores
+    );
+
+    const seenO = new Set<string>();
+    const seenU = new Set<string>();
+    const seenV = new Set<string>();
+    for (const m of evaluationOutput.mismatches) {
+      if (!m.ruleId) continue;
+      if (m.type === "overscored") {
+        if (!seenO.has(m.ruleId)) {
+          seenO.add(m.ruleId);
+          let s = overscoredRuns.get(m.ruleId);
+          if (!s) {
+            s = new Set();
+            overscoredRuns.set(m.ruleId, s);
+          }
+          s.add(i);
+        }
+      } else if (m.type === "underscored") {
+        if (!seenU.has(m.ruleId)) {
+          seenU.add(m.ruleId);
+          let s = underscoredRuns.get(m.ruleId);
+          if (!s) {
+            s = new Set();
+            underscoredRuns.set(m.ruleId, s);
+          }
+          s.add(i);
+        }
+      } else if (m.type === "validated") {
+        if (!seenV.has(m.ruleId)) {
+          seenV.add(m.ruleId);
+          let s = validatedRuns.get(m.ruleId);
+          if (!s) {
+            s = new Set();
+            validatedRuns.set(m.ruleId, s);
+          }
+          s.add(i);
+        }
+      }
+    }
+  }
+
+  const nRuns = runs.length;
+  const neverFlagged = enabledRuleIds().filter((id) => !flaggedRules.has(id));
+
+  // Similarity summary per run
+  const similaritySummary: Array<{ label: string; similarity: number | undefined }> = [];
+  for (const f of parsed) {
+    const dirName = f.runDir.split(/[/\\]/).pop() ?? f.runDir;
+    similaritySummary.push({ label: dirName, similarity: f.similarity });
+  }
+
+  const lines: string[] = [];
+  lines.push("# Gap-based rule review");
+  lines.push("");
+  lines.push(`Generated: ${new Date().toISOString()}`);
+  lines.push("");
+  lines.push("## Summary");
+  lines.push("");
+  lines.push(`| Metric | Value |`);
+  lines.push(`| --- | --- |`);
+  lines.push(`| Run directories scanned | ${runDirs.length} |`);
+  lines.push(`| Runs with gap data | ${parsed.length} |`);
+  lines.push(`| Runs with analysis+conversion | ${nRuns} |`);
+  lines.push(`| Distinct fixtures (from gaps) | ${totalFixtures} |`);
+  lines.push(`| Total gap entries | ${allGaps.length} |`);
+  lines.push(`| Actionable gaps not covered by existing rule | ${notCoveredActionable.length} |`);
+  lines.push("");
+
+  if (similaritySummary.length > 0) {
+    lines.push("## Similarity per run");
+    lines.push("");
+    lines.push("| Run | Similarity |");
+    lines.push("| --- | --- |");
+    for (const s of similaritySummary) {
+      lines.push(`| ${s.label} | ${s.similarity != null ? `${s.similarity}%` : "N/A"} |`);
+    }
+    lines.push("");
+  }
+
+  lines.push("## Gaps by category");
+  lines.push("");
+  if (byCategory.size === 0) {
+    lines.push("_No gap entries found._");
+  } else {
+    lines.push("| Category | Count |");
+    lines.push("| --- | --- |");
+    for (const [k, v] of [...byCategory.entries()].sort((a, b) => b[1] - a[1])) {
+      lines.push(`| ${k} | ${v} |`);
+    }
+  }
+  lines.push("");
+
+  lines.push(`## Repeating patterns (${minRepeat}+ fixtures)`);
+  lines.push("");
+  lines.push(
+    "_Patterns use category + area/description. Review for **new rule** candidates when not covered by existing rules._"
+  );
+  lines.push("");
+  if (repeatingPatterns.length === 0) {
+    lines.push(`_No patterns appearing in at least ${minRepeat} distinct fixtures._`);
+  } else {
+    lines.push("| Pattern (category) | Fixtures | Sample |");
+    lines.push("| --- | --- | --- |");
+    for (const [, info] of repeatingPatterns) {
+      const fx = [...info.fixtures].sort().join(", ");
+      const safe = info.sample.replace(/\|/g, "\\|").replace(/\n/g, " ");
+      lines.push(`| ${info.category} | ${info.fixtures.size} (${fx}) | ${safe} |`);
+    }
+  }
+  lines.push("");
+
+  lines.push("## Existing rules mentioned in gaps");
+  lines.push("");
+  lines.push("_When a gap is attributed to an existing rule, which fixtures reported it._");
+  lines.push("");
+  if (existingRuleMentions.size === 0) {
+    lines.push("_None._");
+  } else {
+    lines.push("| Rule ID | Fixture count | Fixtures |");
+    lines.push("| --- | --- | --- |");
+    for (const [ruleId, set] of [...existingRuleMentions.entries()].sort(
+      (a, b) => b[1].size - a[1].size
+    )) {
+      const fx = [...set].sort().join(", ");
+      lines.push(`| \`${ruleId}\` | ${set.size} | ${fx} |`);
+    }
+  }
+  lines.push("");
+
+  lines.push("## New rule candidates (from gap files)");
+  lines.push("");
+  const strongSuggestions = [...suggestionCounts.entries()].filter(
+    ([, v]) => v.fixtures.size >= minRepeat
+  );
+  if (strongSuggestions.length === 0) {
+    lines.push(`_No suggestion keys appearing in ${minRepeat}+ fixtures. Lower the threshold or add more gap data._`);
+  } else {
+    lines.push("| Candidate | Appearances | Fixtures |");
+    lines.push("| --- | --- | --- |");
+    for (const [id, v] of strongSuggestions.sort((a, b) => b[1].fixtures.size - a[1].fixtures.size)) {
+      const fx = [...v.fixtures].sort().join(", ");
+      lines.push(`| ${id} | ${v.count} | ${fx} |`);
+    }
+  }
+  lines.push("");
+
+  lines.push("## Rule score vs conversion impact (from run snapshots)");
+  lines.push("");
+  if (nRuns === 0) {
+    lines.push(
+      "_No runs with both `analysis.json` and `conversion.json`. Run calibration first to populate this section._"
+    );
+  } else {
+    lines.push(
+      "_Per run, `calibrate-evaluate`-style comparison: **overscored** means the rule penalty looks too harsh for actual impact; **underscored** means too lenient._"
+    );
+    lines.push("");
+    lines.push(`| Rule ID | Overscored (runs) | Underscored (runs) | Validated (runs) |`);
+    lines.push("| --- | --- | --- | --- |");
+    const ruleIds = new Set<string>([
+      ...overscoredRuns.keys(),
+      ...underscoredRuns.keys(),
+      ...validatedRuns.keys(),
+    ]);
+    for (const id of [...ruleIds].sort()) {
+      const o = overscoredRuns.get(id)?.size ?? 0;
+      const u = underscoredRuns.get(id)?.size ?? 0;
+      const val = validatedRuns.get(id)?.size ?? 0;
+      lines.push(`| \`${id}\` | ${o}/${nRuns} | ${u}/${nRuns} | ${val}/${nRuns} |`);
+    }
+    lines.push("");
+    lines.push("**Heuristic:** many **overscored** rows with high similarity → consider lowering severity or score in `rule-config.ts`. Many **underscored** → consider raising.");
+  }
+  lines.push("");
+
+  lines.push("## Enabled rules never flagged in any run");
+  lines.push("");
+  if (nRuns === 0) {
+    lines.push("_Skipped (no run snapshots)._");
+  } else if (neverFlagged.length === 0) {
+    lines.push("_Every enabled rule was flagged at least once across runs._");
+  } else {
+    lines.push(
+      `_These rules did not appear in \`flaggedRuleIds\` in any saved analysis. They may still be valuable for other designs._`
+    );
+    lines.push("");
+    for (const id of neverFlagged) {
+      lines.push(`- \`${id}\``);
+    }
+  }
+  lines.push("");
+
+  lines.push("## Next step (manual)");
+  lines.push("");
+  lines.push(
+    "Review this report, then run **`/add-rule`** in Claude Code with a concrete concept and fixture path when you want to implement a new rule."
+  );
+  lines.push("");
+
+  return {
+    markdown: lines.join("\n"),
+    runCount: nRuns,
+    gapRunCount: parsed.length,
+  };
+}
diff --git a/src/agents/orchestrator.ts b/src/agents/orchestrator.ts
index fe3f7647..d7ba1704 100644
--- a/src/agents/orchestrator.ts
+++ b/src/agents/orchestrator.ts
@@ -27,6 +27,7 @@ import { runEvaluationAgent } from "./evaluation-agent.js";
 import { runTuningAgent } from "./tuning-agent.js";
 import { generateCalibrationReport } from "./report-generator.js";
 import { ActivityLogger } from "./activity-logger.js";
+import { createCalibrationRunDir, extractFixtureName } from "./run-directory.js";
 
 export interface CalibrationRunOptions {
   enableActivityLog?: boolean;
@@ -372,7 +373,8 @@ export async function runCalibration(
   const parsed = CalibrationConfigSchema.parse(config);
   const pipelineStart = Date.now();
   const startedAt = new Date().toISOString();
-  const logger = options?.enableActivityLog ? new ActivityLogger(parsed.input) : null;
+  const runDir = parsed.runDir ?? createCalibrationRunDir(extractFixtureName(parsed.input));
+  const logger = options?.enableActivityLog ? new ActivityLogger(runDir) : null;
 
   try {
     // Step 1: Load and analyze
diff --git a/src/agents/run-directory.test.ts b/src/agents/run-directory.test.ts
new file mode 100644
index 00000000..fc8b303a
--- /dev/null
+++ b/src/agents/run-directory.test.ts
@@ -0,0 +1,129 @@
+import { mkdtempSync, existsSync } from "node:fs";
+import { join, basename } from "node:path";
+import { tmpdir } from "node:os";
+import { rm } from "node:fs/promises";
+import {
+  extractFixtureName,
+  parseRunDirName,
+  createCalibrationRunDir,
+  createRuleDiscoveryRunDir,
+  listCalibrationRuns,
+} from "./run-directory.js";
+
+describe("extractFixtureName", () => {
+  it("extracts name from path with directory and .json", () => {
+    expect(extractFixtureName("fixtures/material3-kit.json")).toBe("material3-kit");
+  });
+
+  it("extracts name from bare filename", () => {
+    expect(extractFixtureName("my-design.json")).toBe("my-design");
+  });
+
+  it("returns as-is when no .json extension", () => {
+    expect(extractFixtureName("fixtures/something")).toBe("something");
+  });
+
+  it("handles nested paths", () => {
+    expect(extractFixtureName("a/b/c/deep-nested.json")).toBe("deep-nested");
+  });
+});
+
+describe("parseRunDirName", () => {
+  it("splits on last double-dash", () => {
+    const result = parseRunDirName("material3-kit--2026-03-24-0200");
+    expect(result.name).toBe("material3-kit");
+    expect(result.timestamp).toBe("2026-03-24-0200");
+  });
+
+  it("handles names with multiple dashes", () => {
+    const result = parseRunDirName("simple-ds-card-grid--2026-03-24-0200");
+    expect(result.name).toBe("simple-ds-card-grid");
+    expect(result.timestamp).toBe("2026-03-24-0200");
+  });
+
+  it("returns full string as name when no double-dash", () => {
+    const result = parseRunDirName("no-separator");
+    expect(result.name).toBe("no-separator");
+    expect(result.timestamp).toBe("");
+  });
+});
+
+describe("createCalibrationRunDir", () => {
+  const origCwd = process.cwd();
+  let tempDir: string;
+
+  beforeEach(() => {
+    tempDir = mkdtempSync(join(tmpdir(), "run-dir-test-"));
+    process.chdir(tempDir);
+  });
+
+  afterEach(async () => {
+    process.chdir(origCwd);
+    await rm(tempDir, { recursive: true, force: true });
+  });
+
+  it("creates directory and returns path with fixture name and timestamp", () => {
+    const runDir = createCalibrationRunDir("material3-kit");
+    expect(existsSync(runDir)).toBe(true);
+
+    const dirName = basename(runDir);
+    expect(dirName).toMatch(/^material3-kit--\d{4}-\d{2}-\d{2}-\d{4}$/);
+  });
+
+  it("creates directory under logs/calibration/", () => {
+    const runDir = createCalibrationRunDir("test-fixture");
+    expect(runDir).toContain("logs/calibration/");
+  });
+});
+
+describe("createRuleDiscoveryRunDir", () => {
+  const origCwd = process.cwd();
+  let tempDir: string;
+
+  beforeEach(() => {
+    tempDir = mkdtempSync(join(tmpdir(), "run-dir-test-"));
+    process.chdir(tempDir);
+  });
+
+  afterEach(async () => {
+    process.chdir(origCwd);
+    await rm(tempDir, { recursive: true, force: true });
+  });
+
+  it("creates directory with date-only timestamp", () => {
+    const runDir = createRuleDiscoveryRunDir("text-alignment");
+    expect(existsSync(runDir)).toBe(true);
+
+    const dirName = basename(runDir);
+    expect(dirName).toMatch(/^text-alignment--\d{4}-\d{2}-\d{2}$/);
+  });
+});
+
+describe("listCalibrationRuns", () => {
+  const origCwd = process.cwd();
+  let tempDir: string;
+
+  beforeEach(() => {
+    tempDir = mkdtempSync(join(tmpdir(), "run-dir-test-"));
+    process.chdir(tempDir);
+  });
+
+  afterEach(async () => {
+    process.chdir(origCwd);
+    await rm(tempDir, { recursive: true, force: true });
+  });
+
+  it("returns empty array when no runs exist", () => {
+    expect(listCalibrationRuns()).toEqual([]);
+  });
+
+  it("lists run directories sorted, ignoring non-run files", () => {
+    const dir1 = createCalibrationRunDir("aaa-fixture");
+    const dir2 = createCalibrationRunDir("zzz-fixture");
+    const runs = listCalibrationRuns();
+
+    expect(runs.length).toBe(2);
+    expect(runs[0]).toBe(dir1);
+    expect(runs[1]).toBe(dir2);
+  });
+});
diff --git a/src/agents/run-directory.ts b/src/agents/run-directory.ts
new file mode 100644
index 00000000..517cccf1
--- /dev/null
+++ b/src/agents/run-directory.ts
@@ -0,0 +1,103 @@
+import { existsSync, mkdirSync, readdirSync } from "node:fs";
+import { resolve, join } from "node:path";
+
+const CALIBRATION_DIR = "logs/calibration";
+const RULE_DISCOVERY_DIR = "logs/rule-discovery";
+
+function getDateTimeString(): string {
+  const now = new Date();
+  const year = now.getFullYear();
+  const month = String(now.getMonth() + 1).padStart(2, "0");
+  const day = String(now.getDate()).padStart(2, "0");
+  const hours = String(now.getHours()).padStart(2, "0");
+  const minutes = String(now.getMinutes()).padStart(2, "0");
+  return `${year}-${month}-${day}-${hours}${minutes}`;
+}
+
+function getDateString(): string {
+  const now = new Date();
+  const year = now.getFullYear();
+  const month = String(now.getMonth() + 1).padStart(2, "0");
+  const day = String(now.getDate()).padStart(2, "0");
+  return `${year}-${month}-${day}`;
+}
+
+/**
+ * Extract a short fixture name from a file path.
+ * e.g. "fixtures/http-design.json" → "http-design"
+ */
+export function extractFixtureName(fixturePath: string): string {
+  const fileName = fixturePath.split("/").pop() ?? fixturePath;
+  return fileName.replace(/\.json$/, "");
+}
+
+/**
+ * Build a run directory name: `<name>--<timestamp>`
+ * Double dash separates name from timestamp (names can contain single dashes).
+ */
+function buildRunDirName(name: string, timestamp: string): string {
+  return `${name}--${timestamp}`;
+}
+
+/**
+ * Parse a run directory name into its components.
+ * e.g. "material3-kit--2026-03-24-0200" → { name: "material3-kit", timestamp: "2026-03-24-0200" }
+ */
+export function parseRunDirName(dirName: string): { name: string; timestamp: string } {
+  const idx = dirName.lastIndexOf("--");
+  if (idx === -1) {
+    return { name: dirName, timestamp: "" };
+  }
+  return {
+    name: dirName.slice(0, idx),
+    timestamp: dirName.slice(idx + 2),
+  };
+}
+
+/**
+ * Create a calibration run directory and return its absolute path.
+ * Format: `logs/calibration/<fixture-name>--<YYYY-MM-DD-HHMM>/`
+ */
+export function createCalibrationRunDir(fixtureName: string): string {
+  const timestamp = getDateTimeString();
+  const dirName = buildRunDirName(fixtureName, timestamp);
+  const dirPath = resolve(CALIBRATION_DIR, dirName);
+  mkdirSync(dirPath, { recursive: true });
+  return dirPath;
+}
+
+/**
+ * Create a rule discovery run directory and return its absolute path.
+ * Format: `logs/rule-discovery/<concept-slug>--<YYYY-MM-DD>/`
+ */
+export function createRuleDiscoveryRunDir(conceptSlug: string): string {
+  const timestamp = getDateString();
+  const dirName = buildRunDirName(conceptSlug, timestamp);
+  const dirPath = resolve(RULE_DISCOVERY_DIR, dirName);
+  mkdirSync(dirPath, { recursive: true });
+  return dirPath;
+}
+
+/**
+ * List all calibration run directories, sorted by name (oldest first).
+ */
+export function listCalibrationRuns(): string[] {
+  const dir = resolve(CALIBRATION_DIR);
+  if (!existsSync(dir)) return [];
+  return readdirSync(dir, { withFileTypes: true })
+    .filter((e) => e.isDirectory() && e.name.includes("--"))
+    .map((e) => join(dir, e.name))
+    .sort();
+}
+
+/**
+ * List all rule discovery run directories, sorted by name (oldest first).
+ */
+export function listRuleDiscoveryRuns(): string[] {
+  const dir = resolve(RULE_DISCOVERY_DIR);
+  if (!existsSync(dir)) return [];
+  return readdirSync(dir, { withFileTypes: true })
+    .filter((e) => e.isDirectory() && e.name.includes("--"))
+    .map((e) => join(dir, e.name))
+    .sort();
+}
diff --git a/src/cli/index.ts b/src/cli/index.ts
index dc5435a0..e931d90e 100644
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@@ -30,6 +30,7 @@ import {
   runCalibrationEvaluate,
   filterConversionCandidates,
 } from "../agents/orchestrator.js";
+import { generateGapRuleReport } from "../agents/gap-rule-report.js";
 import { handleDocs } from "./docs.js";
 import { initMonitoring, trackEvent, trackError, shutdownMonitoring, EVENTS } from "../core/monitoring/index.js";
 import { POSTHOG_API_KEY as BUILTIN_PH_KEY, SENTRY_DSN as BUILTIN_SENTRY_DSN } from "../core/monitoring/keys.js";
@@ -303,6 +304,7 @@ cli
 
 interface CalibrateAnalyzeOptions {
   output?: string;
+  runDir?: string;
   token?: string;
   targetNodeId?: string;
 }
@@ -313,6 +315,7 @@ cli
     "Run calibration analysis and output JSON for conversion step"
   )
   .option("--output <path>", "Output JSON path", { default: "logs/calibration/calibration-analysis.json" })
+  .option("--run-dir <path>", "Run directory (overrides --output, writes to <run-dir>/analysis.json)")
   .option("--token <token>", "Figma API token (or use FIGMA_TOKEN env var)")
   .option("--target-node-id <nodeId>", "Scope analysis to a specific node")
   .action(async (input: string, options: CalibrateAnalyzeOptions) => {
@@ -348,7 +351,9 @@ cli
         ruleScores,
       };
 
-      const outputPath = resolve(options.output ?? "logs/calibration/calibration-analysis.json");
+      const outputPath = options.runDir
+        ? resolve(options.runDir, "analysis.json")
+        : resolve(options.output ?? "logs/calibration/calibration-analysis.json");
       const outputDir = dirname(outputPath);
       if (!existsSync(outputDir)) {
         mkdirSync(outputDir, { recursive: true });
@@ -372,6 +377,7 @@ cli
 
 interface CalibrateEvaluateOptions {
   output?: string;
+  runDir?: string;
 }
 
 cli
@@ -380,12 +386,17 @@ cli
     "Evaluate conversion results and generate calibration report"
   )
   .option("--output <path>", "Report output path")
+  .option("--run-dir <path>", "Run directory (reads analysis.json + conversion.json, writes summary.md)")
   .action(async (analysisJsonPath: string, conversionJsonPath: string, options: CalibrateEvaluateOptions) => {
     try {
       console.log("Running calibration evaluation...");
 
-      const analysisPath = resolve(analysisJsonPath);
-      const conversionPath = resolve(conversionJsonPath);
+      const analysisPath = options.runDir
+        ? resolve(options.runDir, "analysis.json")
+        : resolve(analysisJsonPath);
+      const conversionPath = options.runDir
+        ? resolve(options.runDir, "conversion.json")
+        : resolve(conversionJsonPath);
 
       if (!existsSync(analysisPath)) {
         throw new Error(`Analysis file not found: ${analysisPath}`);
@@ -404,10 +415,16 @@ cli
         analysisData.ruleScores
       );
 
-      const calNow = new Date();
-      const calTs = `${calNow.getFullYear()}-${String(calNow.getMonth() + 1).padStart(2, "0")}-${String(calNow.getDate()).padStart(2, "0")}-${String(calNow.getHours()).padStart(2, "0")}-${String(calNow.getMinutes()).padStart(2, "0")}`;
-      const defaultCalOutput = `logs/calibration/calibration-${calTs}.md`;
-      const outputPath = resolve(options.output ?? defaultCalOutput);
+      let outputPath: string;
+      if (options.runDir) {
+        outputPath = resolve(options.runDir, "summary.md");
+      } else if (options.output) {
+        outputPath = resolve(options.output);
+      } else {
+        const calNow = new Date();
+        const calTs = `${calNow.getFullYear()}-${String(calNow.getMonth() + 1).padStart(2, "0")}-${String(calNow.getDate()).padStart(2, "0")}-${String(calNow.getHours()).padStart(2, "0")}-${String(calNow.getMinutes()).padStart(2, "0")}`;
+        outputPath = resolve(`logs/calibration/calibration-${calTs}.md`);
+      }
       const calOutputDir = dirname(outputPath);
       if (!existsSync(calOutputDir)) {
         mkdirSync(calOutputDir, { recursive: true });
@@ -442,6 +459,85 @@ cli
     }
   });
 
+interface CalibrateGapReportOptions {
+  calibrationDir?: string;
+  output?: string;
+  minRepeat?: string;
+  json?: boolean;
+}
+
+cli
+  .command(
+    "calibrate-gap-report",
+    "Aggregate gap data and calibration runs into a rule review report"
+  )
+  .option("--calibration-dir <path>", "Calibration runs directory", {
+    default: "logs/calibration",
+  })
+  .option("--output <path>", "Markdown report path", {
+    default: "logs/calibration/REPORT.md",
+  })
+  .option("--min-repeat <n>", "Minimum distinct fixtures to treat as a repeating pattern", {
+    default: "2",
+  })
+  .option("--json", "Print JSON summary to stdout")
+  .action(async (options: CalibrateGapReportOptions) => {
+    try {
+      const minRepeat = Math.max(1, parseInt(options.minRepeat ?? "2", 10) || 2);
+      const result = generateGapRuleReport({
+        calibrationDir: resolve(options.calibrationDir ?? "logs/calibration"),
+        minPatternRepeat: minRepeat,
+      });
+
+      const outPath = resolve(options.output ?? "logs/calibration/REPORT.md");
+      const outDir = dirname(outPath);
+      if (!existsSync(outDir)) {
+        mkdirSync(outDir, { recursive: true });
+      }
+
+      // Backup existing report with timestamp before overwriting
+      if (existsSync(outPath)) {
+        const { readFile: readFileAsync } = await import("node:fs/promises");
+        const existing = await readFileAsync(outPath, "utf-8");
+        // Extract timestamp from the "Generated:" line
+        const match = existing.match(/Generated:\s*(\d{4}-\d{2}-\d{2}T[\d:.]+Z)/);
+        if (match?.[1]) {
+          const ts = match[1].replace(/[:.]/g, "-").replace("T", "-").replace("Z", "");
+          const backupPath = outPath.replace(/\.md$/, `--${ts}.md`);
+          await writeFile(backupPath, existing, "utf-8");
+          console.log(`  Previous report backed up: ${backupPath}`);
+        }
+      }
+
+      await writeFile(outPath, result.markdown, "utf-8");
+
+      console.log("Gap rule review report written.");
+      console.log(`  Runs with gaps: ${result.gapRunCount}`);
+      console.log(`  Runs with snapshots: ${result.runCount}`);
+      console.log(`  Output: ${outPath}`);
+
+      if (options.json) {
+        console.log(
+          JSON.stringify(
+            {
+              gapRunCount: result.gapRunCount,
+              runCount: result.runCount,
+              outputPath: outPath,
+            },
+            null,
+            2
+          )
+        );
+      }
+    } catch (error) {
+      console.error(
+        "\nError:",
+        error instanceof Error ? error.message : String(error)
+      );
+      process.exit(1);
+    }
+  });
+
 interface CalibrateRunOptions {
   output?: string;
   token?: string;
@@ -551,6 +647,11 @@ cli
 
       const { file } = await loadFile(input, options.token);
 
+      // Store original Figma URL in fixture for future reference
+      if (isFigmaUrl(input)) {
+        file.sourceUrl = input;
+      }
+
       const outputPath = resolve(
         options.output ?? `fixtures/${file.fileKey}.json`
       );
diff --git a/src/core/contracts/figma-node.ts b/src/core/contracts/figma-node.ts
index fde04c7b..f2f0c17c 100644
--- a/src/core/contracts/figma-node.ts
+++ b/src/core/contracts/figma-node.ts
@@ -128,6 +128,7 @@ export const AnalysisFileSchema = z.object({
   name: z.string(),
   lastModified: z.string(),
   version: z.string(),
+  sourceUrl: z.string().optional(),
   document: AnalysisNodeSchema,
   components: z.record(
     z.string(),