From 6c071a21536d2ab063d224b73bdfb5682fbd0648 Mon Sep 17 00:00:00 2001
From: let-sunny <sunny.dev.js@gmail.com>
Date: Thu, 26 Mar 2026 18:50:01 +0900
Subject: [PATCH] feat: use calibrated per-rule scores in final percentage
 calculation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace flat severity weights (blocking=3.0, risk=2.0, etc.) with
calculatedScore from rule engine in density calculation. This makes
per-rule scores and depthWeight from rule-config.ts actually influence
the user-facing score, connecting the calibration pipeline to output.

Before: no-auto-layout (-10) and missing-size-constraint (-3) both
contributed 3.0 (same severity = same weight). depthWeight was computed
but never consumed by scoring.

After: no-auto-layout at root contributes 15 (|-10 × 1.5|), while
missing-size-constraint contributes 3 (|-3 × 1.0|). Calibration loop
score adjustments now flow through to final percentages.

Closes #104
---
 src/core/engine/scoring.test.ts | 30 +++++++++++++++++++++++-------
 src/core/engine/scoring.ts      | 27 +++++++++------------------
 2 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/src/core/engine/scoring.test.ts b/src/core/engine/scoring.test.ts
index a2e3297f..cee9eea2 100644
--- a/src/core/engine/scoring.test.ts
+++ b/src/core/engine/scoring.test.ts
@@ -86,18 +86,34 @@ describe("calculateScores", () => {
     expect(scores.summary.totalIssues).toBe(4);
   });
 
-  it("applies severity density weights (blocking=3.0 > risk=2.0 > missing-info=1.0 > suggestion=0.5)", () => {
-    const blocking = calculateScores(makeResult([
-      makeIssue({ ruleId: "no-auto-layout", category: "structure", severity: "blocking" }),
+  it("uses calculatedScore for density: higher score = more density impact", () => {
+    const heavy = calculateScores(makeResult([
+      makeIssue({ ruleId: "no-auto-layout", category: "structure", severity: "blocking", score: -10 }),
     ], 100));
 
-    const suggestion = calculateScores(makeResult([
-      makeIssue({ ruleId: "numeric-suffix-name", category: "naming", severity: "suggestion" }),
+    const light = calculateScores(makeResult([
+      makeIssue({ ruleId: "unnecessary-node", category: "structure", severity: "suggestion", score: -2 }),
+    ], 100));
+
+    expect(heavy.byCategory.structure.densityScore).toBeLessThan(
+      light.byCategory.structure.densityScore
+    );
+  });
+
+  it("differentiates rules within the same severity by score", () => {
+    const highScore = calculateScores(makeResult([
+      makeIssue({ ruleId: "no-auto-layout", category: "structure", severity: "blocking", score: -10 }),
+    ], 100));
+
+    const lowScore = calculateScores(makeResult([
+      makeIssue({ ruleId: "absolute-position-in-auto-layout", category: "structure", severity: "blocking", score: -3 }),
     ], 100));
 
-    expect(blocking.byCategory.structure.densityScore).toBeLessThan(
-      suggestion.byCategory.naming.densityScore
+    expect(highScore.byCategory.structure.densityScore).toBeLessThan(
+      lowScore.byCategory.structure.densityScore
     );
+    expect(highScore.byCategory.structure.weightedIssueCount).toBe(10);
+    expect(lowScore.byCategory.structure.weightedIssueCount).toBe(3);
   });
 
   it("density score decreases as weighted issue count increases relative to node count", () => {
diff --git a/src/core/engine/scoring.ts b/src/core/engine/scoring.ts
index 178bc7fa..c2c47542 100644
--- a/src/core/engine/scoring.ts
+++ b/src/core/engine/scoring.ts
@@ -47,26 +47,17 @@ export interface ScoreReport {
 export type Grade = "S" | "A+" | "A" | "B+" | "B" | "C+" | "C" | "D" | "F";
 
 /**
- * Severity weights for density calculation.
+ * Density weighting now uses per-rule `calculatedScore` from the rule engine,
+ * which incorporates both the calibrated rule score and depthWeight.
  *
- * Rationale (initial intuition, pending calibration validation):
- * - blocking (3.0): issues that prevent correct implementation — weighted highest
- * - risk (2.0): implementable now but will break later — significant but less than blocking
- * - missing-info (1.0): forces guessing — baseline weight
- * - suggestion (0.5): nice-to-have improvements — minimal impact on implementation accuracy
+ * Previously, flat severity weights (blocking=3.0, risk=2.0, etc.) were used,
+ * making all rules within the same severity contribute equally and rendering
+ * the per-rule scores in rule-config.ts effectively unused.
  *
- * The 3:2:1:0.5 ratio reflects relative implementation difficulty. A single blocking
- * issue (e.g., missing auto-layout) costs more effort than 3 suggestions combined.
- * These weights are multiplied with issue counts to produce a weighted density score.
- *
- * Status: initial values. To be validated via /calibrate-loop against visual-compare results.
+ * Now: `no-auto-layout` (score: -10, depthWeight: 1.5) at root contributes 15
+ * to density, while `unnecessary-node` (score: -2, no depthWeight) contributes 2.
+ * This makes calibration loop score adjustments flow through to user-facing scores.
  */
-const SEVERITY_DENSITY_WEIGHT: Record<Severity, number> = {
-  blocking: 3.0,
-  risk: 2.0,
-  "missing-info": 1.0,
-  suggestion: 0.5,
-};
 
 /**
  * Total rules per category — used as denominator for diversity scoring.
@@ -181,7 +172,7 @@ export function calculateScores(result: AnalysisResult): ScoreReport {
 
     categoryScores[category].issueCount++;
     categoryScores[category].bySeverity[severity]++;
-    categoryScores[category].weightedIssueCount += SEVERITY_DENSITY_WEIGHT[severity];
+    categoryScores[category].weightedIssueCount += Math.abs(issue.calculatedScore);
     uniqueRulesPerCategory.get(category)!.add(ruleId);
   }