From 6c071a21536d2ab063d224b73bdfb5682fbd0648 Mon Sep 17 00:00:00 2001 From: let-sunny Date: Thu, 26 Mar 2026 18:50:01 +0900 Subject: [PATCH] feat: use calibrated per-rule scores in final percentage calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace flat severity weights (blocking=3.0, risk=2.0, etc.) with calculatedScore from rule engine in density calculation. This makes per-rule scores and depthWeight from rule-config.ts actually influence the user-facing score, connecting the calibration pipeline to output. Before: no-auto-layout (-10) and missing-size-constraint (-3) both contributed 3.0 (same severity = same weight). depthWeight was computed but never consumed by scoring. After: no-auto-layout at root contributes 15 (|-10 × 1.5|), while missing-size-constraint contributes 3 (|-3 × 1.0|). Calibration loop score adjustments now flow through to final percentages. Closes #104 --- src/core/engine/scoring.test.ts | 30 +++++++++++++++++++++++------- src/core/engine/scoring.ts | 27 +++++++++------------------ 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/src/core/engine/scoring.test.ts b/src/core/engine/scoring.test.ts index a2e3297f..cee9eea2 100644 --- a/src/core/engine/scoring.test.ts +++ b/src/core/engine/scoring.test.ts @@ -86,18 +86,34 @@ describe("calculateScores", () => { expect(scores.summary.totalIssues).toBe(4); }); - it("applies severity density weights (blocking=3.0 > risk=2.0 > missing-info=1.0 > suggestion=0.5)", () => { - const blocking = calculateScores(makeResult([ - makeIssue({ ruleId: "no-auto-layout", category: "structure", severity: "blocking" }), + it("uses calculatedScore for density: higher score = more density impact", () => { + const heavy = calculateScores(makeResult([ + makeIssue({ ruleId: "no-auto-layout", category: "structure", severity: "blocking", score: -10 }), ], 100)); - const suggestion = calculateScores(makeResult([ - makeIssue({ ruleId: "numeric-suffix-name", category: "naming", severity: "suggestion" }), + const light = calculateScores(makeResult([ + makeIssue({ ruleId: "unnecessary-node", category: "structure", severity: "suggestion", score: -2 }), + ], 100)); + + expect(heavy.byCategory.structure.densityScore).toBeLessThan( + light.byCategory.structure.densityScore + ); + }); + + it("differentiates rules within the same severity by score", () => { + const highScore = calculateScores(makeResult([ + makeIssue({ ruleId: "no-auto-layout", category: "structure", severity: "blocking", score: -10 }), + ], 100)); + + const lowScore = calculateScores(makeResult([ + makeIssue({ ruleId: "absolute-position-in-auto-layout", category: "structure", severity: "blocking", score: -3 }), ], 100)); - expect(blocking.byCategory.structure.densityScore).toBeLessThan( - suggestion.byCategory.naming.densityScore + expect(highScore.byCategory.structure.densityScore).toBeLessThan( + lowScore.byCategory.structure.densityScore ); + expect(highScore.byCategory.structure.weightedIssueCount).toBe(10); + expect(lowScore.byCategory.structure.weightedIssueCount).toBe(3); }); it("density score decreases as weighted issue count increases relative to node count", () => { diff --git a/src/core/engine/scoring.ts b/src/core/engine/scoring.ts index 178bc7fa..c2c47542 100644 --- a/src/core/engine/scoring.ts +++ b/src/core/engine/scoring.ts @@ -47,26 +47,17 @@ export interface ScoreReport { export type Grade = "S" | "A+" | "A" | "B+" | "B" | "C+" | "C" | "D" | "F"; /** - * Severity weights for density calculation. + * Density weighting now uses per-rule `calculatedScore` from the rule engine, + * which incorporates both the calibrated rule score and depthWeight. * - * Rationale (initial intuition, pending calibration validation): - * - blocking (3.0): issues that prevent correct implementation — weighted highest - * - risk (2.0): implementable now but will break later — significant but less than blocking - * - missing-info (1.0): forces guessing — baseline weight - * - suggestion (0.5): nice-to-have improvements — minimal impact on implementation accuracy + * Previously, flat severity weights (blocking=3.0, risk=2.0, etc.) were used, + * making all rules within the same severity contribute equally and rendering + * the per-rule scores in rule-config.ts effectively unused. * - * The 3:2:1:0.5 ratio reflects relative implementation difficulty. A single blocking - * issue (e.g., missing auto-layout) costs more effort than 3 suggestions combined. - * These weights are multiplied with issue counts to produce a weighted density score. - * - * Status: initial values. To be validated via /calibrate-loop against visual-compare results. + * Now: `no-auto-layout` (score: -10, depthWeight: 1.5) at root contributes 15 + * to density, while `unnecessary-node` (score: -2, no depthWeight) contributes 2. + * This makes calibration loop score adjustments flow through to user-facing scores. */ -const SEVERITY_DENSITY_WEIGHT: Record = { - blocking: 3.0, - risk: 2.0, - "missing-info": 1.0, - suggestion: 0.5, -}; /** * Total rules per category — used as denominator for diversity scoring. @@ -181,7 +172,7 @@ export function calculateScores(result: AnalysisResult): ScoreReport { categoryScores[category].issueCount++; categoryScores[category].bySeverity[severity]++; - categoryScores[category].weightedIssueCount += SEVERITY_DENSITY_WEIGHT[severity]; + categoryScores[category].weightedIssueCount += Math.abs(issue.calculatedScore); uniqueRulesPerCategory.get(category)!.add(ruleId); }