Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
cb4b4e9
fix(cfg): route early exits through finally with target-relative thre…
magyargergo Jun 10, 2026
d72c8bd
feat(cfg): harvest per-statement def/use facts into the side channel …
magyargergo Jun 10, 2026
a49d32d
feat(cfg): add reaching-definitions solver with GEN/KILL fixpoint + s…
magyargergo Jun 10, 2026
8f2ffb8
feat(cfg): persist budgeted REACHING_DEF projection with RepoMeta coh…
magyargergo Jun 10, 2026
f9a1443
test(cfg): REACHING_DEF snapshot, pipeline both-sinks, and cache-seam…
magyargergo Jun 10, 2026
ac8968c
bench(cfg): reaching-defs scaling gates — dense-bindings + fact-fanou…
magyargergo Jun 10, 2026
157de5e
fix(mcp): exclude BasicBlock pseudo-symbols from detect_changes on pd…
magyargergo Jun 10, 2026
cf360d4
style: prettier pass over M2 files
magyargergo Jun 10, 2026
cebc8fb
fix(cfg): review-pass fixes — defKey overflow guard, catch-param bloc…
magyargergo Jun 10, 2026
5fd049c
Merge branch 'main' into feat/reaching-def-m2
magyargergo Jun 10, 2026
e356803
test(run-analyze): model the M2 RepoMeta.pdg stamp in resolvePdgConfi…
magyargergo Jun 11, 2026
a32958c
test(cfg): reassign the shadowing fixture's bindings — fixes prefer-c…
magyargergo Jun 11, 2026
a271f25
fix(cfg): validate entry/exit indices in the emit-safety guard
magyargergo Jun 11, 2026
04da731
fix(cfg): report the def-key stride bail-out as a distinct 'overflow'…
magyargergo Jun 11, 2026
e8fa243
perf(cfg): cache the nearest enclosing scope per node during the prescan
magyargergo Jun 11, 2026
a951459
fix(cfg): stop harvesting initializer-less var declarators as defs
magyargergo Jun 11, 2026
3d8102a
fix(cfg): unwrap parenthesized/non-null lvalue wrappers before def de…
magyargergo Jun 11, 2026
a67a566
fix(cfg): conditionally-evaluated defs are MAY-defs — gen without kill
magyargergo Jun 11, 2026
1426247
fix(cfg): model labeled statements generically — break keeps its real…
magyargergo Jun 11, 2026
98f19c3
fix(cfg): throw edges deliver ALL of a block's defs to the handler
magyargergo Jun 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 27 additions & 6 deletions gitnexus/bench/cfg/baselines.json
Original file line number Diff line number Diff line change
@@ -1,23 +1,44 @@
{
"straight-line": {
"fingerprint": "f5524690b5b7d484573710938c5e9a28e08ef0882fea95111f01575c71f4a66a",
"fingerprint": "792229965a726d2c6b527f9ee65440a2b3023839ee71cb51522fc30e2f2cb454",
"scaling_budget": 1.5,
"disk_bytes_budget": 1.2,
"heap_budget": 1.3,
"_note": "#2081 M1: ONE function, N coalescing statements (extendBlock text accumulation). Runs at 2000->8000 (larger than the other scenarios — output is constant 4 blocks, so disk/heap can't see this path; the TIME ratio is the sole guard). Verified at this N: the array-join impl is ~1.0, a V8-rope-optimized `+=` is also ~1.0 (correctly NOT a real regression — ropes keep naive concat linear), but a genuine O(n²) accumulation (e.g. re-join-the-array-every-append) is ~3.8 — so budget 1.5 catches a true superlinear regression while passing linear concat. disk ~1.03, retained heap ~0.98. Re-baseline the fingerprint only on an intentional CFG-shape change."
"rd_scaling_budget": 2.0,
"disk_bytes_large_max": 1309481,
"_note": "#2081 M1 / #2082 M2: ONE function, N coalescing statements (extendBlock text accumulation + per-statement fact harvest). Runs at 2000->8000. M2 REWROTE the old 'output is constant 4 blocks' note: statement facts make disk/heap LINEAR in N (a free gate on the harvest payload); TIME still guards the concat path (array-join ~1.0; a genuine O(n^2) re-join accumulation is ~3.8). M2 adds rd_scaling_budget (measured ~0.74) and disk_bytes_large_max -- an ABSOLUTE ceiling ~1.35x the measured indexed-encoding bytes (969,986 at N=8000, ~121 B/stmt); a named-record encoding regression (~4x facts bytes) blows it. Re-baseline the fingerprint only on an intentional CFG/harvest-shape change (the canon now includes statements+bindings)."
},
"many-functions": {
"fingerprint": "c167ccd83086254e2b71eca153ca4a833be14b2d2a3827ab76b49f643aad13d5",
"fingerprint": "f3bcc5e6ef4cf58aefe4e7d801a8fea0215494b9688833e501c2afc6df029c1b",
"scaling_budget": 1.5,
"disk_bytes_budget": 1.2,
"heap_budget": 1.3,
"_note": "#2081 M1: N small branchy functions (collect walk + per-function build). Time ~1.0, disk ~1.01, retained heap ~1.0 (~1KB/function; ~2MB at 2000 fns)."
"rd_scaling_budget": 2.0,
"_note": "#2081 M1 / #2082 M2: N small branchy functions (collect walk + per-function build + per-function solve). Time ~1.0, disk ~1.01, heap ~1.0, rd ~0.86 (solver is per-function; N functions scale linearly)."
},
"branchy": {
"fingerprint": "944ab56ffc70e195f74d8533a8aadf4930d37d13bcfa47cc4feff29e74ddca5c",
"fingerprint": "5b5886521ab21604df8f78af98c8c28a6be8e64c24f3d67b165c2d96ba2a3d52",
"scaling_budget": 1.8,
"disk_bytes_budget": 1.2,
"heap_budget": 1.3,
"_note": "#2081 M1: ONE function, N sequential ifs (block/edge growth in one CFG). Time ~1.1-1.25 (REPS=15 median; noisiest scenario), disk ~1.04, retained heap ~1.0. Time budget 1.8 absorbs noise while catching ~4.0 quadratic."
"rd_scaling_budget": 2.0,
"_note": "#2081 M1 / #2082 M2: ONE function, N sequential ifs (block/edge growth in one CFG). Time ~1.1-1.25 (noisiest scenario; budget 1.8 absorbs noise, catches ~4.0 quadratic), disk ~1.03, heap ~1.0, rd ~0.7."
},
"dense-bindings": {
"fingerprint": "e4d7eb3c7e8b3772423af25cef391e0e6b68067b554819e81b543439a487403f",
"scaling_budget": 1.8,
"disk_bytes_budget": 1.2,
"heap_budget": 1.3,
"rd_scaling_budget": 10.0,
"_note": "#2082 M2: N bindings live across ~N blocks in one loop -- bindings x blocks scale JOINTLY (the solver-lattice stressor). The overlay design measures rd ~5.2 normalized: the OUT spine copy on genning blocks is O(V) per block, which is quadratic when V scales with B (bounded in prod by maxFunctionLines; real functions have V~10-40). Budget 10 deliberately tolerates that known shape and exists to catch the repo's recurring per-item-rescan class (a per-use scan over all defs is O(n^3) here, ratio >=16). If rd drops well below 5, tighten."
},
"fact-fanout": {
"fingerprint": "488e63e072d514a9229e21872615e32c7b099ccbd65ec8c045ba517568fd3e5d",
"scaling_budget": 1.8,
"disk_bytes_budget": 1.2,
"heap_budget": 1.3,
"rd_scaling_budget": 3.0,
"facts_large_max": 16000,
"_note": "#2082 M2: N switch-arm defs of one variable + N later uses -- facts are O(defs x uses) BY SPEC, so the gate is BOUNDEDNESS, not linearity: with the production fact limit engaged (DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION=16000) the materialized fact count stays pinned at the limit as N grows (facts_large_max), and rd time stays bounded (measured ~1.4). Losing the maxFacts early-stop shows as facts_large exploding quadratically."
}
}
104 changes: 102 additions & 2 deletions gitnexus/bench/cfg/measure.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ import { fileURLToPath } from 'node:url';
import Parser from 'tree-sitter';
import TypeScript from 'tree-sitter-typescript';
import { collectFunctionCfgs } from '../../src/core/ingestion/cfg/collect.ts';
import { computeReachingDefs } from '../../src/core/ingestion/cfg/reaching-defs.ts';
import { DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION } from '../../src/core/ingestion/cfg/emit.ts';
import { createTypeScriptCfgVisitor } from '../../src/core/ingestion/cfg/visitors/typescript.ts';
import { getTreeSitterBufferSize } from '../../src/core/ingestion/constants.ts';

Expand Down Expand Up @@ -102,6 +104,43 @@ const SCENARIOS = [
return s + '}\n';
},
},
{
name: 'dense-bindings',
// #2082 M2: N bindings live across ~N blocks inside one loop — bindings ×
// blocks scale JOINTLY, the discriminator for solver-lattice quadratics.
// The overlay design (KTD2: sets shared by reference, OUT spine-copied
// only on gen) is expected to scale ~linearly-with-a-spine-copy here
// (normalized ratio low single digits); the regression this scenario
// exists to catch is the repo's recurring per-item-rescan shape — a
// per-use scan over all defs (O(n³) here) blows the ratio past ~16.
// rd time is the gated metric (rd_scaling_budget).
rdMaxFacts: 0, // measure the algorithm, not the cap
gen: (n) => {
let s = 'function f(c: number) {\n';
for (let i = 0; i < n; i++) s += ` let v${i} = ${i};\n`;
s += ' while (c > 0) {\n';
for (let i = 0; i < n; i++) s += ` if (c > ${i}) { v${i} = v${(i + 1) % n} + 1; }\n`;
return s + ' c = c - 1;\n }\n return v0;\n}\n';
},
},
{
name: 'fact-fanout',
// #2082 M2: N parallel case-arm defs of one variable + N later uses —
// facts are O(defs×uses) BY SPEC, so a linearity ratio gate is the wrong
// shape. The gate here is BOUNDEDNESS: with the production fact limit
// engaged, the materialized fact count stays FLAT (== limit) as N grows
// past it (facts_large_max), and rd time stays bounded. An unbounded
// materialization regression (losing the maxFacts early-stop) shows as
// facts_large exploding quadratically.
rdMaxFacts: DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION,
gen: (n) => {
let s = 'function f(c: number) {\n let x = 0;\n switch (c) {\n';
for (let i = 0; i < n; i++) s += ` case ${i}: x = ${i}; break;\n`;
s += ' }\n';
for (let i = 0; i < n; i++) s += ` u${i}(x);\n`;
return s + '}\n';
},
},
];

const SMALL = 500;
Expand Down Expand Up @@ -130,6 +169,7 @@ function measureCollect(src, file, reps) {
}
return {
ms: median(samples),
cfgs: out.cfgs,
blockCount: out.cfgs.reduce((a, c) => a + c.blocks.length, 0),
// DISK growth: utf8 byte size of the serialized cfgSideChannel — exactly
// what a --pdg run writes onto every ParsedFile shard in the durable store
Expand All @@ -140,6 +180,25 @@ function measureCollect(src, file, reps) {
};
}

// ---- reaching-defs solve cost (#2082 M2) ----

// Times computeReachingDefs over a scenario's collected CFGs (the exact work
// the scope-resolution emit loop adds per file on a --pdg run). `maxFacts`
// mirrors the per-scenario production posture: 0 (unlimited) measures the
// algorithm; the production default exercises the boundedness contract.
function measureReachingDefs(cfgs, reps, maxFacts) {
for (const c of cfgs) computeReachingDefs(c, { maxFacts }); // warm JIT
const samples = [];
let facts = 0;
for (let i = 0; i < reps; i++) {
const start = process.hrtime.bigint();
facts = 0;
for (const c of cfgs) facts += computeReachingDefs(c, { maxFacts }).facts.length;
samples.push(Number(process.hrtime.bigint() - start) / 1e6);
}
return { ms: median(samples), facts };
}

// ---- memory growth: retained heap of the cfgSideChannel payload ----

// Needs `node --expose-gc` to force collection for a clean delta; without it the
Expand Down Expand Up @@ -169,10 +228,17 @@ function retainedHeapBytes(src, file) {

function canonicalizeCfg(cfg) {
const blocks = cfg.blocks
.map((b) => `B|${b.index}|${b.startLine}-${b.endLine}|${b.kind}|${b.text}`)
.map(
(b) =>
`B|${b.index}|${b.startLine}-${b.endLine}|${b.kind}|${b.text}|` +
// #2082 M2: statement facts join the canon so harvest drift (lost
// defs/uses, changed binding resolution) trips the fingerprint gate.
JSON.stringify(b.statements ?? null),
)
.sort();
const edges = cfg.edges.map((e) => `E|${e.from}->${e.to}|${e.kind}`).sort();
return `${cfg.functionStartLine}:${cfg.functionStartColumn}\n${blocks.join('\n')}\n${edges.join('\n')}`;
const bindings = JSON.stringify(cfg.bindings ?? null);
return `${cfg.functionStartLine}:${cfg.functionStartColumn}\n${bindings}\n${blocks.join('\n')}\n${edges.join('\n')}`;
}

function fingerprint(scenario) {
Expand Down Expand Up @@ -205,6 +271,14 @@ function measureScenario(scenario) {
? heapLarge / heapSmall / sizeRatio
: null;

// #2082 M2: reaching-defs solve cost over the same CFGs.
const rdMaxFacts = scenario.rdMaxFacts ?? 0;
const rdSmall = measureReachingDefs(small.cfgs, REPS, rdMaxFacts);
const rdLarge = measureReachingDefs(large.cfgs, REPS, rdMaxFacts);
// Clamp the denominator: a 0.000ms small-N median would otherwise yield
// ratio 0 and the gate would self-disable exactly when the solver is fast.
const rdRatio = rdLarge.ms / Math.max(rdSmall.ms, 0.001) / sizeRatio;

return {
scenario: scenario.name,
elapsed_ms_small: Number(small.ms.toFixed(3)),
Expand All @@ -218,6 +292,11 @@ function measureScenario(scenario) {
heap_ratio: heapRatio === null ? null : Number(heapRatio.toFixed(3)),
blocks_small: small.blockCount,
blocks_large: large.blockCount,
rd_ms_small: Number(rdSmall.ms.toFixed(3)),
rd_ms_large: Number(rdLarge.ms.toFixed(3)),
rd_scaling_ratio: Number(rdRatio.toFixed(3)),
facts_small: rdSmall.facts,
facts_large: rdLarge.facts,
...fingerprint(scenario),
};
}
Expand Down Expand Up @@ -267,6 +346,27 @@ if (!CHECK) {
`${base.disk_bytes_budget} (bytes ${r.disk_bytes_small}->${r.disk_bytes_large})`,
);
}
// #2082 M2 gates — rd solve-time scaling, fact-count boundedness, and an
// ABSOLUTE side-channel size ceiling (a ratio gate is blind to a
// constant-factor encoding bloat like named records vs indexed facts).
if (base.rd_scaling_budget !== undefined && r.rd_scaling_ratio >= base.rd_scaling_budget) {
failures.push(
`${r.scenario}: reaching-defs scaling ratio ${r.rd_scaling_ratio} >= budget ` +
`${base.rd_scaling_budget} (ms ${r.rd_ms_small}->${r.rd_ms_large})`,
);
}
if (base.facts_large_max !== undefined && r.facts_large > base.facts_large_max) {
failures.push(
`${r.scenario}: fact materialization ${r.facts_large} > bound ${base.facts_large_max} ` +
`(the maxFacts early-stop is the boundedness contract)`,
);
}
if (base.disk_bytes_large_max !== undefined && r.disk_bytes_large > base.disk_bytes_large_max) {
failures.push(
`${r.scenario}: cfgSideChannel absolute size ${r.disk_bytes_large} > ceiling ` +
`${base.disk_bytes_large_max} bytes (constant-factor encoding bloat)`,
);
}
// Heap gate only when measured (--expose-gc present) AND a budget exists.
if (
base.heap_budget !== undefined &&
Expand Down
53 changes: 48 additions & 5 deletions gitnexus/src/core/ingestion/cfg/cfg-builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,14 @@
* hand-built block sequences, which is how the classic CFG hazards are pinned
* before the tree-sitter visitor (U2) drives it.
*/
import type { BasicBlockData, CfgEdgeData, CfgEdgeKind, FunctionCfg } from './types.js';
import type {
BasicBlockData,
BindingEntry,
CfgEdgeData,
CfgEdgeKind,
FunctionCfg,
StatementFacts,
} from './types.js';

interface MutableBlock {
startLine: number;
Expand All @@ -26,6 +33,13 @@ interface MutableBlock {
*/
textParts: string[];
kind: BasicBlockData['kind'];
/**
* Per-statement def/use facts in execution order (#2082 M2 U1). Parallel to
* the statements that accrued to this block — but self-describing (each
* record carries its line): facts-only attaches (ENTRY params, catch params)
* mean fact index ≠ text-fragment index.
*/
statements: StatementFacts[];
}

export class CfgBuilder {
Expand Down Expand Up @@ -54,8 +68,15 @@ export class CfgBuilder {
endLine: number,
text: string,
kind: BasicBlockData['kind'] = 'normal',
facts?: StatementFacts,
): number {
this.blocks.push({ startLine, endLine, textParts: text ? [text] : [], kind });
this.blocks.push({
startLine,
endLine,
textParts: text ? [text] : [],
kind,
statements: facts ? [facts] : [],
});
return this.blocks.length - 1;
}

Expand All @@ -73,20 +94,40 @@ export class CfgBuilder {
}

/** Extend a block's end line as more statements accrue to it. */
extendBlock(index: number, endLine: number, appendText?: string): void {
extendBlock(index: number, endLine: number, appendText?: string, facts?: StatementFacts): void {
const b = this.blocks[index];
if (!b) return;
if (endLine > b.endLine) b.endLine = endLine;
if (appendText) b.textParts.push(appendText);
if (facts) b.statements.push(facts);
}

/**
* Attach a facts-only statement record to a block WITHOUT touching its text
* or line span (#2082 M2 U1) — bench fingerprints and CFG snapshots include
* block text, so harvesting must never perturb it (ENTRY-block param defs
* are the canonical use; records that must precede a walked body get their
* own facts-only block instead, see the catch-param handling in visitTry).
*/
attachFacts(index: number, facts: StatementFacts): void {
const b = this.blocks[index];
if (!b) return;
b.statements.push(facts);
}

get blockCount(): number {
return this.blocks.length;
}

/** Produce the serializable CFG. Caller is responsible for having wired the
* function's dangling exits to {@link exitIndex} before calling. */
finish(): FunctionCfg {
* function's dangling exits to {@link exitIndex} before calling.
*
* Pass `bindings` (the function's binding table, possibly empty) to emit
* statement facts (#2082 M2 U1) — every block then carries a `statements`
* array. Omit it (hand-built test CFGs, pre-M2 producers) and both fields
* are absent, which the reaching-defs solver reports as `no-facts`. */
finish(bindings?: readonly BindingEntry[]): FunctionCfg {
const withFacts = bindings !== undefined;
return {
filePath: this.filePath,
functionStartLine: this.functionStartLine,
Expand All @@ -100,8 +141,10 @@ export class CfgBuilder {
endLine: b.endLine,
text: b.textParts.join('\n'),
kind: b.kind,
...(withFacts ? { statements: b.statements } : {}),
})),
edges: [...this.edges],
...(withFacts ? { bindings } : {}),
};
}
}
Expand Down
Loading
Loading