diff --git a/gitnexus/bench/cfg/baselines.json b/gitnexus/bench/cfg/baselines.json index f830cc6805..7e3d718d83 100644 --- a/gitnexus/bench/cfg/baselines.json +++ b/gitnexus/bench/cfg/baselines.json @@ -1,23 +1,44 @@ { "straight-line": { - "fingerprint": "f5524690b5b7d484573710938c5e9a28e08ef0882fea95111f01575c71f4a66a", + "fingerprint": "792229965a726d2c6b527f9ee65440a2b3023839ee71cb51522fc30e2f2cb454", "scaling_budget": 1.5, "disk_bytes_budget": 1.2, "heap_budget": 1.3, - "_note": "#2081 M1: ONE function, N coalescing statements (extendBlock text accumulation). Runs at 2000->8000 (larger than the other scenarios — output is constant 4 blocks, so disk/heap can't see this path; the TIME ratio is the sole guard). Verified at this N: the array-join impl is ~1.0, a V8-rope-optimized `+=` is also ~1.0 (correctly NOT a real regression — ropes keep naive concat linear), but a genuine O(n²) accumulation (e.g. re-join-the-array-every-append) is ~3.8 — so budget 1.5 catches a true superlinear regression while passing linear concat. disk ~1.03, retained heap ~0.98. Re-baseline the fingerprint only on an intentional CFG-shape change." + "rd_scaling_budget": 2.0, + "disk_bytes_large_max": 1309481, + "_note": "#2081 M1 / #2082 M2: ONE function, N coalescing statements (extendBlock text accumulation + per-statement fact harvest). Runs at 2000->8000. M2 REWROTE the old 'output is constant 4 blocks' note: statement facts make disk/heap LINEAR in N (a free gate on the harvest payload); TIME still guards the concat path (array-join ~1.0; a genuine O(n^2) re-join accumulation is ~3.8). M2 adds rd_scaling_budget (measured ~0.74) and disk_bytes_large_max -- an ABSOLUTE ceiling ~1.35x the measured indexed-encoding bytes (969,986 at N=8000, ~121 B/stmt); a named-record encoding regression (~4x facts bytes) blows it. Re-baseline the fingerprint only on an intentional CFG/harvest-shape change (the canon now includes statements+bindings)." }, "many-functions": { - "fingerprint": "c167ccd83086254e2b71eca153ca4a833be14b2d2a3827ab76b49f643aad13d5", + "fingerprint": "f3bcc5e6ef4cf58aefe4e7d801a8fea0215494b9688833e501c2afc6df029c1b", "scaling_budget": 1.5, "disk_bytes_budget": 1.2, "heap_budget": 1.3, - "_note": "#2081 M1: N small branchy functions (collect walk + per-function build). Time ~1.0, disk ~1.01, retained heap ~1.0 (~1KB/function; ~2MB at 2000 fns)." + "rd_scaling_budget": 2.0, + "_note": "#2081 M1 / #2082 M2: N small branchy functions (collect walk + per-function build + per-function solve). Time ~1.0, disk ~1.01, heap ~1.0, rd ~0.86 (solver is per-function; N functions scale linearly)." }, "branchy": { - "fingerprint": "944ab56ffc70e195f74d8533a8aadf4930d37d13bcfa47cc4feff29e74ddca5c", + "fingerprint": "5b5886521ab21604df8f78af98c8c28a6be8e64c24f3d67b165c2d96ba2a3d52", "scaling_budget": 1.8, "disk_bytes_budget": 1.2, "heap_budget": 1.3, - "_note": "#2081 M1: ONE function, N sequential ifs (block/edge growth in one CFG). Time ~1.1-1.25 (REPS=15 median; noisiest scenario), disk ~1.04, retained heap ~1.0. Time budget 1.8 absorbs noise while catching ~4.0 quadratic." + "rd_scaling_budget": 2.0, + "_note": "#2081 M1 / #2082 M2: ONE function, N sequential ifs (block/edge growth in one CFG). Time ~1.1-1.25 (noisiest scenario; budget 1.8 absorbs noise, catches ~4.0 quadratic), disk ~1.03, heap ~1.0, rd ~0.7." + }, + "dense-bindings": { + "fingerprint": "e4d7eb3c7e8b3772423af25cef391e0e6b68067b554819e81b543439a487403f", + "scaling_budget": 1.8, + "disk_bytes_budget": 1.2, + "heap_budget": 1.3, + "rd_scaling_budget": 10.0, + "_note": "#2082 M2: N bindings live across ~N blocks in one loop -- bindings x blocks scale JOINTLY (the solver-lattice stressor). The overlay design measures rd ~5.2 normalized: the OUT spine copy on genning blocks is O(V) per block, which is quadratic when V scales with B (bounded in prod by maxFunctionLines; real functions have V~10-40). Budget 10 deliberately tolerates that known shape and exists to catch the repo's recurring per-item-rescan class (a per-use scan over all defs is O(n^3) here, ratio >=16). If rd drops well below 5, tighten." + }, + "fact-fanout": { + "fingerprint": "488e63e072d514a9229e21872615e32c7b099ccbd65ec8c045ba517568fd3e5d", + "scaling_budget": 1.8, + "disk_bytes_budget": 1.2, + "heap_budget": 1.3, + "rd_scaling_budget": 3.0, + "facts_large_max": 16000, + "_note": "#2082 M2: N switch-arm defs of one variable + N later uses -- facts are O(defs x uses) BY SPEC, so the gate is BOUNDEDNESS, not linearity: with the production fact limit engaged (DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION=16000) the materialized fact count stays pinned at the limit as N grows (facts_large_max), and rd time stays bounded (measured ~1.4). Losing the maxFacts early-stop shows as facts_large exploding quadratically." } } diff --git a/gitnexus/bench/cfg/measure.mjs b/gitnexus/bench/cfg/measure.mjs index 115c878a8e..2451b5d62f 100644 --- a/gitnexus/bench/cfg/measure.mjs +++ b/gitnexus/bench/cfg/measure.mjs @@ -44,6 +44,8 @@ import { fileURLToPath } from 'node:url'; import Parser from 'tree-sitter'; import TypeScript from 'tree-sitter-typescript'; import { collectFunctionCfgs } from '../../src/core/ingestion/cfg/collect.ts'; +import { computeReachingDefs } from '../../src/core/ingestion/cfg/reaching-defs.ts'; +import { DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION } from '../../src/core/ingestion/cfg/emit.ts'; import { createTypeScriptCfgVisitor } from '../../src/core/ingestion/cfg/visitors/typescript.ts'; import { getTreeSitterBufferSize } from '../../src/core/ingestion/constants.ts'; @@ -102,6 +104,43 @@ const SCENARIOS = [ return s + '}\n'; }, }, + { + name: 'dense-bindings', + // #2082 M2: N bindings live across ~N blocks inside one loop — bindings × + // blocks scale JOINTLY, the discriminator for solver-lattice quadratics. + // The overlay design (KTD2: sets shared by reference, OUT spine-copied + // only on gen) is expected to scale ~linearly-with-a-spine-copy here + // (normalized ratio low single digits); the regression this scenario + // exists to catch is the repo's recurring per-item-rescan shape — a + // per-use scan over all defs (O(n³) here) blows the ratio past ~16. + // rd time is the gated metric (rd_scaling_budget). + rdMaxFacts: 0, // measure the algorithm, not the cap + gen: (n) => { + let s = 'function f(c: number) {\n'; + for (let i = 0; i < n; i++) s += ` let v${i} = ${i};\n`; + s += ' while (c > 0) {\n'; + for (let i = 0; i < n; i++) s += ` if (c > ${i}) { v${i} = v${(i + 1) % n} + 1; }\n`; + return s + ' c = c - 1;\n }\n return v0;\n}\n'; + }, + }, + { + name: 'fact-fanout', + // #2082 M2: N parallel case-arm defs of one variable + N later uses — + // facts are O(defs×uses) BY SPEC, so a linearity ratio gate is the wrong + // shape. The gate here is BOUNDEDNESS: with the production fact limit + // engaged, the materialized fact count stays FLAT (== limit) as N grows + // past it (facts_large_max), and rd time stays bounded. An unbounded + // materialization regression (losing the maxFacts early-stop) shows as + // facts_large exploding quadratically. + rdMaxFacts: DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION, + gen: (n) => { + let s = 'function f(c: number) {\n let x = 0;\n switch (c) {\n'; + for (let i = 0; i < n; i++) s += ` case ${i}: x = ${i}; break;\n`; + s += ' }\n'; + for (let i = 0; i < n; i++) s += ` u${i}(x);\n`; + return s + '}\n'; + }, + }, ]; const SMALL = 500; @@ -130,6 +169,7 @@ function measureCollect(src, file, reps) { } return { ms: median(samples), + cfgs: out.cfgs, blockCount: out.cfgs.reduce((a, c) => a + c.blocks.length, 0), // DISK growth: utf8 byte size of the serialized cfgSideChannel — exactly // what a --pdg run writes onto every ParsedFile shard in the durable store @@ -140,6 +180,25 @@ function measureCollect(src, file, reps) { }; } +// ---- reaching-defs solve cost (#2082 M2) ---- + +// Times computeReachingDefs over a scenario's collected CFGs (the exact work +// the scope-resolution emit loop adds per file on a --pdg run). `maxFacts` +// mirrors the per-scenario production posture: 0 (unlimited) measures the +// algorithm; the production default exercises the boundedness contract. +function measureReachingDefs(cfgs, reps, maxFacts) { + for (const c of cfgs) computeReachingDefs(c, { maxFacts }); // warm JIT + const samples = []; + let facts = 0; + for (let i = 0; i < reps; i++) { + const start = process.hrtime.bigint(); + facts = 0; + for (const c of cfgs) facts += computeReachingDefs(c, { maxFacts }).facts.length; + samples.push(Number(process.hrtime.bigint() - start) / 1e6); + } + return { ms: median(samples), facts }; +} + // ---- memory growth: retained heap of the cfgSideChannel payload ---- // Needs `node --expose-gc` to force collection for a clean delta; without it the @@ -169,10 +228,17 @@ function retainedHeapBytes(src, file) { function canonicalizeCfg(cfg) { const blocks = cfg.blocks - .map((b) => `B|${b.index}|${b.startLine}-${b.endLine}|${b.kind}|${b.text}`) + .map( + (b) => + `B|${b.index}|${b.startLine}-${b.endLine}|${b.kind}|${b.text}|` + + // #2082 M2: statement facts join the canon so harvest drift (lost + // defs/uses, changed binding resolution) trips the fingerprint gate. + JSON.stringify(b.statements ?? null), + ) .sort(); const edges = cfg.edges.map((e) => `E|${e.from}->${e.to}|${e.kind}`).sort(); - return `${cfg.functionStartLine}:${cfg.functionStartColumn}\n${blocks.join('\n')}\n${edges.join('\n')}`; + const bindings = JSON.stringify(cfg.bindings ?? null); + return `${cfg.functionStartLine}:${cfg.functionStartColumn}\n${bindings}\n${blocks.join('\n')}\n${edges.join('\n')}`; } function fingerprint(scenario) { @@ -205,6 +271,14 @@ function measureScenario(scenario) { ? heapLarge / heapSmall / sizeRatio : null; + // #2082 M2: reaching-defs solve cost over the same CFGs. + const rdMaxFacts = scenario.rdMaxFacts ?? 0; + const rdSmall = measureReachingDefs(small.cfgs, REPS, rdMaxFacts); + const rdLarge = measureReachingDefs(large.cfgs, REPS, rdMaxFacts); + // Clamp the denominator: a 0.000ms small-N median would otherwise yield + // ratio 0 and the gate would self-disable exactly when the solver is fast. + const rdRatio = rdLarge.ms / Math.max(rdSmall.ms, 0.001) / sizeRatio; + return { scenario: scenario.name, elapsed_ms_small: Number(small.ms.toFixed(3)), @@ -218,6 +292,11 @@ function measureScenario(scenario) { heap_ratio: heapRatio === null ? null : Number(heapRatio.toFixed(3)), blocks_small: small.blockCount, blocks_large: large.blockCount, + rd_ms_small: Number(rdSmall.ms.toFixed(3)), + rd_ms_large: Number(rdLarge.ms.toFixed(3)), + rd_scaling_ratio: Number(rdRatio.toFixed(3)), + facts_small: rdSmall.facts, + facts_large: rdLarge.facts, ...fingerprint(scenario), }; } @@ -267,6 +346,27 @@ if (!CHECK) { `${base.disk_bytes_budget} (bytes ${r.disk_bytes_small}->${r.disk_bytes_large})`, ); } + // #2082 M2 gates — rd solve-time scaling, fact-count boundedness, and an + // ABSOLUTE side-channel size ceiling (a ratio gate is blind to a + // constant-factor encoding bloat like named records vs indexed facts). + if (base.rd_scaling_budget !== undefined && r.rd_scaling_ratio >= base.rd_scaling_budget) { + failures.push( + `${r.scenario}: reaching-defs scaling ratio ${r.rd_scaling_ratio} >= budget ` + + `${base.rd_scaling_budget} (ms ${r.rd_ms_small}->${r.rd_ms_large})`, + ); + } + if (base.facts_large_max !== undefined && r.facts_large > base.facts_large_max) { + failures.push( + `${r.scenario}: fact materialization ${r.facts_large} > bound ${base.facts_large_max} ` + + `(the maxFacts early-stop is the boundedness contract)`, + ); + } + if (base.disk_bytes_large_max !== undefined && r.disk_bytes_large > base.disk_bytes_large_max) { + failures.push( + `${r.scenario}: cfgSideChannel absolute size ${r.disk_bytes_large} > ceiling ` + + `${base.disk_bytes_large_max} bytes (constant-factor encoding bloat)`, + ); + } // Heap gate only when measured (--expose-gc present) AND a budget exists. if ( base.heap_budget !== undefined && diff --git a/gitnexus/src/core/ingestion/cfg/cfg-builder.ts b/gitnexus/src/core/ingestion/cfg/cfg-builder.ts index 976b468246..b6e69126bf 100644 --- a/gitnexus/src/core/ingestion/cfg/cfg-builder.ts +++ b/gitnexus/src/core/ingestion/cfg/cfg-builder.ts @@ -12,7 +12,14 @@ * hand-built block sequences, which is how the classic CFG hazards are pinned * before the tree-sitter visitor (U2) drives it. */ -import type { BasicBlockData, CfgEdgeData, CfgEdgeKind, FunctionCfg } from './types.js'; +import type { + BasicBlockData, + BindingEntry, + CfgEdgeData, + CfgEdgeKind, + FunctionCfg, + StatementFacts, +} from './types.js'; interface MutableBlock { startLine: number; @@ -26,6 +33,13 @@ interface MutableBlock { */ textParts: string[]; kind: BasicBlockData['kind']; + /** + * Per-statement def/use facts in execution order (#2082 M2 U1). Parallel to + * the statements that accrued to this block — but self-describing (each + * record carries its line): facts-only attaches (ENTRY params, catch params) + * mean fact index ≠ text-fragment index. + */ + statements: StatementFacts[]; } export class CfgBuilder { @@ -54,8 +68,15 @@ export class CfgBuilder { endLine: number, text: string, kind: BasicBlockData['kind'] = 'normal', + facts?: StatementFacts, ): number { - this.blocks.push({ startLine, endLine, textParts: text ? [text] : [], kind }); + this.blocks.push({ + startLine, + endLine, + textParts: text ? [text] : [], + kind, + statements: facts ? [facts] : [], + }); return this.blocks.length - 1; } @@ -73,11 +94,25 @@ export class CfgBuilder { } /** Extend a block's end line as more statements accrue to it. */ - extendBlock(index: number, endLine: number, appendText?: string): void { + extendBlock(index: number, endLine: number, appendText?: string, facts?: StatementFacts): void { const b = this.blocks[index]; if (!b) return; if (endLine > b.endLine) b.endLine = endLine; if (appendText) b.textParts.push(appendText); + if (facts) b.statements.push(facts); + } + + /** + * Attach a facts-only statement record to a block WITHOUT touching its text + * or line span (#2082 M2 U1) — bench fingerprints and CFG snapshots include + * block text, so harvesting must never perturb it (ENTRY-block param defs + * are the canonical use; records that must precede a walked body get their + * own facts-only block instead, see the catch-param handling in visitTry). + */ + attachFacts(index: number, facts: StatementFacts): void { + const b = this.blocks[index]; + if (!b) return; + b.statements.push(facts); } get blockCount(): number { @@ -85,8 +120,14 @@ export class CfgBuilder { } /** Produce the serializable CFG. Caller is responsible for having wired the - * function's dangling exits to {@link exitIndex} before calling. */ - finish(): FunctionCfg { + * function's dangling exits to {@link exitIndex} before calling. + * + * Pass `bindings` (the function's binding table, possibly empty) to emit + * statement facts (#2082 M2 U1) — every block then carries a `statements` + * array. Omit it (hand-built test CFGs, pre-M2 producers) and both fields + * are absent, which the reaching-defs solver reports as `no-facts`. */ + finish(bindings?: readonly BindingEntry[]): FunctionCfg { + const withFacts = bindings !== undefined; return { filePath: this.filePath, functionStartLine: this.functionStartLine, @@ -100,8 +141,10 @@ export class CfgBuilder { endLine: b.endLine, text: b.textParts.join('\n'), kind: b.kind, + ...(withFacts ? { statements: b.statements } : {}), })), edges: [...this.edges], + ...(withFacts ? { bindings } : {}), }; } } diff --git a/gitnexus/src/core/ingestion/cfg/control-flow-context.ts b/gitnexus/src/core/ingestion/cfg/control-flow-context.ts index 38c7bcbb8a..6b9bf6c1c2 100644 --- a/gitnexus/src/core/ingestion/cfg/control-flow-context.ts +++ b/gitnexus/src/core/ingestion/cfg/control-flow-context.ts @@ -1,12 +1,24 @@ /** - * ControlFlowContext (issue #2081, M1). + * ControlFlowContext (issue #2081 M1; finalizer frames added by #2082 M2 U2). * * Resolves the targets of `break`/`continue` (plain and labeled) as the visitor * descends through loops and switches. Loops and switches push a target frame * on entry and pop it on exit; a labeled statement attaches its label to the * frame of the construct it labels, so `break outer` / `continue outer` resolve * against the right enclosing loop/switch rather than the nearest one. + * + * M2 adds FINALIZER frames, interleaved on the SAME stack as loop/switch frames + * — interleaving is load-bearing: a jump must route through exactly the + * `finally` bodies lexically BETWEEN it and its target (target-relative + * threading). A `break` whose loop lives entirely inside the `try` crosses no + * finally and must keep its direct edge; re-routing it anyway would force the + * only path to the in-try continuation through the finally, letting a finally + * redefinition falsely KILL in-loop definitions for the downstream + * reaching-defs pass (a taint false negative). A parallel stack cannot express + * that between-ness, which is why the frames live here. */ +import type { CfgBuilder } from './cfg-builder.js'; +import type { CfgEdgeKind } from './types.js'; interface LoopFrame { readonly kind: 'loop'; @@ -14,27 +26,76 @@ interface LoopFrame { readonly continueTo: number; /** Block a `break` jumps to (the loop exit / join). */ readonly breakTo: number; - readonly label?: string; + /** All labels naming this construct (`outer: inner: for` carries both). */ + readonly labels: readonly string[]; } interface SwitchFrame { readonly kind: 'switch'; /** Block a `break` jumps to (after the switch). `continue` is invalid here. */ readonly breakTo: number; - readonly label?: string; + readonly labels: readonly string[]; } -type Frame = LoopFrame | SwitchFrame; +/** + * A labeled NON-loop statement (`blk: { … break blk; … }`) — break-to-label + * targets the synthesized join after the body (tri-review P1: routing such a + * break to EXIT removed the real continuation and falsely killed every def + * live at the jump for post-construct uses). Matched ONLY by a labeled break + * naming it; unlabeled breaks and continues skip it. + */ +interface BlockFrame { + readonly kind: 'block'; + readonly breakTo: number; + readonly labels: readonly string[]; +} + +/** A `finally` whose body any crossing jump must route through. */ +export interface FinalizerFrame { + readonly kind: 'finalizer'; + /** Entry block of the finally body. */ + readonly entry: number; + /** + * Completion legs registered by jumps that crossed this finally: once the + * owning try pops the frame, it wires `finally-exits → to` with `kind` for + * each entry. Mutated by the jump handlers via {@link ControlFlowContext}. + */ + readonly pending: { to: number; kind: CfgEdgeKind }[]; +} + +type Frame = LoopFrame | SwitchFrame | BlockFrame | FinalizerFrame; +type TargetFrame = LoopFrame | SwitchFrame | BlockFrame; + +/** A resolved jump: its ultimate target + the finallys it crosses (inner→outer). */ +export interface JumpResolution { + readonly target: number; + readonly finalizers: readonly FinalizerFrame[]; +} export class ControlFlowContext { private readonly stack: Frame[] = []; - pushLoop(continueTo: number, breakTo: number, label?: string): void { - this.stack.push({ kind: 'loop', continueTo, breakTo, label }); + pushLoop(continueTo: number, breakTo: number, labels: readonly string[] = []): void { + this.stack.push({ kind: 'loop', continueTo, breakTo, labels }); + } + + pushSwitch(breakTo: number, labels: readonly string[] = []): void { + this.stack.push({ kind: 'switch', breakTo, labels }); + } + + /** Push a labeled non-loop statement's break-target frame. */ + pushLabeledBlock(breakTo: number, labels: readonly string[]): void { + this.stack.push({ kind: 'block', breakTo, labels }); } - pushSwitch(breakTo: number, label?: string): void { - this.stack.push({ kind: 'switch', breakTo, label }); + /** + * Push a finalizer frame and return it — the owning `visitTry` keeps the + * reference to wire {@link FinalizerFrame.pending} after popping it. + */ + pushFinalizer(entry: number): FinalizerFrame { + const frame: FinalizerFrame = { kind: 'finalizer', entry, pending: [] }; + this.stack.push(frame); + return frame; } pop(): void { @@ -42,29 +103,114 @@ export class ControlFlowContext { } /** - * Target block for a `break`. With a label, the nearest enclosing frame - * carrying that label (loop or switch); without, the nearest frame of any - * kind. Returns `undefined` if there is no valid target (malformed input). + * Resolve a `break`: the nearest enclosing loop/switch frame (or, with a + * label, the nearest frame carrying that label) plus every finalizer frame + * stacked ABOVE it — i.e. exactly the finallys the jump crosses, innermost + * first. Returns `undefined` if there is no valid target (malformed input or + * an unmodeled label) — the caller falls back to its conservative routing and + * threads nothing. */ - breakTarget(label?: string): number | undefined { + resolveBreak(label?: string): JumpResolution | undefined { + return this.resolve((f) => + label === undefined + ? f.kind !== 'block' // an unlabeled break never targets a labeled block + : f.labels.includes(label), + ); + } + + /** Resolve a `continue`: like {@link resolveBreak} but only loop frames match. */ + resolveContinue(label?: string): JumpResolution | undefined { + return this.resolve( + (f) => f.kind === 'loop' && (label === undefined || f.labels.includes(label)), + (f) => (f as LoopFrame).continueTo, + ); + } + + /** Every active finalizer, innermost first — what a `return` must cross. */ + finalizersForReturn(): readonly FinalizerFrame[] { + const fins: FinalizerFrame[] = []; for (let i = this.stack.length - 1; i >= 0; i--) { const f = this.stack[i]; - if (label === undefined || f.label === label) return f.breakTo; + if (f.kind === 'finalizer') fins.push(f); } - return undefined; + return fins; } /** - * Target block for a `continue`. With a label, the nearest enclosing **loop** - * carrying that label; without, the nearest loop (switches are skipped — you - * cannot `continue` a switch). Returns `undefined` if there is no valid loop. + * Target block for a `break` (no finalizer info) — see {@link resolveBreak}. + * Prefer `resolveBreak` + {@link wireJumpThroughFinalizers} in visitors: a + * target-only lookup silently loses finalizer threading (the M2 soundness + * fix). Kept for target-shape assertions in tests. */ + breakTarget(label?: string): number | undefined { + return this.resolveBreak(label)?.target; + } + + /** Target block for a `continue` — same caveat as {@link breakTarget}. */ continueTarget(label?: string): number | undefined { + return this.resolveContinue(label)?.target; + } + + private resolve( + matches: (f: TargetFrame) => boolean, + targetOf: (f: TargetFrame) => number = (f) => f.breakTo, + ): JumpResolution | undefined { + const crossed: FinalizerFrame[] = []; for (let i = this.stack.length - 1; i >= 0; i--) { const f = this.stack[i]; - if (f.kind !== 'loop') continue; - if (label === undefined || f.label === label) return f.continueTo; + if (f.kind === 'finalizer') { + crossed.push(f); + continue; + } + if (matches(f)) return { target: targetOf(f), finalizers: crossed }; } return undefined; } } + +/** + * Wire a jump from `from` to `target`, routing through the finallys it + * crosses (innermost first). The first leg keeps the bare jump `kind` + * (preserving the "kind ⟹ source-block terminator" invariant in types.ts); + * each finally's completion leg is registered as pending on its frame with the + * matching `finally-*` kind and wired by the owning try via + * {@link drainFinalizerPending} once the finally's exits are known. + * + * Language-agnostic on purpose (#2082 M2): the threading protocol encodes + * three subtle invariants every future language visitor needs identically — + * keeping it here means a new visitor cannot drift on any of them. + */ +export function wireJumpThroughFinalizers( + builder: CfgBuilder, + from: number, + finalizers: readonly FinalizerFrame[], + target: number, + kind: 'return' | 'break' | 'continue', +): void { + if (finalizers.length === 0) { + builder.edge(from, target, kind); + return; + } + const completionKind = `finally-${kind}` as CfgEdgeKind; + builder.edge(from, finalizers[0].entry, kind); + for (let i = 0; i < finalizers.length; i++) { + const to = i + 1 < finalizers.length ? finalizers[i + 1].entry : target; + finalizers[i].pending.push({ to, kind: completionKind }); + } +} + +/** + * Wire a popped finalizer frame's pending completion legs from the finally's + * exit blocks. A finally that itself always jumps (`finally { return 2; }`) + * has no exits — its pending legs wire nowhere, matching JS's + * finally-override semantics. + */ +export function drainFinalizerPending( + builder: CfgBuilder, + frame: FinalizerFrame, + finallyExits: readonly number[], +): void { + for (const p of frame.pending) { + builder.connect(finallyExits, p.to, p.kind); + } +} diff --git a/gitnexus/src/core/ingestion/cfg/emit.ts b/gitnexus/src/core/ingestion/cfg/emit.ts index 6531b723ed..3b1a1e50d0 100644 --- a/gitnexus/src/core/ingestion/cfg/emit.ts +++ b/gitnexus/src/core/ingestion/cfg/emit.ts @@ -20,7 +20,8 @@ */ import type { KnowledgeGraph } from '../../graph/types.js'; import { generateId } from '../../../lib/utils.js'; -import type { FunctionCfg } from './types.js'; +import { computeReachingDefs } from './reaching-defs.js'; +import type { BindingEntry, FunctionCfg } from './types.js'; /** * Default per-function CFG edge cap. A pathological generated function could @@ -31,6 +32,30 @@ import type { FunctionCfg } from './types.js'; */ export const DEFAULT_MAX_CFG_EDGES_PER_FUNCTION = 5000; +/** + * Default per-function REACHING_DEF edge cap (#2082 M2 KTD9). 4000 mirrors + * Joern's per-method `maxNumberOfDefinitions` — the closest production prior + * art — but truncates-and-warns instead of silently skipping the function. + * Counts (defBlock, useBlock, binding) DEDUPED edges, not statement-level + * facts. `0` ⇒ unlimited; `undefined` ⇒ this default. + */ +export const DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION = 4000; + +/** + * Fact-materialization headroom over the edge cap (#2082 M2 U3/F3): facts are + * O(defs×uses) BY SPEC in merge-heavy code, and the edge cap alone bounds the + * GRAPH, not the per-function memory spike of materializing facts before + * dedup. {@link emitFileReachingDefs} hands `edgeCap × this` to + * `computeReachingDefs` as `maxFacts` (unlimited when the edge cap is 0) — + * single source of truth; the DEFAULT constant below is derived, never the + * mechanism. + */ +export const REACHING_DEF_FACTS_PER_EDGE_CAP = 4; + +/** Derived emit-path fact limit at the default edge cap (bench/doc anchor). */ +export const DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION = + REACHING_DEF_FACTS_PER_EDGE_CAP * DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION; + export interface CfgEmitResult { blocks: number; edges: number; @@ -73,12 +98,82 @@ export const isEmitSafeCfg = (cfg: FunctionCfg | undefined | null): cfg is Funct ) { return false; } - const blockIndices = new Set(); + // Contiguity (index === position), not just integer-ness: every consumer — + // this module's id templating AND the reaching-defs solver's + // position-indexed adjacency arrays — assumes blocks[i].index === i. A + // membership-only check would admit a compacted channel ({index:0},{index:5}) + // whose edge 0→5 passes membership but indexes past the arrays downstream. + for (let i = 0; i < cfg.blocks.length; i++) { + if (cfg.blocks[i]?.index !== i) return false; + } + const n = cfg.blocks.length; + // entry/exit must land on real blocks — the solver feeds entryIndex straight + // into its RPO walk, where an out-of-range index throws and (worse than this + // one element) costs the whole FILE's REACHING_DEF pass (tri-review P3). + if ( + !Number.isInteger(cfg.entryIndex) || + cfg.entryIndex < 0 || + cfg.entryIndex >= n || + !Number.isInteger(cfg.exitIndex) || + cfg.exitIndex < 0 || + cfg.exitIndex >= n + ) { + return false; + } + return cfg.edges.every( + (e) => + Number.isInteger(e?.from) && + Number.isInteger(e?.to) && + e.from >= 0 && + e.from < n && + e.to >= 0 && + e.to < n, + ); +}; + +/** + * Whether a structurally-valid CFG's M2 statement facts are safe to feed to + * the reaching-defs solver + REACHING_DEF id templating (#2082 U1/U4): the + * binding table's name/declLine/declColumn template into edge ids, and + * statement def/use indices must stay IN RANGE of the table (an escaping + * index would fabricate `undefined`-keyed ids). Deliberately SEPARATE from + * {@link isEmitSafeCfg}: malformed facts must cost only the function's + * REACHING_DEF projection — degrading to M1 behavior (CFG emitted, no facts) + * — never the BasicBlock/CFG layer itself. + */ +export const hasEmitSafeFacts = (cfg: FunctionCfg): boolean => { + const bindings = cfg.bindings; + if (bindings === undefined) { + // Pre-M2 channel — statements must be absent too. + return cfg.blocks.every((b) => b.statements === undefined); + } + if (!Array.isArray(bindings)) return false; + for (const b of bindings) { + if ( + typeof b?.name !== 'string' || + !Number.isInteger(b.declLine) || + !Number.isInteger(b.declColumn) + ) { + return false; + } + } + const bindingCount = bindings.length; + const inRange = (i: number): boolean => Number.isInteger(i) && i >= 0 && i < bindingCount; for (const b of cfg.blocks) { - if (!Number.isInteger(b?.index)) return false; - blockIndices.add(b.index); + const stmts = b.statements; + if (stmts === undefined) continue; + if (!Array.isArray(stmts)) return false; + for (const s of stmts) { + if (!Number.isInteger(s?.line) || !Array.isArray(s.defs) || !Array.isArray(s.uses)) { + return false; + } + if (!s.defs.every(inRange) || !s.uses.every(inRange)) return false; + if (s.mayDefs !== undefined) { + if (!Array.isArray(s.mayDefs) || !s.mayDefs.every(inRange)) return false; + } + } } - return cfg.edges.every((e) => blockIndices.has(e?.from) && blockIndices.has(e?.to)); + return true; }; /** @@ -145,3 +240,173 @@ export function emitFileCfgs( return result; } + +export interface ReachingDefEmitResult { + /** Deduped (defBlock, useBlock, binding) edges persisted. */ + edges: number; + /** Deduped edges dropped by the per-function edge cap. */ + droppedEdges: number; + cappedFunctions: number; + /** Functions whose FACT materialization hit the solver's maxFacts limit. */ + truncatedFunctions: number; + /** Functions whose facts failed {@link hasEmitSafeFacts} (CFG kept, facts skipped). */ + malformedFactFunctions: number; + /** Total statement-level facts the solver produced (pre-dedup telemetry). */ + facts: number; +} + +/** + * Stable identity for a binding inside edge ids (#2082 M2 KTD3/KTD9): + * `name:declLine:declCol` for declared bindings, `name@module` for synthetic + * ones. Distinct same-name bindings never share a key; identifier characters + * cannot contain the id separators. + */ +const bindingKey = (b: BindingEntry): string => + b.synthetic ? `${b.name}@module` : `${b.name}:${b.declLine}:${b.declColumn}`; + +/** + * Compute reaching definitions per function and persist the bounded + * REACHING_DEF projection (#2082 M2 U4). + * + * Facts are DEDUPED to (defBlock, useBlock, binding) before budgeting — the + * persisted columns (`from,to,type,confidence,reason,step`; relationship ids + * are in-memory-only, the CodeRelation table has no id column) cannot + * distinguish finer rows, so statement-indexed ids would only manufacture + * byte-identical duplicate rows that burn budget. Statement granularity lives + * in the in-memory {@link computeReachingDefs} result, which the M3 taint + * engine recomputes on demand — the budget here governs only this projection + * and can never drop a taint fact. + * + * R7 (no silent truncation) covers BOTH layers: the per-function edge cap AND + * the solver's fact-materialization limit (which can fire without the edge + * cap ever being reached, since dedup is many-to-one) each produce one + * unconditional `onWarn`. The edge-cap warn names the top bindings by fact + * count — overflow is almost always one variable, which is exactly the datum + * M3 tuning wants. + */ +export function emitFileReachingDefs( + graph: KnowledgeGraph, + cfgs: readonly FunctionCfg[], + maxEdgesPerFunction: number = DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION, + onWarn?: (message: string) => void, +): ReachingDefEmitResult { + const result: ReachingDefEmitResult = { + edges: 0, + droppedEdges: 0, + cappedFunctions: 0, + truncatedFunctions: 0, + malformedFactFunctions: 0, + facts: 0, + }; + const cap = maxEdgesPerFunction > 0 ? maxEdgesPerFunction : Infinity; + const maxFacts = Number.isFinite(cap) ? (cap as number) * REACHING_DEF_FACTS_PER_EDGE_CAP : 0; // 0 ⇒ unlimited + + for (const cfg of cfgs) { + // Graceful degradation: malformed M2 facts cost only this function's + // REACHING_DEF projection — its BasicBlock/CFG layer was already emitted. + if (!hasEmitSafeFacts(cfg)) { + result.malformedFactFunctions++; + onWarn?.( + `[reaching-defs] ${cfg.filePath}:${cfg.functionStartLine}: malformed ` + + `statement facts (bad binding table or out-of-range fact indices) — ` + + `REACHING_DEF skipped for this function; its CFG is unaffected`, + ); + continue; + } + const r = computeReachingDefs(cfg, { maxFacts }); + if (r.status === 'no-facts') continue; + result.facts += r.facts.length; + + const { filePath, functionStartLine, functionStartColumn } = cfg; + if (r.status === 'truncated') { + result.truncatedFunctions++; + onWarn?.( + `[reaching-defs] ${filePath}:${functionStartLine}: fact materialization ` + + `limit (${maxFacts}) reached — facts beyond it were not computed; ` + + `the persisted REACHING_DEF projection for this function is sparse`, + ); + } else if (r.status === 'overflow') { + result.truncatedFunctions++; + onWarn?.( + `[reaching-defs] ${filePath}:${functionStartLine}: a basic block exceeds ` + + `the def-key stride (≥2^21 coalesced statements — minified/generated ` + + `code) — REACHING_DEF skipped for this function (computing any facts ` + + `would risk wrong-block aliasing); its CFG is unaffected`, + ); + continue; + } + + // Dedup to (defBlock, useBlock, binding) — facts arrive sorted, so the + // deduped order (and therefore cap truncation) is deterministic. + const seen = new Set(); + const deduped: { defBlock: number; useBlock: number; bindingIdx: number }[] = []; + for (const f of r.facts) { + const key = `${f.def.blockIndex}:${f.use.blockIndex}:${f.bindingIdx}`; + if (seen.has(key)) continue; + seen.add(key); + deduped.push({ + defBlock: f.def.blockIndex, + useBlock: f.use.blockIndex, + bindingIdx: f.bindingIdx, + }); + } + + let emittedForFn = 0; + for (const edge of deduped) { + if (emittedForFn >= cap) { + const dropped = deduped.length - emittedForFn; + result.droppedEdges += dropped; + result.cappedFunctions++; + // Tallied lazily — cap overflow is the rare path; the common uncapped + // case must not pay a per-fact counting pass. + const factsPerBinding = new Map(); + for (const f of r.facts) { + factsPerBinding.set(f.bindingIdx, (factsPerBinding.get(f.bindingIdx) ?? 0) + 1); + } + const top = [...factsPerBinding.entries()] + .sort((a, b) => b[1] - a[1] || a[0] - b[0]) + .slice(0, 2) + .map(([idx, count]) => `${r.bindings[idx]?.name ?? `#${idx}`}(${count} facts)`) + .join(', '); + onWarn?.( + `[reaching-defs] ${filePath}:${functionStartLine}: per-function ` + + `REACHING_DEF edge cap (${maxEdgesPerFunction}) reached — dropped ` + + `${dropped} of ${deduped.length} edges; top bindings: ${top}`, + ); + break; + } + const binding = r.bindings[edge.bindingIdx]; + const sourceId = basicBlockId( + filePath, + functionStartLine, + functionStartColumn, + edge.defBlock, + ); + const targetId = basicBlockId( + filePath, + functionStartLine, + functionStartColumn, + edge.useBlock, + ); + graph.addRelationship({ + // Single function anchor — the two block ids share it, so templating + // it once halves the id size (ids are in-memory-only but ~4000 of + // them per capped function is real transient heap). + id: generateId( + 'REACHING_DEF', + `${filePath}:${functionStartLine}:${functionStartColumn}:` + + `${edge.defBlock}->${edge.useBlock}:${bindingKey(binding)}`, + ), + type: 'REACHING_DEF', + sourceId, + targetId, + confidence: 1.0, + reason: binding.name, // plain source-level name (M0/S1 verdict) — queryable + }); + result.edges++; + emittedForFn++; + } + } + + return result; +} diff --git a/gitnexus/src/core/ingestion/cfg/reaching-defs.ts b/gitnexus/src/core/ingestion/cfg/reaching-defs.ts new file mode 100644 index 0000000000..14c7b3745e --- /dev/null +++ b/gitnexus/src/core/ingestion/cfg/reaching-defs.ts @@ -0,0 +1,448 @@ +/** + * Reaching definitions (#2082 M2 U3) — classic GEN/KILL monotone fixpoint over + * one function's CFG, plus the canonical intra-block statement sweep that + * recovers statement-granular def→use facts from M1's coalesced blocks + * WITHOUT re-splitting the CFG. + * + * PURE AND DETERMINISTIC (load-bearing contract): + * - Pure function of its inputs — no graph, no logger (warnings are the + * caller's job), importable outside the worker. The M3 taint engine calls + * this same function in-phase (facts are recomputed on demand, never + * retained run-wide — the persisted REACHING_DEF edges are a bounded + * projection, never the taint substrate). + * - Deterministic — predecessors merge in sorted block-index order, + * insertion-ordered Maps/Sets throughout, and the output fact array is + * explicitly sorted. Snapshot tests and content-derived edge ids rely on it. + * + * COMPLEXITY DISCIPLINE (the four-times-repeated repo bug shape is per-item + * re-derivation inside the loop): def-sets are SHARED BY REFERENCE, never + * deep-copied — a MUST def's kill is total per binding, so a transfer either + * aliases the incoming set or replaces it; a MAY def (conditional context — + * see StatementFacts.mayDefs) unions WITHOUT killing via a copy-on-extend. + * Single-predecessor blocks alias the predecessor's OUT map outright; + * multi-pred merges union only bindings whose incoming sets differ by + * reference. Iteration is reverse post-order, seeded with every block + * (unreachable blocks keep ⊥ IN — correct, their defs reach nothing). + * Convergence: sets grow monotonically within the finite def-site universe ⇒ + * ≤ loop-depth+1 passes in practice. + * + * `limits.maxFacts` bounds materialization: facts are O(defs×uses) BY SPEC in + * merge-heavy code (N branch-arm defs × N later uses = N² facts), and a + * 2000-line function can spike 100k+ fact objects on the main thread. The + * emit path passes DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION (emit.ts); + * M3 passes its own large-but-finite limit and treats `status: 'truncated'` + * as a per-function taint-coverage gap. + */ +import type { BindingEntry, FunctionCfg } from './types.js'; + +/** A statement-granular program point within one function's CFG. */ +export interface ProgramPoint { + readonly blockIndex: number; + /** Statement index within the block's `statements` array. */ + readonly stmtIndex: number; + readonly line: number; +} + +/** One def→use fact: the definition at `def` reaches the use at `use`. */ +export interface DefUseFact { + /** Index into {@link FunctionDefUse.bindings}. */ + readonly bindingIdx: number; + readonly def: ProgramPoint; + readonly use: ProgramPoint; +} + +export interface ReachingDefsLimits { + /** + * Maximum number of facts to materialize; the sweep stops early and reports + * `status: 'truncated'`. `undefined`/0 ⇒ unlimited. + */ + readonly maxFacts?: number; +} + +export interface FunctionDefUse { + /** + * `computed` — full facts. + * `no-facts` — the CFG carries no statement facts (hand-built or pre-M2 + * side channel); empty facts, NOT an error. + * `truncated` — `limits.maxFacts` hit; `facts` is a deterministic prefix. + * `overflow` — a block's statement count breaches the def-key stride; no + * facts at all (computing any would risk key aliasing — + * wrong-block facts are strictly worse than none). Distinct + * from `truncated` so the caller's diagnostic doesn't + * misname it as the fact-materialization limit. + */ + readonly status: 'computed' | 'no-facts' | 'truncated' | 'overflow'; + /** Pass-through of the CFG's binding table (empty for `no-facts`). */ + readonly bindings: readonly BindingEntry[]; + /** Sorted by (def block, def stmt, use block, use stmt, binding). */ + readonly facts: readonly DefUseFact[]; + /** Total def / use sites seen (telemetry; independent of truncation). */ + readonly defCount: number; + readonly useCount: number; +} + +/** + * def-site key: packs (blockIndex, stmtIndex) into one number. The stride is + * a per-BLOCK statement bound, and `maxFunctionLines` caps LINES, not + * statements — a minified one-line function coalesces arbitrarily many + * statements into one block, so an overflow would silently alias + * (block b, stmt STRIDE+k) with (block b+1, stmt k) and fabricate wrong-block + * facts. computeReachingDefs therefore range-checks up front and bails to a + * sound empty `truncated` result instead of ever letting a key alias. + * 2^21 statements per block × blocks ≤ 2^32 stays inside Number's 2^53. + */ +const STMT_STRIDE = 1 << 21; +const defKey = (blockIndex: number, stmtIndex: number): number => + blockIndex * STMT_STRIDE + stmtIndex; + +type DefSet = Set; +/** bindingIdx → def-site keys reaching this program point. */ +type Lattice = Map; + +const EMPTY_LATTICE: Lattice = new Map(); + +/** + * Compute reaching definitions for one function. See the module doc for the + * purity/determinism/sharing contract. + */ +export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimits): FunctionDefUse { + if (!cfg.bindings) { + return { status: 'no-facts', bindings: [], facts: [], defCount: 0, useCount: 0 }; + } + + const blocks = cfg.blocks; + const n = blocks.length; + + // Key-aliasing guard (see STMT_STRIDE): a block with ≥ STRIDE statements + // cannot be keyed without aliasing into the next block's def sites, which + // would fabricate wrong-block facts — strictly worse than producing none. + // Bail to a sound empty `overflow` result (the emit path warns distinctly). + for (const b of blocks) { + if ((b.statements?.length ?? 0) >= STMT_STRIDE) { + return { status: 'overflow', bindings: cfg.bindings, facts: [], defCount: 0, useCount: 0 }; + } + } + + // ── adjacency (sorted for deterministic merges) ───────────────────────── + // A `throw` edge contributes IN(from) ∪ allDefs(from) to its handler, not + // OUT: an exception can fire BEFORE the block's defs complete (the seed def + // in `let x = seed(); try { x = risky(); } catch { sink(x) }` must reach the + // sink) AND between any two defs of a multi-def coalesced block (the parse + // def in `x = parse(a); x = normalize(x);` is live exactly when normalize + // throws — OUT's last-def-wins misses it). Sound over-approximation; + // monotone, so the fixpoint absorbs it. See mergePreds. + const preds: { from: number; viaThrow: boolean }[][] = Array.from({ length: n }, () => []); + const succs: number[][] = Array.from({ length: n }, () => []); + // Handlers whose IN depends on this block's IN (throw edges) — requeued on + // IN change, since a genned binding can absorb IN growth without changing + // OUT, which would otherwise leave the handler stale. + const throwSuccs: number[][] = Array.from({ length: n }, () => []); + for (const e of cfg.edges) { + // Optional-chained pushes drop out-of-range endpoints defensively — the + // emit path validates via isEmitSafeCfg, but this pure function also runs + // on hand-built CFGs. + succs[e.from]?.push(e.to); + preds[e.to]?.push({ from: e.from, viaThrow: e.kind === 'throw' }); + if (e.kind === 'throw') throwSuccs[e.from]?.push(e.to); + } + for (const list of preds) { + list.sort((a, b) => a.from - b.from || Number(a.viaThrow) - Number(b.viaThrow)); + // duplicate (from, throw+non-throw) pairs both survive — the throw leg + // adds IN(from); the merge dedups set-wise. + } + for (const list of succs) list.sort((a, b) => a - b); + + // ── per-block GEN + def/use telemetry ──────────────────────────────────── + // gen[b]: bindingIdx → { set, kills }. A MUST def resets the accumulated + // set (kill is total); a MAY def (conditionally-evaluated context — see + // StatementFacts.mayDefs) only ADDS: the binding's incoming defs survive, + // so the transfer is out[x] = kills ? set : in[x] ∪ set. + interface GenEntry { + set: DefSet; + kills: boolean; + } + const gen: (Map | null)[] = new Array(n).fill(null); + // allDefsGen[b]: bindingIdx → EVERY def-site key in the block (must + may). + // This is what a throw edge delivers to its handler: an exception can fire + // between any two statements, so every intermediate def may be the live one + // at the handler — IN∪OUT alone misses defs overwritten later in the same + // coalesced block (`try { x = parse(a); x = normalize(x); } catch { sink(x) }` + // — parse's value is exactly what sink sees when normalize throws). + const allDefsGen: (Lattice | null)[] = new Array(n).fill(null); + const defLine = new Map(); // defKey → source line + let defCount = 0; + let useCount = 0; + for (const b of blocks) { + const stmts = b.statements; + if (!stmts || stmts.length === 0) continue; + let g: Map | null = null; + let all: Lattice | null = null; + for (let i = 0; i < stmts.length; i++) { + const s = stmts[i]; + useCount += s.uses.length; + const key = defKey(b.index, i); + const record = (d: number, kills: boolean): void => { + defCount += 1; + defLine.set(key, s.line); + if (!g) g = new Map(); + const entry = g.get(d); + if (kills || !entry) { + g.set(d, { set: new Set([key]), kills: kills || (entry?.kills ?? false) }); + } else { + entry.set.add(key); // may-def accumulates; never clears + } + if (!all) all = new Map(); + const allSet = all.get(d); + if (allSet) allSet.add(key); + else all.set(d, new Set([key])); + }; + if (s.mayDefs) for (const d of s.mayDefs) record(d, false); + for (const d of s.defs) record(d, true); + } + gen[b.index] = g; + allDefsGen[b.index] = all; + } + + // ── iteration order: RPO over reachable blocks, then the rest by index ── + const order = reversePostOrder(cfg.entryIndex, succs, n); + + // ── fixpoint ──────────────────────────────────────────────────────────── + const inSets: Lattice[] = new Array(n).fill(EMPTY_LATTICE); + const outSets: Lattice[] = new Array(n).fill(EMPTY_LATTICE); + + const inWorklist = new Array(n).fill(true); + let pending = n; + while (pending > 0) { + for (const b of order) { + if (!inWorklist[b]) continue; + inWorklist[b] = false; + pending -= 1; + + const p = preds[b]; + const inB: Lattice = + p.length === 0 + ? EMPTY_LATTICE + : p.length === 1 && !p[0].viaThrow + ? outSets[p[0].from] // alias — zero allocation on straight-line chains + : mergePreds(p, inSets, outSets, allDefsGen); + const inChanged = !latticeEquals(inSets[b], inB); + inSets[b] = inB; + + const g = gen[b]; + // OUT = overlay(IN): a KILLING gen entry replaces the binding's set; a + // may-def-only entry unions with the incoming set (never kills). When + // nothing is genned, OUT aliases IN outright. + let outB: Lattice; + if (!g) { + outB = inB; + } else { + outB = new Map(inB); // copies REFERENCES, never set contents + for (const [bindingIdx, entry] of g) { + if (entry.kills) { + outB.set(bindingIdx, entry.set); + } else { + const incoming = inB.get(bindingIdx); + outB.set(bindingIdx, incoming ? unionSets(incoming, entry.set) : entry.set); + } + } + } + + const requeue = (s: number): void => { + if (!inWorklist[s]) { + inWorklist[s] = true; + pending += 1; + } + }; + if (!latticeEquals(outSets[b], outB)) { + outSets[b] = outB; + for (const s of succs[b]) requeue(s); + } + if (inChanged) for (const s of throwSuccs[b]) requeue(s); + } + } + + // ── statement sweep: recover statement-granular def→use facts ─────────── + const maxFacts = limits?.maxFacts && limits.maxFacts > 0 ? limits.maxFacts : Infinity; + const facts: DefUseFact[] = []; + let truncated = false; + + outer: for (const b of blocks) { + const stmts = b.statements; + if (!stmts || stmts.length === 0) continue; + // Lazy overlay of IN — entries are replaced (never mutated) on def, so the + // shared sets stay intact. + let reach: Lattice | null = null; + for (let i = 0; i < stmts.length; i++) { + const s = stmts[i]; + // A use's binding that the SAME statement also defines could be a + // read-then-write (`x += 1` — sees prior defs) OR a write-then-read + // (`if ((m = re.exec(s)) && m[1])` — sees the same-statement def). + // StatementFacts carries no intra-statement order, so emit BOTH: prior + // defs ∪ the same-statement def. Sound over-approximation — the extra + // self-fact on compound assignments is harmless; missing the + // assign-and-test def→use (the most common JS idiom) would be a taint + // false negative. May-defs join the self-key set the same way. + const sameStmtDefs = + s.defs.length > 0 || s.mayDefs?.length ? new Set([...s.defs, ...(s.mayDefs ?? [])]) : null; + for (const u of s.uses) { + const reaching = (reach ?? inSets[b.index]).get(u); + const selfKey = sameStmtDefs?.has(u) ? defKey(b.index, i) : undefined; + if (!reaching && selfKey === undefined) continue; + const keys = + selfKey !== undefined && !reaching?.has(selfKey) + ? [...(reaching ?? []), selfKey] + : [...(reaching ?? [])]; + for (const key of keys) { + if (facts.length >= maxFacts) { + truncated = true; + break outer; + } + const defBlock = Math.floor(key / STMT_STRIDE); + const defStmt = key % STMT_STRIDE; + facts.push({ + bindingIdx: u, + def: { blockIndex: defBlock, stmtIndex: defStmt, line: defLine.get(key) ?? s.line }, + use: { blockIndex: b.index, stmtIndex: i, line: s.line }, + }); + } + } + if (s.mayDefs?.length) { + // Gen WITHOUT kill: the conditional def joins the binding's set. + if (!reach) reach = new Map(inSets[b.index]); + const key = defKey(b.index, i); + for (const d of s.mayDefs) { + const prior = reach.get(d); + reach.set(d, prior ? unionSets(prior, new Set([key])) : new Set([key])); + } + } + if (s.defs.length > 0) { + if (!reach) reach = new Map(inSets[b.index]); + for (const d of s.defs) reach.set(d, new Set([defKey(b.index, i)])); // kill + gen + } + } + } + + facts.sort( + (a, b) => + a.def.blockIndex - b.def.blockIndex || + a.def.stmtIndex - b.def.stmtIndex || + a.use.blockIndex - b.use.blockIndex || + a.use.stmtIndex - b.use.stmtIndex || + a.bindingIdx - b.bindingIdx, + ); + + return { + status: truncated ? 'truncated' : 'computed', + bindings: cfg.bindings, + facts, + defCount, + useCount, + }; +} + +/** RPO over blocks reachable from `entry`; unreachable blocks appended by index. */ +function reversePostOrder(entry: number, succs: readonly number[][], n: number): number[] { + const visited = new Array(n).fill(false); + const post: number[] = []; + // Iterative DFS with an explicit phase stack (children pushed in reverse so + // they pop in sorted order — determinism). + const stack: { node: number; childIdx: number }[] = [{ node: entry, childIdx: 0 }]; + visited[entry] = true; + while (stack.length) { + const top = stack[stack.length - 1]; + const children = succs[top.node]; + if (top.childIdx < children.length) { + const next = children[top.childIdx]; + top.childIdx += 1; + if (!visited[next]) { + visited[next] = true; + stack.push({ node: next, childIdx: 0 }); + } + } else { + post.push(top.node); + stack.pop(); + } + } + const order = post.reverse(); + for (let b = 0; b < n; b++) if (!visited[b]) order.push(b); + return order; +} + +/** + * Union predecessor lattices, sharing sets where possible. A normal edge + * contributes OUT(from). A THROW edge contributes IN(from) ∪ allDefs(from): + * an exception may fire before, between, or after any of the block's defs, so + * the handler can observe the incoming state OR any intermediate def — OUT + * alone (last-def-wins) misses defs overwritten later in the same block. + * IN ∪ allDefs ⊇ OUT, so the throw contribution subsumes it. + */ +function mergePreds( + preds: readonly { from: number; viaThrow: boolean }[], + inSets: readonly Lattice[], + outSets: readonly Lattice[], + allDefsGen: readonly (Lattice | null)[], +): Lattice { + const merged: Lattice = new Map(); + const mergeOne = (source: Lattice): void => { + for (const [bindingIdx, set] of source) { + const existing = merged.get(bindingIdx); + if (!existing) { + merged.set(bindingIdx, set); // share the first contributor's set + } else if (existing !== set) { + // Union only when the references differ. Copy-on-extend: `existing` + // may be a shared set from another block — never mutate it. + let target = existing; + let copied = false; + for (const key of set) { + if (!target.has(key)) { + if (!copied) { + target = new Set(existing); + copied = true; + } + target.add(key); + } + } + if (copied) merged.set(bindingIdx, target); + } + } + }; + for (const p of preds) { + if (p.viaThrow) { + mergeOne(inSets[p.from]); // exception may fire pre-defs… + const all = allDefsGen[p.from]; + if (all) mergeOne(all); // …or after ANY of the block's defs + } else { + mergeOne(outSets[p.from]); + } + } + return merged; +} + +/** Order-stable union of two def-sets (shares `a` when `b` adds nothing). */ +function unionSets(a: DefSet, b: DefSet): DefSet { + let target = a; + let copied = false; + for (const key of b) { + if (!target.has(key)) { + if (!copied) { + target = new Set(a); + copied = true; + } + target.add(key); + } + } + return target; +} + +/** Per-binding equality with a reference fast path (sets only ever grow). */ +function latticeEquals(a: Lattice, b: Lattice): boolean { + if (a === b) return true; + if (a.size !== b.size) return false; + for (const [k, bSet] of b) { + const aSet = a.get(k); + if (aSet === bSet) continue; + if (!aSet || aSet.size !== bSet.size) return false; + for (const v of bSet) if (!aSet.has(v)) return false; + } + return true; +} diff --git a/gitnexus/src/core/ingestion/cfg/types.ts b/gitnexus/src/core/ingestion/cfg/types.ts index 0b28c60898..ed789ec30c 100644 --- a/gitnexus/src/core/ingestion/cfg/types.ts +++ b/gitnexus/src/core/ingestion/cfg/types.ts @@ -11,6 +11,60 @@ * array of them is what rides on `ParsedFile.cfgSideChannel`. */ +/** + * One distinct declared variable (binding) within a function (#2082 M2 U1). + * + * Statement facts reference bindings by integer index into + * {@link FunctionCfg.bindings} — names appear once per binding instead of once + * per occurrence (measured ~4× smaller serialized payload than named records). + * Distinct bindings of the same name (shadowing) get distinct entries, which is + * what keeps an inner `let x` from falsely killing the outer `x`'s definitions + * in the reaching-defs solver. NOTE: no field here may be named `nodeId` — the + * durable parsedfile-store reviver dedups objects keyed on that field name. + */ +export interface BindingEntry { + /** Source-level variable name (what the persisted edge's `reason` carries). */ + readonly name: string; + /** + * 1-based line/0-based column of the canonical declaration site — `var` + * multi-declarations canonicalize to the FIRST declaration in source order. + * Both 0 for synthetic bindings. + */ + readonly declLine: number; + readonly declColumn: number; + /** How the binding was introduced (param/catch matter to the M3 taint pass). */ + readonly kind: 'var' | 'let' | 'const' | 'param' | 'catch' | 'function' | 'class' | 'module'; + /** + * True when the name has no in-function declaration site (implicit global, + * import, or a variable captured from an enclosing function) — keyed + * `name@module` in edge ids instead of `name:line:col`. + */ + readonly synthetic?: boolean; +} + +/** + * Def/use facts for one harvested statement (or construct header), in + * execution order within its block (#2082 M2 U1). `defs`/`uses` are indices + * into {@link FunctionCfg.bindings}. A compound assignment / update expression + * lists its binding in BOTH. Self-describing — `line` is carried here, never + * inferred from the block's text fragments (facts-only records exist, e.g. + * params on ENTRY and catch params). + * + * `mayDefs` (tri-review P1): defs harvested inside CONDITIONALLY-EVALUATED + * subexpressions — short-circuit right operands (`a && (x = v)`, + * `c ?? (c = load())`), ternary arms, logical-assignment operators, and + * switch case-test expressions. The solver treats them as GEN WITHOUT KILL: + * treating them as must-defs would falsely kill the prior def on the + * not-taken path (a taint false negative on core JS idioms). Optional — + * absent means none. + */ +export interface StatementFacts { + readonly line: number; + readonly defs: readonly number[]; + readonly uses: readonly number[]; + readonly mayDefs?: readonly number[]; +} + /** A basic block: a maximal straight-line run of statements between leaders. */ export interface BasicBlockData { /** Block index within its function. The synthetic ENTRY is always 0. */ @@ -20,20 +74,40 @@ export interface BasicBlockData { /** Source snippet for the block (empty for synthetic ENTRY/EXIT). */ readonly text: string; readonly kind: 'entry' | 'exit' | 'normal'; + /** + * Per-statement def/use facts in execution order (#2082 M2 U1). Present only + * when the producing visitor harvests (TS/JS under `--pdg`); absent on + * hand-built or pre-M2 CFGs — the reaching-defs solver reports `no-facts`. + */ + readonly statements?: readonly StatementFacts[]; } -/** Why one block flows to another — drives the `reason` on the emitted CFG edge. */ +/** + * Why one block flows to another — drives the `reason` on the emitted CFG edge. + * + * Kind invariant (M2): a bare jump kind (`return`/`break`/`continue`) means the + * SOURCE block's terminator is that jump statement. A `finally-*` kind marks a + * COMPLETION edge out of a `finally` body's exit — the leg that resumes a jump + * which was re-routed through the finally (issue #2082 U2). Reusing the bare + * kinds on completion edges would silently break consumers that infer the + * source block's terminator from the kind, and a single generic kind would lose + * WHICH jump each completion edge completes when a shared finally has several + * pending targets. + */ export type CfgEdgeKind = | 'seq' // straight-line fallthrough | 'cond-true' // branch taken (if/while/for condition true) | 'cond-false' // branch not taken / loop exit | 'loop-back' // back-edge to a loop header - | 'break' // break → loop/switch exit - | 'continue' // continue → loop header - | 'return' // return → function EXIT + | 'break' // break → loop/switch exit (or the finally it must cross) + | 'continue' // continue → loop header (or the finally it must cross) + | 'return' // return → function EXIT (or the finally it must cross) | 'throw' // throw → nearest handler / finally / EXIT | 'switch-case' // dispatch to a case - | 'fallthrough'; // switch case → next case (no break) + | 'fallthrough' // switch case → next case (no break) + | 'finally-return' // finally exit → resumed return target (EXIT / outer finally) + | 'finally-break' // finally exit → resumed break target + | 'finally-continue'; // finally exit → resumed continue target export interface CfgEdgeData { readonly from: number; @@ -60,6 +134,11 @@ export interface FunctionCfg { readonly exitIndex: number; readonly blocks: readonly BasicBlockData[]; readonly edges: readonly CfgEdgeData[]; + /** + * The function's binding table (#2082 M2 U1) — referenced by index from + * {@link BasicBlockData.statements}. Present iff statement facts are. + */ + readonly bindings?: readonly BindingEntry[]; } /** diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts new file mode 100644 index 0000000000..81823c97a9 --- /dev/null +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts @@ -0,0 +1,656 @@ +/** + * TS/JS def/use harvester (#2082 M2 U1). + * + * Runs in the parse worker next to the CFG visitor, extracting per-statement + * variable definition/use facts that ride the side channel for the + * reaching-defs solver (`cfg/reaching-defs.ts`). Output is the per-function + * binding table ({@link BindingEntry}[]) plus {@link StatementFacts} records + * the visitor attaches to blocks as it walks. + * + * TWO-PHASE, ORDER-INDEPENDENT (load-bearing): the CFG walk is NOT source-order + * — `visitTry` builds the finally body before the protected body, `visitFor` + * creates the init block after walking the body, `visitDoWhile` the condition + * before the body. Resolving names against a scope stack populated *during* + * that walk would mis-resolve common code (`try { var v = 1; } finally + * { use(v); }` keys the use synthetically while the def gets the real binding — + * the def→use fact silently never forms, a taint false negative). So phase 1 + * pre-scans the whole function subtree once, collecting every declaration into + * a completed lexical scope tree (also resolving `var` hoisting and multi-decl + * canonicalization order-independently, eslint-scope style); phase 2 resolves + * defs/uses against that finished tree from any walk order. + * + * v1 def-semantics scope (plan KTD4): var/let/const declarations, assignments + * (plain/compound/destructuring), update expressions, function/class + * declarations, parameters (incl. defaults/rest/destructured), catch params, + * for-in/of heads. EXCLUDED, deliberately: property/member writes (`this.x=`, + * `obj.p=` — TypeScript-CFA precedent), and BOTH directions of nested-function + * capture — writes to outer variables from nested bodies AND reads of captured + * variables inside nested bodies are invisible (nested functions are opaque + * blocks in the enclosing CFG; callback flows like `arr.forEach(() => sink(y))` + * register no use of `y` — closure/callback dataflow is M4 territory and the + * M3 consumer contract must name it). + * + * Identifiers with no in-function declaration (implicit globals, imports, + * variables captured from an enclosing function) resolve to a SYNTHETIC + * module-level binding (`name@module`), applied identically by def and use + * harvesting so `notDeclared = 1; use(notDeclared)` still forms a fact. + * + * NOTE: nothing serialized here may carry a field named `nodeId` — the durable + * parsedfile-store reviver dedups objects keyed on that field name. + */ +import type { SyntaxNode } from '../../utils/ast-helpers.js'; +import type { BindingEntry, StatementFacts } from '../types.js'; + +/** Node types that own a nested CFG — their subtrees are opaque to harvesting. */ +const NESTED_FUNCTION_TYPES = new Set([ + 'function_declaration', + 'function_expression', + 'arrow_function', + 'method_definition', + 'generator_function_declaration', + 'generator_function', + 'async_function_declaration', + 'async_arrow_function', +]); + +/** Function-ish declaration statements whose NAME still binds in the enclosing scope. */ +const FUNCTION_DECL_TYPES = new Set([ + 'function_declaration', + 'generator_function_declaration', + 'async_function_declaration', +]); + +/** + * Nodes that open a lexical scope for `let`/`const`/`class`/catch bindings. + * A `switch` BODY is deliberately ONE scope shared by all case arms (JS + * semantics: `case 1: let x = 1; case 2: use(x)` is the same binding). + */ +const SCOPE_TYPES = new Set([ + 'statement_block', + 'for_statement', + 'for_in_statement', + 'for_of_statement', + 'catch_clause', + 'switch_body', +]); + +/** Type-position subtrees — identifiers inside them are not value uses. */ +const TYPE_CONTEXT_TYPES = new Set([ + 'type_annotation', + 'type_arguments', + 'type_parameters', + 'type_predicate_annotation', + 'asserts_annotation', +]); + +interface Scope { + readonly parent: Scope | null; + /** name → binding index */ + readonly table: Map; +} + +export class TsHarvester { + private readonly bindings: BindingEntry[] = []; + /** Scope-opening node id → its scope. */ + private readonly scopeByNode = new Map(); + private readonly root: Scope = { parent: null, table: new Map() }; + /** name → synthetic binding index (implicit global / import / captured). */ + private readonly synthetic = new Map(); + private readonly fnId: number; + /** + * Innermost enclosing scope per visited node id, filled during the prescan + * (which already touches every named node once). Makes phase-2 resolution + * O(scope-chain) instead of O(AST-depth) per identifier — a deeply-chained + * single-statement expression (generated code) otherwise turns the + * parent-chain walk quadratic (tri-review perf finding). + */ + private readonly nearestScopeCache = new Map(); + /** + * >0 while walking a conditionally-evaluated subexpression (short-circuit + * right operand, ternary arm, logical-assignment target, case test). Defs + * found there are MAY-defs — gen without kill (tri-review P1: a must-def + * here falsely kills the prior def on the not-taken path). + */ + private conditionalDepth = 0; + + constructor(private readonly fnNode: SyntaxNode) { + this.fnId = fnNode.id; + this.scopeByNode.set(fnNode.id, this.root); + this.declareParams(fnNode); + const body = fnNode.childForFieldName('body'); + if (body) + this.prescan(body, body.type === 'statement_block' ? this.openScope(body) : this.root); + } + + /** The completed binding table — pass to `CfgBuilder.finish`. */ + table(): readonly BindingEntry[] { + return this.bindings; + } + + // ── phase 1: declaration pre-scan ──────────────────────────────────────── + + private openScope(node: SyntaxNode): Scope { + const existing = this.scopeByNode.get(node.id); + if (existing) return existing; + const scope: Scope = { parent: this.nearestScopeOf(node), table: new Map() }; + this.scopeByNode.set(node.id, scope); + return scope; + } + + private nearestScopeOf(node: SyntaxNode): Scope { + for (let p = node.parent; p; p = p.parent) { + const s = this.scopeByNode.get(p.id); + if (s) return s; + if (p.id === this.fnId) break; + } + return this.root; + } + + private declare( + nameNode: SyntaxNode, + kind: BindingEntry['kind'], + scope: Scope, + hoistToRoot: boolean, + ): void { + const target = hoistToRoot ? this.root : scope; + const name = nameNode.text; + // `var` multi-declaration (and a param + `var` of the same name) is ONE + // binding — first declaration in source order is canonical. The dedup is + // scoped to the single target table, so an inner `let x` shadowing a root + // `var x` still gets its own entry in its own scope. + if (target.table.has(name)) return; + target.table.set(name, this.bindings.length); + this.bindings.push({ + name, + declLine: nameNode.startPosition.row + 1, + declColumn: nameNode.startPosition.column, + kind, + }); + } + + private declareParams(fnNode: SyntaxNode): void { + const params = fnNode.childForFieldName('parameters') ?? fnNode.childForFieldName('parameter'); + if (!params) return; + if (params.type === 'identifier') { + this.declare(params, 'param', this.root, true); // `x => …` single-param arrow + return; + } + for (let i = 0; i < params.namedChildCount; i++) { + const p = params.namedChild(i); + if (!p) continue; + // TS wraps each param (required_parameter/optional_parameter, field + // `pattern`); plain JS puts the pattern directly in formal_parameters. + const pattern = p.childForFieldName('pattern') ?? p; + this.declarePattern(pattern, 'param', this.root, true); + } + } + + /** Declare every name bound by a (possibly destructuring) pattern. */ + private declarePattern( + node: SyntaxNode, + kind: BindingEntry['kind'], + scope: Scope, + hoistToRoot: boolean, + ): void { + switch (node.type) { + case 'identifier': + case 'shorthand_property_identifier_pattern': + this.declare(node, kind, scope, hoistToRoot); + return; + case 'rest_pattern': + case 'object_pattern': + case 'array_pattern': + for (let i = 0; i < node.namedChildCount; i++) { + const c = node.namedChild(i); + if (c) this.declarePattern(c, kind, scope, hoistToRoot); + } + return; + case 'pair_pattern': { + const value = node.childForFieldName('value'); + if (value) this.declarePattern(value, kind, scope, hoistToRoot); + return; + } + case 'assignment_pattern': + case 'object_assignment_pattern': { + const left = node.childForFieldName('left'); + if (left) this.declarePattern(left, kind, scope, hoistToRoot); + return; + } + default: + // Type annotations / unknown wrappers — descend defensively. + for (let i = 0; i < node.namedChildCount; i++) { + const c = node.namedChild(i); + if (c && !TYPE_CONTEXT_TYPES.has(c.type)) { + this.declarePattern(c, kind, scope, hoistToRoot); + } + } + } + } + + private prescan(node: SyntaxNode, scope: Scope): void { + this.nearestScopeCache.set(node.id, scope); + const t = node.type; + if (NESTED_FUNCTION_TYPES.has(t) && node.id !== this.fnId) { + // A nested function's NAME binds in the enclosing scope; its body is opaque. + if (FUNCTION_DECL_TYPES.has(t)) { + const name = node.childForFieldName('name'); + if (name) this.declare(name, 'function', scope, false); + } + return; + } + + let childScope = scope; + if (SCOPE_TYPES.has(t)) childScope = this.openScope(node); + + switch (t) { + case 'lexical_declaration': { + const kind = node.child(0)?.type === 'const' ? 'const' : 'let'; + this.declareDeclarators(node, kind, childScope, false); + break; + } + case 'variable_declaration': + this.declareDeclarators(node, 'var', childScope, true); + break; + case 'class_declaration': { + const name = node.childForFieldName('name'); + if (name) this.declare(name, 'class', childScope, false); + break; + } + case 'catch_clause': { + const param = node.childForFieldName('parameter'); + if (param) this.declarePattern(param, 'catch', childScope, false); + break; + } + case 'for_in_statement': + case 'for_of_statement': { + // `for (const x of xs)` — the `kind` keyword marks a declaration; a bare + // `for (x of xs)` left is an assignment, resolved at use time instead. + const kindNode = node.childForFieldName('kind'); + const left = node.childForFieldName('left'); + if (kindNode && left) { + const k = kindNode.type === 'var' ? 'var' : kindNode.type === 'const' ? 'const' : 'let'; + this.declarePattern(left, k, childScope, k === 'var'); + } + break; + } + default: + break; + } + + for (let i = 0; i < node.namedChildCount; i++) { + const c = node.namedChild(i); + if (c) this.prescan(c, childScope); + } + } + + private declareDeclarators( + declNode: SyntaxNode, + kind: 'var' | 'let' | 'const', + scope: Scope, + hoistToRoot: boolean, + ): void { + for (let i = 0; i < declNode.namedChildCount; i++) { + const d = declNode.namedChild(i); + if (d?.type !== 'variable_declarator') continue; + const name = d.childForFieldName('name'); + if (name) this.declarePattern(name, kind, scope, hoistToRoot); + } + } + + // ── phase 2: per-statement fact extraction ─────────────────────────────── + + /** + * Def/use facts for one statement (or construct-header expression) node. + * Safe from any walk order — resolution consults the completed scope tree. + */ + facts(node: SyntaxNode): StatementFacts { + const acc = new FactAccumulator(node.startPosition.row + 1); + this.walkValue(node, acc); + return acc.finish(); + } + + /** + * Facts for an expression whose WHOLE evaluation is conditional (switch + * case tests, which only run when earlier cases didn't match) — every def + * inside becomes a may-def. + */ + factsConditional(node: SyntaxNode): StatementFacts { + const acc = new FactAccumulator(node.startPosition.row + 1); + this.conditional(() => this.walkValue(node, acc)); + return acc.finish(); + } + + /** Facts for a `for (left in/of right)` head: left binds/assigns, right is used. */ + forInHeadFacts(stmt: SyntaxNode): StatementFacts { + const acc = new FactAccumulator(stmt.startPosition.row + 1); + const left = stmt.childForFieldName('left'); + const right = stmt.childForFieldName('right'); + if (left) this.walkDefPattern(left, acc); + if (right) this.walkValue(right, acc); + return acc.finish(); + } + + /** ENTRY-block facts for the function's parameters (defs + default-value uses). */ + paramFacts(): StatementFacts | undefined { + const fnNode = this.fnNode; + const params = fnNode.childForFieldName('parameters') ?? fnNode.childForFieldName('parameter'); + if (!params) return undefined; + const acc = new FactAccumulator(fnNode.startPosition.row + 1); + if (params.type === 'identifier') { + this.def(params, acc); + } else { + for (let i = 0; i < params.namedChildCount; i++) { + const p = params.namedChild(i); + if (!p) continue; + const pattern = p.childForFieldName('pattern') ?? p; + this.walkDefPattern(pattern, acc); + const dflt = p.childForFieldName('value'); + if (dflt) this.walkValue(dflt, acc); + } + } + return acc.defCount() || acc.useCount() ? acc.finish() : undefined; + } + + /** Def fact for a `catch (e)` parameter — prepend to the handler entry block. */ + catchParamFacts(catchClause: SyntaxNode): StatementFacts | undefined { + const param = catchClause.childForFieldName('parameter'); + if (!param) return undefined; + const acc = new FactAccumulator(catchClause.startPosition.row + 1); + this.walkDefPattern(param, acc); + return acc.defCount() ? acc.finish() : undefined; + } + + private resolve(nameNode: SyntaxNode): number { + const name = nameNode.text; + // Fast path: the prescan cached every visited node's innermost scope, so + // resolution walks the SCOPE chain (shallow), not the AST parent chain + // (arbitrarily deep in chained expressions). The parent-chain walk remains + // as fallback for the few nodes the prescan never visits (e.g. a nested + // function declaration's own name node). + const cached = this.nearestScopeCache.get(nameNode.id); + let startScope: Scope | null = cached ?? null; + if (!startScope) { + for (let p: SyntaxNode | null = nameNode; p; p = p.parent) { + const scope = this.scopeByNode.get(p.id) ?? this.nearestScopeCache.get(p.id); + if (scope) { + startScope = scope; + break; + } + if (p.id === this.fnId) { + startScope = this.root; + break; + } + } + } + for (let s: Scope | null = startScope; s; s = s.parent) { + const idx = s.table.get(name); + if (idx !== undefined) return idx; + } + // No in-function declaration — synthetic module-level binding, shared by + // defs and uses so `notDeclared = 1; use(notDeclared)` still forms a fact. + let idx = this.synthetic.get(name); + if (idx === undefined) { + idx = this.bindings.length; + this.synthetic.set(name, idx); + this.bindings.push({ name, declLine: 0, declColumn: 0, kind: 'module', synthetic: true }); + } + return idx; + } + + private def(nameNode: SyntaxNode, acc: FactAccumulator): void { + if (this.conditionalDepth > 0) acc.addMayDef(this.resolve(nameNode)); + else acc.addDef(this.resolve(nameNode)); + } + + /** Run `fn` with defs demoted to may-defs (conditionally-evaluated context). */ + private conditional(fn: () => void): void { + this.conditionalDepth++; + try { + fn(); + } finally { + this.conditionalDepth--; + } + } + + /** Strip wrappers that don't change the lvalue (`(x) += 1`, `x! ++`). */ + private unwrapLvalue(node: SyntaxNode): SyntaxNode { + let n = node; + while (n.type === 'parenthesized_expression' || n.type === 'non_null_expression') { + const inner = n.namedChild(0); + if (!inner) break; + n = inner; + } + return n; + } + + private use(nameNode: SyntaxNode, acc: FactAccumulator): void { + acc.addUse(this.resolve(nameNode)); + } + + /** Value-position walk: collect uses; route def positions to the pattern walk. */ + private walkValue(node: SyntaxNode, acc: FactAccumulator): void { + const t = node.type; + if (TYPE_CONTEXT_TYPES.has(t)) return; + if (NESTED_FUNCTION_TYPES.has(t) && node.id !== this.fnId) { + // Opaque nested function: its NAME (function declaration) is a def in + // the enclosing scope; captured reads/writes inside are invisible (KTD4). + if (FUNCTION_DECL_TYPES.has(t)) { + const name = node.childForFieldName('name'); + if (name) this.def(name, acc); + } + return; + } + + switch (t) { + case 'identifier': + case 'shorthand_property_identifier': + this.use(node, acc); + return; + case 'lexical_declaration': + case 'variable_declaration': + for (let i = 0; i < node.namedChildCount; i++) { + const d = node.namedChild(i); + if (d?.type !== 'variable_declarator') continue; + const name = d.childForFieldName('name'); + const value = d.childForFieldName('value'); + // A bare `var x;` mid-function is hoisted and writes NOTHING at + // runtime — harvesting it as a def would fabricate a kill of the + // live def (`x = source(); var x; sink(x)` must keep source→sink; + // tri-review P2). `let`/`const` declarators genuinely initialize. + if (name && (value || t === 'lexical_declaration')) { + this.walkDefPattern(name, acc); + } + if (value) this.walkValue(value, acc); + } + return; + case 'assignment_expression': { + const left = node.childForFieldName('left'); + const right = node.childForFieldName('right'); + if (left) this.walkDefPattern(this.unwrapLvalue(left), acc); + if (right) this.walkValue(right, acc); + return; + } + case 'augmented_assignment_expression': { + // `x += y` both defines and uses x. The logical-assignment operators + // (`||=`, `&&=`, `??=`) only WRITE conditionally — their def is a + // may-def (the read always happens). + const left = node.childForFieldName('left') + ? this.unwrapLvalue(node.childForFieldName('left') as SyntaxNode) + : null; + const right = node.childForFieldName('right'); + const op = node.childForFieldName('operator')?.type ?? ''; + const logical = op === '||=' || op === '&&=' || op === '??='; + if (left?.type === 'identifier') { + if (logical) this.conditional(() => this.def(left, acc)); + else this.def(left, acc); + this.use(left, acc); + } else if (left) { + this.walkValue(left, acc); // member/subscript target — uses only + } + // The RHS of a logical assignment is itself conditionally evaluated. + if (right) { + if (logical) this.conditional(() => this.walkValue(right, acc)); + else this.walkValue(right, acc); + } + return; + } + case 'update_expression': { + const rawArg = node.childForFieldName('argument'); + const arg = rawArg ? this.unwrapLvalue(rawArg) : null; + if (arg?.type === 'identifier') { + this.def(arg, acc); + this.use(arg, acc); + } else if (arg) { + this.walkValue(arg, acc); + } + return; + } + case 'binary_expression': { + // Short-circuit operators evaluate their RIGHT operand conditionally: + // a def inside it (`a && (x = clean())`, `c ?? (c = load())`) must be + // a may-def or the not-taken path's prior def is falsely killed + // (tri-review P1). Other binary operators evaluate both sides. + const left = node.childForFieldName('left'); + const right = node.childForFieldName('right'); + const op = node.childForFieldName('operator')?.type ?? ''; + if (left) this.walkValue(left, acc); + if (right) { + if (op === '&&' || op === '||' || op === '??') { + this.conditional(() => this.walkValue(right, acc)); + } else { + this.walkValue(right, acc); + } + } + return; + } + case 'ternary_expression': { + // Each arm is conditionally evaluated — defs inside are may-defs. + const cond = node.childForFieldName('condition'); + const consequence = node.childForFieldName('consequence'); + const alternative = node.childForFieldName('alternative'); + if (cond) this.walkValue(cond, acc); + if (consequence) this.conditional(() => this.walkValue(consequence, acc)); + if (alternative) this.conditional(() => this.walkValue(alternative, acc)); + return; + } + case 'class_declaration': { + // The class NAME is a def (prescan declared the binding) — without + // this case the default walk would record it as a bogus USE in plain + // JS (the name is an `identifier` there; in TS it's a type_identifier + // and would be silently skipped, losing the def either way). The body + // walk picks up field-initializer uses; methods are opaque nested fns. + const name = node.childForFieldName('name'); + if (name) this.def(name, acc); + const body = node.childForFieldName('body'); + if (body) this.walkValue(body, acc); + return; + } + case 'class': { + // Class EXPRESSION: its name (if any) binds only inside the class — + // not a def in the enclosing function. Walk only the body. + const body = node.childForFieldName('body'); + if (body) this.walkValue(body, acc); + return; + } + default: + for (let i = 0; i < node.namedChildCount; i++) { + const c = node.namedChild(i); + if (c) this.walkValue(c, acc); + } + } + } + + /** Assignment-target walk: identifiers bind; member/subscript targets are uses. */ + private walkDefPattern(node: SyntaxNode, acc: FactAccumulator): void { + switch (node.type) { + case 'identifier': + case 'shorthand_property_identifier_pattern': + this.def(node, acc); + return; + case 'rest_pattern': + case 'object_pattern': + case 'array_pattern': + for (let i = 0; i < node.namedChildCount; i++) { + const c = node.namedChild(i); + if (c) this.walkDefPattern(c, acc); + } + return; + case 'pair_pattern': { + const key = node.childForFieldName('key'); + const value = node.childForFieldName('value'); + if (key?.type === 'computed_property_name') this.walkValue(key, acc); + if (value) this.walkDefPattern(value, acc); + return; + } + case 'assignment_pattern': + case 'object_assignment_pattern': { + const left = node.childForFieldName('left'); + const right = node.childForFieldName('right'); + if (left) this.walkDefPattern(left, acc); + if (right) this.walkValue(right, acc); + return; + } + case 'member_expression': + case 'subscript_expression': + // Property/element write — NOT a scalar def (KTD4); its identifiers + // (object, computed key) are uses. + this.walkValue(node, acc); + return; + default: + for (let i = 0; i < node.namedChildCount; i++) { + const c = node.namedChild(i); + if (c && !TYPE_CONTEXT_TYPES.has(c.type)) this.walkDefPattern(c, acc); + } + } + } +} + +/** Ordered, deduplicating def/use collector for one statement record. */ +class FactAccumulator { + private readonly defs: number[] = []; + private readonly uses: number[] = []; + private readonly mayDefs: number[] = []; + private readonly defSeen = new Set(); + private readonly useSeen = new Set(); + private readonly mayDefSeen = new Set(); + + constructor(private readonly line: number) {} + + addDef(idx: number): void { + if (this.defSeen.has(idx)) return; + this.defSeen.add(idx); + this.defs.push(idx); + } + + /** A def that may not execute (conditional context) — gen without kill. */ + addMayDef(idx: number): void { + if (this.mayDefSeen.has(idx)) return; + this.mayDefSeen.add(idx); + this.mayDefs.push(idx); + } + + addUse(idx: number): void { + if (this.useSeen.has(idx)) return; + this.useSeen.add(idx); + this.uses.push(idx); + } + + defCount(): number { + return this.defs.length + this.mayDefs.length; + } + + useCount(): number { + return this.uses.length; + } + + finish(): StatementFacts { + return { + line: this.line, + defs: this.defs, + uses: this.uses, + // Optional field stays absent when empty — keeps the serialized + // side-channel payload lean (most statements have no may-defs). + ...(this.mayDefs.length > 0 ? { mayDefs: this.mayDefs } : {}), + }; + } +} diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts index 79643b8d42..1382fdb668 100644 --- a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts @@ -18,23 +18,40 @@ * - `try/catch/finally` routes both normal completion AND a `throw` in the try * through `finally` (the finally block post-dominates the try/catch); a * `throw` with no catch propagates through finally to the enclosing handler. - * - labeled `break`/`continue` resolve against the labeled loop's frame. + * - EARLY EXITS THROUGH FINALLY (#2082 M2 U2, closes the M1 soundness gap): a + * `break`/`continue`/`return` whose jump CROSSES a `finally` is re-routed to + * the finally entry (keeping its bare jump kind), and the finally's exits + * gain a `finally-return`/`finally-break`/`finally-continue` completion edge + * to the resumed target. Threading is TARGET-RELATIVE via finalizer frames + * interleaved on the {@link ControlFlowContext} stack: only the finallys + * lexically between the jump and its target thread (a `break` whose loop is + * wholly inside the try keeps its direct edge — re-routing it would let a + * finally redefinition falsely kill in-loop defs for reaching-defs). Nested + * finallys chain inner→outer; finally-as-shared-join conflates exit paths + * (sound over-approximation; duplication-per-exit-path was rejected). An + * empty/comment-only finally pushes no frame — jumps keep direct edges. + * - labeled `break`/`continue` resolve against the labeled construct's frame: + * loops/switches carry their full label LIST (`outer: inner: for` resolves + * both), and a labeled NON-loop statement (`blk: { … break blk; … }`) gets + * a break-target frame whose target is a synthesized join after the body — + * the M1 route-to-EXIT fallback removed the real continuation and falsely + * killed defs for reaching-defs (tri-review P1). * - * Known M1 limitations: - * - SOUNDNESS GAP (M2 blocker, not mere precision): a non-local jump - * (`break`/`continue`/`return`) out of a `try` that has a `finally` edges - * directly to its target rather than routing THROUGH the `finally` block - * first. A future taint/PDG pass will therefore MISS flow mediated by a - * `finally` on the early-exit path (e.g. a value the `finally` taints or - * sanitizes before the `return` reaches its target) — a false negative. The - * general fix duplicates `finally` per exit path; deferred past M1 and - * tracked for M2. Normal completion and `throw` DO route through `finally`. - * - A `break`/`continue` to a label on a non-loop/non-switch block, and the - * OUTER label of a doubly-labeled construct (`outer: inner: for (...)`), are - * not modeled. The jump is conservatively routed to the function EXIT (a - * sound over-approximation that keeps the graph single-exit — see visitBreak) - * rather than left as a dangling sink; only the precise labeled target is - * unmodeled. Single-labeled loops/switches resolve correctly. + * Known limitations: + * - A jump whose label STILL fails to resolve (malformed source) keeps the + * conservative route-to-EXIT + thread-all-finallys fallback in + * visitBreak/visitContinue — single-exit preserved, no finally bypassed, + * but the continuation path is approximate. + * - Exceptional flow stays the sound over-approximation: EVERY protected-region + * block edges to the handler (an exception may fire mid-block), which + * over-supplies reaching-defs facts into `catch` — extra facts, never false + * kills. Per-leader throw precision is deliberately deferred (M3 decides). + * - Def/use harvest scope (#2082 M2, see typescript-harvest.ts for the full + * v1 semantics table): member/property writes are not scalar defs; nested + * function bodies are opaque in BOTH directions (writes to and reads of + * captured outer variables are invisible — callback flows are M4 territory); + * `case x:` test uses attach to the switch dispatch block (sound + * over-approximation of in-order case evaluation). * * Block/edge accounting and reachability are pinned in * `test/unit/cfg/cfg-builder.test.ts` (core) and @@ -42,9 +59,14 @@ */ import type { SyntaxNode } from '../../utils/ast-helpers.js'; import { CfgBuilder } from '../cfg-builder.js'; -import { ControlFlowContext } from '../control-flow-context.js'; +import { + ControlFlowContext, + drainFinalizerPending, + wireJumpThroughFinalizers, +} from '../control-flow-context.js'; import type { TraversalResult } from '../traversal-result.js'; import type { CfgVisitor, FunctionCfg } from '../types.js'; +import { TsHarvester } from './typescript-harvest.js'; /** TS/JS node types that own a CFG-bearing function body. */ const TS_FUNCTION_TYPES = new Set([ @@ -100,10 +122,15 @@ class TsCfgWalk { private readonly cfc = new ControlFlowContext(); /** Stack of exception-handler entry blocks (catch/finally) a `throw` jumps to. */ private readonly handlers: number[] = []; - /** Label awaiting the loop/switch it immediately precedes (labeled_statement). */ - private pendingLabel: string | undefined; + /** Labels awaiting the construct they precede (`outer: inner: for` = both). */ + private pendingLabels: string[] = []; - constructor(private readonly builder: CfgBuilder) {} + constructor( + private readonly builder: CfgBuilder, + /** Def/use fact extractor (#2082 M2 U1) — phase-2 only; its scope tree is + * already complete, so any walk order resolves names correctly. */ + private readonly harvest: TsHarvester, + ) {} /** Statements of a block node, ignoring comments. */ private statementsOf(block: SyntaxNode): SyntaxNode[] { @@ -141,13 +168,24 @@ class TsCfgWalk { } else { // Simple statement — coalesce into the current straight-line block. if (openSimple === undefined) { - const idx = this.builder.newBlock(startLineOf(stmt), endLineOf(stmt), stmt.text); + const idx = this.builder.newBlock( + startLineOf(stmt), + endLineOf(stmt), + stmt.text, + 'normal', + this.harvest.facts(stmt), + ); if (entry === undefined) entry = idx; else this.builder.connect(dangling, idx, 'seq'); openSimple = idx; dangling = [idx]; } else { - this.builder.extendBlock(openSimple, endLineOf(stmt), stmt.text); + this.builder.extendBlock( + openSimple, + endLineOf(stmt), + stmt.text, + this.harvest.facts(stmt), + ); } } } @@ -192,59 +230,119 @@ class TsCfgWalk { } private visitSimple(stmt: SyntaxNode): TraversalResult { - const idx = this.builder.newBlock(startLineOf(stmt), endLineOf(stmt), stmt.text); + const idx = this.builder.newBlock( + startLineOf(stmt), + endLineOf(stmt), + stmt.text, + 'normal', + this.harvest.facts(stmt), + ); return { entry: idx, exits: [idx] }; } private visitReturn(stmt: SyntaxNode): TraversalResult { - const idx = this.builder.newBlock(startLineOf(stmt), endLineOf(stmt), stmt.text); - this.builder.edge(idx, this.builder.exitIndex, 'return'); + // Harvest the argument expression's uses — `return x` blocks live in this + // dedicated handler, not visitSeq, and were a silently-missed site once. + const idx = this.builder.newBlock( + startLineOf(stmt), + endLineOf(stmt), + stmt.text, + 'normal', + this.harvest.facts(stmt), + ); + // A return crosses EVERY active finally before reaching EXIT. + wireJumpThroughFinalizers( + this.builder, + idx, + this.cfc.finalizersForReturn(), + this.builder.exitIndex, + 'return', + ); return { entry: idx, exits: [] }; } private visitThrow(stmt: SyntaxNode): TraversalResult { - const idx = this.builder.newBlock(startLineOf(stmt), endLineOf(stmt), stmt.text); + const idx = this.builder.newBlock( + startLineOf(stmt), + endLineOf(stmt), + stmt.text, + 'normal', + this.harvest.facts(stmt), + ); this.builder.edge(idx, this.currentHandler(), 'throw'); return { entry: idx, exits: [] }; } private visitBreak(stmt: SyntaxNode): TraversalResult { const idx = this.builder.newBlock(startLineOf(stmt), endLineOf(stmt), stmt.text); - const target = this.cfc.breakTarget(this.labelOf(stmt)); - // An unresolved target — a label this M1 visitor doesn't model (a stacked + const res = this.cfc.resolveBreak(this.labelOf(stmt)); + // An unresolved target — a label this visitor doesn't model (a stacked // outer label like `outer: inner: for`, or a labeled non-loop block) — // would otherwise leave this block with NO out-edge, stranding it and // breaking the single-exit invariant a downstream post-dominator / PDG pass // relies on. Conservatively route an unresolved jump to the function EXIT - // ("escapes the function"): sound over-approximation, keeps single-exit. - this.builder.edge(idx, target ?? this.builder.exitIndex, 'break'); + // ("escapes the function") and thread ALL active finallys — a superset of + // the truly-crossed set (the real target is somewhere in the function, so + // execution provably runs every finally between the jump and wherever it + // lands... up to the ones the conservative EXIT routing over-includes). + // Sound for dataflow either way: extra paths, never a bypassed finally. + const { target, finalizers } = res ?? { + target: this.builder.exitIndex, + finalizers: this.cfc.finalizersForReturn(), + }; + wireJumpThroughFinalizers(this.builder, idx, finalizers, target, 'break'); return { entry: idx, exits: [] }; } private visitContinue(stmt: SyntaxNode): TraversalResult { const idx = this.builder.newBlock(startLineOf(stmt), endLineOf(stmt), stmt.text); - const target = this.cfc.continueTarget(this.labelOf(stmt)); - // See visitBreak: an unresolved label routes to EXIT to preserve single-exit. - this.builder.edge(idx, target ?? this.builder.exitIndex, 'continue'); + const res = this.cfc.resolveContinue(this.labelOf(stmt)); + // See visitBreak: an unresolved label routes to EXIT (threading all + // active finallys) to preserve single-exit without bypassing a finally. + const { target, finalizers } = res ?? { + target: this.builder.exitIndex, + finalizers: this.cfc.finalizersForReturn(), + }; + wireJumpThroughFinalizers(this.builder, idx, finalizers, target, 'continue'); return { entry: idx, exits: [] }; } private visitLabeled(stmt: SyntaxNode): SeqResult { const body = stmt.childForFieldName('body') ?? stmt.namedChildren[stmt.namedChildren.length - 1]; - if (body && LOOP_OR_SWITCH_TYPES.has(body.type)) { - this.pendingLabel = this.labelOf(stmt); + const label = this.labelOf(stmt); + if (body && (LOOP_OR_SWITCH_TYPES.has(body.type) || body.type === 'labeled_statement')) { + // Loop/switch consumes the accumulated labels via takeLabels(); a nested + // labeled_statement keeps accumulating (`outer: inner: for` → both + // labels land on the loop frame). + if (label) this.pendingLabels.push(label); const res = this.visitStmt(body); - this.pendingLabel = undefined; // clear even if the construct didn't consume it + this.pendingLabels = []; // clear leftovers if the construct didn't consume return res; } - // Labeled non-loop blocks (break-to-block-label) are not modeled in M1. - return this.visitBody(body); + // Labeled NON-loop statement (`blk: { … break blk; … }`): break-to-label + // targets a synthesized join after the body. Routing it to EXIT instead + // (the M1 behavior) removed the real continuation and falsely killed + // every def live at the jump for post-construct uses (tri-review P1). + const labels = [...this.pendingLabels, ...(label ? [label] : [])]; + this.pendingLabels = []; + const join = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); + this.cfc.pushLabeledBlock(join, labels); + const res = this.visitBody(body); + this.cfc.pop(); + if (res) this.builder.connect(res.exits, join, 'seq'); + return { entry: res?.entry ?? join, exits: [join] }; } private visitIf(stmt: SyntaxNode): TraversalResult { const cond = stmt.childForFieldName('condition') ?? stmt; - const condBlock = this.builder.newBlock(startLineOf(stmt), endLineOf(cond), cond.text); + const condBlock = this.builder.newBlock( + startLineOf(stmt), + endLineOf(cond), + cond.text, + 'normal', + this.harvest.facts(cond), + ); const exits: number[] = []; @@ -283,12 +381,18 @@ class TsCfgWalk { } private visitWhile(stmt: SyntaxNode): TraversalResult { - const label = this.takeLabel(); + const labels = this.takeLabels(); const cond = stmt.childForFieldName('condition') ?? stmt; - const header = this.builder.newBlock(startLineOf(stmt), endLineOf(cond), cond.text); + const header = this.builder.newBlock( + startLineOf(stmt), + endLineOf(cond), + cond.text, + 'normal', + this.harvest.facts(cond), + ); const loopExit = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); - this.cfc.pushLoop(header, loopExit, label); + this.cfc.pushLoop(header, loopExit, labels); const body = this.visitBody(this.bodyBlockOf(stmt)); this.cfc.pop(); @@ -303,12 +407,18 @@ class TsCfgWalk { } private visitDoWhile(stmt: SyntaxNode): TraversalResult { - const label = this.takeLabel(); + const labels = this.takeLabels(); const cond = stmt.childForFieldName('condition') ?? stmt; - const condBlock = this.builder.newBlock(startLineOf(cond), endLineOf(cond), cond.text); + const condBlock = this.builder.newBlock( + startLineOf(cond), + endLineOf(cond), + cond.text, + 'normal', + this.harvest.facts(cond), + ); const loopExit = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); - this.cfc.pushLoop(condBlock, loopExit, label); + this.cfc.pushLoop(condBlock, loopExit, labels); const body = this.visitBody(this.bodyBlockOf(stmt)); this.cfc.pop(); @@ -320,7 +430,7 @@ class TsCfgWalk { } private visitFor(stmt: SyntaxNode): TraversalResult { - const label = this.takeLabel(); + const labels = this.takeLabels(); const init = stmt.childForFieldName('initializer'); const cond = stmt.childForFieldName('condition'); const incr = stmt.childForFieldName('increment'); @@ -329,16 +439,24 @@ class TsCfgWalk { startLineOf(stmt), cond ? endLineOf(cond) : startLineOf(stmt), cond ? cond.text : 'for(;;)', + 'normal', + cond ? this.harvest.facts(cond) : undefined, ); const loopExit = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); let incrBlock = header; if (incr) { - incrBlock = this.builder.newBlock(startLineOf(incr), endLineOf(incr), incr.text); + incrBlock = this.builder.newBlock( + startLineOf(incr), + endLineOf(incr), + incr.text, + 'normal', + this.harvest.facts(incr), + ); this.builder.edge(incrBlock, header, 'loop-back'); } - this.cfc.pushLoop(incrBlock, loopExit, label); + this.cfc.pushLoop(incrBlock, loopExit, labels); const body = this.visitBody(this.bodyBlockOf(stmt)); this.cfc.pop(); @@ -359,7 +477,13 @@ class TsCfgWalk { let entry = header; if (init) { - const initBlock = this.builder.newBlock(startLineOf(init), endLineOf(init), init.text); + const initBlock = this.builder.newBlock( + startLineOf(init), + endLineOf(init), + init.text, + 'normal', + this.harvest.facts(init), + ); this.builder.edge(initBlock, header, 'seq'); entry = initBlock; } @@ -367,15 +491,19 @@ class TsCfgWalk { } private visitForIn(stmt: SyntaxNode): TraversalResult { - const label = this.takeLabel(); + const labels = this.takeLabels(); + // Header text is SYNTHESIZED, so facts come from the left/right AST nodes + // directly (the loop variable is a def, the iterated expression a use). const header = this.builder.newBlock( startLineOf(stmt), startLineOf(stmt), this.forInHeaderText(stmt), + 'normal', + this.harvest.forInHeadFacts(stmt), ); const loopExit = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); - this.cfc.pushLoop(header, loopExit, label); + this.cfc.pushLoop(header, loopExit, labels); const body = this.visitBody(this.bodyBlockOf(stmt)); this.cfc.pop(); @@ -396,17 +524,35 @@ class TsCfgWalk { } private visitSwitch(stmt: SyntaxNode): TraversalResult { - const label = this.takeLabel(); + const labels = this.takeLabels(); const value = stmt.childForFieldName('value') ?? stmt; - const dispatch = this.builder.newBlock(startLineOf(stmt), endLineOf(value), value.text); + const dispatch = this.builder.newBlock( + startLineOf(stmt), + endLineOf(value), + value.text, + 'normal', + this.harvest.facts(value), + ); const switchExit = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); - this.cfc.pushSwitch(switchExit, label); + this.cfc.pushSwitch(switchExit, labels); const body = stmt.childForFieldName('body'); const cases = body ? body.namedChildren.filter((c) => c.type === 'switch_case' || c.type === 'switch_default') : []; + // `case x:` test expressions live in no block (caseStatements filters the + // value node out) — harvest their uses onto the dispatch block, one record + // per case in source order (a sound over-approximation of JS's in-order + // case evaluation). Conditionally: a later case test only evaluates when + // earlier cases didn't match, so any def inside one is a may-def — as a + // must-def on the always-executed dispatch block it would falsely kill + // prior defs for earlier-matching arms (tri-review). + for (const c of cases) { + const caseValue = c.childForFieldName('value'); + if (caseValue) this.builder.attachFacts(dispatch, this.harvest.factsConditional(caseValue)); + } + const caseResults = cases.map((c) => this.visitSeq(this.caseStatements(c))); const hasDefault = cases.some((c) => c.type === 'switch_default'); @@ -453,11 +599,20 @@ class TsCfgWalk { } // Build finally first so its entry is known as both a normal join and a - // handler target. The finally body runs in the OUTER handler context. + // handler target. The finally body runs in the OUTER handler context — and + // OUTSIDE this try's finalizer frame: a return inside the finally must not + // thread itself (it threads only outer finallys, matching JS semantics). const finallyRes = finallyClause ? this.visitSeq(this.statementsOf(this.bodyBlockOf(finallyClause) as SyntaxNode)) : null; + // Finalizer frame for early-exit threading (#2082 M2 U2): active while the + // catch and protected bodies are walked, so a crossing `return`/`break`/ + // `continue` inside either routes through the finally. An empty/comment-only + // finally (`finallyRes` null — the #2099-F2 empty-catch bug shape) pushes + // NO frame: it can define nothing, so jumps soundly keep direct edges. + const finFrame = finallyRes ? this.cfc.pushFinalizer(finallyRes.entry) : null; + // A throw inside catch propagates to finally (if any), else the outer handler. let catchRes: SeqResult = null; if (catchClause) { @@ -477,6 +632,27 @@ class TsCfgWalk { const idx = this.builder.newBlock(startLineOf(catchClause), endLineOf(catchClause), ''); catchRes = { entry: idx, exits: [idx] }; } + // `catch (e)` has no header block — the param def gets its OWN + // facts-only block in front of the body entry. It must NOT be prepended + // into the body's entry block: when the catch body STARTS with a loop, + // that entry is the loop HEADER, re-entered on every iteration — the + // param def would re-gen there and falsely KILL loop-carried + // redefinitions of the param (`catch (e) { while (c) { e = fix(e); } + // sink(e); }` would lose the fix→sink fact, a taint false negative). + // The param block becomes the handler entry, which is also semantically + // right: the binding happens exactly once, on handler entry. + const paramFacts = this.harvest.catchParamFacts(catchClause); + if (paramFacts) { + const paramBlock = this.builder.newBlock( + startLineOf(catchClause), + startLineOf(catchClause), + '', + 'normal', + paramFacts, + ); + this.builder.edge(paramBlock, catchRes.entry, 'seq'); + catchRes = { entry: paramBlock, exits: catchRes.exits }; + } } // Handler for the try body: catch if present, else finally, else outer. @@ -500,6 +676,15 @@ class TsCfgWalk { } } + // The finalizer frame closes once the protected/catch walks are done; any + // jumps that crossed it left their completion legs on `pending`, wired + // here from the finally's exits (see drainFinalizerPending for the + // finally-override semantics of an always-jumping finally). + if (finFrame && finallyRes) { + this.cfc.pop(); + drainFinalizerPending(this.builder, finFrame, finallyRes.exits); + } + const exits: number[] = []; if (finallyRes) { // Normal completion of try AND catch both flow through finally. @@ -523,11 +708,11 @@ class TsCfgWalk { return this.handlers.length ? this.handlers[this.handlers.length - 1] : this.builder.exitIndex; } - /** Consume the label awaiting the loop/switch this call is building. */ - private takeLabel(): string | undefined { - const label = this.pendingLabel; - this.pendingLabel = undefined; - return label; + /** Consume the labels awaiting the loop/switch this call is building. */ + private takeLabels(): string[] { + const labels = this.pendingLabels; + this.pendingLabels = []; + return labels; } private labelOf(stmt: SyntaxNode): string | undefined { @@ -549,23 +734,39 @@ function buildFunctionCfg(fnNode: SyntaxNode, filePath: string): FunctionCfg | u const body = fnNode.childForFieldName('body'); if (!body) return undefined; // overload signature / abstract method — no body + // Phase-1 declaration pre-scan (#2082 M2 U1) — must complete before any + // facts are extracted; the CFG walk below is not source-order. + const harvest = new TsHarvester(fnNode); + + // Parameters define at ENTRY (facts only — never touch the entry block's + // text or span: bench fingerprints and CFG snapshots include block text). + const paramFacts = harvest.paramFacts(); + if (paramFacts) builder.attachFacts(builder.entryIndex, paramFacts); + if (body.type !== 'statement_block') { // Expression-bodied arrow: `() => expr` — one block whose value is returned. - const blk = builder.newBlock(startLineOf(body), endLineOf(body), body.text); + // Lives outside the walk class, so it harvests explicitly. + const blk = builder.newBlock( + startLineOf(body), + endLineOf(body), + body.text, + 'normal', + harvest.facts(body), + ); builder.edge(builder.entryIndex, blk, 'seq'); builder.edge(blk, builder.exitIndex, 'return'); - return builder.finish(); + return builder.finish(harvest.table()); } - const walk = new TsCfgWalk(builder); + const walk = new TsCfgWalk(builder, harvest); const res = walk.visitSeq(body.namedChildren.filter((c) => c.type !== 'comment')); if (!res) { builder.edge(builder.entryIndex, builder.exitIndex, 'seq'); // empty body - return builder.finish(); + return builder.finish(harvest.table()); } builder.edge(builder.entryIndex, res.entry, 'seq'); builder.connect(res.exits, builder.exitIndex, 'seq'); // normal fall-off → EXIT - return builder.finish(); + return builder.finish(harvest.table()); } /** Whether a node is a TS/JS function this visitor builds a CFG for. */ diff --git a/gitnexus/src/core/ingestion/pipeline.ts b/gitnexus/src/core/ingestion/pipeline.ts index 32216645e8..f1f8326321 100644 --- a/gitnexus/src/core/ingestion/pipeline.ts +++ b/gitnexus/src/core/ingestion/pipeline.ts @@ -73,6 +73,15 @@ export interface PipelineOptions { * silent truncation). No CLI flag in M1 — programmatic / server path only. */ pdgMaxEdgesPerFunction?: number; + /** + * Per-function REACHING_DEF edge cap for the scope-resolution emit step + * (#2082 M2). `undefined` ⇒ `DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION` + * (4000); `0` ⇒ no cap (unlimited). Emit-time-only — NOT folded into the + * parse-cache chunk key (the worker never sees it); recorded in + * `RepoMeta.pdg` so a cap change forces a full writeback. No CLI flag — + * programmatic / server path only, like the M1 caps. + */ + pdgMaxReachingDefEdgesPerFunction?: number; /** * Request parsing with the worker pool disabled. The sequential parser was * removed — the worker pool is the sole parse path — so setting this now diff --git a/gitnexus/src/core/ingestion/scope-resolution/pipeline/phase.ts b/gitnexus/src/core/ingestion/scope-resolution/pipeline/phase.ts index ec72c76ee7..7d0fd811e0 100644 --- a/gitnexus/src/core/ingestion/scope-resolution/pipeline/phase.ts +++ b/gitnexus/src/core/ingestion/scope-resolution/pipeline/phase.ts @@ -353,6 +353,7 @@ export const scopeResolutionPhase: PipelinePhase = { // CFG/PDG emission (#2081 M1) — opt-in; off ⇒ byte-identical graph. pdg: ctx.options?.pdg === true, pdgMaxEdgesPerFunction: ctx.options?.pdgMaxEdgesPerFunction, + pdgMaxReachingDefEdgesPerFunction: ctx.options?.pdgMaxReachingDefEdgesPerFunction, recordResolutionOutcome: (outcome) => { resolutionOutcomes.push(outcome); }, diff --git a/gitnexus/src/core/ingestion/scope-resolution/pipeline/run.ts b/gitnexus/src/core/ingestion/scope-resolution/pipeline/run.ts index d937b05b98..e8d5fe6b57 100644 --- a/gitnexus/src/core/ingestion/scope-resolution/pipeline/run.ts +++ b/gitnexus/src/core/ingestion/scope-resolution/pipeline/run.ts @@ -34,7 +34,13 @@ import { extractParsedFile } from '../../scope-extractor-bridge.js'; import { finalizeScopeModel } from '../../finalize-orchestrator.js'; import { resolveReferenceSites, type ResolveStats } from '../../resolve-references.js'; import { buildGraphNodeLookup } from '../graph-bridge/node-lookup.js'; -import { emitFileCfgs, isEmitSafeCfg, DEFAULT_MAX_CFG_EDGES_PER_FUNCTION } from '../../cfg/emit.js'; +import { + emitFileCfgs, + emitFileReachingDefs, + isEmitSafeCfg, + DEFAULT_MAX_CFG_EDGES_PER_FUNCTION, + DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION, +} from '../../cfg/emit.js'; import type { FunctionCfg } from '../../cfg/types.js'; import { resolveDefGraphId } from '../graph-bridge/ids.js'; import { buildPopulatedMethodDispatch } from '../graph-bridge/method-dispatch.js'; @@ -264,6 +270,9 @@ interface RunScopeResolutionInput { /** Per-function CFG edge cap. `undefined` ⇒ {@link DEFAULT_MAX_CFG_EDGES_PER_FUNCTION}; * `0` ⇒ no cap (unlimited). */ readonly pdgMaxEdgesPerFunction?: number; + /** Per-function REACHING_DEF edge cap (#2082 M2). `undefined` ⇒ + * {@link DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION}; `0` ⇒ no cap. */ + readonly pdgMaxReachingDefEdgesPerFunction?: number; /** * Optional graph-node lookup built ONCE by the caller and shared across * every language pass. `buildGraphNodeLookup` scans the whole graph and is @@ -698,10 +707,20 @@ export function runScopeResolution( // disk store is cleared right after this orchestrator returns, see phase.ts). // A post-`mro` phase would read empty data (KTD1). Off by default ⇒ zero // BasicBlock/CFG nodes/edges and a byte-identical graph. + // Accumulated M2 reaching-defs time (solve + dedup + REACHING_DEF emit), + // reported as the PROF `pdg=` segment. It is a SUBSET of `emit=` — the M1 + // CFG emit and the M2 solve interleave per file, so a separate checkpoint + // pair can't bracket them; without this accumulator the M2 cost would + // silently disappear into `emit=` and field regressions would be invisible. + let pdgMs = 0; if (input.pdg === true) { let cfgBlocks = 0; let cfgEdges = 0; let cfgDroppedEdges = 0; + let rdEdges = 0; + let rdDropped = 0; + let rdFacts = 0; + let rdTruncated = 0; for (const pf of emitParsedFiles) { const cfgs = pf.cfgSideChannel; // Defensive: cfgSideChannel is opaque (`unknown`) and crosses the cache / @@ -739,6 +758,25 @@ export function runScopeResolution( cfgBlocks += emitted.blocks; cfgEdges += emitted.edges; cfgDroppedEdges += emitted.droppedEdges; + + // M2 (#2082 U4): reaching definitions over the same validated CFGs. + // In-memory facts are computed per function and dropped after the + // bounded (defBlock, useBlock, binding) projection is persisted — + // M3 recomputes via the same pure solver in-phase (KTD8). Timing is + // PROF-gated like every other checkpoint here (zero cost when off). + const t0 = PROF ? performance.now() : 0; + const rd = emitFileReachingDefs( + graph, + wellFormed, + input.pdgMaxReachingDefEdgesPerFunction ?? + DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION, + (message) => logger.warn(message), // unconditional — R7, both layers + ); + if (PROF) pdgMs += performance.now() - t0; + rdEdges += rd.edges; + rdDropped += rd.droppedEdges; + rdFacts += rd.facts; + rdTruncated += rd.truncatedFunctions; } catch (err) { // Last-resort isolation, mirroring the worker-side per-file try/catch: // a shape the predicate misses must cost this one file's CFG, not @@ -757,7 +795,10 @@ export function runScopeResolution( logger.debug( `[scope-resolution] CFG emit (lang=${provider.language}): ` + `${cfgBlocks} BasicBlock nodes, ${cfgEdges} CFG edges` + - (cfgDroppedEdges > 0 ? `, ${cfgDroppedEdges} edges dropped (per-function cap)` : ''), + (cfgDroppedEdges > 0 ? `, ${cfgDroppedEdges} edges dropped (per-function cap)` : '') + + `; ${rdEdges} REACHING_DEF edges (${rdFacts} facts)` + + (rdDropped > 0 ? `, ${rdDropped} REACHING_DEF edges dropped (per-function cap)` : '') + + (rdTruncated > 0 ? `, ${rdTruncated} function(s) hit the fact limit` : ''), ); } } @@ -771,6 +812,8 @@ export function runScopeResolution( ` propagate=${ns(tFinalize, tPropagate).toFixed(0)}ms` + ` resolve=${ns(tPropagate, tResolve).toFixed(0)}ms` + ` emit=${ns(tResolve, tEnd).toFixed(0)}ms` + + // pdg ⊆ emit: the M2 reaching-defs share of the emit bucket (#2082 U4). + (input.pdg === true ? ` pdg=${pdgMs.toFixed(0)}ms` : '') + ` total=${ns(tStart, tEnd).toFixed(0)}ms` + ` (${parsedFiles.length} files)`, ); diff --git a/gitnexus/src/core/run-analyze.ts b/gitnexus/src/core/run-analyze.ts index f8fe4163e3..66300b8b31 100644 --- a/gitnexus/src/core/run-analyze.ts +++ b/gitnexus/src/core/run-analyze.ts @@ -45,7 +45,10 @@ import { type RepoMeta, } from '../storage/repo-manager.js'; import { DEFAULT_PDG_MAX_FUNCTION_LINES } from './ingestion/cfg/collect.js'; -import { DEFAULT_MAX_CFG_EDGES_PER_FUNCTION } from './ingestion/cfg/emit.js'; +import { + DEFAULT_MAX_CFG_EDGES_PER_FUNCTION, + DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION, +} from './ingestion/cfg/emit.js'; import { computeFileHashes, diffFileHashes } from '../storage/file-hash.js'; import { extractChangedSubgraph, @@ -135,6 +138,9 @@ export interface AnalyzeOptions { pdgMaxFunctionLines?: number; /** Per-function CFG edge cap. Forwarded to `PipelineOptions.pdgMaxEdgesPerFunction`. */ pdgMaxEdgesPerFunction?: number; + /** Per-function REACHING_DEF edge cap (#2082 M2). Forwarded to + * `PipelineOptions.pdgMaxReachingDefEdgesPerFunction`. */ + pdgMaxReachingDefEdgesPerFunction?: number; /** * Default branch threaded into generated AGENTS.md / CLAUDE.md so the * regression-compare example uses the configured branch instead of a @@ -335,13 +341,19 @@ export const collectBranchCacheKeys = async ( * defaults so an explicit-default run compares equal to a default run * (`0` = unlimited is preserved as `0`). Pure + exported for testing. */ -type PdgOptions = Pick; +type PdgOptions = Pick< + AnalyzeOptions, + 'pdg' | 'pdgMaxFunctionLines' | 'pdgMaxEdgesPerFunction' | 'pdgMaxReachingDefEdgesPerFunction' +>; export const resolvePdgConfig = (options: PdgOptions): RepoMeta['pdg'] => options.pdg === true ? { maxFunctionLines: options.pdgMaxFunctionLines ?? DEFAULT_PDG_MAX_FUNCTION_LINES, maxEdgesPerFunction: options.pdgMaxEdgesPerFunction ?? DEFAULT_MAX_CFG_EDGES_PER_FUNCTION, + maxReachingDefEdgesPerFunction: + options.pdgMaxReachingDefEdgesPerFunction ?? + DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION, } : undefined; @@ -358,10 +370,20 @@ export const pdgModeMismatch = (recorded: RepoMeta['pdg'], options: PdgOptions): const requested = resolvePdgConfig(options); if (!requested && !recorded) return false; if (!requested || !recorded) return true; - return ( - requested.maxFunctionLines !== recorded.maxFunctionLines || - requested.maxEdgesPerFunction !== recorded.maxEdgesPerFunction - ); + // Structural comparison over the KEY UNION of both resolved records — not a + // hand-maintained field list. Both sides come fully resolved from + // resolvePdgConfig, so any new emit-affecting knob added there joins the + // comparison automatically (M1's hand-extended comparator was the trap this + // closes: a knob it missed would silently strand a stale projection). It is + // also what makes the M1→M2 upgrade work with zero extra code: an M1-era + // stamp lacks maxReachingDefEdgesPerFunction, so `4000 !== undefined` trips + // a full writeback that populates REACHING_DEF rows without `--force`. + const reqRecord = requested as Record; + const recRecord = recorded as Record; + for (const key of new Set([...Object.keys(reqRecord), ...Object.keys(recRecord)])) { + if (reqRecord[key] !== recRecord[key]) return true; + } + return false; }; export async function runFullAnalysis( @@ -730,6 +752,7 @@ export async function runFullAnalysis( pdg: options.pdg === true, pdgMaxFunctionLines: options.pdgMaxFunctionLines, pdgMaxEdgesPerFunction: options.pdgMaxEdgesPerFunction, + pdgMaxReachingDefEdgesPerFunction: options.pdgMaxReachingDefEdgesPerFunction, fetchWrappers: options.fetchWrappers, }, ); diff --git a/gitnexus/src/mcp/local/local-backend.ts b/gitnexus/src/mcp/local/local-backend.ts index 2abfdfbc9f..f2d17b3bd0 100644 --- a/gitnexus/src/mcp/local/local-backend.ts +++ b/gitnexus/src/mcp/local/local-backend.ts @@ -1677,9 +1677,14 @@ export class LocalBackend { ) : await executeParameterized( repo.lbugPath, + // Same BasicBlock exclusion as detect_changes (#2082 U7): on a + // --pdg index a function-heavy file has far more BasicBlock rows + // than symbols, so an unfiltered LIMIT 3 would surface nameless + // substrate rows and displace the real symbols. ` MATCH (n) WHERE n.filePath = $filePath + AND NOT n.id STARTS WITH 'BasicBlock:' RETURN n.id AS id, n.name AS name, labels(n)[0] AS type, n.filePath AS filePath, n.startLine AS startLine, n.endLine AS endLine LIMIT 3 `, @@ -2922,8 +2927,20 @@ export class LocalBackend { queryParams[`hunkEnd${i}`] = hunk.endLine; }); + // Exclude BasicBlock rows by id prefix: on a --pdg index every edited + // function otherwise contributes N nameless BasicBlock pseudo-"symbols" + // (they carry filePath/start/end but no name), inflating changed_count + // and risk level with rows no consumer can act on (#2082 U7). Blocks + // are implementation substrate, not symbols — the owning Function row + // already represents the change. The id prefix (`BasicBlock::…`, + // cfg/emit.ts basicBlockId) beats a label predicate (`labels(n)[0]` is + // known to come back empty for several node types — see + // enrichCandidateLabels) AND beats `n.name IS NOT NULL` (which would + // also drop legitimate symbols whose name loaded as NULL, e.g. + // quoted-empty CSV fields for anonymous constructs). const symbolQuery = ` MATCH (n) WHERE n.filePath ENDS WITH $filePath + AND NOT n.id STARTS WITH 'BasicBlock:' AND n.startLine IS NOT NULL AND n.endLine IS NOT NULL AND (${overlapConditions}) RETURN n.id AS id, n.name AS name, labels(n)[0] AS type, diff --git a/gitnexus/src/storage/parse-cache.ts b/gitnexus/src/storage/parse-cache.ts index 744748e4b7..837144c5cd 100644 --- a/gitnexus/src/storage/parse-cache.ts +++ b/gitnexus/src/storage/parse-cache.ts @@ -55,7 +55,7 @@ import type { ParseWorkerResult } from '../core/ingestion/workers/parse-worker.j // the main thread (the #1983 OOM). Because the two stores share this version, // any future change to the `ParsedFile` serialization shape MUST bump // SCHEMA_BUMP so both invalidate in lockstep. -const SCHEMA_BUMP = 5; // #2081 M1: ParsedFile gained `cfgSideChannel` +const SCHEMA_BUMP = 6; // #2082 M2: cfgSideChannel gained bindings + per-block statement facts const GITNEXUS_PKG_VERSION = (() => { try { // package.json sits at gitnexus/package.json — two levels up from diff --git a/gitnexus/src/storage/repo-manager.ts b/gitnexus/src/storage/repo-manager.ts index 80e815ebfa..5e9fd972f8 100644 --- a/gitnexus/src/storage/repo-manager.ts +++ b/gitnexus/src/storage/repo-manager.ts @@ -150,6 +150,14 @@ export interface RepoMeta { maxFunctionLines: number; /** Emit-side per-function CFG edge cap, resolved (0 = unlimited). */ maxEdgesPerFunction: number; + /** + * Emit-side per-function REACHING_DEF edge cap, resolved (0 = unlimited; + * #2082 M2). ABSENT on an M1-era stamp — which is exactly what makes + * `pdgModeMismatch` trip on the first M2 run over an M1 index and force + * the full writeback that populates REACHING_DEF rows. Optional in the + * type for that reason; resolved (always present) on every M2+ write. + */ + maxReachingDefEdgesPerFunction?: number; }; } diff --git a/gitnexus/test/integration/cfg/__snapshots__/cfg-snapshot.test.ts.snap b/gitnexus/test/integration/cfg/__snapshots__/cfg-snapshot.test.ts.snap index 092da4de90..04e592d1ef 100644 --- a/gitnexus/test/integration/cfg/__snapshots__/cfg-snapshot.test.ts.snap +++ b/gitnexus/test/integration/cfg/__snapshots__/cfg-snapshot.test.ts.snap @@ -1,6 +1,6 @@ // Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html -exports[`U7 — AC1: 10-function fixture CFG snapshot > matches the committed CFG node/edge set 1`] = ` +exports[`U7 — AC1: ten-functions fixture CFG snapshot > matches the committed CFG node/edge set 1`] = ` [ { "blocks": 3, @@ -107,14 +107,15 @@ exports[`U7 — AC1: 10-function fixture CFG snapshot > matches the committed CF "startLine": 55, }, { - "blocks": 6, + "blocks": 7, "edges": [ - "0->4:seq", - "2->5:seq", + "0->5:seq", + "2->6:seq", "3->2:seq", - "4->2:seq", - "4->3:throw", - "5->1:seq", + "4->3:seq", + "5->2:seq", + "5->4:throw", + "6->1:seq", ], "entry": 0, "exit": 1, @@ -150,5 +151,38 @@ exports[`U7 — AC1: 10-function fixture CFG snapshot > matches the committed CF "exit": 1, "startLine": 87, }, + { + "blocks": 8, + "edges": [ + "0->2:seq", + "2->4:seq", + "3->1:finally-return", + "3->1:throw", + "3->7:seq", + "4->3:throw", + "4->5:cond-true", + "4->6:seq", + "5->3:return", + "5->3:throw", + "6->3:seq", + "6->3:throw", + "7->1:return", + ], + "entry": 0, + "exit": 1, + "startLine": 102, + }, + { + "blocks": 5, + "edges": [ + "0->2:seq", + "2->3:seq", + "3->4:seq", + "4->1:seq", + ], + "entry": 0, + "exit": 1, + "startLine": 115, + }, ] `; diff --git a/gitnexus/test/integration/cfg/__snapshots__/reaching-defs-snapshot.test.ts.snap b/gitnexus/test/integration/cfg/__snapshots__/reaching-defs-snapshot.test.ts.snap new file mode 100644 index 0000000000..7ad8b7afb3 --- /dev/null +++ b/gitnexus/test/integration/cfg/__snapshots__/reaching-defs-snapshot.test.ts.snap @@ -0,0 +1,124 @@ +// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html + +exports[`R5 — REACHING_DEF facts snapshot on the M1 fixture > matches the committed fact set for every fixture function 1`] = ` +[ + { + "defs": 0, + "facts": [], + "startLine": 9, + "status": "computed", + "uses": 2, + }, + { + "defs": 1, + "facts": [ + "0:0->2:0:x:14:23", + ], + "startLine": 14, + "status": "computed", + "uses": 4, + }, + { + "defs": 1, + "facts": [ + "0:0->2:0:x:23:27", + "0:0->4:0:x:23:27", + ], + "startLine": 23, + "status": "computed", + "uses": 6, + }, + { + "defs": 1, + "facts": [ + "0:0->2:0:x:34:26", + ], + "startLine": 34, + "status": "computed", + "uses": 3, + }, + { + "defs": 3, + "facts": [ + "0:0->2:0:n:41:24", + "4:0->2:0:i:42:11", + "4:0->4:0:i:42:11", + "6:0->2:0:i:42:11", + "6:0->4:0:i:42:11", + ], + "startLine": 41, + "status": "computed", + "uses": 5, + }, + { + "defs": 2, + "facts": [ + "0:0->2:0:xs:48:26", + "2:0->4:0:x:49:13", + ], + "startLine": 48, + "status": "computed", + "uses": 4, + }, + { + "defs": 1, + "facts": [ + "0:0->2:0:x:55:27", + ], + "startLine": 55, + "status": "computed", + "uses": 5, + }, + { + "defs": 1, + "facts": [], + "startLine": 69, + "status": "computed", + "uses": 4, + }, + { + "defs": 1, + "facts": [ + "0:0->2:0:x:80:27", + ], + "startLine": 80, + "status": "computed", + "uses": 1, + }, + { + "defs": 2, + "facts": [ + "0:0->2:0:xs:87:27", + "2:0->4:0:x:88:13", + ], + "startLine": 87, + "status": "computed", + "uses": 5, + }, + { + "defs": 3, + "facts": [ + "0:0->4:0:flag:102:37", + "2:0->5:0:val:103:6", + "3:0->7:0:val:103:6", + ], + "startLine": 102, + "status": "computed", + "uses": 5, + }, + { + "defs": 4, + "facts": [ + "2:0->4:0:s:116:6", + "3:0->3:1:s:118:8", + "3:1->3:1:s:118:8", + "3:1->3:2:s:118:8", + "4:0->4:0:s:116:6", + "4:0->4:1:s:116:6", + ], + "startLine": 115, + "status": "computed", + "uses": 6, + }, +] +`; diff --git a/gitnexus/test/integration/cfg/cfg-emit.test.ts b/gitnexus/test/integration/cfg/cfg-emit.test.ts index d90c21210e..98efe908ae 100644 --- a/gitnexus/test/integration/cfg/cfg-emit.test.ts +++ b/gitnexus/test/integration/cfg/cfg-emit.test.ts @@ -2,7 +2,7 @@ import { describe, it, expect, vi } from 'vitest'; import Parser from 'tree-sitter'; import TypeScript from 'tree-sitter-typescript'; import { collectFunctionCfgs } from '../../../src/core/ingestion/cfg/collect.js'; -import { emitFileCfgs } from '../../../src/core/ingestion/cfg/emit.js'; +import { emitFileCfgs, emitFileReachingDefs } from '../../../src/core/ingestion/cfg/emit.js'; import { getProvider } from '../../../src/core/ingestion/languages/index.js'; import { SupportedLanguages } from '../../../src/config/supported-languages.js'; import type { CfgVisitor, FunctionCfg } from '../../../src/core/ingestion/cfg/types.js'; @@ -179,3 +179,162 @@ describe('U4 — flag-off / empty input emits nothing', () => { expect(r.edges).toBe(0); }); }); + +describe('U4 (#2082 M2) — emitFileReachingDefs', () => { + it('persists deduped (blockPair, binding) edges with reason = plain variable name', () => { + const cfgs = cfgsOf( + `function f(a) { + let x = a; + x = x + 1; + return sink(x); + }`, + 'src/rd.ts', + ); + const { graph, rels } = recordingGraph(); + const r = emitFileReachingDefs(graph, cfgs); + expect(r.edges).toBe(rels.length); + expect(rels.length).toBeGreaterThan(0); + for (const e of rels) { + expect(e.type).toBe('REACHING_DEF'); + expect(e.sourceId).toMatch(/^BasicBlock:src\/rd\.ts:\d+:\d+:\d+$/); + expect(e.targetId).toMatch(/^BasicBlock:src\/rd\.ts:\d+:\d+:\d+$/); + } + // reason carries the plain source-level name (M0/S1 verdict) + const reasons = new Set(rels.map((e) => e.reason)); + expect(reasons.has('x')).toBe(true); + expect(reasons.has('a')).toBe(true); + }); + + it('same block pair, two bindings → two distinct edges (id collision-proofing)', () => { + const cfgs = cfgsOf(`function f(a, b) { const c = a + b; use(c); }`, 'two.ts'); + const { graph, rels } = recordingGraph(); + emitFileReachingDefs(graph, cfgs); + const ids = rels.map((e) => e.id); + expect(new Set(ids).size).toBe(ids.length); + // a and b both flow ENTRY→body: same block pair, distinct edges by binding + const entryToBody = rels.filter((e) => e.reason === 'a' || e.reason === 'b'); + expect(entryToBody.length).toBeGreaterThanOrEqual(2); + }); + + it('N statement-level facts on one (blockPair, binding) collapse to ONE edge', () => { + // x defined once, used three times in the same straight-line block: three + // facts, one persisted edge (the persisted columns cannot distinguish). + const cfgs = cfgsOf( + `function f() { + let x = seed(); + a(x); b(x); c(x); + }`, + 'dedup.ts', + ); + const { graph, rels } = recordingGraph(); + const r = emitFileReachingDefs(graph, cfgs); + const xEdges = rels.filter((e) => e.reason === 'x'); + expect(xEdges).toHaveLength(1); // self-pair within the single body block + expect(r.facts).toBeGreaterThan(rels.length); // facts > deduped edges + }); + + it('per-function edge cap: truncates deterministically, warns with top bindings (R7)', () => { + const cfgs = cfgsOf( + `function f(p, q) { + let x = p; + if (p) { x = q; } else { x = p + q; } + s1(x); s2(p); s3(q); + }`, + 'cap.ts', + ); + const full = recordingGraph(); + const rFull = emitFileReachingDefs(full.graph, cfgs); + expect(rFull.edges).toBeGreaterThan(2); + + const capped = recordingGraph(); + const onWarn = vi.fn(); + const r = emitFileReachingDefs(capped.graph, cfgs, 2, onWarn); + expect(capped.rels).toHaveLength(2); + // NOTE: not comparable to rFull.edges — the cap also scales maxFacts (4×), + // so the capped run may dedup fewer facts. Within-run consistency only: + expect(r.droppedEdges).toBeGreaterThan(0); + expect(r.cappedFunctions).toBe(1); + // cap=2 also tightens maxFacts (8) below this function's fact count, so + // BOTH R7 layers may warn — assert on the edge-cap warn specifically. + const capWarns = onWarn.mock.calls + .map((c) => c[0] as string) + .filter((m) => m.includes('REACHING_DEF edge cap')); + expect(capWarns).toHaveLength(1); + expect(capWarns[0]).toContain('top bindings'); + // deterministic truncation: same prefix on a second run + const again = recordingGraph(); + emitFileReachingDefs(again.graph, cfgs, 2, vi.fn()); + expect(again.rels.map((e) => e.id)).toEqual(capped.rels.map((e) => e.id)); + }); + + it('cap of 0 means unlimited (no warn)', () => { + const cfgs = cfgsOf(`function f(a) { use(a); }`, 'u.ts'); + const { graph, rels } = recordingGraph(); + const onWarn = vi.fn(); + emitFileReachingDefs(graph, cfgs, 0, onWarn); + expect(rels.length).toBeGreaterThan(0); + expect(onWarn).not.toHaveBeenCalled(); + }); + + it('fact-layer truncation warns even when the edge cap is never reached (R7 both layers)', () => { + // 3 parallel arms defining x + several later uses → facts >> deduped edges. + // Cap edges generously but squeeze maxFacts via a tiny edge cap × 4? No — + // maxFacts derives from the edge cap (4×). Use a cap that bounds facts + // below the fact count while edges stay under it: cap=3 ⇒ maxFacts=12. + const cfgs = cfgsOf( + `function f(c) { + let x = 0; + if (c === 1) { x = 1; } else if (c === 2) { x = 2; } else { x = 3; } + u1(x); u2(x); u3(x); u4(x); u5(x); + }`, + 'trunc.ts', + ); + const probe = recordingGraph(); + const rProbe = emitFileReachingDefs(probe.graph, cfgs); + expect(rProbe.facts).toBeGreaterThan(12); // 3 defs × 5 uses of x alone = 15+ + + const { graph } = recordingGraph(); + const onWarn = vi.fn(); + const r = emitFileReachingDefs(graph, cfgs, 1000, onWarn); + // edge cap (1000) never reached… + expect(r.cappedFunctions).toBe(0); + // …but with cap=3 ⇒ maxFacts=12 < total facts, truncation warns: + const tight = recordingGraph(); + const onWarnTight = vi.fn(); + const rTight = emitFileReachingDefs(tight.graph, cfgs, 3, onWarnTight); + expect(rTight.truncatedFunctions).toBe(1); + const messages = onWarnTight.mock.calls.map((c) => c[0] as string); + expect(messages.some((m) => m.includes('fact materialization'))).toBe(true); + }); + + it('no-facts CFGs (pre-M2 side channel) emit nothing and do not throw', () => { + const bare = { + filePath: 'old.ts', + functionStartLine: 1, + functionEndLine: 2, + functionStartColumn: 0, + entryIndex: 0, + exitIndex: 1, + blocks: [ + { index: 0, startLine: 1, endLine: 1, text: '', kind: 'entry' }, + { index: 1, startLine: 2, endLine: 2, text: '', kind: 'exit' }, + ], + edges: [{ from: 0, to: 1, kind: 'seq' }], + } as unknown as FunctionCfg; + const { graph, rels } = recordingGraph(); + const r = emitFileReachingDefs(graph, [bare]); + expect(rels).toHaveLength(0); + expect(r.edges).toBe(0); + }); + + it('emitting the same function twice is idempotent by id (first-writer-wins safe)', () => { + const cfgs = cfgsOf(`function f(a) { return a; }`, 'i.ts'); + const { graph, rels } = recordingGraph(); + emitFileReachingDefs(graph, cfgs); + const firstIds = rels.map((e) => e.id); + emitFileReachingDefs(graph, cfgs); + // ids deterministic ⇒ the second pass produces the SAME ids (the real + // KnowledgeGraph would no-op them; the recorder shows them duplicated) + expect(rels.slice(firstIds.length).map((e) => e.id)).toEqual(firstIds); + }); +}); diff --git a/gitnexus/test/integration/cfg/cfg-snapshot.test.ts b/gitnexus/test/integration/cfg/cfg-snapshot.test.ts index 138fe4f301..116a884a70 100644 --- a/gitnexus/test/integration/cfg/cfg-snapshot.test.ts +++ b/gitnexus/test/integration/cfg/cfg-snapshot.test.ts @@ -69,16 +69,17 @@ function reaches(adj: Map, from: string, to: string): boolean return seen.has(to); } -describe('U7 — AC1: 10-function fixture CFG snapshot', () => { +describe('U7 — AC1: ten-functions fixture CFG snapshot', () => { it('matches the committed CFG node/edge set', () => { const cfgs = cfgsOfFile('ten-functions.ts'); - expect(cfgs).toHaveLength(10); + // 10 M1 functions + 2 M2 additions (early-exit finally, shadowing — #2082 U5) + expect(cfgs).toHaveLength(12); expect(cfgs.map(serialize)).toMatchSnapshot(); }); }); describe('U7 — AC2: every BasicBlock reachable from its function ENTRY', () => { - it('holds for all ten functions (no dead code in the fixture)', () => { + it('holds for all fixture functions (no dead code in the fixture)', () => { const cfgs = cfgsOfFile('ten-functions.ts'); const { graph, nodeIds, rels } = recordingGraph(); emitFileCfgs(graph, cfgs); diff --git a/gitnexus/test/integration/cfg/fixtures/ten-functions.ts b/gitnexus/test/integration/cfg/fixtures/ten-functions.ts index 5a7a6db2a6..f58bd9c8b8 100644 --- a/gitnexus/test/integration/cfg/fixtures/ten-functions.ts +++ b/gitnexus/test/integration/cfg/fixtures/ten-functions.ts @@ -95,6 +95,34 @@ export function withNested(xs: number[]): void { end(); } +// M2 additions (#2082 U5): an early-exit-through-finally and a shadowing case — +// the two reaching-defs acceptance shapes the original ten functions lacked. +// Their CFG topology exercises U2's finally threading; their facts pin R4/R9. + +export function withEarlyExitFinally(flag: boolean): number { + let val = 1; + try { + if (flag) { + return probe(val); + } + work(); + } finally { + val = 2; + } + return val; +} + +export function withShadowing(): void { + let s = 1; + { + let s = 2; + s = s + 1; + use(s); + } + s = s + 1; + done2(s); +} + declare function a(): void; declare function b(): void; declare function c(): void; @@ -113,3 +141,5 @@ declare function after(): void; declare function p(): void; declare function q(): void; declare function end(): void; +declare function probe(n: number): number; +declare function done2(n: number): void; diff --git a/gitnexus/test/integration/cfg/parse-cache-mixed.test.ts b/gitnexus/test/integration/cfg/parse-cache-mixed.test.ts new file mode 100644 index 0000000000..58f1837829 --- /dev/null +++ b/gitnexus/test/integration/cfg/parse-cache-mixed.test.ts @@ -0,0 +1,119 @@ +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { + getDurableParsedFileDir, + persistDurableParsedFileShardSync, + restoreDurableParsedFileShard, + loadParsedFilesForPaths, +} from '../../../src/storage/parsedfile-store.js'; +import type { ParsedFile } from 'gitnexus-shared'; +import type { FunctionCfg } from '../../../src/core/ingestion/cfg/types.js'; + +// #2082 M2 U5 — the warm/mixed cache seam for statement facts. On a warm (or +// mixed) run the unchanged chunk's ParsedFiles are BYTE-COPIED from the +// durable store instead of re-parsed (#2038); if that copy (or the store's +// interning reviver) dropped or aliased the new `bindings`/`statements` +// fields, reaching-defs would silently degrade to `no-facts` for every cached +// file — exactly the field-loss class the #2038 mergeChunkResults lesson +// warns about. This pins the persist → restore → load round-trip at the exact +// seam scope-resolution consumes. + +const factCfg: FunctionCfg = { + filePath: 'src/a.ts', + functionStartLine: 1, + functionEndLine: 5, + functionStartColumn: 0, + entryIndex: 0, + exitIndex: 1, + blocks: [ + { index: 0, startLine: 1, endLine: 1, text: '', kind: 'entry', statements: [] }, + { + index: 1, + startLine: 5, + endLine: 5, + text: '', + kind: 'exit', + statements: [ + { line: 2, defs: [0], uses: [] }, + { line: 3, defs: [1], uses: [0] }, + ], + }, + ], + edges: [{ from: 0, to: 1, kind: 'seq' }], + bindings: [ + { name: 'x', declLine: 2, declColumn: 6, kind: 'let' }, + { name: 'y', declLine: 3, declColumn: 6, kind: 'const' }, + ], +}; + +const mkParsedFile = (filePath: string): ParsedFile => + ({ + filePath, + moduleScope: '', + scopes: [], + parsedImports: [], + localDefs: [], + referenceSites: [], + cfgSideChannel: [factCfg], + }) as unknown as ParsedFile; + +describe('durable ParsedFile store carries M2 statement facts (#2082 U5)', () => { + let tempDir = ''; + beforeEach(() => { + tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'm2-facts-store-')); + }); + afterEach(() => { + if (tempDir) fs.rmSync(tempDir, { recursive: true, force: true }); + }); + + it('persist → restore → loadParsedFilesForPaths preserves bindings + statements deep-equal', async () => { + const durableDir = getDurableParsedFileDir(tempDir); + const chunkHash = 'c'.repeat(64); + const files = ['src/a.ts', 'src/b.ts']; + + // What a worker writes at flush on a cache MISS (the cold half of a + // mixed-mode run)… + persistDurableParsedFileShardSync(durableDir, chunkHash, 7, 0, files.map(mkParsedFile)); + // …and what a warm HIT byte-copies into the run-scoped store. + await restoreDurableParsedFileShard(durableDir, tempDir, chunkHash); + + const loaded = await loadParsedFilesForPaths(tempDir, new Set(files)); + expect(loaded.size).toBe(2); + for (const filePath of files) { + const pf = loaded.get(filePath); + expect(pf).toBeDefined(); + const channel = (pf as { cfgSideChannel?: unknown }).cfgSideChannel; + expect(Array.isArray(channel)).toBe(true); + const cfg = (channel as FunctionCfg[])[0]; + // deep-equal: the interning reviver may dedup strings/objects but the + // VALUES must be intact — and no aliasing may merge the two files' + // distinct fact arrays into wrong shapes. + expect(cfg.bindings).toEqual(factCfg.bindings); + expect(cfg.blocks.map((b) => b.statements)).toEqual(factCfg.blocks.map((b) => b.statements)); + } + }); + + it('facts survive even when two files share identical binding tables (reviver dedup safety)', async () => { + // The store reviver interns strings and dedups objects keyed on `nodeId` + // presence — BindingEntry/StatementFacts deliberately carry no such field, + // so dedup must never alias-then-mutate across files. Two files with + // byte-identical channels is the worst case. + const durableDir = getDurableParsedFileDir(tempDir); + const chunkHash = 'd'.repeat(64); + persistDurableParsedFileShardSync(durableDir, chunkHash, 7, 0, [ + mkParsedFile('src/same1.ts'), + mkParsedFile('src/same2.ts'), + ]); + await restoreDurableParsedFileShard(durableDir, tempDir, chunkHash); + const loaded = await loadParsedFilesForPaths( + tempDir, + new Set(['src/same1.ts', 'src/same2.ts']), + ); + const c1 = (loaded.get('src/same1.ts') as { cfgSideChannel?: FunctionCfg[] }).cfgSideChannel; + const c2 = (loaded.get('src/same2.ts') as { cfgSideChannel?: FunctionCfg[] }).cfgSideChannel; + expect(c1?.[0].bindings).toEqual(factCfg.bindings); + expect(c2?.[0].bindings).toEqual(factCfg.bindings); + }); +}); diff --git a/gitnexus/test/integration/cfg/pipeline-pdg.test.ts b/gitnexus/test/integration/cfg/pipeline-pdg.test.ts index 35666fe7dd..69ff6677f1 100644 --- a/gitnexus/test/integration/cfg/pipeline-pdg.test.ts +++ b/gitnexus/test/integration/cfg/pipeline-pdg.test.ts @@ -13,16 +13,22 @@ import type { PipelineResult } from '../../../src/types/pipeline.js'; const FIXTURE = path.join(__dirname, 'fixtures', 'pdg-repo'); -function counts(result: PipelineResult): { basicBlocks: number; cfgEdges: number } { +function counts(result: PipelineResult): { + basicBlocks: number; + cfgEdges: number; + reachingDefs: number; +} { let basicBlocks = 0; result.graph.forEachNode((n) => { if (n.label === 'BasicBlock') basicBlocks++; }); let cfgEdges = 0; + let reachingDefs = 0; for (const rel of result.graph.iterRelationships()) { if (rel.type === 'CFG') cfgEdges++; + if (rel.type === 'REACHING_DEF') reachingDefs++; } - return { basicBlocks, cfgEdges }; + return { basicBlocks, cfgEdges, reachingDefs }; } const tmpDirs: string[] = []; @@ -40,25 +46,34 @@ describe('U7 — end-to-end --pdg pipeline', () => { it('with --pdg on: emits BasicBlock nodes + CFG edges into the graph', async () => { const result = await runPipelineFromRepo(freshRepo(), () => {}, { pdg: true }); - const { basicBlocks, cfgEdges } = counts(result); + const { basicBlocks, cfgEdges, reachingDefs } = counts(result); expect(basicBlocks).toBeGreaterThan(0); expect(cfgEdges).toBeGreaterThan(0); + // M2 (#2082 U5): the def→use projection rides the same gate — the fixture + // has a loop-carried accumulator (`sum`), so facts must exist. + expect(reachingDefs).toBeGreaterThan(0); // CFG edges connect BasicBlocks to BasicBlocks — both endpoints exist. const blockIds = new Set(); result.graph.forEachNode((n) => { if (n.label === 'BasicBlock') blockIds.add(n.id); }); for (const rel of result.graph.iterRelationships()) { - if (rel.type !== 'CFG') continue; + if (rel.type !== 'CFG' && rel.type !== 'REACHING_DEF') continue; expect(blockIds.has(rel.sourceId)).toBe(true); expect(blockIds.has(rel.targetId)).toBe(true); + if (rel.type === 'REACHING_DEF') { + // reason carries the plain variable name (M0/S1 verdict) + expect(typeof rel.reason).toBe('string'); + expect(rel.reason.length).toBeGreaterThan(0); + } } }, 60000); it('with --pdg off (default): emits zero BasicBlock nodes and zero CFG edges', async () => { const result = await runPipelineFromRepo(freshRepo(), () => {}); - const { basicBlocks, cfgEdges } = counts(result); + const { basicBlocks, cfgEdges, reachingDefs } = counts(result); expect(basicBlocks).toBe(0); expect(cfgEdges).toBe(0); + expect(reachingDefs).toBe(0); }, 60000); }); diff --git a/gitnexus/test/integration/cfg/reaching-defs-snapshot.test.ts b/gitnexus/test/integration/cfg/reaching-defs-snapshot.test.ts new file mode 100644 index 0000000000..aa0fb7f0cd --- /dev/null +++ b/gitnexus/test/integration/cfg/reaching-defs-snapshot.test.ts @@ -0,0 +1,103 @@ +import { describe, it, expect } from 'vitest'; +import fs from 'fs'; +import path from 'path'; +import Parser from 'tree-sitter'; +import TypeScript from 'tree-sitter-typescript'; +import { collectFunctionCfgs } from '../../../src/core/ingestion/cfg/collect.js'; +import { computeReachingDefs } from '../../../src/core/ingestion/cfg/reaching-defs.js'; +import { getProvider } from '../../../src/core/ingestion/languages/index.js'; +import { SupportedLanguages } from '../../../src/config/supported-languages.js'; +import type { FunctionCfg } from '../../../src/core/ingestion/cfg/types.js'; + +// #2082 M2 R5 acceptance: a committed snapshot of the REACHING_DEF facts on +// the M1 fixture (extended in U5 with the early-exit-finally + shadowing +// functions). The serialization is deterministic — sorted fact strings keyed +// by program points + binding identity — so any solver/harvest behavior +// change shows as a reviewable snapshot diff, never silent drift. + +const FIXTURES = path.join(__dirname, 'fixtures'); + +function cfgsOfFile(file: string): readonly FunctionCfg[] { + const visitor = getProvider(SupportedLanguages.TypeScript).cfgVisitor; + if (!visitor) throw new Error('no cfgVisitor'); + const source = fs.readFileSync(path.join(FIXTURES, file), 'utf8'); + const parser = new Parser(); + parser.setLanguage(TypeScript.typescript); + return collectFunctionCfgs(parser.parse(source).rootNode, visitor, file).cfgs; +} + +/** Deterministic rendering: defBlock:stmt->useBlock:stmt:bindingKey */ +function serialize(cfg: FunctionCfg): Record { + const r = computeReachingDefs(cfg); + const key = (idx: number): string => { + const b = r.bindings[idx]; + return b.synthetic ? `${b.name}@module` : `${b.name}:${b.declLine}:${b.declColumn}`; + }; + return { + startLine: cfg.functionStartLine, + status: r.status, + defs: r.defCount, + uses: r.useCount, + facts: r.facts.map( + (f) => + `${f.def.blockIndex}:${f.def.stmtIndex}->${f.use.blockIndex}:${f.use.stmtIndex}:${key(f.bindingIdx)}`, + ), + }; +} + +describe('R5 — REACHING_DEF facts snapshot on the M1 fixture', () => { + it('matches the committed fact set for every fixture function', () => { + const cfgs = cfgsOfFile('ten-functions.ts'); + expect(cfgs).toHaveLength(12); + expect(cfgs.map(serialize)).toMatchSnapshot(); + }); + + it('every fixture function computes (no truncation at default limits, no no-facts)', () => { + for (const cfg of cfgsOfFile('ten-functions.ts')) { + const r = computeReachingDefs(cfg); + expect(r.status).toBe('computed'); + } + }); + + it('acceptance shapes: the finally redefinition and the shadowed binding behave per R4/R9', () => { + const cfgs = cfgsOfFile('ten-functions.ts'); + const byLine = new Map(cfgs.map((c) => [c.functionStartLine, c])); + + // withEarlyExitFinally — `val = 2` (finally) is the ONLY def reaching the + // post-try return; the early return's use sees the original `val = 1`. + const early = [...byLine.values()].find((c) => + c.blocks.some((b) => b.text.includes('return probe(val)')), + )!; + const re = computeReachingDefs(early); + const val = re.bindings.findIndex((b) => b.name === 'val'); + const probeUses = re.facts.filter( + (f) => f.bindingIdx === val && early.blocks[f.use.blockIndex].text.includes('probe'), + ); + const finalUses = re.facts.filter( + (f) => f.bindingIdx === val && early.blocks[f.use.blockIndex].text.includes('return val'), + ); + expect(probeUses).toHaveLength(1); + expect(early.blocks[probeUses[0].def.blockIndex].text).toContain('let val = 1'); + expect(finalUses).toHaveLength(1); + expect(early.blocks[finalUses[0].def.blockIndex].text).toContain('val = 2'); + + // withShadowing — two distinct `s` bindings; each use resolves to its own. + const shadow = [...byLine.values()].find((c) => + c.blocks.some((b) => b.text.includes('done2(s)')), + )!; + const rs = computeReachingDefs(shadow); + const sBindings = rs.bindings.filter((b) => b.name === 's'); + expect(sBindings).toHaveLength(2); + const factsByBinding = new Map(); + for (const f of rs.facts) { + factsByBinding.set(f.bindingIdx, (factsByBinding.get(f.bindingIdx) ?? 0) + 1); + } + // each s binding forms its own facts (no cross-kill, no cross-reach): the + // inner block's reassign+use never references the outer binding and vice + // versa — both have facts, and every fact's def and use share the binding + // by construction of DefUseFact, so distinct counts per binding prove the + // bindings never conflated. + const sIdxs = rs.bindings.map((b, i) => (b.name === 's' ? i : -1)).filter((i) => i >= 0); + for (const idx of sIdxs) expect(factsByBinding.get(idx) ?? 0).toBeGreaterThanOrEqual(2); + }); +}); diff --git a/gitnexus/test/integration/cfg/worker-roundtrip.test.ts b/gitnexus/test/integration/cfg/worker-roundtrip.test.ts index 6589b3b025..c46a425aec 100644 --- a/gitnexus/test/integration/cfg/worker-roundtrip.test.ts +++ b/gitnexus/test/integration/cfg/worker-roundtrip.test.ts @@ -94,6 +94,16 @@ describe('U3 — CFG side-channel JSON round-trip (no AST leakage, no field loss for (const b of c.blocks) expect(typeof b.text).toBe('string'); for (const e of c.edges) expect(typeof e.from).toBe('number'); } + // M2 (#2082 U1): the binding table + statement facts must survive the + // boundary — a future cache-slimming field list that drops them would + // silently break reaching-defs (the #2038 mergeChunkResults lesson). + for (const c of round) { + expect(Array.isArray(c.bindings)).toBe(true); + expect(c.blocks.every((b: { statements?: unknown }) => Array.isArray(b.statements))).toBe( + true, + ); + } + expect(round.some((c: { bindings: unknown[] }) => c.bindings.length > 0)).toBe(true); }); }); @@ -150,3 +160,26 @@ describe('U3 — parse-cache key folds the --pdg flag (R4, #2038-class guard)', ).toBe(base); }); }); + +describe('#2082 M2 — the REACHING_DEF emit cap does NOT perturb the chunk key', () => { + const entries = [ + { filePath: 'b.ts', contentHash: 'h2' }, + { filePath: 'a.ts', contentHash: 'h1' }, + ]; + + it('pdgMaxReachingDefEdgesPerFunction is emit-time-only — same key across values (F3 discipline)', () => { + // The worker never sees the REACHING_DEF edge cap (solve + emit happen in + // scope-resolution on the main thread), so the cached shard is identical + // across cap values. Folding it in would be the #2099-F3 over-correction: + // a spurious full re-parse on every cap change. PdgCacheKey simply has no + // field for it — this test pins that the key API surface stays that way + // (the object form ignores unknown extras rather than hashing them). + const base = computeChunkHash(entries, { pdg: true }); + const withExtra = computeChunkHash(entries, { + pdg: true, + // @ts-expect-error — deliberately passing an unknown field: the key must ignore it + maxReachingDefEdgesPerFunction: 1, + }); + expect(withExtra).toBe(base); + }); +}); diff --git a/gitnexus/test/unit/cfg/emit-guard.test.ts b/gitnexus/test/unit/cfg/emit-guard.test.ts index da71def1ab..2a141ae936 100644 --- a/gitnexus/test/unit/cfg/emit-guard.test.ts +++ b/gitnexus/test/unit/cfg/emit-guard.test.ts @@ -191,3 +191,149 @@ describe('cfgSideChannel emit guard (#2099 F4)', () => { expect(warns()).toHaveLength(0); }); }); + +describe('#2082 M2 — statement-fact emit guard (isEmitSafeCfg extension)', () => { + let cap: ReturnType; + beforeEach(() => { + cap = _captureLogger(); + }); + afterEach(() => { + cap.restore(); + }); + const warns = (): string[] => + cap + .records() + .filter((r) => r.level >= 40) + .map((r) => String(r.msg)); + + const rdEdges = (graph: KnowledgeGraph): number => { + let n = 0; + graph.forEachRelationship((r) => { + if (r.type === 'REACHING_DEF') n++; + }); + return n; + }; + const cfgEdgeCount = (graph: KnowledgeGraph): number => { + let n = 0; + graph.forEachRelationship((r) => { + if (r.type === 'CFG') n++; + }); + return n; + }; + + /** Valid facts-bearing CFG: def at stmt 0 reaches the use at stmt 1. */ + const factCfg = (blocks?: unknown): unknown => ({ + ...validCfg, + bindings: [{ name: 'x', declLine: 1, declColumn: 0, kind: 'let' }], + blocks: blocks ?? [ + { index: 0, startLine: 1, endLine: 1, text: '', kind: 'entry', statements: [] }, + { + index: 1, + startLine: 3, + endLine: 3, + text: '', + kind: 'exit', + statements: [ + { line: 2, defs: [0], uses: [] }, + { line: 3, defs: [], uses: [0] }, + ], + }, + ], + }); + + it('a well-formed facts-bearing CFG passes the guard and emits REACHING_DEF', () => { + const graph = emitWith([factCfg()]); + expect(rdEdges(graph)).toBeGreaterThan(0); + expect(warns()).toHaveLength(0); + }); + + it('an OUT-OF-RANGE binding index is rejected per element (would template undefined into ids)', () => { + const bad = factCfg([ + { index: 0, startLine: 1, endLine: 1, text: '', kind: 'entry', statements: [] }, + { + index: 1, + startLine: 3, + endLine: 3, + text: '', + kind: 'exit', + statements: [{ line: 2, defs: [7], uses: [0] }], // 7 escapes the 1-entry table + }, + ]); + const graph = emitWith([bad, validCfg]); + // the malformed element is skipped with a warn; the valid sibling emits CFG + expect(rdEdges(graph)).toBe(0); + expect(cfgEdgeCount(graph)).toBeGreaterThan(0); + expect(warns().some((m) => m.includes('malformed'))).toBe(true); + }); + + it('statements WITHOUT a binding table are rejected (malformed by construction)', () => { + const noTable = { + ...(factCfg() as Record), + bindings: undefined, + }; + const graph = emitWith([noTable]); + expect(rdEdges(graph)).toBe(0); + expect(warns().some((m) => m.includes('malformed'))).toBe(true); + }); + + it('non-integer statement line / non-array defs are rejected per element', () => { + const badLine = factCfg([ + { + index: 0, + startLine: 1, + endLine: 1, + text: '', + kind: 'entry', + statements: [{ line: 'x', defs: [], uses: [] }], + }, + { index: 1, startLine: 3, endLine: 3, text: '', kind: 'exit', statements: [] }, + ]); + const badDefs = factCfg([ + { + index: 0, + startLine: 1, + endLine: 1, + text: '', + kind: 'entry', + statements: [{ line: 2, defs: 'nope', uses: [] }], + }, + { index: 1, startLine: 3, endLine: 3, text: '', kind: 'exit', statements: [] }, + ]); + for (const bad of [badLine, badDefs]) { + const graph = emitWith([bad]); + expect(rdEdges(graph)).toBe(0); + } + expect(warns().some((m) => m.includes('malformed'))).toBe(true); + }); + + it('a pre-M2 channel (no bindings, no statements) still passes — CFG emits, REACHING_DEF skips', () => { + const graph = emitWith([validCfg]); + expect(cfgEdgeCount(graph)).toBeGreaterThan(0); + expect(rdEdges(graph)).toBe(0); + expect(warns()).toHaveLength(0); + }); +}); + +describe('#2160 review — entry/exit index validation', () => { + it('an out-of-range entryIndex is rejected per element (would crash the solver mid-file)', () => { + const bad = { ...validCfg, entryIndex: 99 }; + const logs = _captureLogger(); + try { + const graph = emitWith([bad, validCfg]); + // the malformed element is skipped; the valid sibling still emits + let cfgEdges = 0; + graph.forEachRelationship((r) => { + if (r.type === 'CFG') cfgEdges++; + }); + expect(cfgEdges).toBeGreaterThan(0); + expect( + logs + .records() + .filter((r) => r.level >= 40) + .some((r) => String(r.msg).includes('malformed')), + ).toBe(true); + } finally { + logs.restore(); + } + }); +}); diff --git a/gitnexus/test/unit/cfg/harvest.test.ts b/gitnexus/test/unit/cfg/harvest.test.ts new file mode 100644 index 0000000000..6fccf95bb1 --- /dev/null +++ b/gitnexus/test/unit/cfg/harvest.test.ts @@ -0,0 +1,493 @@ +import { describe, it, expect } from 'vitest'; +import Parser from 'tree-sitter'; +import TypeScript from 'tree-sitter-typescript'; +import type { SyntaxNode } from '../../../src/core/ingestion/utils/ast-helpers.js'; +import { + createTypeScriptCfgVisitor, + TS_FUNCTION_TYPES, +} from '../../../src/core/ingestion/cfg/visitors/typescript.js'; +import type { FunctionCfg, StatementFacts } from '../../../src/core/ingestion/cfg/types.js'; + +// U1 (#2082 M2) — per-statement def/use harvesting. The two-phase design +// (declaration pre-scan → resolve during the CFG walk) is what makes the +// walk-order traps pass: the visitor walks finally-before-try, for-init-last, +// and do-while-condition-first, so declare-as-you-walk would mis-key common +// code. Each test pins names→binding-index agreement, not just presence. + +const visitor = createTypeScriptCfgVisitor(); + +function parse(code: string): SyntaxNode { + const parser = new Parser(); + parser.setLanguage(TypeScript.typescript); + return parser.parse(code).rootNode; +} + +function collectFunctions(root: SyntaxNode): SyntaxNode[] { + const out: SyntaxNode[] = []; + const stack = [root]; + while (stack.length) { + const n = stack.pop() as SyntaxNode; + if (TS_FUNCTION_TYPES.has(n.type)) out.push(n); + for (let i = n.namedChildCount - 1; i >= 0; i--) { + const c = n.namedChild(i); + if (c) stack.push(c); + } + } + return out; +} + +function cfgOf(code: string, index = 0): FunctionCfg { + const fns = collectFunctions(parse(code)); + const fn = fns[index]; + if (!fn) throw new Error(`no function at index ${index}`); + const cfg = visitor.buildFunctionCfg(fn, 'fixture.ts'); + if (!cfg) throw new Error('buildFunctionCfg returned undefined'); + return cfg; +} + +/** All statement facts of the CFG, flattened in (block, statement) order. */ +function allFacts(cfg: FunctionCfg): StatementFacts[] { + return cfg.blocks.flatMap((b) => [...(b.statements ?? [])]); +} + +/** Binding indices of every entry named `name`. */ +function bindingIdxs(cfg: FunctionCfg, name: string): number[] { + return (cfg.bindings ?? []).map((b, i) => (b.name === name ? i : -1)).filter((i) => i >= 0); +} + +/** The single binding index for `name` (throws when shadowed/ambiguous). */ +function bindingIdx(cfg: FunctionCfg, name: string): number { + const idxs = bindingIdxs(cfg, name); + if (idxs.length !== 1) throw new Error(`expected 1 binding for ${name}, got ${idxs.length}`); + return idxs[0]; +} + +const defsOf = (cfg: FunctionCfg): Set => + new Set(allFacts(cfg).flatMap((f) => [...f.defs])); +const usesOf = (cfg: FunctionCfg): Set => + new Set(allFacts(cfg).flatMap((f) => [...f.uses])); + +describe('TS/JS def/use harvest — basics', () => { + it('declaration, reassignment, and read produce per-statement def/use facts', () => { + const cfg = cfgOf(`function f() { let x = 1; x = 2; const y = x; }`); + const x = bindingIdx(cfg, 'x'); + const y = bindingIdx(cfg, 'y'); + // x and y are the only declared (non-synthetic) bindings + expect((cfg.bindings ?? []).filter((b) => !b.synthetic)).toHaveLength(2); + // the three statements coalesce into ONE block with three fact records + const body = cfg.blocks.find((b) => b.text.includes('let x = 1')); + expect(body?.statements).toHaveLength(3); + const [s0, s1, s2] = body!.statements!; + expect([...s0.defs]).toEqual([x]); + expect([...s1.defs]).toEqual([x]); + expect([...s2.defs]).toEqual([y]); + expect([...s2.uses]).toEqual([x]); + }); + + it('compound assignment and update expressions are def+use of the same binding', () => { + const cfg = cfgOf(`function f(x, y, i) { x += y; i++; }`); + const x = bindingIdx(cfg, 'x'); + const y = bindingIdx(cfg, 'y'); + const i = bindingIdx(cfg, 'i'); + const body = cfg.blocks.find((b) => b.text.includes('x += y')); + const [s0, s1] = body!.statements!; + expect([...s0.defs]).toEqual([x]); + expect([...s0.uses]).toEqual(expect.arrayContaining([x, y])); + expect([...s1.defs]).toEqual([i]); + expect([...s1.uses]).toEqual([i]); + }); + + it('destructuring flattens to one def per bound name; sources are uses', () => { + const cfg = cfgOf(`function f(obj, arr) { + const { a, b: c, ...rest } = obj; + let d, e; + [d = 1, ...e] = arr; + }`); + const defs = defsOf(cfg); + for (const name of ['a', 'c', 'rest', 'd', 'e']) { + expect(defs).toContain(bindingIdx(cfg, name)); + } + const uses = usesOf(cfg); + expect(uses).toContain(bindingIdx(cfg, 'obj')); + expect(uses).toContain(bindingIdx(cfg, 'arr')); + // no spurious binding for the renamed pattern key `b` + expect(bindingIdxs(cfg, 'b')).toHaveLength(0); + }); + + it('shadowing: inner let is a DISTINCT binding from the outer one', () => { + const cfg = cfgOf(`function f() { + let x = 1; + { let x = 2; use(x); } + use(x); + }`); + const xs = bindingIdxs(cfg, 'x'); + expect(xs).toHaveLength(2); + const [outer, inner] = xs; // pre-scan is source-order: outer declared first + const facts = allFacts(cfg); + const useFacts = facts.filter((f) => f.uses.includes(outer) || f.uses.includes(inner)); + // inner use(x) sees the inner binding; trailing use(x) sees the outer + expect(useFacts.some((f) => f.uses.includes(inner))).toBe(true); + expect(useFacts.some((f) => f.uses.includes(outer))).toBe(true); + const defFacts = facts.filter((f) => f.defs.length > 0); + expect(defFacts.find((f) => f.defs.includes(outer))?.line).toBeLessThan( + defFacts.find((f) => f.defs.includes(inner))!.line, + ); + }); + + it('var hoisting + multi-declaration canonicalize to ONE function-rooted binding', () => { + const cfg = cfgOf(`function f(c) { + use(v); + if (c) { var v = 1; } + var v; + }`); + expect(bindingIdxs(cfg, 'v')).toHaveLength(1); + const v = bindingIdx(cfg, 'v'); + expect(usesOf(cfg)).toContain(v); + expect(defsOf(cfg)).toContain(v); + // canonical decl site is the FIRST declaration in source order + expect(cfg.bindings![v].declLine).toBe(3); + }); + + it('undeclared assignment targets get one deterministic synthetic binding', () => { + const cfg = cfgOf(`function f() { notDeclared = 1; use(notDeclared); }`); + const idxs = bindingIdxs(cfg, 'notDeclared'); + expect(idxs).toHaveLength(1); + const b = cfg.bindings![idxs[0]]; + expect(b.synthetic).toBe(true); + expect(defsOf(cfg)).toContain(idxs[0]); + expect(usesOf(cfg)).toContain(idxs[0]); + }); +}); + +describe('TS/JS def/use harvest — harvest sites beyond visitSeq', () => { + it('parameters define at the ENTRY block (incl. destructured/default/rest)', () => { + const cfg = cfgOf(`function f(a, { b }, c = a, ...rest) { body(); }`); + const entry = cfg.blocks[cfg.entryIndex]; + expect(entry.text).toBe(''); // facts-only attach — never perturbs block text + const entryFacts = entry.statements ?? []; + expect(entryFacts).toHaveLength(1); + const defs = new Set(entryFacts[0].defs); + for (const name of ['a', 'b', 'c', 'rest']) { + expect(defs).toContain(bindingIdx(cfg, name)); + } + expect(entryFacts[0].uses).toContain(bindingIdx(cfg, 'a')); // default-value use + expect(cfg.bindings![bindingIdx(cfg, 'a')].kind).toBe('param'); + }); + + it('return and throw argument expressions are harvested (dedicated handler blocks)', () => { + const cfg = cfgOf(`function f(x, y, err) { + if (x) { return x + y; } + throw err; + }`); + const retBlock = cfg.blocks.find((b) => b.text.includes('return x + y')); + const retUses = new Set(retBlock!.statements!.flatMap((f) => [...f.uses])); + expect(retUses).toContain(bindingIdx(cfg, 'x')); + expect(retUses).toContain(bindingIdx(cfg, 'y')); + const throwBlock = cfg.blocks.find((b) => b.text.includes('throw err')); + const throwUses = new Set(throwBlock!.statements!.flatMap((f) => [...f.uses])); + expect(throwUses).toContain(bindingIdx(cfg, 'err')); + }); + + it('expression-bodied arrow harvests params at ENTRY and body uses', () => { + const cfg = cfgOf(`const f = (p) => p + q;`); + const entryFacts = cfg.blocks[cfg.entryIndex].statements ?? []; + expect(entryFacts[0]?.defs).toContain(bindingIdx(cfg, 'p')); + const body = cfg.blocks.find((b) => b.text.includes('p + q')); + const uses = new Set(body!.statements!.flatMap((f) => [...f.uses])); + expect(uses).toContain(bindingIdx(cfg, 'p')); + expect(uses).toContain(bindingIdx(cfg, 'q')); // synthetic capture + expect(cfg.bindings![bindingIdx(cfg, 'q')].synthetic).toBe(true); + }); + + it('construct headers harvest: if/while conditions, for init/cond/incr, for-of head', () => { + const cfg = cfgOf(`function f(n, list) { + for (let i = 0; i < n; i++) { work(i); } + for (const item of list) { work(item); } + while (n > 0) { n--; } + }`); + const i = bindingIdx(cfg, 'i'); + const item = bindingIdx(cfg, 'item'); + const n = bindingIdx(cfg, 'n'); + const initBlock = cfg.blocks.find((b) => b.text === 'let i = 0;'); + expect(initBlock!.statements![0].defs).toContain(i); + const condBlock = cfg.blocks.find((b) => b.text === 'i < n'); + expect(new Set(condBlock!.statements![0].uses)).toEqual(new Set([i, n])); + const incrBlock = cfg.blocks.find((b) => b.text === 'i++'); + expect(incrBlock!.statements![0].defs).toContain(i); + const forOfHead = cfg.blocks.find((b) => b.text.includes('item'))!; + expect(forOfHead.statements!.some((f) => f.defs.includes(item))).toBe(true); + expect(forOfHead.statements!.some((f) => f.uses.includes(bindingIdx(cfg, 'list')))).toBe(true); + }); + + it('catch param defines in its own facts-only block preceding the body', () => { + const cfg = cfgOf(`function f() { + try { risky(); } catch (e) { use(e); } + }`); + const e = bindingIdx(cfg, 'e'); + expect(cfg.bindings![e].kind).toBe('catch'); + // The param def gets a DEDICATED once-executed block in front of the body + // entry — NOT prepended into the body's entry block, which can be a loop + // header that would re-gen the def per iteration and falsely kill + // loop-carried redefinitions of the param. + const paramBlock = cfg.blocks.find( + (b) => b.text === '' && (b.statements ?? []).some((f) => f.defs.includes(e)), + ); + expect(paramBlock).toBeDefined(); + const body = cfg.blocks.find((b) => b.text.includes('use(e)'))!; + expect(cfg.edges.some((ed) => ed.from === paramBlock!.index && ed.to === body.index)).toBe( + true, + ); + }); + + it('catch body starting with a loop: param def does NOT re-gen on the loop header', () => { + const cfg = cfgOf(`function f(c) { + try { risky(); } catch (e) { while (c) { e = fix(e); } sink(e); } + }`); + const e = bindingIdx(cfg, 'e'); + const header = cfg.blocks.find((b) => b.text === '(c)' || b.text === 'c')!; + // the loop header carries NO def of e — only the dedicated param block does + expect((header.statements ?? []).some((f) => f.defs.includes(e))).toBe(false); + }); + + it('empty catch: param def lands on the synthetic handler block', () => { + const cfg = cfgOf(`function f() { try { risky(); } catch (e) {} }`); + const e = bindingIdx(cfg, 'e'); + const withDef = cfg.blocks.filter((b) => (b.statements ?? []).some((f) => f.defs.includes(e))); + expect(withDef).toHaveLength(1); + expect(withDef[0].text).toBe(''); // the synthetic empty-catch block + }); + + it('switch: discriminant and case-test uses harvest onto the dispatch block', () => { + const cfg = cfgOf(`function f(s, sel) { + switch (s) { + case sel: a(); break; + default: b(); + } + }`); + const dispatch = cfg.blocks.find((b) => b.text === '(s)'); + const uses = new Set(dispatch!.statements!.flatMap((f) => [...f.uses])); + expect(uses).toContain(bindingIdx(cfg, 's')); + expect(uses).toContain(bindingIdx(cfg, 'sel')); + }); +}); + +describe('TS/JS def/use harvest — exclusions (KTD4)', () => { + it('nested function bodies are opaque: no defs/uses of captured names harvested', () => { + const cfg = cfgOf(`function f() { + let outer = 1; + const g = () => { outer = 2; use(outer); }; + }`); + const outer = bindingIdx(cfg, 'outer'); + const g = bindingIdx(cfg, 'g'); + const facts = allFacts(cfg); + // exactly ONE def of outer (its declaration) — the nested write is invisible + expect(facts.filter((f) => f.defs.includes(outer))).toHaveLength(1); + expect(facts.some((f) => f.uses.includes(outer))).toBe(false); + // the declaration of g IS a def + expect(facts.some((f) => f.defs.includes(g))).toBe(true); + }); + + it('member/property writes are not defs; their identifiers are uses', () => { + const cfg = cfgOf(`function f(obj, q) { + this.x = 1; + obj.p = q; + }`); + const facts = allFacts(cfg); + const nonParamDefs = facts + .flatMap((f) => [...f.defs]) + .filter((d) => cfg.bindings![d].kind !== 'param'); + expect(nonParamDefs).toHaveLength(0); + const uses = usesOf(cfg); + expect(uses).toContain(bindingIdx(cfg, 'obj')); + expect(uses).toContain(bindingIdx(cfg, 'q')); + expect(bindingIdxs(cfg, 'x')).toHaveLength(0); // property name never binds + expect(bindingIdxs(cfg, 'p')).toHaveLength(0); + }); + + it('type annotations do not produce uses', () => { + const cfg = cfgOf(`function f(v: SomeType): OtherType { const x: Wide = v; return x; }`); + expect(bindingIdxs(cfg, 'SomeType')).toHaveLength(0); + expect(bindingIdxs(cfg, 'OtherType')).toHaveLength(0); + expect(bindingIdxs(cfg, 'Wide')).toHaveLength(0); + }); +}); + +describe('TS/JS def/use harvest — walk-order traps (two-phase pre-scan)', () => { + it('finally walked before try body: var def and finally use share one binding', () => { + const cfg = cfgOf(`function f() { + try { var v = 1; } finally { use(v); } + }`); + expect(bindingIdxs(cfg, 'v')).toHaveLength(1); + const v = bindingIdx(cfg, 'v'); + expect(cfg.bindings![v].synthetic).toBeUndefined(); + expect(defsOf(cfg)).toContain(v); + expect(usesOf(cfg)).toContain(v); + }); + + it('for-init block created after body walk: init def and body use share one binding', () => { + const cfg = cfgOf(`function f(n) { + for (let i = 0; i < n; i++) { use(i); } + }`); + expect(bindingIdxs(cfg, 'i')).toHaveLength(1); + const i = bindingIdx(cfg, 'i'); + expect(defsOf(cfg)).toContain(i); + const bodyBlock = cfg.blocks.find((b) => b.text.includes('use(i)')); + expect(bodyBlock!.statements!.some((f) => f.uses.includes(i))).toBe(true); + }); + + it('do-while condition created before body: body var def and condition use share one binding', () => { + const cfg = cfgOf(`function f() { + do { var x = step(); } while (x); + }`); + expect(bindingIdxs(cfg, 'x')).toHaveLength(1); + const x = bindingIdx(cfg, 'x'); + const condBlock = cfg.blocks.find((b) => b.text === 'x' || b.text === '(x)'); + expect(condBlock!.statements!.some((f) => f.uses.includes(x))).toBe(true); + }); + + it('switch body is ONE scope: let in one case resolves in a later case', () => { + const cfg = cfgOf(`function f(s) { + switch (s) { + case 1: let shared = 1; break; + case 2: use(shared); break; + } + }`); + expect(bindingIdxs(cfg, 'shared')).toHaveLength(1); + const shared = bindingIdx(cfg, 'shared'); + expect(defsOf(cfg)).toContain(shared); + expect(usesOf(cfg)).toContain(shared); + }); +}); + +describe('TS/JS def/use harvest — serialization', () => { + it('facts survive a JSON round-trip deep-equal (worker boundary shape)', () => { + const cfg = cfgOf(`function f(a) { + let x = a; + try { x += 1; } catch (e) { use(e); } finally { done(x); } + return x; + }`); + const trip = JSON.parse(JSON.stringify(cfg)) as FunctionCfg; + expect(trip).toEqual(cfg); + expect(trip.bindings).toBeDefined(); + expect(trip.blocks.every((b) => Array.isArray(b.statements))).toBe(true); + }); + + it('binding indices in facts are always in range of the binding table', () => { + const cfg = cfgOf(`function f(a, b) { + const c = a + b; + for (const k in a) { sink(k, c); } + }`); + const n = cfg.bindings!.length; + for (const f of allFacts(cfg)) { + for (const d of f.defs) (expect(d).toBeGreaterThanOrEqual(0), expect(d).toBeLessThan(n)); + for (const u of f.uses) (expect(u).toBeGreaterThanOrEqual(0), expect(u).toBeLessThan(n)); + } + }); +}); + +describe('TS/JS def/use harvest — review-pass regressions (#2082)', () => { + it('class declarations harvest the name as a DEF (JS identifier and TS type_identifier)', () => { + const cfg = cfgOf(`function f() { + class A {} + return new A(); + }`); + const a = bindingIdx(cfg, 'A'); + expect(cfg.bindings![a].kind).toBe('class'); + const facts = allFacts(cfg); + expect(facts.some((fa) => fa.defs.includes(a))).toBe(true); + // the `new A()` use resolves to the same binding + expect(facts.some((fa) => fa.uses.includes(a))).toBe(true); + // and the declaration statement records NO bogus use of A + const declFact = facts.find((fa) => fa.defs.includes(a)); + expect(declFact!.uses).not.toContain(a); + }); + + it('write-then-read in one statement (assign-and-test idiom) forms the def→use fact', async () => { + const { computeReachingDefs } = + await import('../../../src/core/ingestion/cfg/reaching-defs.js'); + const cfg = cfgOf(`function f(re, s) { + let m = null; + if ((m = re.exec(s)) && m) { sink(m); } + }`); + const m = bindingIdx(cfg, 'm'); + const r = computeReachingDefs(cfg); + // the `m` read in the condition gets a fact from the SAME-statement + // assignment (write-then-read), not only from the dead `m = null` init + const condUses = r.facts.filter( + (fa) => fa.bindingIdx === m && fa.def.line === fa.use.line && fa.use.line === 3, + ); + expect(condUses.length).toBeGreaterThan(0); + }); +}); + +describe('TS/JS def/use harvest — conditional contexts are MAY-defs (tri-review P1)', () => { + it('short-circuit RHS def lands in mayDefs, not defs', () => { + const cfg = cfgOf(`function f(a) { let x = source(); if (a && (x = clean())) {} sink(x); }`); + const x = bindingIdx(cfg, 'x'); + const cond = cfg.blocks.find((b) => b.text.includes('a && (x = clean())'))!; + const fact = cond.statements!.find((s) => (s.mayDefs ?? []).includes(x)); + expect(fact).toBeDefined(); + expect(fact!.defs).not.toContain(x); + }); + + it('nullish lazy-init (`c ?? (c = load())`) and ternary-arm defs are may-defs', () => { + const cfg = cfgOf(`function f(c, k) { + const v = c ?? (c = load()); + const w = k ? (c = a()) : b(); + use(v, w, c); + }`); + const c = bindingIdx(cfg, 'c'); + const all = allFacts(cfg); + expect(all.filter((s) => (s.mayDefs ?? []).includes(c))).toHaveLength(2); + // the only MUST def of c is its ENTRY param record — neither conditional + // assignment is a must-def + const mustDefs = all.filter((s) => s.defs.includes(c)); + expect(mustDefs).toHaveLength(1); + expect(mustDefs[0].line).toBe(1); // the param record + }); + + it('switch case-test defs are may-defs on the dispatch block', () => { + const cfg = cfgOf(`function f(v) { + let y = taint(); + switch (v) { + case probe(): sinkA(y); break; + case (y = 1): sinkB(); break; + } + }`); + const y = bindingIdx(cfg, 'y'); + const dispatch = cfg.blocks.find((b) => b.text === '(v)')!; + expect(dispatch.statements!.some((s) => (s.mayDefs ?? []).includes(y))).toBe(true); + expect(dispatch.statements!.some((s) => s.defs.includes(y))).toBe(false); + }); + + it('logical-assignment operators (`x ||= v`) write conditionally — may-def, but the read is a use', () => { + const cfg = cfgOf(`function f(x) { x ||= fallback(); use(x); }`); + const x = bindingIdx(cfg, 'x'); + const stmt = allFacts(cfg).find((s) => (s.mayDefs ?? []).includes(x)); + expect(stmt).toBeDefined(); + expect(stmt!.defs).not.toContain(x); + expect(stmt!.uses).toContain(x); + }); + + it('plain compound assignment (`x += 1`) stays a MUST def', () => { + const cfg = cfgOf(`function f(x) { x += 1; }`); + const x = bindingIdx(cfg, 'x'); + expect(allFacts(cfg).some((s) => s.defs.includes(x))).toBe(true); + }); + + it('bare `var x;` is a runtime no-op — no def fact (initialized var still defs)', () => { + const cfg = cfgOf(`function f() { x = source(); var x; var y = 1; sink(x, y); }`); + const x = bindingIdx(cfg, 'x'); + const y = bindingIdx(cfg, 'y'); + const defFacts = allFacts(cfg).filter((s) => s.defs.includes(x)); + expect(defFacts).toHaveLength(1); // only the assignment, never the bare declarator + expect(allFacts(cfg).some((s) => s.defs.includes(y))).toBe(true); + }); + + it('parenthesized lvalues unwrap: `(x) += 1` and `(x)++` def+use x', () => { + const cfg = cfgOf(`function f(x) { (x) += 1; (x)++; }`); + const x = bindingIdx(cfg, 'x'); + const withDef = allFacts(cfg).filter((s) => s.defs.includes(x)); + expect(withDef.length).toBeGreaterThanOrEqual(2); + }); +}); diff --git a/gitnexus/test/unit/cfg/reaching-defs.test.ts b/gitnexus/test/unit/cfg/reaching-defs.test.ts new file mode 100644 index 0000000000..01dc598edc --- /dev/null +++ b/gitnexus/test/unit/cfg/reaching-defs.test.ts @@ -0,0 +1,520 @@ +import { describe, it, expect } from 'vitest'; +import Parser from 'tree-sitter'; +import TypeScript from 'tree-sitter-typescript'; +import type { SyntaxNode } from '../../../src/core/ingestion/utils/ast-helpers.js'; +import { + createTypeScriptCfgVisitor, + TS_FUNCTION_TYPES, +} from '../../../src/core/ingestion/cfg/visitors/typescript.js'; +import { + computeReachingDefs, + type DefUseFact, +} from '../../../src/core/ingestion/cfg/reaching-defs.js'; +import type { + BasicBlockData, + BindingEntry, + CfgEdgeData, + FunctionCfg, + StatementFacts, +} from '../../../src/core/ingestion/cfg/types.js'; + +// U3 (#2082 M2) — the GEN/KILL fixpoint + intra-block statement sweep. The +// classic lattice hazards (kill ordering, branch-merge union, loop-carried +// defs, self-loops, unreachable blocks) are pinned on hand-built FunctionCfg +// literals with zero tree-sitter dependency, mirroring cfg-builder.test.ts; +// shadowing/try-finally acceptance runs parser-direct through the U1 harvest. + +// ── hand-built CFG helpers ────────────────────────────────────────────────── + +interface BlockSpec { + readonly kind?: BasicBlockData['kind']; + readonly stmts?: StatementFacts[]; +} + +function mkCfg(blocks: BlockSpec[], edges: [number, number][], bindings: string[]): FunctionCfg { + const bindingTable: BindingEntry[] = bindings.map((name, i) => ({ + name, + declLine: i + 1, + declColumn: 0, + kind: 'let', + })); + return { + filePath: 'hand.ts', + functionStartLine: 1, + functionEndLine: 99, + functionStartColumn: 0, + entryIndex: 0, + exitIndex: 1, + blocks: blocks.map((b, index) => ({ + index, + startLine: index + 1, + endLine: index + 1, + text: '', + kind: b.kind ?? (index === 0 ? 'entry' : index === 1 ? 'exit' : 'normal'), + statements: b.stmts ?? [], + })), + edges: edges.map(([from, to]) => ({ from, to, kind: 'seq' }) as CfgEdgeData), + bindings: bindingTable, + }; +} + +const stmt = (line: number, defs: number[] = [], uses: number[] = []): StatementFacts => ({ + line, + defs, + uses, +}); + +/** Compact "defBlock:defStmt->useBlock:useStmt:binding" rendering for asserts. */ +const render = (facts: readonly DefUseFact[]): string[] => + facts.map( + (f) => + `${f.def.blockIndex}:${f.def.stmtIndex}->${f.use.blockIndex}:${f.use.stmtIndex}:${f.bindingIdx}`, + ); + +// ── parser-direct helpers (shadowing / finally acceptance) ────────────────── + +const visitor = createTypeScriptCfgVisitor(); + +function cfgOf(code: string, index = 0): FunctionCfg { + const parser = new Parser(); + parser.setLanguage(TypeScript.typescript); + const root = parser.parse(code).rootNode as SyntaxNode; + const fns: SyntaxNode[] = []; + const stack = [root]; + while (stack.length) { + const n = stack.pop() as SyntaxNode; + if (TS_FUNCTION_TYPES.has(n.type)) fns.push(n); + for (let i = n.namedChildCount - 1; i >= 0; i--) { + const c = n.namedChild(i); + if (c) stack.push(c); + } + } + const cfg = visitor.buildFunctionCfg(fns[index], 'fixture.ts'); + if (!cfg) throw new Error('no cfg'); + return cfg; +} + +const nameIdx = (cfg: FunctionCfg, name: string): number[] => + (cfg.bindings ?? []).map((b, i) => (b.name === name ? i : -1)).filter((i) => i >= 0); + +// ── tests ─────────────────────────────────────────────────────────────────── + +describe('computeReachingDefs — kill/gen fundamentals (hand-built)', () => { + it('straight line: reassignment kills the prior def (R6)', () => { + // block 2: x=1; x=2; y=x + const cfg = mkCfg( + [{}, {}, { stmts: [stmt(10, [0]), stmt(11, [0]), stmt(12, [1], [0])] }], + [ + [0, 2], + [2, 1], + ], + ['x', 'y'], + ); + const r = computeReachingDefs(cfg); + expect(r.status).toBe('computed'); + expect(render(r.facts)).toEqual(['2:1->2:2:0']); // ONLY the second def reaches + expect(r.defCount).toBe(3); + expect(r.useCount).toBe(1); + }); + + it('branch merge (diamond): defs from BOTH arms reach the join use', () => { + // 0→2(def x)→{3,4 both def x}→5(use x)→1 + const cfg = mkCfg( + [ + {}, + {}, + { stmts: [stmt(10, [0])] }, + { stmts: [stmt(20, [0])] }, + { stmts: [stmt(30, [0])] }, + { stmts: [stmt(40, [], [0])] }, + ], + [ + [0, 2], + [2, 3], + [2, 4], + [3, 5], + [4, 5], + [5, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + expect(render(r.facts).sort()).toEqual(['3:0->5:0:0', '4:0->5:0:0']); + }); + + it('loop back-edge: pre-loop def AND loop-carried redef both reach the header use', () => { + // 0→2(def x)→3(use x = header)→4(def x, body)→3(back); 3→1(exit) + const cfg = mkCfg( + [ + {}, + {}, + { stmts: [stmt(10, [0])] }, + { stmts: [stmt(20, [], [0])] }, + { stmts: [stmt(30, [0])] }, + ], + [ + [0, 2], + [2, 3], + [3, 4], + [4, 3], + [3, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + expect(render(r.facts).sort()).toEqual(['2:0->3:0:0', '4:0->3:0:0']); + }); + + it('self-loop block converges with the loop-carried def visible to its own use', () => { + // block 2 loops to itself: use x; def x + const cfg = mkCfg( + [{}, {}, { stmts: [stmt(10, [], [0]), stmt(11, [0])] }], + [ + [0, 2], + [2, 2], + [2, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + // the block's own def flows around the self-loop into its use + expect(render(r.facts)).toEqual(['2:1->2:0:0']); + }); + + it('unreachable block: its defs reach nothing; reachable uses see only reachable defs', () => { + // 2(def x)→3(use x); 4 is DISCONNECTED and also defs x + const cfg = mkCfg( + [ + {}, + {}, + { stmts: [stmt(10, [0])] }, + { stmts: [stmt(20, [], [0])] }, + { stmts: [stmt(30, [0])] }, + ], + [ + [0, 2], + [2, 3], + [3, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + expect(render(r.facts)).toEqual(['2:0->3:0:0']); + }); + + it('intra-block sweep: a use BEFORE the same-block def sees the incoming def', () => { + // 2: def x. 3: use x (stmt0); def x (stmt1); use x (stmt2) + const cfg = mkCfg( + [ + {}, + {}, + { stmts: [stmt(10, [0])] }, + { stmts: [stmt(20, [], [0]), stmt(21, [0]), stmt(22, [], [0])] }, + ], + [ + [0, 2], + [2, 3], + [3, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + expect(render(r.facts).sort()).toEqual(['2:0->3:0:0', '3:1->3:2:0']); + }); + + it('def+use in one statement: the use sees prior defs AND the same-statement def', () => { + // StatementFacts carries no intra-statement order, so `x += 1` + // (read-then-write) and `if ((m = f()) && m.p)` (write-then-read) are + // indistinguishable — the sweep emits BOTH the prior def and the + // same-statement self-def (sound over-approximation; missing the + // assign-and-test idiom's def→use would be a taint false negative). + const cfg = mkCfg( + [{}, {}, { stmts: [stmt(10, [0]), stmt(11, [0], [0])] }], + [ + [0, 2], + [2, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + expect(render(r.facts).sort()).toEqual(['2:0->2:1:0', '2:1->2:1:0']); + }); +}); + +describe('computeReachingDefs — determinism and convergence', () => { + it('permuted edge order produces byte-identical sorted facts', () => { + const blocks: BlockSpec[] = [ + {}, + {}, + { stmts: [stmt(1, [0]), stmt(2, [1])] }, + { stmts: [stmt(3, [0], [1])] }, + { stmts: [stmt(4, [1], [0])] }, + { stmts: [stmt(5, [], [0, 1])] }, + ]; + const edges: [number, number][] = [ + [0, 2], + [2, 3], + [2, 4], + [3, 5], + [4, 5], + [5, 3], + [5, 1], + ]; + const base = computeReachingDefs(mkCfg(blocks, edges, ['x', 'y'])); + for (let i = 0; i < 5; i++) { + const shuffled = [...edges].reverse(); + shuffled.push(shuffled.shift() as [number, number]); + const r = computeReachingDefs(mkCfg(blocks, shuffled, ['x', 'y'])); + expect(render(r.facts)).toEqual(render(base.facts)); + } + }); + + it('nested loops (depth 3) converge with loop-carried defs intact', () => { + // 2 chains into three nested loop headers 3,4,5; innermost body 6 defs x. + const cfg = mkCfg( + [ + {}, + {}, + { stmts: [stmt(1, [0])] }, + { stmts: [stmt(2, [], [0])] }, + { stmts: [stmt(3, [], [0])] }, + { stmts: [stmt(4, [], [0])] }, + { stmts: [stmt(5, [0])] }, + ], + [ + [0, 2], + [2, 3], + [3, 4], + [4, 5], + [5, 6], + [6, 5], + [5, 4], + [4, 3], + [3, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + // every header use sees both the init def and the innermost redef + for (const useBlock of [3, 4, 5]) { + const defs = r.facts + .filter((f) => f.use.blockIndex === useBlock) + .map((f) => f.def.blockIndex); + expect(new Set(defs)).toEqual(new Set([2, 6])); + } + }); + + it('no-facts fallback: a CFG without statement facts reports no-facts, no throw', () => { + const bare: FunctionCfg = { + filePath: 'hand.ts', + functionStartLine: 1, + functionEndLine: 2, + functionStartColumn: 0, + entryIndex: 0, + exitIndex: 1, + blocks: [ + { index: 0, startLine: 1, endLine: 1, text: '', kind: 'entry' }, + { index: 1, startLine: 2, endLine: 2, text: '', kind: 'exit' }, + ], + edges: [{ from: 0, to: 1, kind: 'seq' }], + }; + const r = computeReachingDefs(bare); + expect(r.status).toBe('no-facts'); + expect(r.facts).toEqual([]); + }); + + it('maxFacts truncation: deterministic prefix + truncated status', () => { + // fan-out: 4 defs of x in parallel arms, then 4 uses → 16 facts + const arms = [2, 3, 4, 5]; + const uses = [6, 7, 8, 9]; + const blocks: BlockSpec[] = [{}, {}]; + for (const a of arms) blocks[a] = { stmts: [stmt(a, [0])] }; + for (const u of uses) blocks[u] = { stmts: [stmt(u, [], [0])] }; + const edges: [number, number][] = []; + for (const a of arms) edges.push([0, a], [a, 6]); + edges.push([6, 7], [7, 8], [8, 9], [9, 1]); + const full = computeReachingDefs(mkCfg(blocks, edges, ['x'])); + expect(full.status).toBe('computed'); + expect(full.facts).toHaveLength(16); + + const capped = computeReachingDefs(mkCfg(blocks, edges, ['x']), { maxFacts: 5 }); + expect(capped.status).toBe('truncated'); + expect(capped.facts).toHaveLength(5); + // deterministic prefix: re-running yields the same truncated set + const again = computeReachingDefs(mkCfg(blocks, edges, ['x']), { maxFacts: 5 }); + expect(render(again.facts)).toEqual(render(capped.facts)); + // telemetry counts are truncation-independent + expect(capped.defCount).toBe(full.defCount); + expect(capped.useCount).toBe(full.useCount); + }); +}); + +describe('computeReachingDefs — parser-direct acceptance (with U1/U2)', () => { + it('shadowing: inner let does NOT kill the outer binding across the block (R4)', () => { + const cfg = cfgOf(`function f() { + let x = 1; + { let x = 2; sink(x); } + sink(x); + }`); + const [outer, inner] = nameIdx(cfg, 'x'); + const r = computeReachingDefs(cfg); + const outerUse = r.facts.filter((f) => f.bindingIdx === outer); + const innerUse = r.facts.filter((f) => f.bindingIdx === inner); + expect(innerUse).toHaveLength(1); + expect(outerUse).toHaveLength(1); + // the trailing sink(x) sees the OUTER def — the inner block didn't kill it + expect(outerUse[0].def.line).toBe(2); + expect(outerUse[0].use.line).toBe(4); + }); + + it('try/catch over-approximation: a try-body def reaches a catch-body use (R10)', () => { + const cfg = cfgOf(`function f() { + let x = seed(); + try { x = risky(); } catch (e) { sink(x); } + }`); + const [x] = nameIdx(cfg, 'x'); + const r = computeReachingDefs(cfg); + const catchUses = r.facts.filter((f) => f.bindingIdx === x && f.use.line === 3); + // BOTH the seed def and the try-body redef may reach the catch use + expect(new Set(catchUses.map((f) => f.def.line))).toEqual(new Set([2, 3])); + }); + + it('finally redefinition on the early-exit/normal paths kills the original (R9 + U2)', () => { + const cfg = cfgOf(`function f(c) { + let x = 1; + try { + if (c) { return probe(x); } + } finally { + x = 2; + } + return sink(x); + }`); + const [x] = nameIdx(cfg, 'x'); + const r = computeReachingDefs(cfg); + // the early return's use happens BEFORE finally runs → sees x = 1 (line 2) + const probeUse = r.facts.filter((f) => f.bindingIdx === x && f.use.line === 4); + expect(probeUse.map((f) => f.def.line)).toEqual([2]); + // the post-try use sits behind the finally on EVERY path → sees ONLY x = 2 + const sinkUse = r.facts.filter((f) => f.bindingIdx === x && f.use.line === 8); + expect(sinkUse.map((f) => f.def.line)).toEqual([6]); + }); + + it('params reach their uses from the ENTRY record', () => { + const cfg = cfgOf(`function f(a) { return a + 1; }`); + const [a] = nameIdx(cfg, 'a'); + const r = computeReachingDefs(cfg); + const fact = r.facts.find((f) => f.bindingIdx === a); + expect(fact).toBeDefined(); + expect(fact!.def.blockIndex).toBe(cfg.entryIndex); + }); + + it('loop-carried accumulator: both the init and in-loop defs reach the post-loop use', () => { + const cfg = cfgOf(`function f(xs) { + let sum = 0; + for (const x of xs) { sum += x; } + return sum; + }`); + const [sum] = nameIdx(cfg, 'sum'); + const r = computeReachingDefs(cfg); + const retUse = r.facts.filter((f) => f.bindingIdx === sum && f.use.line === 4); + expect(new Set(retUse.map((f) => f.def.line))).toEqual(new Set([2, 3])); + }); +}); + +describe('computeReachingDefs — tri-review soundness fixes (#2160 review)', () => { + it('may-def gen does NOT kill: prior def survives a conditional assignment (hand-built)', () => { + // block 2: def x. block 3: stmt with MAY-def of x. block 4: use x. + const cfg = mkCfg( + [ + {}, + {}, + { stmts: [stmt(10, [0])] }, + { stmts: [{ line: 20, defs: [], uses: [], mayDefs: [0] }] }, + { stmts: [stmt(30, [], [0])] }, + ], + [ + [0, 2], + [2, 3], + [3, 4], + [4, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + // BOTH the unconditional def and the conditional one reach the use + expect(render(r.facts).sort()).toEqual(['2:0->4:0:0', '3:0->4:0:0']); + }); + + it('short-circuit conditional def: the not-taken path keeps the prior def (parser-direct, P1)', () => { + const cfg = cfgOf(`function f(a) { + let x = source(); + if (a && (x = clean())) {} + sink(x); + }`); + const [x] = nameIdx(cfg, 'x'); + const r = computeReachingDefs(cfg); + const sinkUses = r.facts.filter((f) => f.bindingIdx === x && f.use.line === 4); + // BOTH source (line 2) and clean (line 3) reach sink — pre-fix, source was + // falsely killed (taint false negative on the lazy-init idiom) + expect(new Set(sinkUses.map((f) => f.def.line))).toEqual(new Set([2, 3])); + }); + + it('labeled non-loop block: break keeps the real continuation (parser-direct, P1)', () => { + const cfg = cfgOf(`function f(c) { + let x = 1; + blk: { if (c) break blk; x = 2; } + sink(x); + }`); + const [x] = nameIdx(cfg, 'x'); + const r = computeReachingDefs(cfg); + const sinkUses = r.facts.filter((f) => f.bindingIdx === x && f.use.line === 4); + // the break path preserves x=1; the fall-through path redefines to x=2 + expect(new Set(sinkUses.map((f) => f.def.line))).toEqual(new Set([2, 3])); + }); + + it('doubly-labeled loop: `break outer` resolves to the loop exit, keeping post-loop facts (P1)', () => { + const cfg = cfgOf(`function f(c) { + let x = 1; + outer: inner: do { if (c) break outer; x = 2; } while (g()); + sink(x); + }`); + const [x] = nameIdx(cfg, 'x'); + const r = computeReachingDefs(cfg); + const sinkUses = r.facts.filter((f) => f.bindingIdx === x && f.use.line === 4); + expect(new Set(sinkUses.map((f) => f.def.line))).toEqual(new Set([2, 3])); + }); + + it('throw edges deliver INTERMEDIATE defs of a coalesced block to the handler (parser-direct, P1)', () => { + const cfg = cfgOf(`function f(a) { + let x = seed(a); + try { + x = parse(a); + x = normalize(x); + } catch (e) { + sink(x); + } + }`); + const [x] = nameIdx(cfg, 'x'); + const r = computeReachingDefs(cfg); + const sinkUses = r.facts.filter((f) => f.bindingIdx === x && f.use.line === 7); + // seed (pre-try), parse (intermediate — normalize may throw with parse's + // value live), and normalize (its own RHS use may throw) all reach sink + expect(new Set(sinkUses.map((f) => f.def.line))).toEqual(new Set([2, 4, 5])); + }); + + it('a block with ≥ STMT_STRIDE statements reports overflow with zero facts (no aliasing)', () => { + const shared = { line: 1, defs: [], uses: [] }; + const huge = new Array(1 << 21).fill(shared); + const cfg = mkCfg( + [{}, {}, { stmts: huge as StatementFacts[] }], + [ + [0, 2], + [2, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + expect(r.status).toBe('overflow'); + expect(r.facts).toEqual([]); + }); +}); diff --git a/gitnexus/test/unit/cfg/typescript-visitor.test.ts b/gitnexus/test/unit/cfg/typescript-visitor.test.ts index 8452d2d71c..7dc7711a53 100644 --- a/gitnexus/test/unit/cfg/typescript-visitor.test.ts +++ b/gitnexus/test/unit/cfg/typescript-visitor.test.ts @@ -350,7 +350,14 @@ describe('TS/JS CfgVisitor — try/catch/finally (R10)', () => { it('non-empty catch is unchanged by the empty-catch synthesis (F2 regression guard)', () => { const cfg = cfgOf(`function f() { try { a(); } catch (e) { h(); } after(); }`); - expect(throwTargets(cfg).has(block(cfg, 'h();'))).toBe(true); + // The throw lands on the handler ENTRY — since M2 that is the catch-param + // binding block (a facts-only block in front of the body), which flows + // into the body. Assert the path, not block identity. + const handlerEntries = [...throwTargets(cfg)]; + expect(handlerEntries.length).toBeGreaterThan(0); + for (const t of handlerEntries) { + expect(reaches(cfg, t, block(cfg, 'h();'))).toBe(true); + } expect(reaches(cfg, block(cfg, 'h();'), block(cfg, 'after();'))).toBe(true); }); @@ -493,3 +500,211 @@ describe('TS/JS CfgVisitor — AC1: 10-function fixture', () => { } }); }); + +describe('TS/JS CfgVisitor — early exits through finally (#2082 M2 U2)', () => { + const edges = (cfg: FunctionCfg) => cfg.edges; + const edgesFrom = (cfg: FunctionCfg, from: number) => cfg.edges.filter((e) => e.from === from); + + it('return inside try-with-finally routes through finally; no direct try→EXIT return edge', () => { + const cfg = cfgOf(`function f() { try { return 1; } finally { cleanup(); } }`); + const ret = block(cfg, 'return 1'); + const fin = block(cfg, 'cleanup()'); + expect(edges(cfg)).toContainEqual({ from: ret, to: fin, kind: 'return' }); + expect(edges(cfg)).toContainEqual({ from: fin, to: cfg.exitIndex, kind: 'finally-return' }); + expect(edges(cfg)).not.toContainEqual({ from: ret, to: cfg.exitIndex, kind: 'return' }); + }); + + it('break/continue crossing a finally thread through it with finally-* completion kinds', () => { + const cfg = cfgOf(`function f(xs) { + for (const x of xs) { + try { if (x) { break; } else { continue; } } finally { f1(); } + } + }`); + const fin = block(cfg, 'f1()'); + const brk = block(cfg, 'break'); + const cont = block(cfg, 'continue'); + expect(edges(cfg)).toContainEqual({ from: brk, to: fin, kind: 'break' }); + expect(edges(cfg)).toContainEqual({ from: cont, to: fin, kind: 'continue' }); + const fromFin = edgesFrom(cfg, fin).map((e) => e.kind); + expect(fromFin).toContain('finally-break'); + expect(fromFin).toContain('finally-continue'); + }); + + it('nested finallys chain: return threads a() then b() then EXIT', () => { + const cfg = cfgOf(`function f() { try { try { return; } finally { a(); } } finally { b(); } }`); + const ret = block(cfg, 'return'); + const finA = block(cfg, 'a()'); + const finB = block(cfg, 'b()'); + expect(edges(cfg)).toContainEqual({ from: ret, to: finA, kind: 'return' }); + expect(edges(cfg)).toContainEqual({ from: finA, to: finB, kind: 'finally-return' }); + expect(edges(cfg)).toContainEqual({ from: finB, to: cfg.exitIndex, kind: 'finally-return' }); + }); + + it('returns in try AND catch share one deduped finally-return completion edge', () => { + const cfg = cfgOf(`function f() { + try { return t(); } catch (e) { return c(); } finally { f1(); } + }`); + const fin = block(cfg, 'f1()'); + const completions = edgesFrom(cfg, fin).filter((e) => e.kind === 'finally-return'); + expect(completions).toEqual([{ from: fin, to: cfg.exitIndex, kind: 'finally-return' }]); + expect(edges(cfg)).toContainEqual({ from: block(cfg, 't()'), to: fin, kind: 'return' }); + expect(edges(cfg)).toContainEqual({ from: block(cfg, 'c()'), to: fin, kind: 'return' }); + }); + + it('return inside catch with NO finally keeps its direct edge to EXIT', () => { + const cfg = cfgOf(`function f() { try { t(); } catch (e) { return 1; } }`); + const ret = block(cfg, 'return 1'); + expect(edges(cfg)).toContainEqual({ from: ret, to: cfg.exitIndex, kind: 'return' }); + expect(edgeKinds(cfg).has('finally-return')).toBe(false); + }); + + it('normal completion still routes through finally exactly once', () => { + const cfg = cfgOf(`function f() { try { work(); } finally { fin(); } done(); }`); + const body = block(cfg, 'work()'); + const fin = block(cfg, 'fin()'); + const seqs = edges(cfg).filter((e) => e.from === body && e.to === fin && e.kind === 'seq'); + expect(seqs).toHaveLength(1); + expect(reaches(cfg, fin, block(cfg, 'done()'))).toBe(true); + }); + + it('kind invariant: no bare jump edge originates from a finally exit block', () => { + const cfg = cfgOf(`function f(xs) { + for (const x of xs) { try { if (x) return 1; break; } finally { f1(); } } + }`); + const fin = block(cfg, 'f1()'); + for (const e of edgesFrom(cfg, fin)) { + expect(['return', 'break', 'continue']).not.toContain(e.kind); + } + }); + + it('non-crossing break (loop wholly inside try) keeps its direct edge — no finally threading', () => { + const cfg = cfgOf(`function f(xs) { + try { for (const x of xs) { break; } post(); } finally { f1(); } + }`); + const brk = block(cfg, 'break'); + const fin = block(cfg, 'f1()'); + const brkEdges = edgesFrom(cfg, brk).filter((e) => e.kind === 'break'); + expect(brkEdges).toHaveLength(1); + expect(brkEdges[0].to).not.toBe(fin); + // the break's continuation (post()) is reachable WITHOUT passing the finally + expect(reaches(cfg, brkEdges[0].to, block(cfg, 'post()'))).toBe(true); + expect(edgeKinds(cfg).has('finally-break')).toBe(false); + // normal try completion still routes through finally + expect(edges(cfg)).toContainEqual({ + from: block(cfg, 'post()'), + to: fin, + kind: 'seq', + }); + }); + + it('labeled break crossing the finally DOES thread', () => { + const cfg = cfgOf(`function f(xs) { + outer: for (const x of xs) { + try { break outer; } finally { f1(); } + } + }`); + const brk = block(cfg, 'break outer'); + const fin = block(cfg, 'f1()'); + expect(edges(cfg)).toContainEqual({ from: brk, to: fin, kind: 'break' }); + expect(edgesFrom(cfg, fin).some((e) => e.kind === 'finally-break')).toBe(true); + }); + + it('empty finally: jump keeps its direct edge, no finally-* kinds, no throw', () => { + const cfg = cfgOf(`function f() { try { return 1; } finally {} }`); + const ret = block(cfg, 'return 1'); + expect(edges(cfg)).toContainEqual({ from: ret, to: cfg.exitIndex, kind: 'return' }); + expect(edgeKinds(cfg).has('finally-return')).toBe(false); + }); + + it('finally that itself returns: its return wins; no dangling completion edges', () => { + const cfg = cfgOf(`function f() { try { return 1; } finally { return 2; } }`); + const finRet = block(cfg, 'return 2'); + expect(edges(cfg)).toContainEqual({ from: finRet, to: cfg.exitIndex, kind: 'return' }); + // the pending completion had no finally exits to attach to + expect(edgeKinds(cfg).has('finally-return')).toBe(false); + // every edge endpoint is in range (no dangling) + for (const e of edges(cfg)) { + expect(e.to).toBeGreaterThanOrEqual(0); + expect(e.to).toBeLessThan(cfg.blocks.length); + } + }); + + it('single-exit invariant: EXIT reachable, all blocks have a path onward', () => { + const cfg = cfgOf(`function f(xs) { + outer: for (const x of xs) { + try { + try { if (x) { continue outer; } return g(x); } finally { a(); } + } finally { b(); } + } + }`); + expect(reaches(cfg, cfg.entryIndex, cfg.exitIndex)).toBe(true); + for (const b of cfg.blocks) { + if (b.index === cfg.exitIndex) continue; + if (!reachable(cfg, b.index)) continue; // unreachable blocks exempt + expect(reaches(cfg, b.index, cfg.exitIndex)).toBe(true); + } + }); +}); + +describe('TS/JS CfgVisitor — labeled statements modeled generically (#2160 review)', () => { + it('break to a labeled non-loop block targets the synthesized join, not EXIT', () => { + const cfg = cfgOf(`function f(c) { + let x = 1; + blk: { if (c) { break blk; } x = 2; } + sink(x); + }`); + const brk = block(cfg, 'break blk'); + const sink = block(cfg, 'sink(x)'); + const brkEdges = cfg.edges.filter((e) => e.from === brk && e.kind === 'break'); + expect(brkEdges).toHaveLength(1); + expect(brkEdges[0].to).not.toBe(cfg.exitIndex); + // the break's target flows into the post-construct continuation + expect(reaches(cfg, brkEdges[0].to, sink)).toBe(true); + }); + + it('doubly-labeled loop: break to the OUTER label resolves to the loop exit', () => { + const cfg = cfgOf(`function f(c) { + outer: inner: do { if (c) { break outer; } work(); } while (g()); + done(); + }`); + const brk = block(cfg, 'break outer'); + const done = block(cfg, 'done()'); + const brkEdges = cfg.edges.filter((e) => e.from === brk && e.kind === 'break'); + expect(brkEdges).toHaveLength(1); + expect(brkEdges[0].to).not.toBe(cfg.exitIndex); + expect(reaches(cfg, brkEdges[0].to, done)).toBe(true); + }); + + it('labeled break crossing a finally still threads it (labels + finalizers compose)', () => { + const cfg = cfgOf(`function f(c) { + blk: { + try { if (c) { break blk; } } finally { f1(); } + rest(); + } + after(); + }`); + const brk = block(cfg, 'break blk'); + const fin = block(cfg, 'f1()'); + expect(cfg.edges).toContainEqual({ from: brk, to: fin, kind: 'break' }); + const completions = cfg.edges.filter((e) => e.from === fin && e.kind === 'finally-break'); + expect(completions).toHaveLength(1); + // the completion resumes at the block's join → after() reachable, rest() skipped on that path + expect(reaches(cfg, completions[0].to, block(cfg, 'after()'))).toBe(true); + expect(completions[0].to).not.toBe(block(cfg, 'rest()')); + }); + + it('an unlabeled break inside a labeled block still targets the enclosing loop', () => { + const cfg = cfgOf(`function f(xs) { + for (const x of xs) { + blk: { if (x) { break; } } + body(); + } + done(); + }`); + const brk = block(cfg, 'break'); + const brkEdges = cfg.edges.filter((e) => e.from === brk && e.kind === 'break'); + expect(brkEdges).toHaveLength(1); + // targets the LOOP exit (reaches done() without re-entering body()) + expect(reaches(cfg, brkEdges[0].to, block(cfg, 'done()'))).toBe(true); + }); +}); diff --git a/gitnexus/test/unit/pdg-mode-flip.test.ts b/gitnexus/test/unit/pdg-mode-flip.test.ts index 1000ef9520..f258ecb8a0 100644 --- a/gitnexus/test/unit/pdg-mode-flip.test.ts +++ b/gitnexus/test/unit/pdg-mode-flip.test.ts @@ -34,6 +34,80 @@ async function countBasicBlocks(repoPath: string): Promise { } } +describe('pdgModeMismatch — M1→M2 stamp upgrade (#2082 M2, pure)', () => { + it('an M1-era stamp (no REACHING_DEF cap) mismatches an M2 request — upgrade forces full writeback', async () => { + const { pdgModeMismatch } = await import('../../src/core/run-analyze.js'); + const m1Stamp = { maxFunctionLines: 2000, maxEdgesPerFunction: 5000 }; + // default M2 request resolves maxReachingDefEdgesPerFunction=4000 ≠ undefined + expect(pdgModeMismatch(m1Stamp, { pdg: true })).toBe(true); + }); + + it('an identical resolved M2 config compares equal (steady state keeps incremental)', async () => { + const { pdgModeMismatch, resolvePdgConfig } = await import('../../src/core/run-analyze.js'); + const stamp = resolvePdgConfig({ pdg: true }); + expect(pdgModeMismatch(stamp, { pdg: true })).toBe(false); + }); + + it('a REACHING_DEF cap change alone trips the mismatch', async () => { + const { pdgModeMismatch, resolvePdgConfig } = await import('../../src/core/run-analyze.js'); + const stamp = resolvePdgConfig({ pdg: true }); + expect(pdgModeMismatch(stamp, { pdg: true, pdgMaxReachingDefEdgesPerFunction: 100 })).toBe( + true, + ); + expect(pdgModeMismatch(stamp, { pdg: true, pdgMaxReachingDefEdgesPerFunction: 4000 })).toBe( + false, // explicit default ≡ default (resolution before comparison) + ); + }); +}); + +describe('detect_changes BasicBlock exclusion (#2082 U7)', () => { + it('the symbol-overlap id-prefix filter excludes exactly the BasicBlock rows', async () => { + const repo = await setupMiniRepo(); + try { + const { runFullAnalysis } = await import('../../src/core/run-analyze.js'); + const cb = { onProgress: () => {}, onLog: () => {} }; + await runFullAnalysis(repo.dbPath, { skipAgentsMd: true, pdg: true }, cb); + + const adapter = await import('../../src/core/lbug/lbug-adapter.js'); + const { lbugPath } = getStoragePaths(repo.dbPath); + await adapter.initLbug(lbugPath); + try { + // Counterfactual: WITHOUT the U7 filter, line-bearing BasicBlock rows + // exist on a pdg index (the noise detect_changes used to report). + const blocks = (await adapter.executeQuery( + `MATCH (n) WHERE n.id STARTS WITH 'BasicBlock:' + AND n.startLine IS NOT NULL AND n.endLine IS NOT NULL + RETURN n.id AS id`, + )) as Array<{ id: string }>; + expect(blocks.length).toBeGreaterThan(0); + // With the U7 filter (the exact predicate detectChanges now runs — + // also validates STARTS WITH against the real engine): no BasicBlocks, + // real symbols intact. + const symbols = (await adapter.executeQuery( + `MATCH (n) WHERE NOT n.id STARTS WITH 'BasicBlock:' + AND n.startLine IS NOT NULL AND n.endLine IS NOT NULL + RETURN n.id AS id`, + )) as Array<{ id: string }>; + expect(symbols.length).toBeGreaterThan(0); + for (const row of symbols) { + expect(String(row.id)).not.toMatch(/^BasicBlock:/); + } + // DB-level smoke for the M2 projection itself: REACHING_DEF rows + // persisted with the variable name in `reason` (plan Validation). + const rd = (await adapter.executeQuery( + `MATCH (:BasicBlock)-[r:CodeRelation {type: 'REACHING_DEF'}]->(:BasicBlock) + RETURN count(r) AS c`, + )) as Array<{ c: number | bigint }>; + expect(Number(rd[0]?.c ?? 0)).toBeGreaterThan(0); + } finally { + await adapter.closeLbug(); + } + } finally { + await repo.cleanup(); + } + }, 600_000); +}); + describe('runFullAnalysis — pdg-mode flip (#2099 F1)', () => { it('off→on flip forces a full writeback that persists the CFG layer; on→off removes it', async () => { const repo = await setupMiniRepo(); @@ -57,7 +131,11 @@ describe('runFullAnalysis — pdg-mode flip (#2099 F1)', () => { expect(logs.some((m) => m.includes('pdg mode changed'))).toBe(true); expect(await countBasicBlocks(repo.dbPath)).toBeGreaterThan(0); const stamped = await loadMeta(storagePath); - expect(stamped!.pdg).toEqual({ maxFunctionLines: 2000, maxEdgesPerFunction: 5000 }); + expect(stamped!.pdg).toEqual({ + maxFunctionLines: 2000, + maxEdgesPerFunction: 5000, + maxReachingDefEdgesPerFunction: 4000, + }); expect(stamped!.incrementalInProgress).toBeUndefined(); // cleared on success // 3. Steady state: a second identical --pdg run takes the fast path — @@ -105,6 +183,7 @@ describe('runFullAnalysis — pdg-mode flip (#2099 F1)', () => { expect((await loadMeta(storagePath))!.pdg).toEqual({ maxFunctionLines: 2000, maxEdgesPerFunction: 1, + maxReachingDefEdgesPerFunction: 4000, }); // The CFG layer survives a rebuild under a tighter edge cap (blocks are // never capped, only edges). diff --git a/gitnexus/test/unit/run-analyze.test.ts b/gitnexus/test/unit/run-analyze.test.ts index ba80bde8fd..9790f337cb 100644 --- a/gitnexus/test/unit/run-analyze.test.ts +++ b/gitnexus/test/unit/run-analyze.test.ts @@ -330,7 +330,14 @@ describe('deriveEmbeddingCap', () => { }); describe('pdgModeMismatch / resolvePdgConfig (#2099 F1)', () => { - const DEFAULTS = { maxFunctionLines: 2000, maxEdgesPerFunction: 5000 }; + // M2 (#2082) added the resolved REACHING_DEF cap to the stamp; these tests + // model M2 STEADY-STATE equality. The M1-era-stamp (field absent) upgrade + // path is pinned in pdg-mode-flip.test.ts. + const DEFAULTS = { + maxFunctionLines: 2000, + maxEdgesPerFunction: 5000, + maxReachingDefEdgesPerFunction: 4000, + }; it('resolvePdgConfig: pdg-off run resolves to undefined (the meta field is omitted)', async () => { const { resolvePdgConfig } = await import('../../src/core/run-analyze.js'); @@ -342,8 +349,13 @@ describe('pdgModeMismatch / resolvePdgConfig (#2099 F1)', () => { const { resolvePdgConfig } = await import('../../src/core/run-analyze.js'); expect(resolvePdgConfig({ pdg: true })).toEqual(DEFAULTS); expect( - resolvePdgConfig({ pdg: true, pdgMaxFunctionLines: 0, pdgMaxEdgesPerFunction: 0 }), - ).toEqual({ maxFunctionLines: 0, maxEdgesPerFunction: 0 }); + resolvePdgConfig({ + pdg: true, + pdgMaxFunctionLines: 0, + pdgMaxEdgesPerFunction: 0, + pdgMaxReachingDefEdgesPerFunction: 0, + }), + ).toEqual({ maxFunctionLines: 0, maxEdgesPerFunction: 0, maxReachingDefEdgesPerFunction: 0 }); }); it('legacy meta (no recorded stamp) + plain run → no mismatch', async () => {