From cb4b4e95fcffb07725fd02df028a86ff8b4b62da Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Wed, 10 Jun 2026 20:17:51 +0000 Subject: [PATCH 01/19] fix(cfg): route early exits through finally with target-relative threading (#2082 U2) --- .../ingestion/cfg/control-flow-context.ts | 99 ++++++++++-- gitnexus/src/core/ingestion/cfg/types.ts | 24 ++- .../core/ingestion/cfg/visitors/typescript.ts | 98 +++++++++--- .../test/unit/cfg/typescript-visitor.test.ts | 147 ++++++++++++++++++ 4 files changed, 327 insertions(+), 41 deletions(-) diff --git a/gitnexus/src/core/ingestion/cfg/control-flow-context.ts b/gitnexus/src/core/ingestion/cfg/control-flow-context.ts index 38c7bcbb8a..7b54865a85 100644 --- a/gitnexus/src/core/ingestion/cfg/control-flow-context.ts +++ b/gitnexus/src/core/ingestion/cfg/control-flow-context.ts @@ -1,12 +1,23 @@ /** - * ControlFlowContext (issue #2081, M1). + * ControlFlowContext (issue #2081 M1; finalizer frames added by #2082 M2 U2). * * Resolves the targets of `break`/`continue` (plain and labeled) as the visitor * descends through loops and switches. Loops and switches push a target frame * on entry and pop it on exit; a labeled statement attaches its label to the * frame of the construct it labels, so `break outer` / `continue outer` resolve * against the right enclosing loop/switch rather than the nearest one. + * + * M2 adds FINALIZER frames, interleaved on the SAME stack as loop/switch frames + * — interleaving is load-bearing: a jump must route through exactly the + * `finally` bodies lexically BETWEEN it and its target (target-relative + * threading). A `break` whose loop lives entirely inside the `try` crosses no + * finally and must keep its direct edge; re-routing it anyway would force the + * only path to the in-try continuation through the finally, letting a finally + * redefinition falsely KILL in-loop definitions for the downstream + * reaching-defs pass (a taint false negative). A parallel stack cannot express + * that between-ness, which is why the frames live here. */ +import type { CfgEdgeKind } from './types.js'; interface LoopFrame { readonly kind: 'loop'; @@ -24,7 +35,26 @@ interface SwitchFrame { readonly label?: string; } -type Frame = LoopFrame | SwitchFrame; +/** A `finally` whose body any crossing jump must route through. */ +export interface FinalizerFrame { + readonly kind: 'finalizer'; + /** Entry block of the finally body. */ + readonly entry: number; + /** + * Completion legs registered by jumps that crossed this finally: once the + * owning try pops the frame, it wires `finally-exits → to` with `kind` for + * each entry. Mutated by the jump handlers via {@link ControlFlowContext}. + */ + readonly pending: { to: number; kind: CfgEdgeKind }[]; +} + +type Frame = LoopFrame | SwitchFrame | FinalizerFrame; + +/** A resolved jump: its ultimate target + the finallys it crosses (inner→outer). */ +export interface JumpResolution { + readonly target: number; + readonly finalizers: readonly FinalizerFrame[]; +} export class ControlFlowContext { private readonly stack: Frame[] = []; @@ -37,33 +67,72 @@ export class ControlFlowContext { this.stack.push({ kind: 'switch', breakTo, label }); } + /** + * Push a finalizer frame and return it — the owning `visitTry` keeps the + * reference to wire {@link FinalizerFrame.pending} after popping it. + */ + pushFinalizer(entry: number): FinalizerFrame { + const frame: FinalizerFrame = { kind: 'finalizer', entry, pending: [] }; + this.stack.push(frame); + return frame; + } + pop(): void { this.stack.pop(); } /** - * Target block for a `break`. With a label, the nearest enclosing frame - * carrying that label (loop or switch); without, the nearest frame of any - * kind. Returns `undefined` if there is no valid target (malformed input). + * Resolve a `break`: the nearest enclosing loop/switch frame (or, with a + * label, the nearest frame carrying that label) plus every finalizer frame + * stacked ABOVE it — i.e. exactly the finallys the jump crosses, innermost + * first. Returns `undefined` if there is no valid target (malformed input or + * an unmodeled label) — the caller falls back to its conservative routing and + * threads nothing. */ - breakTarget(label?: string): number | undefined { + resolveBreak(label?: string): JumpResolution | undefined { + return this.resolve((f) => label === undefined || f.label === label); + } + + /** Resolve a `continue`: like {@link resolveBreak} but only loop frames match. */ + resolveContinue(label?: string): JumpResolution | undefined { + return this.resolve( + (f) => f.kind === 'loop' && (label === undefined || f.label === label), + (f) => (f as LoopFrame).continueTo, + ); + } + + /** Every active finalizer, innermost first — what a `return` must cross. */ + finalizersForReturn(): readonly FinalizerFrame[] { + const fins: FinalizerFrame[] = []; for (let i = this.stack.length - 1; i >= 0; i--) { const f = this.stack[i]; - if (label === undefined || f.label === label) return f.breakTo; + if (f.kind === 'finalizer') fins.push(f); } - return undefined; + return fins; } - /** - * Target block for a `continue`. With a label, the nearest enclosing **loop** - * carrying that label; without, the nearest loop (switches are skipped — you - * cannot `continue` a switch). Returns `undefined` if there is no valid loop. - */ + /** Target block for a `break` (no finalizer info) — see {@link resolveBreak}. */ + breakTarget(label?: string): number | undefined { + return this.resolveBreak(label)?.target; + } + + /** Target block for a `continue` (no finalizer info) — see {@link resolveContinue}. */ continueTarget(label?: string): number | undefined { + return this.resolveContinue(label)?.target; + } + + private resolve( + matches: (f: LoopFrame | SwitchFrame) => boolean, + targetOf: (f: LoopFrame | SwitchFrame) => number = (f) => f.breakTo, + ): JumpResolution | undefined { + const crossed: FinalizerFrame[] = []; for (let i = this.stack.length - 1; i >= 0; i--) { const f = this.stack[i]; - if (f.kind !== 'loop') continue; - if (label === undefined || f.label === label) return f.continueTo; + if (f.kind === 'finalizer') { + crossed.push(f); + continue; + } + if (matches(f)) return { target: targetOf(f), finalizers: crossed }; } return undefined; } diff --git a/gitnexus/src/core/ingestion/cfg/types.ts b/gitnexus/src/core/ingestion/cfg/types.ts index 0b28c60898..72d2c0a9b8 100644 --- a/gitnexus/src/core/ingestion/cfg/types.ts +++ b/gitnexus/src/core/ingestion/cfg/types.ts @@ -22,18 +22,32 @@ export interface BasicBlockData { readonly kind: 'entry' | 'exit' | 'normal'; } -/** Why one block flows to another — drives the `reason` on the emitted CFG edge. */ +/** + * Why one block flows to another — drives the `reason` on the emitted CFG edge. + * + * Kind invariant (M2): a bare jump kind (`return`/`break`/`continue`) means the + * SOURCE block's terminator is that jump statement. A `finally-*` kind marks a + * COMPLETION edge out of a `finally` body's exit — the leg that resumes a jump + * which was re-routed through the finally (issue #2082 U2). Reusing the bare + * kinds on completion edges would silently break consumers that infer the + * source block's terminator from the kind, and a single generic kind would lose + * WHICH jump each completion edge completes when a shared finally has several + * pending targets. + */ export type CfgEdgeKind = | 'seq' // straight-line fallthrough | 'cond-true' // branch taken (if/while/for condition true) | 'cond-false' // branch not taken / loop exit | 'loop-back' // back-edge to a loop header - | 'break' // break → loop/switch exit - | 'continue' // continue → loop header - | 'return' // return → function EXIT + | 'break' // break → loop/switch exit (or the finally it must cross) + | 'continue' // continue → loop header (or the finally it must cross) + | 'return' // return → function EXIT (or the finally it must cross) | 'throw' // throw → nearest handler / finally / EXIT | 'switch-case' // dispatch to a case - | 'fallthrough'; // switch case → next case (no break) + | 'fallthrough' // switch case → next case (no break) + | 'finally-return' // finally exit → resumed return target (EXIT / outer finally) + | 'finally-break' // finally exit → resumed break target + | 'finally-continue'; // finally exit → resumed continue target export interface CfgEdgeData { readonly from: number; diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts index 79643b8d42..4166f1b168 100644 --- a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts @@ -18,23 +18,28 @@ * - `try/catch/finally` routes both normal completion AND a `throw` in the try * through `finally` (the finally block post-dominates the try/catch); a * `throw` with no catch propagates through finally to the enclosing handler. + * - EARLY EXITS THROUGH FINALLY (#2082 M2 U2, closes the M1 soundness gap): a + * `break`/`continue`/`return` whose jump CROSSES a `finally` is re-routed to + * the finally entry (keeping its bare jump kind), and the finally's exits + * gain a `finally-return`/`finally-break`/`finally-continue` completion edge + * to the resumed target. Threading is TARGET-RELATIVE via finalizer frames + * interleaved on the {@link ControlFlowContext} stack: only the finallys + * lexically between the jump and its target thread (a `break` whose loop is + * wholly inside the try keeps its direct edge — re-routing it would let a + * finally redefinition falsely kill in-loop defs for reaching-defs). Nested + * finallys chain inner→outer; finally-as-shared-join conflates exit paths + * (sound over-approximation; duplication-per-exit-path was rejected). An + * empty/comment-only finally pushes no frame — jumps keep direct edges. * - labeled `break`/`continue` resolve against the labeled loop's frame. * - * Known M1 limitations: - * - SOUNDNESS GAP (M2 blocker, not mere precision): a non-local jump - * (`break`/`continue`/`return`) out of a `try` that has a `finally` edges - * directly to its target rather than routing THROUGH the `finally` block - * first. A future taint/PDG pass will therefore MISS flow mediated by a - * `finally` on the early-exit path (e.g. a value the `finally` taints or - * sanitizes before the `return` reaches its target) — a false negative. The - * general fix duplicates `finally` per exit path; deferred past M1 and - * tracked for M2. Normal completion and `throw` DO route through `finally`. + * Known limitations: * - A `break`/`continue` to a label on a non-loop/non-switch block, and the * OUTER label of a doubly-labeled construct (`outer: inner: for (...)`), are * not modeled. The jump is conservatively routed to the function EXIT (a * sound over-approximation that keeps the graph single-exit — see visitBreak) - * rather than left as a dangling sink; only the precise labeled target is - * unmodeled. Single-labeled loops/switches resolve correctly. + * rather than left as a dangling sink, and threads no finallys (target + * unknown ⇒ crossed set unknown). Single-labeled loops/switches resolve + * correctly, including across finallys. * * Block/edge accounting and reachability are pinned in * `test/unit/cfg/cfg-builder.test.ts` (core) and @@ -42,9 +47,9 @@ */ import type { SyntaxNode } from '../../utils/ast-helpers.js'; import { CfgBuilder } from '../cfg-builder.js'; -import { ControlFlowContext } from '../control-flow-context.js'; +import { ControlFlowContext, type FinalizerFrame } from '../control-flow-context.js'; import type { TraversalResult } from '../traversal-result.js'; -import type { CfgVisitor, FunctionCfg } from '../types.js'; +import type { CfgEdgeKind, CfgVisitor, FunctionCfg } from '../types.js'; /** TS/JS node types that own a CFG-bearing function body. */ const TS_FUNCTION_TYPES = new Set([ @@ -198,10 +203,37 @@ class TsCfgWalk { private visitReturn(stmt: SyntaxNode): TraversalResult { const idx = this.builder.newBlock(startLineOf(stmt), endLineOf(stmt), stmt.text); - this.builder.edge(idx, this.builder.exitIndex, 'return'); + // A return crosses EVERY active finally before reaching EXIT. + this.jumpVia(idx, this.cfc.finalizersForReturn(), this.builder.exitIndex, 'return'); return { entry: idx, exits: [] }; } + /** + * Wire a jump from `from` to `target`, routing through the finallys it + * crosses (innermost first). The first leg keeps the bare jump `kind` + * (preserving the "kind ⟹ source-block terminator" invariant); each + * finally's completion leg is registered as pending on its frame with the + * matching `finally-*` kind and wired by the owning `visitTry` once the + * finally's exits are known to it (#2082 M2 U2 / KTD5). + */ + private jumpVia( + from: number, + finalizers: readonly FinalizerFrame[], + target: number, + kind: 'return' | 'break' | 'continue', + ): void { + if (finalizers.length === 0) { + this.builder.edge(from, target, kind); + return; + } + const completionKind = `finally-${kind}` as CfgEdgeKind; + this.builder.edge(from, finalizers[0].entry, kind); + for (let i = 0; i < finalizers.length; i++) { + const to = i + 1 < finalizers.length ? finalizers[i + 1].entry : target; + finalizers[i].pending.push({ to, kind: completionKind }); + } + } + private visitThrow(stmt: SyntaxNode): TraversalResult { const idx = this.builder.newBlock(startLineOf(stmt), endLineOf(stmt), stmt.text); this.builder.edge(idx, this.currentHandler(), 'throw'); @@ -210,22 +242,25 @@ class TsCfgWalk { private visitBreak(stmt: SyntaxNode): TraversalResult { const idx = this.builder.newBlock(startLineOf(stmt), endLineOf(stmt), stmt.text); - const target = this.cfc.breakTarget(this.labelOf(stmt)); - // An unresolved target — a label this M1 visitor doesn't model (a stacked + const res = this.cfc.resolveBreak(this.labelOf(stmt)); + // An unresolved target — a label this visitor doesn't model (a stacked // outer label like `outer: inner: for`, or a labeled non-loop block) — // would otherwise leave this block with NO out-edge, stranding it and // breaking the single-exit invariant a downstream post-dominator / PDG pass // relies on. Conservatively route an unresolved jump to the function EXIT - // ("escapes the function"): sound over-approximation, keeps single-exit. - this.builder.edge(idx, target ?? this.builder.exitIndex, 'break'); + // ("escapes the function"): sound over-approximation, keeps single-exit; + // no finallys thread (the target is unknown, so the crossed set is too). + if (res) this.jumpVia(idx, res.finalizers, res.target, 'break'); + else this.builder.edge(idx, this.builder.exitIndex, 'break'); return { entry: idx, exits: [] }; } private visitContinue(stmt: SyntaxNode): TraversalResult { const idx = this.builder.newBlock(startLineOf(stmt), endLineOf(stmt), stmt.text); - const target = this.cfc.continueTarget(this.labelOf(stmt)); + const res = this.cfc.resolveContinue(this.labelOf(stmt)); // See visitBreak: an unresolved label routes to EXIT to preserve single-exit. - this.builder.edge(idx, target ?? this.builder.exitIndex, 'continue'); + if (res) this.jumpVia(idx, res.finalizers, res.target, 'continue'); + else this.builder.edge(idx, this.builder.exitIndex, 'continue'); return { entry: idx, exits: [] }; } @@ -453,11 +488,20 @@ class TsCfgWalk { } // Build finally first so its entry is known as both a normal join and a - // handler target. The finally body runs in the OUTER handler context. + // handler target. The finally body runs in the OUTER handler context — and + // OUTSIDE this try's finalizer frame: a return inside the finally must not + // thread itself (it threads only outer finallys, matching JS semantics). const finallyRes = finallyClause ? this.visitSeq(this.statementsOf(this.bodyBlockOf(finallyClause) as SyntaxNode)) : null; + // Finalizer frame for early-exit threading (#2082 M2 U2): active while the + // catch and protected bodies are walked, so a crossing `return`/`break`/ + // `continue` inside either routes through the finally. An empty/comment-only + // finally (`finallyRes` null — the #2099-F2 empty-catch bug shape) pushes + // NO frame: it can define nothing, so jumps soundly keep direct edges. + const finFrame = finallyRes ? this.cfc.pushFinalizer(finallyRes.entry) : null; + // A throw inside catch propagates to finally (if any), else the outer handler. let catchRes: SeqResult = null; if (catchClause) { @@ -500,6 +544,18 @@ class TsCfgWalk { } } + // The finalizer frame closes once the protected/catch walks are done; any + // jumps that crossed it left their completion legs on `pending`, wired + // here from the finally's exits. A finally that itself always jumps + // (`finally { return 2; }`) has no exits — pending legs wire nowhere, + // matching JS's finally-override semantics. + if (finFrame && finallyRes) { + this.cfc.pop(); + for (const p of finFrame.pending) { + this.builder.connect(finallyRes.exits, p.to, p.kind); + } + } + const exits: number[] = []; if (finallyRes) { // Normal completion of try AND catch both flow through finally. diff --git a/gitnexus/test/unit/cfg/typescript-visitor.test.ts b/gitnexus/test/unit/cfg/typescript-visitor.test.ts index 8452d2d71c..ef97bc44a2 100644 --- a/gitnexus/test/unit/cfg/typescript-visitor.test.ts +++ b/gitnexus/test/unit/cfg/typescript-visitor.test.ts @@ -493,3 +493,150 @@ describe('TS/JS CfgVisitor — AC1: 10-function fixture', () => { } }); }); + +describe('TS/JS CfgVisitor — early exits through finally (#2082 M2 U2)', () => { + const edges = (cfg: FunctionCfg) => cfg.edges; + const edgesFrom = (cfg: FunctionCfg, from: number) => cfg.edges.filter((e) => e.from === from); + + it('return inside try-with-finally routes through finally; no direct try→EXIT return edge', () => { + const cfg = cfgOf(`function f() { try { return 1; } finally { cleanup(); } }`); + const ret = block(cfg, 'return 1'); + const fin = block(cfg, 'cleanup()'); + expect(edges(cfg)).toContainEqual({ from: ret, to: fin, kind: 'return' }); + expect(edges(cfg)).toContainEqual({ from: fin, to: cfg.exitIndex, kind: 'finally-return' }); + expect(edges(cfg)).not.toContainEqual({ from: ret, to: cfg.exitIndex, kind: 'return' }); + }); + + it('break/continue crossing a finally thread through it with finally-* completion kinds', () => { + const cfg = cfgOf(`function f(xs) { + for (const x of xs) { + try { if (x) { break; } else { continue; } } finally { f1(); } + } + }`); + const fin = block(cfg, 'f1()'); + const brk = block(cfg, 'break'); + const cont = block(cfg, 'continue'); + expect(edges(cfg)).toContainEqual({ from: brk, to: fin, kind: 'break' }); + expect(edges(cfg)).toContainEqual({ from: cont, to: fin, kind: 'continue' }); + const fromFin = edgesFrom(cfg, fin).map((e) => e.kind); + expect(fromFin).toContain('finally-break'); + expect(fromFin).toContain('finally-continue'); + }); + + it('nested finallys chain: return threads a() then b() then EXIT', () => { + const cfg = cfgOf( + `function f() { try { try { return; } finally { a(); } } finally { b(); } }`, + ); + const ret = block(cfg, 'return'); + const finA = block(cfg, 'a()'); + const finB = block(cfg, 'b()'); + expect(edges(cfg)).toContainEqual({ from: ret, to: finA, kind: 'return' }); + expect(edges(cfg)).toContainEqual({ from: finA, to: finB, kind: 'finally-return' }); + expect(edges(cfg)).toContainEqual({ from: finB, to: cfg.exitIndex, kind: 'finally-return' }); + }); + + it('returns in try AND catch share one deduped finally-return completion edge', () => { + const cfg = cfgOf(`function f() { + try { return t(); } catch (e) { return c(); } finally { f1(); } + }`); + const fin = block(cfg, 'f1()'); + const completions = edgesFrom(cfg, fin).filter((e) => e.kind === 'finally-return'); + expect(completions).toEqual([{ from: fin, to: cfg.exitIndex, kind: 'finally-return' }]); + expect(edges(cfg)).toContainEqual({ from: block(cfg, 't()'), to: fin, kind: 'return' }); + expect(edges(cfg)).toContainEqual({ from: block(cfg, 'c()'), to: fin, kind: 'return' }); + }); + + it('return inside catch with NO finally keeps its direct edge to EXIT', () => { + const cfg = cfgOf(`function f() { try { t(); } catch (e) { return 1; } }`); + const ret = block(cfg, 'return 1'); + expect(edges(cfg)).toContainEqual({ from: ret, to: cfg.exitIndex, kind: 'return' }); + expect(edgeKinds(cfg).has('finally-return')).toBe(false); + }); + + it('normal completion still routes through finally exactly once', () => { + const cfg = cfgOf(`function f() { try { work(); } finally { fin(); } done(); }`); + const body = block(cfg, 'work()'); + const fin = block(cfg, 'fin()'); + const seqs = edges(cfg).filter((e) => e.from === body && e.to === fin && e.kind === 'seq'); + expect(seqs).toHaveLength(1); + expect(reaches(cfg, fin, block(cfg, 'done()'))).toBe(true); + }); + + it('kind invariant: no bare jump edge originates from a finally exit block', () => { + const cfg = cfgOf(`function f(xs) { + for (const x of xs) { try { if (x) return 1; break; } finally { f1(); } } + }`); + const fin = block(cfg, 'f1()'); + for (const e of edgesFrom(cfg, fin)) { + expect(['return', 'break', 'continue']).not.toContain(e.kind); + } + }); + + it('non-crossing break (loop wholly inside try) keeps its direct edge — no finally threading', () => { + const cfg = cfgOf(`function f(xs) { + try { for (const x of xs) { break; } post(); } finally { f1(); } + }`); + const brk = block(cfg, 'break'); + const fin = block(cfg, 'f1()'); + const brkEdges = edgesFrom(cfg, brk).filter((e) => e.kind === 'break'); + expect(brkEdges).toHaveLength(1); + expect(brkEdges[0].to).not.toBe(fin); + // the break's continuation (post()) is reachable WITHOUT passing the finally + expect(reaches(cfg, brkEdges[0].to, block(cfg, 'post()'))).toBe(true); + expect(edgeKinds(cfg).has('finally-break')).toBe(false); + // normal try completion still routes through finally + expect(edges(cfg)).toContainEqual({ + from: block(cfg, 'post()'), + to: fin, + kind: 'seq', + }); + }); + + it('labeled break crossing the finally DOES thread', () => { + const cfg = cfgOf(`function f(xs) { + outer: for (const x of xs) { + try { break outer; } finally { f1(); } + } + }`); + const brk = block(cfg, 'break outer'); + const fin = block(cfg, 'f1()'); + expect(edges(cfg)).toContainEqual({ from: brk, to: fin, kind: 'break' }); + expect(edgesFrom(cfg, fin).some((e) => e.kind === 'finally-break')).toBe(true); + }); + + it('empty finally: jump keeps its direct edge, no finally-* kinds, no throw', () => { + const cfg = cfgOf(`function f() { try { return 1; } finally {} }`); + const ret = block(cfg, 'return 1'); + expect(edges(cfg)).toContainEqual({ from: ret, to: cfg.exitIndex, kind: 'return' }); + expect(edgeKinds(cfg).has('finally-return')).toBe(false); + }); + + it('finally that itself returns: its return wins; no dangling completion edges', () => { + const cfg = cfgOf(`function f() { try { return 1; } finally { return 2; } }`); + const finRet = block(cfg, 'return 2'); + expect(edges(cfg)).toContainEqual({ from: finRet, to: cfg.exitIndex, kind: 'return' }); + // the pending completion had no finally exits to attach to + expect(edgeKinds(cfg).has('finally-return')).toBe(false); + // every edge endpoint is in range (no dangling) + for (const e of edges(cfg)) { + expect(e.to).toBeGreaterThanOrEqual(0); + expect(e.to).toBeLessThan(cfg.blocks.length); + } + }); + + it('single-exit invariant: EXIT reachable, all blocks have a path onward', () => { + const cfg = cfgOf(`function f(xs) { + outer: for (const x of xs) { + try { + try { if (x) { continue outer; } return g(x); } finally { a(); } + } finally { b(); } + } + }`); + expect(reaches(cfg, cfg.entryIndex, cfg.exitIndex)).toBe(true); + for (const b of cfg.blocks) { + if (b.index === cfg.exitIndex) continue; + if (!reachable(cfg, b.index)) continue; // unreachable blocks exempt + expect(reaches(cfg, b.index, cfg.exitIndex)).toBe(true); + } + }); +}); From d72c8bd2572213ac4792c914715eef844030080b Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Wed, 10 Jun 2026 20:29:24 +0000 Subject: [PATCH 02/19] feat(cfg): harvest per-statement def/use facts into the side channel (#2082 U1) --- .../src/core/ingestion/cfg/cfg-builder.ts | 55 +- gitnexus/src/core/ingestion/cfg/types.ts | 56 ++ .../cfg/visitors/typescript-harvest.ts | 512 ++++++++++++++++++ .../core/ingestion/cfg/visitors/typescript.ts | 139 ++++- gitnexus/src/storage/parse-cache.ts | 2 +- .../integration/cfg/worker-roundtrip.test.ts | 10 + gitnexus/test/unit/cfg/harvest.test.ts | 372 +++++++++++++ 7 files changed, 1123 insertions(+), 23 deletions(-) create mode 100644 gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts create mode 100644 gitnexus/test/unit/cfg/harvest.test.ts diff --git a/gitnexus/src/core/ingestion/cfg/cfg-builder.ts b/gitnexus/src/core/ingestion/cfg/cfg-builder.ts index 976b468246..adf76a89b7 100644 --- a/gitnexus/src/core/ingestion/cfg/cfg-builder.ts +++ b/gitnexus/src/core/ingestion/cfg/cfg-builder.ts @@ -12,7 +12,14 @@ * hand-built block sequences, which is how the classic CFG hazards are pinned * before the tree-sitter visitor (U2) drives it. */ -import type { BasicBlockData, CfgEdgeData, CfgEdgeKind, FunctionCfg } from './types.js'; +import type { + BasicBlockData, + BindingEntry, + CfgEdgeData, + CfgEdgeKind, + FunctionCfg, + StatementFacts, +} from './types.js'; interface MutableBlock { startLine: number; @@ -26,6 +33,13 @@ interface MutableBlock { */ textParts: string[]; kind: BasicBlockData['kind']; + /** + * Per-statement def/use facts in execution order (#2082 M2 U1). Parallel to + * the statements that accrued to this block — but self-describing (each + * record carries its line): facts-only attaches (ENTRY params, catch params) + * mean fact index ≠ text-fragment index. + */ + statements: StatementFacts[]; } export class CfgBuilder { @@ -54,8 +68,15 @@ export class CfgBuilder { endLine: number, text: string, kind: BasicBlockData['kind'] = 'normal', + facts?: StatementFacts, ): number { - this.blocks.push({ startLine, endLine, textParts: text ? [text] : [], kind }); + this.blocks.push({ + startLine, + endLine, + textParts: text ? [text] : [], + kind, + statements: facts ? [facts] : [], + }); return this.blocks.length - 1; } @@ -73,11 +94,27 @@ export class CfgBuilder { } /** Extend a block's end line as more statements accrue to it. */ - extendBlock(index: number, endLine: number, appendText?: string): void { + extendBlock(index: number, endLine: number, appendText?: string, facts?: StatementFacts): void { const b = this.blocks[index]; if (!b) return; if (endLine > b.endLine) b.endLine = endLine; if (appendText) b.textParts.push(appendText); + if (facts) b.statements.push(facts); + } + + /** + * Attach a facts-only statement record to a block WITHOUT touching its text + * or line span (#2082 M2 U1) — bench fingerprints and CFG snapshots include + * block text, so harvesting must never perturb it. `prepend` is for records + * that lexically precede the block's statements: a `catch (e)` param def must + * sit at statement index 0 of the handler entry block or an in-block + * `use(e)` at index 0 would see no reaching def in the in-order sweep. + */ + attachFacts(index: number, facts: StatementFacts, position: 'append' | 'prepend' = 'append'): void { + const b = this.blocks[index]; + if (!b) return; + if (position === 'prepend') b.statements.unshift(facts); + else b.statements.push(facts); } get blockCount(): number { @@ -85,8 +122,14 @@ export class CfgBuilder { } /** Produce the serializable CFG. Caller is responsible for having wired the - * function's dangling exits to {@link exitIndex} before calling. */ - finish(): FunctionCfg { + * function's dangling exits to {@link exitIndex} before calling. + * + * Pass `bindings` (the function's binding table, possibly empty) to emit + * statement facts (#2082 M2 U1) — every block then carries a `statements` + * array. Omit it (hand-built test CFGs, pre-M2 producers) and both fields + * are absent, which the reaching-defs solver reports as `no-facts`. */ + finish(bindings?: readonly BindingEntry[]): FunctionCfg { + const withFacts = bindings !== undefined; return { filePath: this.filePath, functionStartLine: this.functionStartLine, @@ -100,8 +143,10 @@ export class CfgBuilder { endLine: b.endLine, text: b.textParts.join('\n'), kind: b.kind, + ...(withFacts ? { statements: b.statements } : {}), })), edges: [...this.edges], + ...(withFacts ? { bindings } : {}), }; } } diff --git a/gitnexus/src/core/ingestion/cfg/types.ts b/gitnexus/src/core/ingestion/cfg/types.ts index 72d2c0a9b8..bf6e47a538 100644 --- a/gitnexus/src/core/ingestion/cfg/types.ts +++ b/gitnexus/src/core/ingestion/cfg/types.ts @@ -11,6 +11,51 @@ * array of them is what rides on `ParsedFile.cfgSideChannel`. */ +/** + * One distinct declared variable (binding) within a function (#2082 M2 U1). + * + * Statement facts reference bindings by integer index into + * {@link FunctionCfg.bindings} — names appear once per binding instead of once + * per occurrence (measured ~4× smaller serialized payload than named records). + * Distinct bindings of the same name (shadowing) get distinct entries, which is + * what keeps an inner `let x` from falsely killing the outer `x`'s definitions + * in the reaching-defs solver. NOTE: no field here may be named `nodeId` — the + * durable parsedfile-store reviver dedups objects keyed on that field name. + */ +export interface BindingEntry { + /** Source-level variable name (what the persisted edge's `reason` carries). */ + readonly name: string; + /** + * 1-based line/0-based column of the canonical declaration site — `var` + * multi-declarations canonicalize to the FIRST declaration in source order. + * Both 0 for synthetic bindings. + */ + readonly declLine: number; + readonly declColumn: number; + /** How the binding was introduced (param/catch matter to the M3 taint pass). */ + readonly kind: 'var' | 'let' | 'const' | 'param' | 'catch' | 'function' | 'class' | 'module'; + /** + * True when the name has no in-function declaration site (implicit global, + * import, or a variable captured from an enclosing function) — keyed + * `name@module` in edge ids instead of `name:line:col`. + */ + readonly synthetic?: boolean; +} + +/** + * Def/use facts for one harvested statement (or construct header), in + * execution order within its block (#2082 M2 U1). `defs`/`uses` are indices + * into {@link FunctionCfg.bindings}. A compound assignment / update expression + * lists its binding in BOTH. Self-describing — `line` is carried here, never + * inferred from the block's text fragments (facts-only records exist, e.g. + * params on ENTRY and catch params). + */ +export interface StatementFacts { + readonly line: number; + readonly defs: readonly number[]; + readonly uses: readonly number[]; +} + /** A basic block: a maximal straight-line run of statements between leaders. */ export interface BasicBlockData { /** Block index within its function. The synthetic ENTRY is always 0. */ @@ -20,6 +65,12 @@ export interface BasicBlockData { /** Source snippet for the block (empty for synthetic ENTRY/EXIT). */ readonly text: string; readonly kind: 'entry' | 'exit' | 'normal'; + /** + * Per-statement def/use facts in execution order (#2082 M2 U1). Present only + * when the producing visitor harvests (TS/JS under `--pdg`); absent on + * hand-built or pre-M2 CFGs — the reaching-defs solver reports `no-facts`. + */ + readonly statements?: readonly StatementFacts[]; } /** @@ -74,6 +125,11 @@ export interface FunctionCfg { readonly exitIndex: number; readonly blocks: readonly BasicBlockData[]; readonly edges: readonly CfgEdgeData[]; + /** + * The function's binding table (#2082 M2 U1) — referenced by index from + * {@link BasicBlockData.statements}. Present iff statement facts are. + */ + readonly bindings?: readonly BindingEntry[]; } /** diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts new file mode 100644 index 0000000000..b175f668f1 --- /dev/null +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts @@ -0,0 +1,512 @@ +/** + * TS/JS def/use harvester (#2082 M2 U1). + * + * Runs in the parse worker next to the CFG visitor, extracting per-statement + * variable definition/use facts that ride the side channel for the + * reaching-defs solver (`cfg/reaching-defs.ts`). Output is the per-function + * binding table ({@link BindingEntry}[]) plus {@link StatementFacts} records + * the visitor attaches to blocks as it walks. + * + * TWO-PHASE, ORDER-INDEPENDENT (load-bearing): the CFG walk is NOT source-order + * — `visitTry` builds the finally body before the protected body, `visitFor` + * creates the init block after walking the body, `visitDoWhile` the condition + * before the body. Resolving names against a scope stack populated *during* + * that walk would mis-resolve common code (`try { var v = 1; } finally + * { use(v); }` keys the use synthetically while the def gets the real binding — + * the def→use fact silently never forms, a taint false negative). So phase 1 + * pre-scans the whole function subtree once, collecting every declaration into + * a completed lexical scope tree (also resolving `var` hoisting and multi-decl + * canonicalization order-independently, eslint-scope style); phase 2 resolves + * defs/uses against that finished tree from any walk order. + * + * v1 def-semantics scope (plan KTD4): var/let/const declarations, assignments + * (plain/compound/destructuring), update expressions, function/class + * declarations, parameters (incl. defaults/rest/destructured), catch params, + * for-in/of heads. EXCLUDED, deliberately: property/member writes (`this.x=`, + * `obj.p=` — TypeScript-CFA precedent), and BOTH directions of nested-function + * capture — writes to outer variables from nested bodies AND reads of captured + * variables inside nested bodies are invisible (nested functions are opaque + * blocks in the enclosing CFG; callback flows like `arr.forEach(() => sink(y))` + * register no use of `y` — closure/callback dataflow is M4 territory and the + * M3 consumer contract must name it). + * + * Identifiers with no in-function declaration (implicit globals, imports, + * variables captured from an enclosing function) resolve to a SYNTHETIC + * module-level binding (`name@module`), applied identically by def and use + * harvesting so `notDeclared = 1; use(notDeclared)` still forms a fact. + * + * NOTE: nothing serialized here may carry a field named `nodeId` — the durable + * parsedfile-store reviver dedups objects keyed on that field name. + */ +import type { SyntaxNode } from '../../utils/ast-helpers.js'; +import type { BindingEntry, StatementFacts } from '../types.js'; + +/** Node types that own a nested CFG — their subtrees are opaque to harvesting. */ +const NESTED_FUNCTION_TYPES = new Set([ + 'function_declaration', + 'function_expression', + 'arrow_function', + 'method_definition', + 'generator_function_declaration', + 'generator_function', + 'async_function_declaration', + 'async_arrow_function', +]); + +/** Function-ish declaration statements whose NAME still binds in the enclosing scope. */ +const FUNCTION_DECL_TYPES = new Set([ + 'function_declaration', + 'generator_function_declaration', + 'async_function_declaration', +]); + +/** + * Nodes that open a lexical scope for `let`/`const`/`class`/catch bindings. + * A `switch` BODY is deliberately ONE scope shared by all case arms (JS + * semantics: `case 1: let x = 1; case 2: use(x)` is the same binding). + */ +const SCOPE_TYPES = new Set([ + 'statement_block', + 'for_statement', + 'for_in_statement', + 'for_of_statement', + 'catch_clause', + 'switch_body', +]); + +/** Type-position subtrees — identifiers inside them are not value uses. */ +const TYPE_CONTEXT_TYPES = new Set([ + 'type_annotation', + 'type_arguments', + 'type_parameters', + 'type_predicate_annotation', + 'asserts_annotation', +]); + +interface Scope { + readonly parent: Scope | null; + /** name → binding index */ + readonly table: Map; +} + +export class TsHarvester { + private readonly bindings: BindingEntry[] = []; + /** Scope-opening node id → its scope. */ + private readonly scopeByNode = new Map(); + private readonly root: Scope = { parent: null, table: new Map() }; + /** name → synthetic binding index (implicit global / import / captured). */ + private readonly synthetic = new Map(); + private readonly fnId: number; + + constructor(private readonly fnNode: SyntaxNode) { + this.fnId = fnNode.id; + this.scopeByNode.set(fnNode.id, this.root); + this.declareParams(fnNode); + const body = fnNode.childForFieldName('body'); + if (body) this.prescan(body, body.type === 'statement_block' ? this.openScope(body) : this.root); + } + + /** The completed binding table — pass to `CfgBuilder.finish`. */ + table(): readonly BindingEntry[] { + return this.bindings; + } + + // ── phase 1: declaration pre-scan ──────────────────────────────────────── + + private openScope(node: SyntaxNode): Scope { + const existing = this.scopeByNode.get(node.id); + if (existing) return existing; + const scope: Scope = { parent: this.nearestScopeOf(node), table: new Map() }; + this.scopeByNode.set(node.id, scope); + return scope; + } + + private nearestScopeOf(node: SyntaxNode): Scope { + for (let p = node.parent; p; p = p.parent) { + const s = this.scopeByNode.get(p.id); + if (s) return s; + if (p.id === this.fnId) break; + } + return this.root; + } + + private declare( + nameNode: SyntaxNode, + kind: BindingEntry['kind'], + scope: Scope, + hoistToRoot: boolean, + ): void { + const target = hoistToRoot ? this.root : scope; + const name = nameNode.text; + // `var` multi-declaration (and a param + `var` of the same name) is ONE + // binding — first declaration in source order is canonical. The dedup is + // scoped to the single target table, so an inner `let x` shadowing a root + // `var x` still gets its own entry in its own scope. + if (target.table.has(name)) return; + target.table.set(name, this.bindings.length); + this.bindings.push({ + name, + declLine: nameNode.startPosition.row + 1, + declColumn: nameNode.startPosition.column, + kind, + }); + } + + private declareParams(fnNode: SyntaxNode): void { + const params = fnNode.childForFieldName('parameters') ?? fnNode.childForFieldName('parameter'); + if (!params) return; + if (params.type === 'identifier') { + this.declare(params, 'param', this.root, true); // `x => …` single-param arrow + return; + } + for (let i = 0; i < params.namedChildCount; i++) { + const p = params.namedChild(i); + if (!p) continue; + // TS wraps each param (required_parameter/optional_parameter, field + // `pattern`); plain JS puts the pattern directly in formal_parameters. + const pattern = p.childForFieldName('pattern') ?? p; + this.declarePattern(pattern, 'param', this.root, true); + } + } + + /** Declare every name bound by a (possibly destructuring) pattern. */ + private declarePattern( + node: SyntaxNode, + kind: BindingEntry['kind'], + scope: Scope, + hoistToRoot: boolean, + ): void { + switch (node.type) { + case 'identifier': + case 'shorthand_property_identifier_pattern': + this.declare(node, kind, scope, hoistToRoot); + return; + case 'rest_pattern': + case 'object_pattern': + case 'array_pattern': + for (let i = 0; i < node.namedChildCount; i++) { + const c = node.namedChild(i); + if (c) this.declarePattern(c, kind, scope, hoistToRoot); + } + return; + case 'pair_pattern': { + const value = node.childForFieldName('value'); + if (value) this.declarePattern(value, kind, scope, hoistToRoot); + return; + } + case 'assignment_pattern': + case 'object_assignment_pattern': { + const left = node.childForFieldName('left'); + if (left) this.declarePattern(left, kind, scope, hoistToRoot); + return; + } + default: + // Type annotations / unknown wrappers — descend defensively. + for (let i = 0; i < node.namedChildCount; i++) { + const c = node.namedChild(i); + if (c && !TYPE_CONTEXT_TYPES.has(c.type)) { + this.declarePattern(c, kind, scope, hoistToRoot); + } + } + } + } + + private prescan(node: SyntaxNode, scope: Scope): void { + const t = node.type; + if (NESTED_FUNCTION_TYPES.has(t) && node.id !== this.fnId) { + // A nested function's NAME binds in the enclosing scope; its body is opaque. + if (FUNCTION_DECL_TYPES.has(t)) { + const name = node.childForFieldName('name'); + if (name) this.declare(name, 'function', scope, false); + } + return; + } + + let childScope = scope; + if (SCOPE_TYPES.has(t)) childScope = this.openScope(node); + + switch (t) { + case 'lexical_declaration': { + const kind = node.child(0)?.type === 'const' ? 'const' : 'let'; + this.declareDeclarators(node, kind, childScope, false); + break; + } + case 'variable_declaration': + this.declareDeclarators(node, 'var', childScope, true); + break; + case 'class_declaration': { + const name = node.childForFieldName('name'); + if (name) this.declare(name, 'class', childScope, false); + break; + } + case 'catch_clause': { + const param = node.childForFieldName('parameter'); + if (param) this.declarePattern(param, 'catch', childScope, false); + break; + } + case 'for_in_statement': + case 'for_of_statement': { + // `for (const x of xs)` — the `kind` keyword marks a declaration; a bare + // `for (x of xs)` left is an assignment, resolved at use time instead. + const kindNode = node.childForFieldName('kind'); + const left = node.childForFieldName('left'); + if (kindNode && left) { + const k = kindNode.type === 'var' ? 'var' : kindNode.type === 'const' ? 'const' : 'let'; + this.declarePattern(left, k, childScope, k === 'var'); + } + break; + } + default: + break; + } + + for (let i = 0; i < node.namedChildCount; i++) { + const c = node.namedChild(i); + if (c) this.prescan(c, childScope); + } + } + + private declareDeclarators( + declNode: SyntaxNode, + kind: 'var' | 'let' | 'const', + scope: Scope, + hoistToRoot: boolean, + ): void { + for (let i = 0; i < declNode.namedChildCount; i++) { + const d = declNode.namedChild(i); + if (d?.type !== 'variable_declarator') continue; + const name = d.childForFieldName('name'); + if (name) this.declarePattern(name, kind, scope, hoistToRoot); + } + } + + // ── phase 2: per-statement fact extraction ─────────────────────────────── + + /** + * Def/use facts for one statement (or construct-header expression) node. + * Safe from any walk order — resolution consults the completed scope tree. + */ + facts(node: SyntaxNode): StatementFacts { + const acc = new FactAccumulator(node.startPosition.row + 1); + this.walkValue(node, acc); + return acc.finish(); + } + + /** Facts for a `for (left in/of right)` head: left binds/assigns, right is used. */ + forInHeadFacts(stmt: SyntaxNode): StatementFacts { + const acc = new FactAccumulator(stmt.startPosition.row + 1); + const left = stmt.childForFieldName('left'); + const right = stmt.childForFieldName('right'); + if (left) this.walkDefPattern(left, acc); + if (right) this.walkValue(right, acc); + return acc.finish(); + } + + /** ENTRY-block facts for the function's parameters (defs + default-value uses). */ + paramFacts(): StatementFacts | undefined { + const fnNode = this.fnNode; + const params = fnNode.childForFieldName('parameters') ?? fnNode.childForFieldName('parameter'); + if (!params) return undefined; + const acc = new FactAccumulator(fnNode.startPosition.row + 1); + if (params.type === 'identifier') { + this.def(params, acc); + } else { + for (let i = 0; i < params.namedChildCount; i++) { + const p = params.namedChild(i); + if (!p) continue; + const pattern = p.childForFieldName('pattern') ?? p; + this.walkDefPattern(pattern, acc); + const dflt = p.childForFieldName('value'); + if (dflt) this.walkValue(dflt, acc); + } + } + return acc.defCount() || acc.useCount() ? acc.finish() : undefined; + } + + /** Def fact for a `catch (e)` parameter — prepend to the handler entry block. */ + catchParamFacts(catchClause: SyntaxNode): StatementFacts | undefined { + const param = catchClause.childForFieldName('parameter'); + if (!param) return undefined; + const acc = new FactAccumulator(catchClause.startPosition.row + 1); + this.walkDefPattern(param, acc); + return acc.defCount() ? acc.finish() : undefined; + } + + private resolve(nameNode: SyntaxNode): number { + const name = nameNode.text; + for (let p: SyntaxNode | null = nameNode; p; p = p.parent) { + const scope = this.scopeByNode.get(p.id); + if (scope) { + for (let s: Scope | null = scope; s; s = s.parent) { + const idx = s.table.get(name); + if (idx !== undefined) return idx; + } + break; // reached the root scope without a hit + } + if (p.id === this.fnId) break; + } + // No in-function declaration — synthetic module-level binding, shared by + // defs and uses so `notDeclared = 1; use(notDeclared)` still forms a fact. + let idx = this.synthetic.get(name); + if (idx === undefined) { + idx = this.bindings.length; + this.synthetic.set(name, idx); + this.bindings.push({ name, declLine: 0, declColumn: 0, kind: 'module', synthetic: true }); + } + return idx; + } + + private def(nameNode: SyntaxNode, acc: FactAccumulator): void { + acc.addDef(this.resolve(nameNode)); + } + + private use(nameNode: SyntaxNode, acc: FactAccumulator): void { + acc.addUse(this.resolve(nameNode)); + } + + /** Value-position walk: collect uses; route def positions to the pattern walk. */ + private walkValue(node: SyntaxNode, acc: FactAccumulator): void { + const t = node.type; + if (TYPE_CONTEXT_TYPES.has(t)) return; + if (NESTED_FUNCTION_TYPES.has(t) && node.id !== this.fnId) { + // Opaque nested function: its NAME (function declaration) is a def in + // the enclosing scope; captured reads/writes inside are invisible (KTD4). + if (FUNCTION_DECL_TYPES.has(t)) { + const name = node.childForFieldName('name'); + if (name) this.def(name, acc); + } + return; + } + + switch (t) { + case 'identifier': + case 'shorthand_property_identifier': + this.use(node, acc); + return; + case 'lexical_declaration': + case 'variable_declaration': + for (let i = 0; i < node.namedChildCount; i++) { + const d = node.namedChild(i); + if (d?.type !== 'variable_declarator') continue; + const name = d.childForFieldName('name'); + const value = d.childForFieldName('value'); + if (name) this.walkDefPattern(name, acc); + if (value) this.walkValue(value, acc); + } + return; + case 'assignment_expression': { + const left = node.childForFieldName('left'); + const right = node.childForFieldName('right'); + if (left) this.walkDefPattern(left, acc); + if (right) this.walkValue(right, acc); + return; + } + case 'augmented_assignment_expression': { + // `x += y` both defines and uses x. + const left = node.childForFieldName('left'); + const right = node.childForFieldName('right'); + if (left?.type === 'identifier') { + this.def(left, acc); + this.use(left, acc); + } else if (left) { + this.walkValue(left, acc); // member/subscript target — uses only + } + if (right) this.walkValue(right, acc); + return; + } + case 'update_expression': { + const arg = node.childForFieldName('argument'); + if (arg?.type === 'identifier') { + this.def(arg, acc); + this.use(arg, acc); + } else if (arg) { + this.walkValue(arg, acc); + } + return; + } + default: + for (let i = 0; i < node.namedChildCount; i++) { + const c = node.namedChild(i); + if (c) this.walkValue(c, acc); + } + } + } + + /** Assignment-target walk: identifiers bind; member/subscript targets are uses. */ + private walkDefPattern(node: SyntaxNode, acc: FactAccumulator): void { + switch (node.type) { + case 'identifier': + case 'shorthand_property_identifier_pattern': + this.def(node, acc); + return; + case 'rest_pattern': + case 'object_pattern': + case 'array_pattern': + for (let i = 0; i < node.namedChildCount; i++) { + const c = node.namedChild(i); + if (c) this.walkDefPattern(c, acc); + } + return; + case 'pair_pattern': { + const key = node.childForFieldName('key'); + const value = node.childForFieldName('value'); + if (key?.type === 'computed_property_name') this.walkValue(key, acc); + if (value) this.walkDefPattern(value, acc); + return; + } + case 'assignment_pattern': + case 'object_assignment_pattern': { + const left = node.childForFieldName('left'); + const right = node.childForFieldName('right'); + if (left) this.walkDefPattern(left, acc); + if (right) this.walkValue(right, acc); + return; + } + case 'member_expression': + case 'subscript_expression': + // Property/element write — NOT a scalar def (KTD4); its identifiers + // (object, computed key) are uses. + this.walkValue(node, acc); + return; + default: + for (let i = 0; i < node.namedChildCount; i++) { + const c = node.namedChild(i); + if (c && !TYPE_CONTEXT_TYPES.has(c.type)) this.walkDefPattern(c, acc); + } + } + } +} + +/** Ordered, deduplicating def/use collector for one statement record. */ +class FactAccumulator { + private readonly defs: number[] = []; + private readonly uses: number[] = []; + private readonly defSeen = new Set(); + private readonly useSeen = new Set(); + + constructor(private readonly line: number) {} + + addDef(idx: number): void { + if (this.defSeen.has(idx)) return; + this.defSeen.add(idx); + this.defs.push(idx); + } + + addUse(idx: number): void { + if (this.useSeen.has(idx)) return; + this.useSeen.add(idx); + this.uses.push(idx); + } + + defCount(): number { + return this.defs.length; + } + + useCount(): number { + return this.uses.length; + } + + finish(): StatementFacts { + return { line: this.line, defs: this.defs, uses: this.uses }; + } +} diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts index 4166f1b168..ec85433915 100644 --- a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts @@ -50,6 +50,7 @@ import { CfgBuilder } from '../cfg-builder.js'; import { ControlFlowContext, type FinalizerFrame } from '../control-flow-context.js'; import type { TraversalResult } from '../traversal-result.js'; import type { CfgEdgeKind, CfgVisitor, FunctionCfg } from '../types.js'; +import { TsHarvester } from './typescript-harvest.js'; /** TS/JS node types that own a CFG-bearing function body. */ const TS_FUNCTION_TYPES = new Set([ @@ -108,7 +109,12 @@ class TsCfgWalk { /** Label awaiting the loop/switch it immediately precedes (labeled_statement). */ private pendingLabel: string | undefined; - constructor(private readonly builder: CfgBuilder) {} + constructor( + private readonly builder: CfgBuilder, + /** Def/use fact extractor (#2082 M2 U1) — phase-2 only; its scope tree is + * already complete, so any walk order resolves names correctly. */ + private readonly harvest: TsHarvester, + ) {} /** Statements of a block node, ignoring comments. */ private statementsOf(block: SyntaxNode): SyntaxNode[] { @@ -146,13 +152,19 @@ class TsCfgWalk { } else { // Simple statement — coalesce into the current straight-line block. if (openSimple === undefined) { - const idx = this.builder.newBlock(startLineOf(stmt), endLineOf(stmt), stmt.text); + const idx = this.builder.newBlock( + startLineOf(stmt), + endLineOf(stmt), + stmt.text, + 'normal', + this.harvest.facts(stmt), + ); if (entry === undefined) entry = idx; else this.builder.connect(dangling, idx, 'seq'); openSimple = idx; dangling = [idx]; } else { - this.builder.extendBlock(openSimple, endLineOf(stmt), stmt.text); + this.builder.extendBlock(openSimple, endLineOf(stmt), stmt.text, this.harvest.facts(stmt)); } } } @@ -197,12 +209,26 @@ class TsCfgWalk { } private visitSimple(stmt: SyntaxNode): TraversalResult { - const idx = this.builder.newBlock(startLineOf(stmt), endLineOf(stmt), stmt.text); + const idx = this.builder.newBlock( + startLineOf(stmt), + endLineOf(stmt), + stmt.text, + 'normal', + this.harvest.facts(stmt), + ); return { entry: idx, exits: [idx] }; } private visitReturn(stmt: SyntaxNode): TraversalResult { - const idx = this.builder.newBlock(startLineOf(stmt), endLineOf(stmt), stmt.text); + // Harvest the argument expression's uses — `return x` blocks live in this + // dedicated handler, not visitSeq, and were a silently-missed site once. + const idx = this.builder.newBlock( + startLineOf(stmt), + endLineOf(stmt), + stmt.text, + 'normal', + this.harvest.facts(stmt), + ); // A return crosses EVERY active finally before reaching EXIT. this.jumpVia(idx, this.cfc.finalizersForReturn(), this.builder.exitIndex, 'return'); return { entry: idx, exits: [] }; @@ -235,7 +261,13 @@ class TsCfgWalk { } private visitThrow(stmt: SyntaxNode): TraversalResult { - const idx = this.builder.newBlock(startLineOf(stmt), endLineOf(stmt), stmt.text); + const idx = this.builder.newBlock( + startLineOf(stmt), + endLineOf(stmt), + stmt.text, + 'normal', + this.harvest.facts(stmt), + ); this.builder.edge(idx, this.currentHandler(), 'throw'); return { entry: idx, exits: [] }; } @@ -279,7 +311,13 @@ class TsCfgWalk { private visitIf(stmt: SyntaxNode): TraversalResult { const cond = stmt.childForFieldName('condition') ?? stmt; - const condBlock = this.builder.newBlock(startLineOf(stmt), endLineOf(cond), cond.text); + const condBlock = this.builder.newBlock( + startLineOf(stmt), + endLineOf(cond), + cond.text, + 'normal', + this.harvest.facts(cond), + ); const exits: number[] = []; @@ -320,7 +358,13 @@ class TsCfgWalk { private visitWhile(stmt: SyntaxNode): TraversalResult { const label = this.takeLabel(); const cond = stmt.childForFieldName('condition') ?? stmt; - const header = this.builder.newBlock(startLineOf(stmt), endLineOf(cond), cond.text); + const header = this.builder.newBlock( + startLineOf(stmt), + endLineOf(cond), + cond.text, + 'normal', + this.harvest.facts(cond), + ); const loopExit = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); this.cfc.pushLoop(header, loopExit, label); @@ -340,7 +384,13 @@ class TsCfgWalk { private visitDoWhile(stmt: SyntaxNode): TraversalResult { const label = this.takeLabel(); const cond = stmt.childForFieldName('condition') ?? stmt; - const condBlock = this.builder.newBlock(startLineOf(cond), endLineOf(cond), cond.text); + const condBlock = this.builder.newBlock( + startLineOf(cond), + endLineOf(cond), + cond.text, + 'normal', + this.harvest.facts(cond), + ); const loopExit = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); this.cfc.pushLoop(condBlock, loopExit, label); @@ -364,12 +414,20 @@ class TsCfgWalk { startLineOf(stmt), cond ? endLineOf(cond) : startLineOf(stmt), cond ? cond.text : 'for(;;)', + 'normal', + cond ? this.harvest.facts(cond) : undefined, ); const loopExit = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); let incrBlock = header; if (incr) { - incrBlock = this.builder.newBlock(startLineOf(incr), endLineOf(incr), incr.text); + incrBlock = this.builder.newBlock( + startLineOf(incr), + endLineOf(incr), + incr.text, + 'normal', + this.harvest.facts(incr), + ); this.builder.edge(incrBlock, header, 'loop-back'); } @@ -394,7 +452,13 @@ class TsCfgWalk { let entry = header; if (init) { - const initBlock = this.builder.newBlock(startLineOf(init), endLineOf(init), init.text); + const initBlock = this.builder.newBlock( + startLineOf(init), + endLineOf(init), + init.text, + 'normal', + this.harvest.facts(init), + ); this.builder.edge(initBlock, header, 'seq'); entry = initBlock; } @@ -403,10 +467,14 @@ class TsCfgWalk { private visitForIn(stmt: SyntaxNode): TraversalResult { const label = this.takeLabel(); + // Header text is SYNTHESIZED, so facts come from the left/right AST nodes + // directly (the loop variable is a def, the iterated expression a use). const header = this.builder.newBlock( startLineOf(stmt), startLineOf(stmt), this.forInHeaderText(stmt), + 'normal', + this.harvest.forInHeadFacts(stmt), ); const loopExit = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); @@ -433,7 +501,13 @@ class TsCfgWalk { private visitSwitch(stmt: SyntaxNode): TraversalResult { const label = this.takeLabel(); const value = stmt.childForFieldName('value') ?? stmt; - const dispatch = this.builder.newBlock(startLineOf(stmt), endLineOf(value), value.text); + const dispatch = this.builder.newBlock( + startLineOf(stmt), + endLineOf(value), + value.text, + 'normal', + this.harvest.facts(value), + ); const switchExit = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); this.cfc.pushSwitch(switchExit, label); @@ -442,6 +516,15 @@ class TsCfgWalk { ? body.namedChildren.filter((c) => c.type === 'switch_case' || c.type === 'switch_default') : []; + // `case x:` test expressions live in no block (caseStatements filters the + // value node out) — harvest their uses onto the dispatch block, one record + // per case in source order (a sound over-approximation of JS's in-order + // case evaluation). + for (const c of cases) { + const caseValue = c.childForFieldName('value'); + if (caseValue) this.builder.attachFacts(dispatch, this.harvest.facts(caseValue)); + } + const caseResults = cases.map((c) => this.visitSeq(this.caseStatements(c))); const hasDefault = cases.some((c) => c.type === 'switch_default'); @@ -521,6 +604,12 @@ class TsCfgWalk { const idx = this.builder.newBlock(startLineOf(catchClause), endLineOf(catchClause), ''); catchRes = { entry: idx, exits: [idx] }; } + // `catch (e)` has no header block — the param def fact PREPENDS onto the + // handler entry (it lexically precedes the body, and the body was walked + // first, so appending would put the def AFTER the body's facts and an + // in-block `use(e)` at statement index 0 would see no reaching def). + const paramFacts = this.harvest.catchParamFacts(catchClause); + if (paramFacts) this.builder.attachFacts(catchRes.entry, paramFacts, 'prepend'); } // Handler for the try body: catch if present, else finally, else outer. @@ -605,23 +694,39 @@ function buildFunctionCfg(fnNode: SyntaxNode, filePath: string): FunctionCfg | u const body = fnNode.childForFieldName('body'); if (!body) return undefined; // overload signature / abstract method — no body + // Phase-1 declaration pre-scan (#2082 M2 U1) — must complete before any + // facts are extracted; the CFG walk below is not source-order. + const harvest = new TsHarvester(fnNode); + + // Parameters define at ENTRY (facts only — never touch the entry block's + // text or span: bench fingerprints and CFG snapshots include block text). + const paramFacts = harvest.paramFacts(); + if (paramFacts) builder.attachFacts(builder.entryIndex, paramFacts); + if (body.type !== 'statement_block') { // Expression-bodied arrow: `() => expr` — one block whose value is returned. - const blk = builder.newBlock(startLineOf(body), endLineOf(body), body.text); + // Lives outside the walk class, so it harvests explicitly. + const blk = builder.newBlock( + startLineOf(body), + endLineOf(body), + body.text, + 'normal', + harvest.facts(body), + ); builder.edge(builder.entryIndex, blk, 'seq'); builder.edge(blk, builder.exitIndex, 'return'); - return builder.finish(); + return builder.finish(harvest.table()); } - const walk = new TsCfgWalk(builder); + const walk = new TsCfgWalk(builder, harvest); const res = walk.visitSeq(body.namedChildren.filter((c) => c.type !== 'comment')); if (!res) { builder.edge(builder.entryIndex, builder.exitIndex, 'seq'); // empty body - return builder.finish(); + return builder.finish(harvest.table()); } builder.edge(builder.entryIndex, res.entry, 'seq'); builder.connect(res.exits, builder.exitIndex, 'seq'); // normal fall-off → EXIT - return builder.finish(); + return builder.finish(harvest.table()); } /** Whether a node is a TS/JS function this visitor builds a CFG for. */ diff --git a/gitnexus/src/storage/parse-cache.ts b/gitnexus/src/storage/parse-cache.ts index 744748e4b7..837144c5cd 100644 --- a/gitnexus/src/storage/parse-cache.ts +++ b/gitnexus/src/storage/parse-cache.ts @@ -55,7 +55,7 @@ import type { ParseWorkerResult } from '../core/ingestion/workers/parse-worker.j // the main thread (the #1983 OOM). Because the two stores share this version, // any future change to the `ParsedFile` serialization shape MUST bump // SCHEMA_BUMP so both invalidate in lockstep. -const SCHEMA_BUMP = 5; // #2081 M1: ParsedFile gained `cfgSideChannel` +const SCHEMA_BUMP = 6; // #2082 M2: cfgSideChannel gained bindings + per-block statement facts const GITNEXUS_PKG_VERSION = (() => { try { // package.json sits at gitnexus/package.json — two levels up from diff --git a/gitnexus/test/integration/cfg/worker-roundtrip.test.ts b/gitnexus/test/integration/cfg/worker-roundtrip.test.ts index 6589b3b025..a35c953d11 100644 --- a/gitnexus/test/integration/cfg/worker-roundtrip.test.ts +++ b/gitnexus/test/integration/cfg/worker-roundtrip.test.ts @@ -94,6 +94,16 @@ describe('U3 — CFG side-channel JSON round-trip (no AST leakage, no field loss for (const b of c.blocks) expect(typeof b.text).toBe('string'); for (const e of c.edges) expect(typeof e.from).toBe('number'); } + // M2 (#2082 U1): the binding table + statement facts must survive the + // boundary — a future cache-slimming field list that drops them would + // silently break reaching-defs (the #2038 mergeChunkResults lesson). + for (const c of round) { + expect(Array.isArray(c.bindings)).toBe(true); + expect(c.blocks.every((b: { statements?: unknown }) => Array.isArray(b.statements))).toBe( + true, + ); + } + expect(round.some((c: { bindings: unknown[] }) => c.bindings.length > 0)).toBe(true); }); }); diff --git a/gitnexus/test/unit/cfg/harvest.test.ts b/gitnexus/test/unit/cfg/harvest.test.ts new file mode 100644 index 0000000000..2030234785 --- /dev/null +++ b/gitnexus/test/unit/cfg/harvest.test.ts @@ -0,0 +1,372 @@ +import { describe, it, expect } from 'vitest'; +import Parser from 'tree-sitter'; +import TypeScript from 'tree-sitter-typescript'; +import type { SyntaxNode } from '../../../src/core/ingestion/utils/ast-helpers.js'; +import { + createTypeScriptCfgVisitor, + TS_FUNCTION_TYPES, +} from '../../../src/core/ingestion/cfg/visitors/typescript.js'; +import type { FunctionCfg, StatementFacts } from '../../../src/core/ingestion/cfg/types.js'; + +// U1 (#2082 M2) — per-statement def/use harvesting. The two-phase design +// (declaration pre-scan → resolve during the CFG walk) is what makes the +// walk-order traps pass: the visitor walks finally-before-try, for-init-last, +// and do-while-condition-first, so declare-as-you-walk would mis-key common +// code. Each test pins names→binding-index agreement, not just presence. + +const visitor = createTypeScriptCfgVisitor(); + +function parse(code: string): SyntaxNode { + const parser = new Parser(); + parser.setLanguage(TypeScript.typescript); + return parser.parse(code).rootNode; +} + +function collectFunctions(root: SyntaxNode): SyntaxNode[] { + const out: SyntaxNode[] = []; + const stack = [root]; + while (stack.length) { + const n = stack.pop() as SyntaxNode; + if (TS_FUNCTION_TYPES.has(n.type)) out.push(n); + for (let i = n.namedChildCount - 1; i >= 0; i--) { + const c = n.namedChild(i); + if (c) stack.push(c); + } + } + return out; +} + +function cfgOf(code: string, index = 0): FunctionCfg { + const fns = collectFunctions(parse(code)); + const fn = fns[index]; + if (!fn) throw new Error(`no function at index ${index}`); + const cfg = visitor.buildFunctionCfg(fn, 'fixture.ts'); + if (!cfg) throw new Error('buildFunctionCfg returned undefined'); + return cfg; +} + +/** All statement facts of the CFG, flattened in (block, statement) order. */ +function allFacts(cfg: FunctionCfg): StatementFacts[] { + return cfg.blocks.flatMap((b) => [...(b.statements ?? [])]); +} + +/** Binding indices of every entry named `name`. */ +function bindingIdxs(cfg: FunctionCfg, name: string): number[] { + return (cfg.bindings ?? []) + .map((b, i) => (b.name === name ? i : -1)) + .filter((i) => i >= 0); +} + +/** The single binding index for `name` (throws when shadowed/ambiguous). */ +function bindingIdx(cfg: FunctionCfg, name: string): number { + const idxs = bindingIdxs(cfg, name); + if (idxs.length !== 1) throw new Error(`expected 1 binding for ${name}, got ${idxs.length}`); + return idxs[0]; +} + +const defsOf = (cfg: FunctionCfg): Set => + new Set(allFacts(cfg).flatMap((f) => [...f.defs])); +const usesOf = (cfg: FunctionCfg): Set => + new Set(allFacts(cfg).flatMap((f) => [...f.uses])); + +describe('TS/JS def/use harvest — basics', () => { + it('declaration, reassignment, and read produce per-statement def/use facts', () => { + const cfg = cfgOf(`function f() { let x = 1; x = 2; const y = x; }`); + const x = bindingIdx(cfg, 'x'); + const y = bindingIdx(cfg, 'y'); + // x and y are the only declared (non-synthetic) bindings + expect((cfg.bindings ?? []).filter((b) => !b.synthetic)).toHaveLength(2); + // the three statements coalesce into ONE block with three fact records + const body = cfg.blocks.find((b) => b.text.includes('let x = 1')); + expect(body?.statements).toHaveLength(3); + const [s0, s1, s2] = body!.statements!; + expect([...s0.defs]).toEqual([x]); + expect([...s1.defs]).toEqual([x]); + expect([...s2.defs]).toEqual([y]); + expect([...s2.uses]).toEqual([x]); + }); + + it('compound assignment and update expressions are def+use of the same binding', () => { + const cfg = cfgOf(`function f(x, y, i) { x += y; i++; }`); + const x = bindingIdx(cfg, 'x'); + const y = bindingIdx(cfg, 'y'); + const i = bindingIdx(cfg, 'i'); + const body = cfg.blocks.find((b) => b.text.includes('x += y')); + const [s0, s1] = body!.statements!; + expect([...s0.defs]).toEqual([x]); + expect([...s0.uses]).toEqual(expect.arrayContaining([x, y])); + expect([...s1.defs]).toEqual([i]); + expect([...s1.uses]).toEqual([i]); + }); + + it('destructuring flattens to one def per bound name; sources are uses', () => { + const cfg = cfgOf(`function f(obj, arr) { + const { a, b: c, ...rest } = obj; + let d, e; + [d = 1, ...e] = arr; + }`); + const defs = defsOf(cfg); + for (const name of ['a', 'c', 'rest', 'd', 'e']) { + expect(defs).toContain(bindingIdx(cfg, name)); + } + const uses = usesOf(cfg); + expect(uses).toContain(bindingIdx(cfg, 'obj')); + expect(uses).toContain(bindingIdx(cfg, 'arr')); + // no spurious binding for the renamed pattern key `b` + expect(bindingIdxs(cfg, 'b')).toHaveLength(0); + }); + + it('shadowing: inner let is a DISTINCT binding from the outer one', () => { + const cfg = cfgOf(`function f() { + let x = 1; + { let x = 2; use(x); } + use(x); + }`); + const xs = bindingIdxs(cfg, 'x'); + expect(xs).toHaveLength(2); + const [outer, inner] = xs; // pre-scan is source-order: outer declared first + const facts = allFacts(cfg); + const useFacts = facts.filter((f) => f.uses.includes(outer) || f.uses.includes(inner)); + // inner use(x) sees the inner binding; trailing use(x) sees the outer + expect(useFacts.some((f) => f.uses.includes(inner))).toBe(true); + expect(useFacts.some((f) => f.uses.includes(outer))).toBe(true); + const defFacts = facts.filter((f) => f.defs.length > 0); + expect(defFacts.find((f) => f.defs.includes(outer))?.line).toBeLessThan( + defFacts.find((f) => f.defs.includes(inner))!.line, + ); + }); + + it('var hoisting + multi-declaration canonicalize to ONE function-rooted binding', () => { + const cfg = cfgOf(`function f(c) { + use(v); + if (c) { var v = 1; } + var v; + }`); + expect(bindingIdxs(cfg, 'v')).toHaveLength(1); + const v = bindingIdx(cfg, 'v'); + expect(usesOf(cfg)).toContain(v); + expect(defsOf(cfg)).toContain(v); + // canonical decl site is the FIRST declaration in source order + expect(cfg.bindings![v].declLine).toBe(3); + }); + + it('undeclared assignment targets get one deterministic synthetic binding', () => { + const cfg = cfgOf(`function f() { notDeclared = 1; use(notDeclared); }`); + const idxs = bindingIdxs(cfg, 'notDeclared'); + expect(idxs).toHaveLength(1); + const b = cfg.bindings![idxs[0]]; + expect(b.synthetic).toBe(true); + expect(defsOf(cfg)).toContain(idxs[0]); + expect(usesOf(cfg)).toContain(idxs[0]); + }); +}); + +describe('TS/JS def/use harvest — harvest sites beyond visitSeq', () => { + it('parameters define at the ENTRY block (incl. destructured/default/rest)', () => { + const cfg = cfgOf(`function f(a, { b }, c = a, ...rest) { body(); }`); + const entry = cfg.blocks[cfg.entryIndex]; + expect(entry.text).toBe(''); // facts-only attach — never perturbs block text + const entryFacts = entry.statements ?? []; + expect(entryFacts).toHaveLength(1); + const defs = new Set(entryFacts[0].defs); + for (const name of ['a', 'b', 'c', 'rest']) { + expect(defs).toContain(bindingIdx(cfg, name)); + } + expect(entryFacts[0].uses).toContain(bindingIdx(cfg, 'a')); // default-value use + expect(cfg.bindings![bindingIdx(cfg, 'a')].kind).toBe('param'); + }); + + it('return and throw argument expressions are harvested (dedicated handler blocks)', () => { + const cfg = cfgOf(`function f(x, y, err) { + if (x) { return x + y; } + throw err; + }`); + const retBlock = cfg.blocks.find((b) => b.text.includes('return x + y')); + const retUses = new Set(retBlock!.statements!.flatMap((f) => [...f.uses])); + expect(retUses).toContain(bindingIdx(cfg, 'x')); + expect(retUses).toContain(bindingIdx(cfg, 'y')); + const throwBlock = cfg.blocks.find((b) => b.text.includes('throw err')); + const throwUses = new Set(throwBlock!.statements!.flatMap((f) => [...f.uses])); + expect(throwUses).toContain(bindingIdx(cfg, 'err')); + }); + + it('expression-bodied arrow harvests params at ENTRY and body uses', () => { + const cfg = cfgOf(`const f = (p) => p + q;`); + const entryFacts = cfg.blocks[cfg.entryIndex].statements ?? []; + expect(entryFacts[0]?.defs).toContain(bindingIdx(cfg, 'p')); + const body = cfg.blocks.find((b) => b.text.includes('p + q')); + const uses = new Set(body!.statements!.flatMap((f) => [...f.uses])); + expect(uses).toContain(bindingIdx(cfg, 'p')); + expect(uses).toContain(bindingIdx(cfg, 'q')); // synthetic capture + expect(cfg.bindings![bindingIdx(cfg, 'q')].synthetic).toBe(true); + }); + + it('construct headers harvest: if/while conditions, for init/cond/incr, for-of head', () => { + const cfg = cfgOf(`function f(n, list) { + for (let i = 0; i < n; i++) { work(i); } + for (const item of list) { work(item); } + while (n > 0) { n--; } + }`); + const i = bindingIdx(cfg, 'i'); + const item = bindingIdx(cfg, 'item'); + const n = bindingIdx(cfg, 'n'); + const initBlock = cfg.blocks.find((b) => b.text === 'let i = 0;'); + expect(initBlock!.statements![0].defs).toContain(i); + const condBlock = cfg.blocks.find((b) => b.text === 'i < n'); + expect(new Set(condBlock!.statements![0].uses)).toEqual(new Set([i, n])); + const incrBlock = cfg.blocks.find((b) => b.text === 'i++'); + expect(incrBlock!.statements![0].defs).toContain(i); + const forOfHead = cfg.blocks.find((b) => b.text.includes('item'))!; + expect(forOfHead.statements!.some((f) => f.defs.includes(item))).toBe(true); + expect(forOfHead.statements!.some((f) => f.uses.includes(bindingIdx(cfg, 'list')))).toBe(true); + }); + + it('catch param defines at statement index 0 of the handler entry block', () => { + const cfg = cfgOf(`function f() { + try { risky(); } catch (e) { use(e); } + }`); + const e = bindingIdx(cfg, 'e'); + expect(cfg.bindings![e].kind).toBe('catch'); + const handler = cfg.blocks.find((b) => b.text.includes('use(e)')); + const facts = handler!.statements!; + // def of e PRECEDES the body's use of e — index 0, so the in-order sweep + // gives `use(e)` a reaching def + expect([...facts[0].defs]).toEqual([e]); + expect(facts.findIndex((f) => f.uses.includes(e))).toBeGreaterThan(0); + }); + + it('empty catch: param def lands on the synthetic handler block', () => { + const cfg = cfgOf(`function f() { try { risky(); } catch (e) {} }`); + const e = bindingIdx(cfg, 'e'); + const withDef = cfg.blocks.filter((b) => (b.statements ?? []).some((f) => f.defs.includes(e))); + expect(withDef).toHaveLength(1); + expect(withDef[0].text).toBe(''); // the synthetic empty-catch block + }); + + it('switch: discriminant and case-test uses harvest onto the dispatch block', () => { + const cfg = cfgOf(`function f(s, sel) { + switch (s) { + case sel: a(); break; + default: b(); + } + }`); + const dispatch = cfg.blocks.find((b) => b.text === '(s)'); + const uses = new Set(dispatch!.statements!.flatMap((f) => [...f.uses])); + expect(uses).toContain(bindingIdx(cfg, 's')); + expect(uses).toContain(bindingIdx(cfg, 'sel')); + }); +}); + +describe('TS/JS def/use harvest — exclusions (KTD4)', () => { + it('nested function bodies are opaque: no defs/uses of captured names harvested', () => { + const cfg = cfgOf(`function f() { + let outer = 1; + const g = () => { outer = 2; use(outer); }; + }`); + const outer = bindingIdx(cfg, 'outer'); + const g = bindingIdx(cfg, 'g'); + const facts = allFacts(cfg); + // exactly ONE def of outer (its declaration) — the nested write is invisible + expect(facts.filter((f) => f.defs.includes(outer))).toHaveLength(1); + expect(facts.some((f) => f.uses.includes(outer))).toBe(false); + // the declaration of g IS a def + expect(facts.some((f) => f.defs.includes(g))).toBe(true); + }); + + it('member/property writes are not defs; their identifiers are uses', () => { + const cfg = cfgOf(`function f(obj, q) { + this.x = 1; + obj.p = q; + }`); + const facts = allFacts(cfg); + const nonParamDefs = facts + .flatMap((f) => [...f.defs]) + .filter((d) => cfg.bindings![d].kind !== 'param'); + expect(nonParamDefs).toHaveLength(0); + const uses = usesOf(cfg); + expect(uses).toContain(bindingIdx(cfg, 'obj')); + expect(uses).toContain(bindingIdx(cfg, 'q')); + expect(bindingIdxs(cfg, 'x')).toHaveLength(0); // property name never binds + expect(bindingIdxs(cfg, 'p')).toHaveLength(0); + }); + + it('type annotations do not produce uses', () => { + const cfg = cfgOf(`function f(v: SomeType): OtherType { const x: Wide = v; return x; }`); + expect(bindingIdxs(cfg, 'SomeType')).toHaveLength(0); + expect(bindingIdxs(cfg, 'OtherType')).toHaveLength(0); + expect(bindingIdxs(cfg, 'Wide')).toHaveLength(0); + }); +}); + +describe('TS/JS def/use harvest — walk-order traps (two-phase pre-scan)', () => { + it('finally walked before try body: var def and finally use share one binding', () => { + const cfg = cfgOf(`function f() { + try { var v = 1; } finally { use(v); } + }`); + expect(bindingIdxs(cfg, 'v')).toHaveLength(1); + const v = bindingIdx(cfg, 'v'); + expect(cfg.bindings![v].synthetic).toBeUndefined(); + expect(defsOf(cfg)).toContain(v); + expect(usesOf(cfg)).toContain(v); + }); + + it('for-init block created after body walk: init def and body use share one binding', () => { + const cfg = cfgOf(`function f(n) { + for (let i = 0; i < n; i++) { use(i); } + }`); + expect(bindingIdxs(cfg, 'i')).toHaveLength(1); + const i = bindingIdx(cfg, 'i'); + expect(defsOf(cfg)).toContain(i); + const bodyBlock = cfg.blocks.find((b) => b.text.includes('use(i)')); + expect(bodyBlock!.statements!.some((f) => f.uses.includes(i))).toBe(true); + }); + + it('do-while condition created before body: body var def and condition use share one binding', () => { + const cfg = cfgOf(`function f() { + do { var x = step(); } while (x); + }`); + expect(bindingIdxs(cfg, 'x')).toHaveLength(1); + const x = bindingIdx(cfg, 'x'); + const condBlock = cfg.blocks.find((b) => b.text === 'x' || b.text === '(x)'); + expect(condBlock!.statements!.some((f) => f.uses.includes(x))).toBe(true); + }); + + it('switch body is ONE scope: let in one case resolves in a later case', () => { + const cfg = cfgOf(`function f(s) { + switch (s) { + case 1: let shared = 1; break; + case 2: use(shared); break; + } + }`); + expect(bindingIdxs(cfg, 'shared')).toHaveLength(1); + const shared = bindingIdx(cfg, 'shared'); + expect(defsOf(cfg)).toContain(shared); + expect(usesOf(cfg)).toContain(shared); + }); +}); + +describe('TS/JS def/use harvest — serialization', () => { + it('facts survive a JSON round-trip deep-equal (worker boundary shape)', () => { + const cfg = cfgOf(`function f(a) { + let x = a; + try { x += 1; } catch (e) { use(e); } finally { done(x); } + return x; + }`); + const trip = JSON.parse(JSON.stringify(cfg)) as FunctionCfg; + expect(trip).toEqual(cfg); + expect(trip.bindings).toBeDefined(); + expect(trip.blocks.every((b) => Array.isArray(b.statements))).toBe(true); + }); + + it('binding indices in facts are always in range of the binding table', () => { + const cfg = cfgOf(`function f(a, b) { + const c = a + b; + for (const k in a) { sink(k, c); } + }`); + const n = cfg.bindings!.length; + for (const f of allFacts(cfg)) { + for (const d of f.defs) expect(d).toBeGreaterThanOrEqual(0), expect(d).toBeLessThan(n); + for (const u of f.uses) expect(u).toBeGreaterThanOrEqual(0), expect(u).toBeLessThan(n); + } + }); +}); From a49d32db75b8b18d802ee7381a7fe34f6066eedb Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Wed, 10 Jun 2026 20:36:29 +0000 Subject: [PATCH 03/19] feat(cfg): add reaching-definitions solver with GEN/KILL fixpoint + statement sweep (#2082 U3) --- .../src/core/ingestion/cfg/reaching-defs.ts | 338 +++++++++++++++ gitnexus/test/unit/cfg/reaching-defs.test.ts | 398 ++++++++++++++++++ 2 files changed, 736 insertions(+) create mode 100644 gitnexus/src/core/ingestion/cfg/reaching-defs.ts create mode 100644 gitnexus/test/unit/cfg/reaching-defs.test.ts diff --git a/gitnexus/src/core/ingestion/cfg/reaching-defs.ts b/gitnexus/src/core/ingestion/cfg/reaching-defs.ts new file mode 100644 index 0000000000..7225a0e31c --- /dev/null +++ b/gitnexus/src/core/ingestion/cfg/reaching-defs.ts @@ -0,0 +1,338 @@ +/** + * Reaching definitions (#2082 M2 U3) — classic GEN/KILL monotone fixpoint over + * one function's CFG, plus the canonical intra-block statement sweep that + * recovers statement-granular def→use facts from M1's coalesced blocks + * WITHOUT re-splitting the CFG. + * + * PURE AND DETERMINISTIC (load-bearing contract): + * - Pure function of its inputs — no graph, no logger (warnings are the + * caller's job), importable outside the worker. The M3 taint engine calls + * this same function in-phase (facts are recomputed on demand, never + * retained run-wide — the persisted REACHING_DEF edges are a bounded + * projection, never the taint substrate). + * - Deterministic — predecessors merge in sorted block-index order, + * insertion-ordered Maps/Sets throughout, and the output fact array is + * explicitly sorted. Snapshot tests and content-derived edge ids rely on it. + * + * COMPLEXITY DISCIPLINE (the four-times-repeated repo bug shape is per-item + * re-derivation inside the loop): def-sets are SHARED BY REFERENCE, never + * deep-copied — RD's kill is total per binding, so a transfer either aliases + * the incoming set or replaces it with a fresh singleton. Single-predecessor + * blocks alias the predecessor's OUT map outright; multi-pred merges union + * only bindings whose incoming sets differ by reference. Iteration is reverse + * post-order, seeded with every block (unreachable blocks keep ⊥ IN — correct, + * their defs reach nothing). Convergence: sets grow monotonically within the + * finite def-site universe ⇒ ≤ loop-depth+1 passes in practice. + * + * `limits.maxFacts` bounds materialization: facts are O(defs×uses) BY SPEC in + * merge-heavy code (N branch-arm defs × N later uses = N² facts), and a + * 2000-line function can spike 100k+ fact objects on the main thread. The + * emit path passes DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION (emit.ts); + * M3 passes its own large-but-finite limit and treats `status: 'truncated'` + * as a per-function taint-coverage gap. + */ +import type { BindingEntry, FunctionCfg } from './types.js'; + +/** A statement-granular program point within one function's CFG. */ +export interface ProgramPoint { + readonly blockIndex: number; + /** Statement index within the block's `statements` array. */ + readonly stmtIndex: number; + readonly line: number; +} + +/** One def→use fact: the definition at `def` reaches the use at `use`. */ +export interface DefUseFact { + /** Index into {@link FunctionDefUse.bindings}. */ + readonly bindingIdx: number; + readonly def: ProgramPoint; + readonly use: ProgramPoint; +} + +export interface ReachingDefsLimits { + /** + * Maximum number of facts to materialize; the sweep stops early and reports + * `status: 'truncated'`. `undefined`/0 ⇒ unlimited. + */ + readonly maxFacts?: number; +} + +export interface FunctionDefUse { + /** + * `computed` — full facts. + * `no-facts` — the CFG carries no statement facts (hand-built or pre-M2 + * side channel); empty facts, NOT an error. + * `truncated` — `limits.maxFacts` hit; `facts` is a deterministic prefix. + */ + readonly status: 'computed' | 'no-facts' | 'truncated'; + /** Pass-through of the CFG's binding table (empty for `no-facts`). */ + readonly bindings: readonly BindingEntry[]; + /** Sorted by (def block, def stmt, use block, use stmt, binding). */ + readonly facts: readonly DefUseFact[]; + /** Total def / use sites seen (telemetry; independent of truncation). */ + readonly defCount: number; + readonly useCount: number; +} + +/** def-site key: packs (blockIndex, stmtIndex) into one number. */ +const STMT_STRIDE = 1 << 16; // maxFunctionLines caps statements far below 65536 +const defKey = (blockIndex: number, stmtIndex: number): number => + blockIndex * STMT_STRIDE + stmtIndex; + +type DefSet = Set; +/** bindingIdx → def-site keys reaching this program point. */ +type Lattice = Map; + +const EMPTY_LATTICE: Lattice = new Map(); + +/** + * Compute reaching definitions for one function. See the module doc for the + * purity/determinism/sharing contract. + */ +export function computeReachingDefs( + cfg: FunctionCfg, + limits?: ReachingDefsLimits, +): FunctionDefUse { + if (!cfg.bindings) { + return { status: 'no-facts', bindings: [], facts: [], defCount: 0, useCount: 0 }; + } + + const blocks = cfg.blocks; + const n = blocks.length; + + // ── adjacency (sorted for deterministic merges) ───────────────────────── + // A `throw` edge contributes IN(from) ∪ OUT(from) to its handler, not just + // OUT: an exception can fire BEFORE the faulting block's defs complete, so + // OUT-only would falsely kill the pre-block defs on the exceptional path — + // `let x = seed(); try { x = risky(); } catch { sink(x) }` must let the + // seed def reach the sink (risky() may throw before assigning). Sound + // over-approximation; monotone, so the fixpoint absorbs it. + const preds: { from: number; viaThrow: boolean }[][] = Array.from({ length: n }, () => []); + const succs: number[][] = Array.from({ length: n }, () => []); + // Handlers whose IN depends on this block's IN (throw edges) — requeued on + // IN change, since a genned binding can absorb IN growth without changing + // OUT, which would otherwise leave the handler stale. + const throwSuccs: number[][] = Array.from({ length: n }, () => []); + for (const e of cfg.edges) { + // Optional-chained pushes drop out-of-range endpoints defensively — the + // emit path validates via isEmitSafeCfg, but this pure function also runs + // on hand-built CFGs. + succs[e.from]?.push(e.to); + preds[e.to]?.push({ from: e.from, viaThrow: e.kind === 'throw' }); + if (e.kind === 'throw') throwSuccs[e.from]?.push(e.to); + } + for (const list of preds) { + list.sort((a, b) => a.from - b.from || Number(a.viaThrow) - Number(b.viaThrow)); + // duplicate (from, throw+non-throw) pairs both survive — the throw leg + // adds IN(from); the merge dedups set-wise. + } + for (const list of succs) list.sort((a, b) => a - b); + + // ── per-block GEN (last def per binding) + def/use telemetry ──────────── + // gen[b]: bindingIdx → singleton def-set of the block's LAST def of it. + const gen: (Map | null)[] = new Array(n).fill(null); + const defLine = new Map(); // defKey → source line + let defCount = 0; + let useCount = 0; + for (const b of blocks) { + const stmts = b.statements; + if (!stmts || stmts.length === 0) continue; + let g: Map | null = null; + for (let i = 0; i < stmts.length; i++) { + const s = stmts[i]; + useCount += s.uses.length; + for (const d of s.defs) { + defCount += 1; + const key = defKey(b.index, i); + defLine.set(key, s.line); + if (!g) g = new Map(); + g.set(d, new Set([key])); // later defs overwrite — kill is total + } + } + gen[b.index] = g; + } + + // ── iteration order: RPO over reachable blocks, then the rest by index ── + const order = reversePostOrder(cfg.entryIndex, succs, n); + + // ── fixpoint ──────────────────────────────────────────────────────────── + const inSets: Lattice[] = new Array(n).fill(EMPTY_LATTICE); + const outSets: Lattice[] = new Array(n).fill(EMPTY_LATTICE); + const posInOrder = new Map(); + order.forEach((b, i) => posInOrder.set(b, i)); + + const inWorklist = new Array(n).fill(true); + let pending = n; + while (pending > 0) { + for (const b of order) { + if (!inWorklist[b]) continue; + inWorklist[b] = false; + pending -= 1; + + const p = preds[b]; + const inB: Lattice = + p.length === 0 + ? EMPTY_LATTICE + : p.length === 1 && !p[0].viaThrow + ? outSets[p[0].from] // alias — zero allocation on straight-line chains + : mergePreds(p, inSets, outSets); + const inChanged = !latticeEquals(inSets[b], inB); + inSets[b] = inB; + + const g = gen[b]; + // OUT = overlay(IN) replacing only GEN'd bindings with singletons. When + // nothing is genned, OUT aliases IN outright. + let outB: Lattice; + if (!g) { + outB = inB; + } else { + outB = new Map(inB); // copies REFERENCES, never set contents + for (const [bindingIdx, set] of g) outB.set(bindingIdx, set); + } + + const requeue = (s: number): void => { + if (!inWorklist[s]) { + inWorklist[s] = true; + pending += 1; + } + }; + if (!latticeEquals(outSets[b], outB)) { + outSets[b] = outB; + for (const s of succs[b]) requeue(s); + } + if (inChanged) for (const s of throwSuccs[b]) requeue(s); + } + } + + // ── statement sweep: recover statement-granular def→use facts ─────────── + const maxFacts = limits?.maxFacts && limits.maxFacts > 0 ? limits.maxFacts : Infinity; + const facts: DefUseFact[] = []; + let truncated = false; + + outer: for (const b of blocks) { + const stmts = b.statements; + if (!stmts || stmts.length === 0) continue; + // Lazy overlay of IN — entries are replaced (never mutated) on def, so the + // shared sets stay intact. + let reach: Lattice | null = null; + for (let i = 0; i < stmts.length; i++) { + const s = stmts[i]; + for (const u of s.uses) { + const reaching = (reach ?? inSets[b.index]).get(u); + if (!reaching) continue; + for (const key of reaching) { + if (facts.length >= maxFacts) { + truncated = true; + break outer; + } + const defBlock = Math.floor(key / STMT_STRIDE); + const defStmt = key % STMT_STRIDE; + facts.push({ + bindingIdx: u, + def: { blockIndex: defBlock, stmtIndex: defStmt, line: defLine.get(key) ?? 0 }, + use: { blockIndex: b.index, stmtIndex: i, line: s.line }, + }); + } + } + if (s.defs.length > 0) { + if (!reach) reach = new Map(inSets[b.index]); + for (const d of s.defs) reach.set(d, new Set([defKey(b.index, i)])); // kill + gen + } + } + } + + facts.sort( + (a, b) => + a.def.blockIndex - b.def.blockIndex || + a.def.stmtIndex - b.def.stmtIndex || + a.use.blockIndex - b.use.blockIndex || + a.use.stmtIndex - b.use.stmtIndex || + a.bindingIdx - b.bindingIdx, + ); + + return { + status: truncated ? 'truncated' : 'computed', + bindings: cfg.bindings, + facts, + defCount, + useCount, + }; +} + +/** RPO over blocks reachable from `entry`; unreachable blocks appended by index. */ +function reversePostOrder(entry: number, succs: readonly number[][], n: number): number[] { + const visited = new Array(n).fill(false); + const post: number[] = []; + // Iterative DFS with an explicit phase stack (children pushed in reverse so + // they pop in sorted order — determinism). + const stack: { node: number; childIdx: number }[] = [{ node: entry, childIdx: 0 }]; + visited[entry] = true; + while (stack.length) { + const top = stack[stack.length - 1]; + const children = succs[top.node]; + if (top.childIdx < children.length) { + const next = children[top.childIdx]; + top.childIdx += 1; + if (!visited[next]) { + visited[next] = true; + stack.push({ node: next, childIdx: 0 }); + } + } else { + post.push(top.node); + stack.pop(); + } + } + const order = post.reverse(); + for (let b = 0; b < n; b++) if (!visited[b]) order.push(b); + return order; +} + +/** Union predecessor lattices (OUT; plus IN for throw edges), sharing sets. */ +function mergePreds( + preds: readonly { from: number; viaThrow: boolean }[], + inSets: readonly Lattice[], + outSets: readonly Lattice[], +): Lattice { + const merged: Lattice = new Map(); + const mergeOne = (source: Lattice): void => { + for (const [bindingIdx, set] of source) { + const existing = merged.get(bindingIdx); + if (!existing) { + merged.set(bindingIdx, set); // share the first contributor's set + } else if (existing !== set) { + // Union only when the references differ. Copy-on-extend: `existing` + // may be a shared set from another block — never mutate it. + let target = existing; + let copied = false; + for (const key of set) { + if (!target.has(key)) { + if (!copied) { + target = new Set(existing); + copied = true; + } + target.add(key); + } + } + if (copied) merged.set(bindingIdx, target); + } + } + }; + for (const p of preds) { + mergeOne(outSets[p.from]); + if (p.viaThrow) mergeOne(inSets[p.from]); // exception may fire pre-defs + } + return merged; +} + +/** Per-binding equality with a reference fast path (sets only ever grow). */ +function latticeEquals(a: Lattice, b: Lattice): boolean { + if (a === b) return true; + if (a.size !== b.size) return false; + for (const [k, bSet] of b) { + const aSet = a.get(k); + if (aSet === bSet) continue; + if (!aSet || aSet.size !== bSet.size) return false; + for (const v of bSet) if (!aSet.has(v)) return false; + } + return true; +} diff --git a/gitnexus/test/unit/cfg/reaching-defs.test.ts b/gitnexus/test/unit/cfg/reaching-defs.test.ts new file mode 100644 index 0000000000..1c5000a6ae --- /dev/null +++ b/gitnexus/test/unit/cfg/reaching-defs.test.ts @@ -0,0 +1,398 @@ +import { describe, it, expect } from 'vitest'; +import Parser from 'tree-sitter'; +import TypeScript from 'tree-sitter-typescript'; +import type { SyntaxNode } from '../../../src/core/ingestion/utils/ast-helpers.js'; +import { + createTypeScriptCfgVisitor, + TS_FUNCTION_TYPES, +} from '../../../src/core/ingestion/cfg/visitors/typescript.js'; +import { + computeReachingDefs, + type DefUseFact, +} from '../../../src/core/ingestion/cfg/reaching-defs.js'; +import type { + BasicBlockData, + BindingEntry, + CfgEdgeData, + FunctionCfg, + StatementFacts, +} from '../../../src/core/ingestion/cfg/types.js'; + +// U3 (#2082 M2) — the GEN/KILL fixpoint + intra-block statement sweep. The +// classic lattice hazards (kill ordering, branch-merge union, loop-carried +// defs, self-loops, unreachable blocks) are pinned on hand-built FunctionCfg +// literals with zero tree-sitter dependency, mirroring cfg-builder.test.ts; +// shadowing/try-finally acceptance runs parser-direct through the U1 harvest. + +// ── hand-built CFG helpers ────────────────────────────────────────────────── + +interface BlockSpec { + readonly kind?: BasicBlockData['kind']; + readonly stmts?: StatementFacts[]; +} + +function mkCfg(blocks: BlockSpec[], edges: [number, number][], bindings: string[]): FunctionCfg { + const bindingTable: BindingEntry[] = bindings.map((name, i) => ({ + name, + declLine: i + 1, + declColumn: 0, + kind: 'let', + })); + return { + filePath: 'hand.ts', + functionStartLine: 1, + functionEndLine: 99, + functionStartColumn: 0, + entryIndex: 0, + exitIndex: 1, + blocks: blocks.map((b, index) => ({ + index, + startLine: index + 1, + endLine: index + 1, + text: '', + kind: b.kind ?? (index === 0 ? 'entry' : index === 1 ? 'exit' : 'normal'), + statements: b.stmts ?? [], + })), + edges: edges.map(([from, to]) => ({ from, to, kind: 'seq' }) as CfgEdgeData), + bindings: bindingTable, + }; +} + +const stmt = (line: number, defs: number[] = [], uses: number[] = []): StatementFacts => ({ + line, + defs, + uses, +}); + +/** Compact "defBlock:defStmt->useBlock:useStmt:binding" rendering for asserts. */ +const render = (facts: readonly DefUseFact[]): string[] => + facts.map( + (f) => + `${f.def.blockIndex}:${f.def.stmtIndex}->${f.use.blockIndex}:${f.use.stmtIndex}:${f.bindingIdx}`, + ); + +// ── parser-direct helpers (shadowing / finally acceptance) ────────────────── + +const visitor = createTypeScriptCfgVisitor(); + +function cfgOf(code: string, index = 0): FunctionCfg { + const parser = new Parser(); + parser.setLanguage(TypeScript.typescript); + const root = parser.parse(code).rootNode as SyntaxNode; + const fns: SyntaxNode[] = []; + const stack = [root]; + while (stack.length) { + const n = stack.pop() as SyntaxNode; + if (TS_FUNCTION_TYPES.has(n.type)) fns.push(n); + for (let i = n.namedChildCount - 1; i >= 0; i--) { + const c = n.namedChild(i); + if (c) stack.push(c); + } + } + const cfg = visitor.buildFunctionCfg(fns[index], 'fixture.ts'); + if (!cfg) throw new Error('no cfg'); + return cfg; +} + +const nameIdx = (cfg: FunctionCfg, name: string): number[] => + (cfg.bindings ?? []).map((b, i) => (b.name === name ? i : -1)).filter((i) => i >= 0); + +// ── tests ─────────────────────────────────────────────────────────────────── + +describe('computeReachingDefs — kill/gen fundamentals (hand-built)', () => { + it('straight line: reassignment kills the prior def (R6)', () => { + // block 2: x=1; x=2; y=x + const cfg = mkCfg( + [{}, {}, { stmts: [stmt(10, [0]), stmt(11, [0]), stmt(12, [1], [0])] }], + [ + [0, 2], + [2, 1], + ], + ['x', 'y'], + ); + const r = computeReachingDefs(cfg); + expect(r.status).toBe('computed'); + expect(render(r.facts)).toEqual(['2:1->2:2:0']); // ONLY the second def reaches + expect(r.defCount).toBe(3); + expect(r.useCount).toBe(1); + }); + + it('branch merge (diamond): defs from BOTH arms reach the join use', () => { + // 0→2(def x)→{3,4 both def x}→5(use x)→1 + const cfg = mkCfg( + [ + {}, + {}, + { stmts: [stmt(10, [0])] }, + { stmts: [stmt(20, [0])] }, + { stmts: [stmt(30, [0])] }, + { stmts: [stmt(40, [], [0])] }, + ], + [ + [0, 2], + [2, 3], + [2, 4], + [3, 5], + [4, 5], + [5, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + expect(render(r.facts).sort()).toEqual(['3:0->5:0:0', '4:0->5:0:0']); + }); + + it('loop back-edge: pre-loop def AND loop-carried redef both reach the header use', () => { + // 0→2(def x)→3(use x = header)→4(def x, body)→3(back); 3→1(exit) + const cfg = mkCfg( + [{}, {}, { stmts: [stmt(10, [0])] }, { stmts: [stmt(20, [], [0])] }, { stmts: [stmt(30, [0])] }], + [ + [0, 2], + [2, 3], + [3, 4], + [4, 3], + [3, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + expect(render(r.facts).sort()).toEqual(['2:0->3:0:0', '4:0->3:0:0']); + }); + + it('self-loop block converges with the loop-carried def visible to its own use', () => { + // block 2 loops to itself: use x; def x + const cfg = mkCfg( + [{}, {}, { stmts: [stmt(10, [], [0]), stmt(11, [0])] }], + [ + [0, 2], + [2, 2], + [2, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + // the block's own def flows around the self-loop into its use + expect(render(r.facts)).toEqual(['2:1->2:0:0']); + }); + + it('unreachable block: its defs reach nothing; reachable uses see only reachable defs', () => { + // 2(def x)→3(use x); 4 is DISCONNECTED and also defs x + const cfg = mkCfg( + [{}, {}, { stmts: [stmt(10, [0])] }, { stmts: [stmt(20, [], [0])] }, { stmts: [stmt(30, [0])] }], + [ + [0, 2], + [2, 3], + [3, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + expect(render(r.facts)).toEqual(['2:0->3:0:0']); + }); + + it('intra-block sweep: a use BEFORE the same-block def sees the incoming def', () => { + // 2: def x. 3: use x (stmt0); def x (stmt1); use x (stmt2) + const cfg = mkCfg( + [{}, {}, { stmts: [stmt(10, [0])] }, { stmts: [stmt(20, [], [0]), stmt(21, [0]), stmt(22, [], [0])] }], + [ + [0, 2], + [2, 3], + [3, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + expect(render(r.facts).sort()).toEqual(['2:0->3:0:0', '3:1->3:2:0']); + }); + + it('def+use in one statement (x += 1): the use sees PRIOR defs, not its own', () => { + const cfg = mkCfg( + [{}, {}, { stmts: [stmt(10, [0]), stmt(11, [0], [0])] }], + [ + [0, 2], + [2, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + expect(render(r.facts)).toEqual(['2:0->2:1:0']); + }); +}); + +describe('computeReachingDefs — determinism and convergence', () => { + it('permuted edge order produces byte-identical sorted facts', () => { + const blocks: BlockSpec[] = [ + {}, + {}, + { stmts: [stmt(1, [0]), stmt(2, [1])] }, + { stmts: [stmt(3, [0], [1])] }, + { stmts: [stmt(4, [1], [0])] }, + { stmts: [stmt(5, [], [0, 1])] }, + ]; + const edges: [number, number][] = [ + [0, 2], + [2, 3], + [2, 4], + [3, 5], + [4, 5], + [5, 3], + [5, 1], + ]; + const base = computeReachingDefs(mkCfg(blocks, edges, ['x', 'y'])); + for (let i = 0; i < 5; i++) { + const shuffled = [...edges].reverse(); + shuffled.push(shuffled.shift() as [number, number]); + const r = computeReachingDefs(mkCfg(blocks, shuffled, ['x', 'y'])); + expect(render(r.facts)).toEqual(render(base.facts)); + } + }); + + it('nested loops (depth 3) converge with loop-carried defs intact', () => { + // 2 chains into three nested loop headers 3,4,5; innermost body 6 defs x. + const cfg = mkCfg( + [ + {}, + {}, + { stmts: [stmt(1, [0])] }, + { stmts: [stmt(2, [], [0])] }, + { stmts: [stmt(3, [], [0])] }, + { stmts: [stmt(4, [], [0])] }, + { stmts: [stmt(5, [0])] }, + ], + [ + [0, 2], + [2, 3], + [3, 4], + [4, 5], + [5, 6], + [6, 5], + [5, 4], + [4, 3], + [3, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + // every header use sees both the init def and the innermost redef + for (const useBlock of [3, 4, 5]) { + const defs = r.facts.filter((f) => f.use.blockIndex === useBlock).map((f) => f.def.blockIndex); + expect(new Set(defs)).toEqual(new Set([2, 6])); + } + }); + + it('no-facts fallback: a CFG without statement facts reports no-facts, no throw', () => { + const bare: FunctionCfg = { + filePath: 'hand.ts', + functionStartLine: 1, + functionEndLine: 2, + functionStartColumn: 0, + entryIndex: 0, + exitIndex: 1, + blocks: [ + { index: 0, startLine: 1, endLine: 1, text: '', kind: 'entry' }, + { index: 1, startLine: 2, endLine: 2, text: '', kind: 'exit' }, + ], + edges: [{ from: 0, to: 1, kind: 'seq' }], + }; + const r = computeReachingDefs(bare); + expect(r.status).toBe('no-facts'); + expect(r.facts).toEqual([]); + }); + + it('maxFacts truncation: deterministic prefix + truncated status', () => { + // fan-out: 4 defs of x in parallel arms, then 4 uses → 16 facts + const arms = [2, 3, 4, 5]; + const uses = [6, 7, 8, 9]; + const blocks: BlockSpec[] = [{}, {}]; + for (const a of arms) blocks[a] = { stmts: [stmt(a, [0])] }; + for (const u of uses) blocks[u] = { stmts: [stmt(u, [], [0])] }; + const edges: [number, number][] = []; + for (const a of arms) edges.push([0, a], [a, 6]); + edges.push([6, 7], [7, 8], [8, 9], [9, 1]); + const full = computeReachingDefs(mkCfg(blocks, edges, ['x'])); + expect(full.status).toBe('computed'); + expect(full.facts).toHaveLength(16); + + const capped = computeReachingDefs(mkCfg(blocks, edges, ['x']), { maxFacts: 5 }); + expect(capped.status).toBe('truncated'); + expect(capped.facts).toHaveLength(5); + // deterministic prefix: re-running yields the same truncated set + const again = computeReachingDefs(mkCfg(blocks, edges, ['x']), { maxFacts: 5 }); + expect(render(again.facts)).toEqual(render(capped.facts)); + // telemetry counts are truncation-independent + expect(capped.defCount).toBe(full.defCount); + expect(capped.useCount).toBe(full.useCount); + }); +}); + +describe('computeReachingDefs — parser-direct acceptance (with U1/U2)', () => { + it('shadowing: inner let does NOT kill the outer binding across the block (R4)', () => { + const cfg = cfgOf(`function f() { + let x = 1; + { let x = 2; sink(x); } + sink(x); + }`); + const [outer, inner] = nameIdx(cfg, 'x'); + const r = computeReachingDefs(cfg); + const outerUse = r.facts.filter((f) => f.bindingIdx === outer); + const innerUse = r.facts.filter((f) => f.bindingIdx === inner); + expect(innerUse).toHaveLength(1); + expect(outerUse).toHaveLength(1); + // the trailing sink(x) sees the OUTER def — the inner block didn't kill it + expect(outerUse[0].def.line).toBe(2); + expect(outerUse[0].use.line).toBe(4); + }); + + it('try/catch over-approximation: a try-body def reaches a catch-body use (R10)', () => { + const cfg = cfgOf(`function f() { + let x = seed(); + try { x = risky(); } catch (e) { sink(x); } + }`); + const [x] = nameIdx(cfg, 'x'); + const r = computeReachingDefs(cfg); + const catchUses = r.facts.filter((f) => f.bindingIdx === x && f.use.line === 3); + // BOTH the seed def and the try-body redef may reach the catch use + expect(new Set(catchUses.map((f) => f.def.line))).toEqual(new Set([2, 3])); + }); + + it('finally redefinition on the early-exit/normal paths kills the original (R9 + U2)', () => { + const cfg = cfgOf(`function f(c) { + let x = 1; + try { + if (c) { return probe(x); } + } finally { + x = 2; + } + return sink(x); + }`); + const [x] = nameIdx(cfg, 'x'); + const r = computeReachingDefs(cfg); + // the early return's use happens BEFORE finally runs → sees x = 1 (line 2) + const probeUse = r.facts.filter((f) => f.bindingIdx === x && f.use.line === 4); + expect(probeUse.map((f) => f.def.line)).toEqual([2]); + // the post-try use sits behind the finally on EVERY path → sees ONLY x = 2 + const sinkUse = r.facts.filter((f) => f.bindingIdx === x && f.use.line === 8); + expect(sinkUse.map((f) => f.def.line)).toEqual([6]); + }); + + it('params reach their uses from the ENTRY record', () => { + const cfg = cfgOf(`function f(a) { return a + 1; }`); + const [a] = nameIdx(cfg, 'a'); + const r = computeReachingDefs(cfg); + const fact = r.facts.find((f) => f.bindingIdx === a); + expect(fact).toBeDefined(); + expect(fact!.def.blockIndex).toBe(cfg.entryIndex); + }); + + it('loop-carried accumulator: both the init and in-loop defs reach the post-loop use', () => { + const cfg = cfgOf(`function f(xs) { + let sum = 0; + for (const x of xs) { sum += x; } + return sum; + }`); + const [sum] = nameIdx(cfg, 'sum'); + const r = computeReachingDefs(cfg); + const retUse = r.facts.filter((f) => f.bindingIdx === sum && f.use.line === 4); + expect(new Set(retUse.map((f) => f.def.line))).toEqual(new Set([2, 3])); + }); +}); From 8f2ffb8a5d896aa8e7e5f9fdcd91676e160a73c7 Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Wed, 10 Jun 2026 20:53:42 +0000 Subject: [PATCH 04/19] feat(cfg): persist budgeted REACHING_DEF projection with RepoMeta coherence (#2082 U4) --- gitnexus/src/core/ingestion/cfg/emit.ts | 195 +++++++++++++++++- gitnexus/src/core/ingestion/pipeline.ts | 9 + .../scope-resolution/pipeline/phase.ts | 1 + .../scope-resolution/pipeline/run.ts | 46 ++++- gitnexus/src/core/run-analyze.ts | 26 ++- gitnexus/src/storage/repo-manager.ts | 8 + .../test/integration/cfg/cfg-emit.test.ts | 161 ++++++++++++++- .../integration/cfg/worker-roundtrip.test.ts | 23 +++ gitnexus/test/unit/cfg/emit-guard.test.ts | 122 +++++++++++ gitnexus/test/unit/pdg-mode-flip.test.ts | 33 ++- 10 files changed, 615 insertions(+), 9 deletions(-) diff --git a/gitnexus/src/core/ingestion/cfg/emit.ts b/gitnexus/src/core/ingestion/cfg/emit.ts index 6531b723ed..18afd897b8 100644 --- a/gitnexus/src/core/ingestion/cfg/emit.ts +++ b/gitnexus/src/core/ingestion/cfg/emit.ts @@ -20,7 +20,8 @@ */ import type { KnowledgeGraph } from '../../graph/types.js'; import { generateId } from '../../../lib/utils.js'; -import type { FunctionCfg } from './types.js'; +import { computeReachingDefs } from './reaching-defs.js'; +import type { BindingEntry, FunctionCfg } from './types.js'; /** * Default per-function CFG edge cap. A pathological generated function could @@ -31,6 +32,25 @@ import type { FunctionCfg } from './types.js'; */ export const DEFAULT_MAX_CFG_EDGES_PER_FUNCTION = 5000; +/** + * Default per-function REACHING_DEF edge cap (#2082 M2 KTD9). 4000 mirrors + * Joern's per-method `maxNumberOfDefinitions` — the closest production prior + * art — but truncates-and-warns instead of silently skipping the function. + * Counts (defBlock, useBlock, binding) DEDUPED edges, not statement-level + * facts. `0` ⇒ unlimited; `undefined` ⇒ this default. + */ +export const DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION = 4000; + +/** + * Fact-materialization limit handed to {@link computeReachingDefs} on the + * emit path (#2082 M2 U3/F3): facts are O(defs×uses) BY SPEC in merge-heavy + * code, and the edge cap alone bounds the GRAPH, not the per-function memory + * spike of materializing facts before dedup. 4× the edge cap leaves dedup + * headroom. Scales with a custom edge cap; unlimited when the edge cap is 0. + */ +export const DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION = + 4 * DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION; + export interface CfgEmitResult { blocks: number; edges: number; @@ -78,7 +98,41 @@ export const isEmitSafeCfg = (cfg: FunctionCfg | undefined | null): cfg is Funct if (!Number.isInteger(b?.index)) return false; blockIndices.add(b.index); } - return cfg.edges.every((e) => blockIndices.has(e?.from) && blockIndices.has(e?.to)); + if (!cfg.edges.every((e) => blockIndices.has(e?.from) && blockIndices.has(e?.to))) return false; + + // M2 (#2082 U1/U4): the binding table + statement facts join the + // REACHING_DEF id path (binding name/declLine/declColumn template into edge + // ids; statement defs/uses index into the table). A statement record whose + // index escapes the table would silently fabricate `undefined`-keyed edge + // ids, so indices are checked for RANGE, not just integer-ness. A channel + // with statements but NO binding table is malformed by construction. + const bindings = cfg.bindings; + if (bindings !== undefined) { + if (!Array.isArray(bindings)) return false; + for (const b of bindings) { + if ( + typeof b?.name !== 'string' || + !Number.isInteger(b.declLine) || + !Number.isInteger(b.declColumn) + ) { + return false; + } + } + } + const bindingCount = bindings?.length ?? 0; + for (const b of cfg.blocks) { + const stmts = b.statements; + if (stmts === undefined) continue; + if (bindings === undefined || !Array.isArray(stmts)) return false; + for (const s of stmts) { + if (!Number.isInteger(s?.line) || !Array.isArray(s.defs) || !Array.isArray(s.uses)) { + return false; + } + const inRange = (i: number): boolean => Number.isInteger(i) && i >= 0 && i < bindingCount; + if (!s.defs.every(inRange) || !s.uses.every(inRange)) return false; + } + } + return true; }; /** @@ -145,3 +199,140 @@ export function emitFileCfgs( return result; } + +export interface ReachingDefEmitResult { + /** Deduped (defBlock, useBlock, binding) edges persisted. */ + edges: number; + /** Deduped edges dropped by the per-function edge cap. */ + droppedEdges: number; + cappedFunctions: number; + /** Functions whose FACT materialization hit the solver's maxFacts limit. */ + truncatedFunctions: number; + /** Total statement-level facts the solver produced (pre-dedup telemetry). */ + facts: number; + /** Aggregate solve+dedup time in ms (PROF support). */ + solveMs: number; +} + +/** + * Stable identity for a binding inside edge ids (#2082 M2 KTD3/KTD9): + * `name:declLine:declCol` for declared bindings, `name@module` for synthetic + * ones. Distinct same-name bindings never share a key; identifier characters + * cannot contain the id separators. + */ +const bindingKey = (b: BindingEntry): string => + b.synthetic ? `${b.name}@module` : `${b.name}:${b.declLine}:${b.declColumn}`; + +/** + * Compute reaching definitions per function and persist the bounded + * REACHING_DEF projection (#2082 M2 U4). + * + * Facts are DEDUPED to (defBlock, useBlock, binding) before budgeting — the + * persisted columns (`from,to,type,confidence,reason,step`; relationship ids + * are in-memory-only, the CodeRelation table has no id column) cannot + * distinguish finer rows, so statement-indexed ids would only manufacture + * byte-identical duplicate rows that burn budget. Statement granularity lives + * in the in-memory {@link computeReachingDefs} result, which the M3 taint + * engine recomputes on demand — the budget here governs only this projection + * and can never drop a taint fact. + * + * R7 (no silent truncation) covers BOTH layers: the per-function edge cap AND + * the solver's fact-materialization limit (which can fire without the edge + * cap ever being reached, since dedup is many-to-one) each produce one + * unconditional `onWarn`. The edge-cap warn names the top bindings by fact + * count — overflow is almost always one variable, which is exactly the datum + * M3 tuning wants. + */ +export function emitFileReachingDefs( + graph: KnowledgeGraph, + cfgs: readonly FunctionCfg[], + maxEdgesPerFunction: number = DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION, + onWarn?: (message: string) => void, +): ReachingDefEmitResult { + const result: ReachingDefEmitResult = { + edges: 0, + droppedEdges: 0, + cappedFunctions: 0, + truncatedFunctions: 0, + facts: 0, + solveMs: 0, + }; + const cap = maxEdgesPerFunction > 0 ? maxEdgesPerFunction : Infinity; + const maxFacts = Number.isFinite(cap) ? (cap as number) * 4 : 0; // 0 ⇒ unlimited + + for (const cfg of cfgs) { + const t0 = performance.now(); + const r = computeReachingDefs(cfg, { maxFacts }); + if (r.status === 'no-facts') { + result.solveMs += performance.now() - t0; + continue; + } + result.facts += r.facts.length; + + const { filePath, functionStartLine, functionStartColumn } = cfg; + if (r.status === 'truncated') { + result.truncatedFunctions++; + onWarn?.( + `[reaching-defs] ${filePath}:${functionStartLine}: fact materialization ` + + `limit (${maxFacts}) reached — facts beyond it were not computed; ` + + `the persisted REACHING_DEF projection for this function is sparse`, + ); + } + + // Dedup to (defBlock, useBlock, binding) — facts arrive sorted, so the + // deduped order (and therefore cap truncation) is deterministic. + const seen = new Set(); + const deduped: { defBlock: number; useBlock: number; bindingIdx: number }[] = []; + const factsPerBinding = new Map(); + for (const f of r.facts) { + factsPerBinding.set(f.bindingIdx, (factsPerBinding.get(f.bindingIdx) ?? 0) + 1); + const key = `${f.def.blockIndex}:${f.use.blockIndex}:${f.bindingIdx}`; + if (seen.has(key)) continue; + seen.add(key); + deduped.push({ defBlock: f.def.blockIndex, useBlock: f.use.blockIndex, bindingIdx: f.bindingIdx }); + } + result.solveMs += performance.now() - t0; + + let emittedForFn = 0; + for (const edge of deduped) { + if (emittedForFn >= cap) { + const dropped = deduped.length - emittedForFn; + result.droppedEdges += dropped; + result.cappedFunctions++; + const top = [...factsPerBinding.entries()] + .sort((a, b) => b[1] - a[1] || a[0] - b[0]) + .slice(0, 2) + .map(([idx, count]) => `${r.bindings[idx]?.name ?? `#${idx}`}(${count} facts)`) + .join(', '); + onWarn?.( + `[reaching-defs] ${filePath}:${functionStartLine}: per-function ` + + `REACHING_DEF edge cap (${maxEdgesPerFunction}) reached — dropped ` + + `${dropped} of ${deduped.length} edges; top bindings: ${top}`, + ); + break; + } + const binding = r.bindings[edge.bindingIdx]; + const sourceId = basicBlockId(filePath, functionStartLine, functionStartColumn, edge.defBlock); + const targetId = basicBlockId(filePath, functionStartLine, functionStartColumn, edge.useBlock); + graph.addRelationship({ + // Single function anchor — the two block ids share it, so templating + // it once halves the id size (ids are in-memory-only but ~4000 of + // them per capped function is real transient heap). + id: generateId( + 'REACHING_DEF', + `${filePath}:${functionStartLine}:${functionStartColumn}:` + + `${edge.defBlock}->${edge.useBlock}:${bindingKey(binding)}`, + ), + type: 'REACHING_DEF', + sourceId, + targetId, + confidence: 1.0, + reason: binding.name, // plain source-level name (M0/S1 verdict) — queryable + }); + result.edges++; + emittedForFn++; + } + } + + return result; +} diff --git a/gitnexus/src/core/ingestion/pipeline.ts b/gitnexus/src/core/ingestion/pipeline.ts index 32216645e8..f1f8326321 100644 --- a/gitnexus/src/core/ingestion/pipeline.ts +++ b/gitnexus/src/core/ingestion/pipeline.ts @@ -73,6 +73,15 @@ export interface PipelineOptions { * silent truncation). No CLI flag in M1 — programmatic / server path only. */ pdgMaxEdgesPerFunction?: number; + /** + * Per-function REACHING_DEF edge cap for the scope-resolution emit step + * (#2082 M2). `undefined` ⇒ `DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION` + * (4000); `0` ⇒ no cap (unlimited). Emit-time-only — NOT folded into the + * parse-cache chunk key (the worker never sees it); recorded in + * `RepoMeta.pdg` so a cap change forces a full writeback. No CLI flag — + * programmatic / server path only, like the M1 caps. + */ + pdgMaxReachingDefEdgesPerFunction?: number; /** * Request parsing with the worker pool disabled. The sequential parser was * removed — the worker pool is the sole parse path — so setting this now diff --git a/gitnexus/src/core/ingestion/scope-resolution/pipeline/phase.ts b/gitnexus/src/core/ingestion/scope-resolution/pipeline/phase.ts index ec72c76ee7..7d0fd811e0 100644 --- a/gitnexus/src/core/ingestion/scope-resolution/pipeline/phase.ts +++ b/gitnexus/src/core/ingestion/scope-resolution/pipeline/phase.ts @@ -353,6 +353,7 @@ export const scopeResolutionPhase: PipelinePhase = { // CFG/PDG emission (#2081 M1) — opt-in; off ⇒ byte-identical graph. pdg: ctx.options?.pdg === true, pdgMaxEdgesPerFunction: ctx.options?.pdgMaxEdgesPerFunction, + pdgMaxReachingDefEdgesPerFunction: ctx.options?.pdgMaxReachingDefEdgesPerFunction, recordResolutionOutcome: (outcome) => { resolutionOutcomes.push(outcome); }, diff --git a/gitnexus/src/core/ingestion/scope-resolution/pipeline/run.ts b/gitnexus/src/core/ingestion/scope-resolution/pipeline/run.ts index d937b05b98..a0ea40a1f7 100644 --- a/gitnexus/src/core/ingestion/scope-resolution/pipeline/run.ts +++ b/gitnexus/src/core/ingestion/scope-resolution/pipeline/run.ts @@ -34,7 +34,13 @@ import { extractParsedFile } from '../../scope-extractor-bridge.js'; import { finalizeScopeModel } from '../../finalize-orchestrator.js'; import { resolveReferenceSites, type ResolveStats } from '../../resolve-references.js'; import { buildGraphNodeLookup } from '../graph-bridge/node-lookup.js'; -import { emitFileCfgs, isEmitSafeCfg, DEFAULT_MAX_CFG_EDGES_PER_FUNCTION } from '../../cfg/emit.js'; +import { + emitFileCfgs, + emitFileReachingDefs, + isEmitSafeCfg, + DEFAULT_MAX_CFG_EDGES_PER_FUNCTION, + DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION, +} from '../../cfg/emit.js'; import type { FunctionCfg } from '../../cfg/types.js'; import { resolveDefGraphId } from '../graph-bridge/ids.js'; import { buildPopulatedMethodDispatch } from '../graph-bridge/method-dispatch.js'; @@ -264,6 +270,9 @@ interface RunScopeResolutionInput { /** Per-function CFG edge cap. `undefined` ⇒ {@link DEFAULT_MAX_CFG_EDGES_PER_FUNCTION}; * `0` ⇒ no cap (unlimited). */ readonly pdgMaxEdgesPerFunction?: number; + /** Per-function REACHING_DEF edge cap (#2082 M2). `undefined` ⇒ + * {@link DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION}; `0` ⇒ no cap. */ + readonly pdgMaxReachingDefEdgesPerFunction?: number; /** * Optional graph-node lookup built ONCE by the caller and shared across * every language pass. `buildGraphNodeLookup` scans the whole graph and is @@ -698,10 +707,20 @@ export function runScopeResolution( // disk store is cleared right after this orchestrator returns, see phase.ts). // A post-`mro` phase would read empty data (KTD1). Off by default ⇒ zero // BasicBlock/CFG nodes/edges and a byte-identical graph. + // Accumulated M2 reaching-defs time (solve + dedup + REACHING_DEF emit), + // reported as the PROF `pdg=` segment. It is a SUBSET of `emit=` — the M1 + // CFG emit and the M2 solve interleave per file, so a separate checkpoint + // pair can't bracket them; without this accumulator the M2 cost would + // silently disappear into `emit=` and field regressions would be invisible. + let pdgMs = 0; if (input.pdg === true) { let cfgBlocks = 0; let cfgEdges = 0; let cfgDroppedEdges = 0; + let rdEdges = 0; + let rdDropped = 0; + let rdFacts = 0; + let rdTruncated = 0; for (const pf of emitParsedFiles) { const cfgs = pf.cfgSideChannel; // Defensive: cfgSideChannel is opaque (`unknown`) and crosses the cache / @@ -739,6 +758,24 @@ export function runScopeResolution( cfgBlocks += emitted.blocks; cfgEdges += emitted.edges; cfgDroppedEdges += emitted.droppedEdges; + + // M2 (#2082 U4): reaching definitions over the same validated CFGs. + // In-memory facts are computed per function and dropped after the + // bounded (defBlock, useBlock, binding) projection is persisted — + // M3 recomputes via the same pure solver in-phase (KTD8). + const t0 = performance.now(); + const rd = emitFileReachingDefs( + graph, + wellFormed, + input.pdgMaxReachingDefEdgesPerFunction ?? + DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION, + (message) => logger.warn(message), // unconditional — R7, both layers + ); + pdgMs += performance.now() - t0; + rdEdges += rd.edges; + rdDropped += rd.droppedEdges; + rdFacts += rd.facts; + rdTruncated += rd.truncatedFunctions; } catch (err) { // Last-resort isolation, mirroring the worker-side per-file try/catch: // a shape the predicate misses must cost this one file's CFG, not @@ -757,7 +794,10 @@ export function runScopeResolution( logger.debug( `[scope-resolution] CFG emit (lang=${provider.language}): ` + `${cfgBlocks} BasicBlock nodes, ${cfgEdges} CFG edges` + - (cfgDroppedEdges > 0 ? `, ${cfgDroppedEdges} edges dropped (per-function cap)` : ''), + (cfgDroppedEdges > 0 ? `, ${cfgDroppedEdges} edges dropped (per-function cap)` : '') + + `; ${rdEdges} REACHING_DEF edges (${rdFacts} facts)` + + (rdDropped > 0 ? `, ${rdDropped} REACHING_DEF edges dropped (per-function cap)` : '') + + (rdTruncated > 0 ? `, ${rdTruncated} function(s) hit the fact limit` : ''), ); } } @@ -771,6 +811,8 @@ export function runScopeResolution( ` propagate=${ns(tFinalize, tPropagate).toFixed(0)}ms` + ` resolve=${ns(tPropagate, tResolve).toFixed(0)}ms` + ` emit=${ns(tResolve, tEnd).toFixed(0)}ms` + + // pdg ⊆ emit: the M2 reaching-defs share of the emit bucket (#2082 U4). + (input.pdg === true ? ` pdg=${pdgMs.toFixed(0)}ms` : '') + ` total=${ns(tStart, tEnd).toFixed(0)}ms` + ` (${parsedFiles.length} files)`, ); diff --git a/gitnexus/src/core/run-analyze.ts b/gitnexus/src/core/run-analyze.ts index f8fe4163e3..38740f7083 100644 --- a/gitnexus/src/core/run-analyze.ts +++ b/gitnexus/src/core/run-analyze.ts @@ -45,7 +45,10 @@ import { type RepoMeta, } from '../storage/repo-manager.js'; import { DEFAULT_PDG_MAX_FUNCTION_LINES } from './ingestion/cfg/collect.js'; -import { DEFAULT_MAX_CFG_EDGES_PER_FUNCTION } from './ingestion/cfg/emit.js'; +import { + DEFAULT_MAX_CFG_EDGES_PER_FUNCTION, + DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION, +} from './ingestion/cfg/emit.js'; import { computeFileHashes, diffFileHashes } from '../storage/file-hash.js'; import { extractChangedSubgraph, @@ -135,6 +138,9 @@ export interface AnalyzeOptions { pdgMaxFunctionLines?: number; /** Per-function CFG edge cap. Forwarded to `PipelineOptions.pdgMaxEdgesPerFunction`. */ pdgMaxEdgesPerFunction?: number; + /** Per-function REACHING_DEF edge cap (#2082 M2). Forwarded to + * `PipelineOptions.pdgMaxReachingDefEdgesPerFunction`. */ + pdgMaxReachingDefEdgesPerFunction?: number; /** * Default branch threaded into generated AGENTS.md / CLAUDE.md so the * regression-compare example uses the configured branch instead of a @@ -335,13 +341,19 @@ export const collectBranchCacheKeys = async ( * defaults so an explicit-default run compares equal to a default run * (`0` = unlimited is preserved as `0`). Pure + exported for testing. */ -type PdgOptions = Pick; +type PdgOptions = Pick< + AnalyzeOptions, + 'pdg' | 'pdgMaxFunctionLines' | 'pdgMaxEdgesPerFunction' | 'pdgMaxReachingDefEdgesPerFunction' +>; export const resolvePdgConfig = (options: PdgOptions): RepoMeta['pdg'] => options.pdg === true ? { maxFunctionLines: options.pdgMaxFunctionLines ?? DEFAULT_PDG_MAX_FUNCTION_LINES, maxEdgesPerFunction: options.pdgMaxEdgesPerFunction ?? DEFAULT_MAX_CFG_EDGES_PER_FUNCTION, + maxReachingDefEdgesPerFunction: + options.pdgMaxReachingDefEdgesPerFunction ?? + DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION, } : undefined; @@ -360,7 +372,14 @@ export const pdgModeMismatch = (recorded: RepoMeta['pdg'], options: PdgOptions): if (!requested || !recorded) return true; return ( requested.maxFunctionLines !== recorded.maxFunctionLines || - requested.maxEdgesPerFunction !== recorded.maxEdgesPerFunction + requested.maxEdgesPerFunction !== recorded.maxEdgesPerFunction || + // M2 (#2082): an M1-era stamp has NO maxReachingDefEdgesPerFunction — + // `4000 !== undefined` trips here, which is what makes an M1→M2 upgrade + // force the full writeback that populates REACHING_DEF rows without + // `--force`. The comparator is field-wise on purpose; new emit-affecting + // knobs MUST join it (a knob the comparator misses silently strands a + // stale projection). + requested.maxReachingDefEdgesPerFunction !== recorded.maxReachingDefEdgesPerFunction ); }; @@ -730,6 +749,7 @@ export async function runFullAnalysis( pdg: options.pdg === true, pdgMaxFunctionLines: options.pdgMaxFunctionLines, pdgMaxEdgesPerFunction: options.pdgMaxEdgesPerFunction, + pdgMaxReachingDefEdgesPerFunction: options.pdgMaxReachingDefEdgesPerFunction, fetchWrappers: options.fetchWrappers, }, ); diff --git a/gitnexus/src/storage/repo-manager.ts b/gitnexus/src/storage/repo-manager.ts index 80e815ebfa..5e9fd972f8 100644 --- a/gitnexus/src/storage/repo-manager.ts +++ b/gitnexus/src/storage/repo-manager.ts @@ -150,6 +150,14 @@ export interface RepoMeta { maxFunctionLines: number; /** Emit-side per-function CFG edge cap, resolved (0 = unlimited). */ maxEdgesPerFunction: number; + /** + * Emit-side per-function REACHING_DEF edge cap, resolved (0 = unlimited; + * #2082 M2). ABSENT on an M1-era stamp — which is exactly what makes + * `pdgModeMismatch` trip on the first M2 run over an M1 index and force + * the full writeback that populates REACHING_DEF rows. Optional in the + * type for that reason; resolved (always present) on every M2+ write. + */ + maxReachingDefEdgesPerFunction?: number; }; } diff --git a/gitnexus/test/integration/cfg/cfg-emit.test.ts b/gitnexus/test/integration/cfg/cfg-emit.test.ts index d90c21210e..98efe908ae 100644 --- a/gitnexus/test/integration/cfg/cfg-emit.test.ts +++ b/gitnexus/test/integration/cfg/cfg-emit.test.ts @@ -2,7 +2,7 @@ import { describe, it, expect, vi } from 'vitest'; import Parser from 'tree-sitter'; import TypeScript from 'tree-sitter-typescript'; import { collectFunctionCfgs } from '../../../src/core/ingestion/cfg/collect.js'; -import { emitFileCfgs } from '../../../src/core/ingestion/cfg/emit.js'; +import { emitFileCfgs, emitFileReachingDefs } from '../../../src/core/ingestion/cfg/emit.js'; import { getProvider } from '../../../src/core/ingestion/languages/index.js'; import { SupportedLanguages } from '../../../src/config/supported-languages.js'; import type { CfgVisitor, FunctionCfg } from '../../../src/core/ingestion/cfg/types.js'; @@ -179,3 +179,162 @@ describe('U4 — flag-off / empty input emits nothing', () => { expect(r.edges).toBe(0); }); }); + +describe('U4 (#2082 M2) — emitFileReachingDefs', () => { + it('persists deduped (blockPair, binding) edges with reason = plain variable name', () => { + const cfgs = cfgsOf( + `function f(a) { + let x = a; + x = x + 1; + return sink(x); + }`, + 'src/rd.ts', + ); + const { graph, rels } = recordingGraph(); + const r = emitFileReachingDefs(graph, cfgs); + expect(r.edges).toBe(rels.length); + expect(rels.length).toBeGreaterThan(0); + for (const e of rels) { + expect(e.type).toBe('REACHING_DEF'); + expect(e.sourceId).toMatch(/^BasicBlock:src\/rd\.ts:\d+:\d+:\d+$/); + expect(e.targetId).toMatch(/^BasicBlock:src\/rd\.ts:\d+:\d+:\d+$/); + } + // reason carries the plain source-level name (M0/S1 verdict) + const reasons = new Set(rels.map((e) => e.reason)); + expect(reasons.has('x')).toBe(true); + expect(reasons.has('a')).toBe(true); + }); + + it('same block pair, two bindings → two distinct edges (id collision-proofing)', () => { + const cfgs = cfgsOf(`function f(a, b) { const c = a + b; use(c); }`, 'two.ts'); + const { graph, rels } = recordingGraph(); + emitFileReachingDefs(graph, cfgs); + const ids = rels.map((e) => e.id); + expect(new Set(ids).size).toBe(ids.length); + // a and b both flow ENTRY→body: same block pair, distinct edges by binding + const entryToBody = rels.filter((e) => e.reason === 'a' || e.reason === 'b'); + expect(entryToBody.length).toBeGreaterThanOrEqual(2); + }); + + it('N statement-level facts on one (blockPair, binding) collapse to ONE edge', () => { + // x defined once, used three times in the same straight-line block: three + // facts, one persisted edge (the persisted columns cannot distinguish). + const cfgs = cfgsOf( + `function f() { + let x = seed(); + a(x); b(x); c(x); + }`, + 'dedup.ts', + ); + const { graph, rels } = recordingGraph(); + const r = emitFileReachingDefs(graph, cfgs); + const xEdges = rels.filter((e) => e.reason === 'x'); + expect(xEdges).toHaveLength(1); // self-pair within the single body block + expect(r.facts).toBeGreaterThan(rels.length); // facts > deduped edges + }); + + it('per-function edge cap: truncates deterministically, warns with top bindings (R7)', () => { + const cfgs = cfgsOf( + `function f(p, q) { + let x = p; + if (p) { x = q; } else { x = p + q; } + s1(x); s2(p); s3(q); + }`, + 'cap.ts', + ); + const full = recordingGraph(); + const rFull = emitFileReachingDefs(full.graph, cfgs); + expect(rFull.edges).toBeGreaterThan(2); + + const capped = recordingGraph(); + const onWarn = vi.fn(); + const r = emitFileReachingDefs(capped.graph, cfgs, 2, onWarn); + expect(capped.rels).toHaveLength(2); + // NOTE: not comparable to rFull.edges — the cap also scales maxFacts (4×), + // so the capped run may dedup fewer facts. Within-run consistency only: + expect(r.droppedEdges).toBeGreaterThan(0); + expect(r.cappedFunctions).toBe(1); + // cap=2 also tightens maxFacts (8) below this function's fact count, so + // BOTH R7 layers may warn — assert on the edge-cap warn specifically. + const capWarns = onWarn.mock.calls + .map((c) => c[0] as string) + .filter((m) => m.includes('REACHING_DEF edge cap')); + expect(capWarns).toHaveLength(1); + expect(capWarns[0]).toContain('top bindings'); + // deterministic truncation: same prefix on a second run + const again = recordingGraph(); + emitFileReachingDefs(again.graph, cfgs, 2, vi.fn()); + expect(again.rels.map((e) => e.id)).toEqual(capped.rels.map((e) => e.id)); + }); + + it('cap of 0 means unlimited (no warn)', () => { + const cfgs = cfgsOf(`function f(a) { use(a); }`, 'u.ts'); + const { graph, rels } = recordingGraph(); + const onWarn = vi.fn(); + emitFileReachingDefs(graph, cfgs, 0, onWarn); + expect(rels.length).toBeGreaterThan(0); + expect(onWarn).not.toHaveBeenCalled(); + }); + + it('fact-layer truncation warns even when the edge cap is never reached (R7 both layers)', () => { + // 3 parallel arms defining x + several later uses → facts >> deduped edges. + // Cap edges generously but squeeze maxFacts via a tiny edge cap × 4? No — + // maxFacts derives from the edge cap (4×). Use a cap that bounds facts + // below the fact count while edges stay under it: cap=3 ⇒ maxFacts=12. + const cfgs = cfgsOf( + `function f(c) { + let x = 0; + if (c === 1) { x = 1; } else if (c === 2) { x = 2; } else { x = 3; } + u1(x); u2(x); u3(x); u4(x); u5(x); + }`, + 'trunc.ts', + ); + const probe = recordingGraph(); + const rProbe = emitFileReachingDefs(probe.graph, cfgs); + expect(rProbe.facts).toBeGreaterThan(12); // 3 defs × 5 uses of x alone = 15+ + + const { graph } = recordingGraph(); + const onWarn = vi.fn(); + const r = emitFileReachingDefs(graph, cfgs, 1000, onWarn); + // edge cap (1000) never reached… + expect(r.cappedFunctions).toBe(0); + // …but with cap=3 ⇒ maxFacts=12 < total facts, truncation warns: + const tight = recordingGraph(); + const onWarnTight = vi.fn(); + const rTight = emitFileReachingDefs(tight.graph, cfgs, 3, onWarnTight); + expect(rTight.truncatedFunctions).toBe(1); + const messages = onWarnTight.mock.calls.map((c) => c[0] as string); + expect(messages.some((m) => m.includes('fact materialization'))).toBe(true); + }); + + it('no-facts CFGs (pre-M2 side channel) emit nothing and do not throw', () => { + const bare = { + filePath: 'old.ts', + functionStartLine: 1, + functionEndLine: 2, + functionStartColumn: 0, + entryIndex: 0, + exitIndex: 1, + blocks: [ + { index: 0, startLine: 1, endLine: 1, text: '', kind: 'entry' }, + { index: 1, startLine: 2, endLine: 2, text: '', kind: 'exit' }, + ], + edges: [{ from: 0, to: 1, kind: 'seq' }], + } as unknown as FunctionCfg; + const { graph, rels } = recordingGraph(); + const r = emitFileReachingDefs(graph, [bare]); + expect(rels).toHaveLength(0); + expect(r.edges).toBe(0); + }); + + it('emitting the same function twice is idempotent by id (first-writer-wins safe)', () => { + const cfgs = cfgsOf(`function f(a) { return a; }`, 'i.ts'); + const { graph, rels } = recordingGraph(); + emitFileReachingDefs(graph, cfgs); + const firstIds = rels.map((e) => e.id); + emitFileReachingDefs(graph, cfgs); + // ids deterministic ⇒ the second pass produces the SAME ids (the real + // KnowledgeGraph would no-op them; the recorder shows them duplicated) + expect(rels.slice(firstIds.length).map((e) => e.id)).toEqual(firstIds); + }); +}); diff --git a/gitnexus/test/integration/cfg/worker-roundtrip.test.ts b/gitnexus/test/integration/cfg/worker-roundtrip.test.ts index a35c953d11..c46a425aec 100644 --- a/gitnexus/test/integration/cfg/worker-roundtrip.test.ts +++ b/gitnexus/test/integration/cfg/worker-roundtrip.test.ts @@ -160,3 +160,26 @@ describe('U3 — parse-cache key folds the --pdg flag (R4, #2038-class guard)', ).toBe(base); }); }); + +describe('#2082 M2 — the REACHING_DEF emit cap does NOT perturb the chunk key', () => { + const entries = [ + { filePath: 'b.ts', contentHash: 'h2' }, + { filePath: 'a.ts', contentHash: 'h1' }, + ]; + + it('pdgMaxReachingDefEdgesPerFunction is emit-time-only — same key across values (F3 discipline)', () => { + // The worker never sees the REACHING_DEF edge cap (solve + emit happen in + // scope-resolution on the main thread), so the cached shard is identical + // across cap values. Folding it in would be the #2099-F3 over-correction: + // a spurious full re-parse on every cap change. PdgCacheKey simply has no + // field for it — this test pins that the key API surface stays that way + // (the object form ignores unknown extras rather than hashing them). + const base = computeChunkHash(entries, { pdg: true }); + const withExtra = computeChunkHash(entries, { + pdg: true, + // @ts-expect-error — deliberately passing an unknown field: the key must ignore it + maxReachingDefEdgesPerFunction: 1, + }); + expect(withExtra).toBe(base); + }); +}); diff --git a/gitnexus/test/unit/cfg/emit-guard.test.ts b/gitnexus/test/unit/cfg/emit-guard.test.ts index da71def1ab..a117e2730d 100644 --- a/gitnexus/test/unit/cfg/emit-guard.test.ts +++ b/gitnexus/test/unit/cfg/emit-guard.test.ts @@ -191,3 +191,125 @@ describe('cfgSideChannel emit guard (#2099 F4)', () => { expect(warns()).toHaveLength(0); }); }); + +describe('#2082 M2 — statement-fact emit guard (isEmitSafeCfg extension)', () => { + let cap: ReturnType; + beforeEach(() => { + cap = _captureLogger(); + }); + afterEach(() => { + cap.restore(); + }); + const warns = (): string[] => + cap + .records() + .filter((r) => r.level >= 40) + .map((r) => String(r.msg)); + + const rdEdges = (graph: KnowledgeGraph): number => { + let n = 0; + graph.forEachRelationship((r) => { + if (r.type === 'REACHING_DEF') n++; + }); + return n; + }; + const cfgEdgeCount = (graph: KnowledgeGraph): number => { + let n = 0; + graph.forEachRelationship((r) => { + if (r.type === 'CFG') n++; + }); + return n; + }; + + /** Valid facts-bearing CFG: def at stmt 0 reaches the use at stmt 1. */ + const factCfg = (blocks?: unknown): unknown => ({ + ...validCfg, + bindings: [{ name: 'x', declLine: 1, declColumn: 0, kind: 'let' }], + blocks: blocks ?? [ + { index: 0, startLine: 1, endLine: 1, text: '', kind: 'entry', statements: [] }, + { + index: 1, + startLine: 3, + endLine: 3, + text: '', + kind: 'exit', + statements: [ + { line: 2, defs: [0], uses: [] }, + { line: 3, defs: [], uses: [0] }, + ], + }, + ], + }); + + it('a well-formed facts-bearing CFG passes the guard and emits REACHING_DEF', () => { + const graph = emitWith([factCfg()]); + expect(rdEdges(graph)).toBeGreaterThan(0); + expect(warns()).toHaveLength(0); + }); + + it('an OUT-OF-RANGE binding index is rejected per element (would template undefined into ids)', () => { + const bad = factCfg([ + { index: 0, startLine: 1, endLine: 1, text: '', kind: 'entry', statements: [] }, + { + index: 1, + startLine: 3, + endLine: 3, + text: '', + kind: 'exit', + statements: [{ line: 2, defs: [7], uses: [0] }], // 7 escapes the 1-entry table + }, + ]); + const graph = emitWith([bad, validCfg]); + // the malformed element is skipped with a warn; the valid sibling emits CFG + expect(rdEdges(graph)).toBe(0); + expect(cfgEdgeCount(graph)).toBeGreaterThan(0); + expect(warns().some((m) => m.includes('malformed'))).toBe(true); + }); + + it('statements WITHOUT a binding table are rejected (malformed by construction)', () => { + const noTable = { + ...(factCfg() as Record), + bindings: undefined, + }; + const graph = emitWith([noTable]); + expect(rdEdges(graph)).toBe(0); + expect(warns().some((m) => m.includes('malformed'))).toBe(true); + }); + + it('non-integer statement line / non-array defs are rejected per element', () => { + const badLine = factCfg([ + { + index: 0, + startLine: 1, + endLine: 1, + text: '', + kind: 'entry', + statements: [{ line: 'x', defs: [], uses: [] }], + }, + { index: 1, startLine: 3, endLine: 3, text: '', kind: 'exit', statements: [] }, + ]); + const badDefs = factCfg([ + { + index: 0, + startLine: 1, + endLine: 1, + text: '', + kind: 'entry', + statements: [{ line: 2, defs: 'nope', uses: [] }], + }, + { index: 1, startLine: 3, endLine: 3, text: '', kind: 'exit', statements: [] }, + ]); + for (const bad of [badLine, badDefs]) { + const graph = emitWith([bad]); + expect(rdEdges(graph)).toBe(0); + } + expect(warns().some((m) => m.includes('malformed'))).toBe(true); + }); + + it('a pre-M2 channel (no bindings, no statements) still passes — CFG emits, REACHING_DEF skips', () => { + const graph = emitWith([validCfg]); + expect(cfgEdgeCount(graph)).toBeGreaterThan(0); + expect(rdEdges(graph)).toBe(0); + expect(warns()).toHaveLength(0); + }); +}); diff --git a/gitnexus/test/unit/pdg-mode-flip.test.ts b/gitnexus/test/unit/pdg-mode-flip.test.ts index 1000ef9520..086ac1864b 100644 --- a/gitnexus/test/unit/pdg-mode-flip.test.ts +++ b/gitnexus/test/unit/pdg-mode-flip.test.ts @@ -34,6 +34,32 @@ async function countBasicBlocks(repoPath: string): Promise { } } +describe('pdgModeMismatch — M1→M2 stamp upgrade (#2082 M2, pure)', () => { + it('an M1-era stamp (no REACHING_DEF cap) mismatches an M2 request — upgrade forces full writeback', async () => { + const { pdgModeMismatch } = await import('../../src/core/run-analyze.js'); + const m1Stamp = { maxFunctionLines: 2000, maxEdgesPerFunction: 5000 }; + // default M2 request resolves maxReachingDefEdgesPerFunction=4000 ≠ undefined + expect(pdgModeMismatch(m1Stamp, { pdg: true })).toBe(true); + }); + + it('an identical resolved M2 config compares equal (steady state keeps incremental)', async () => { + const { pdgModeMismatch, resolvePdgConfig } = await import('../../src/core/run-analyze.js'); + const stamp = resolvePdgConfig({ pdg: true }); + expect(pdgModeMismatch(stamp, { pdg: true })).toBe(false); + }); + + it('a REACHING_DEF cap change alone trips the mismatch', async () => { + const { pdgModeMismatch, resolvePdgConfig } = await import('../../src/core/run-analyze.js'); + const stamp = resolvePdgConfig({ pdg: true }); + expect(pdgModeMismatch(stamp, { pdg: true, pdgMaxReachingDefEdgesPerFunction: 100 })).toBe( + true, + ); + expect(pdgModeMismatch(stamp, { pdg: true, pdgMaxReachingDefEdgesPerFunction: 4000 })).toBe( + false, // explicit default ≡ default (resolution before comparison) + ); + }); +}); + describe('runFullAnalysis — pdg-mode flip (#2099 F1)', () => { it('off→on flip forces a full writeback that persists the CFG layer; on→off removes it', async () => { const repo = await setupMiniRepo(); @@ -57,7 +83,11 @@ describe('runFullAnalysis — pdg-mode flip (#2099 F1)', () => { expect(logs.some((m) => m.includes('pdg mode changed'))).toBe(true); expect(await countBasicBlocks(repo.dbPath)).toBeGreaterThan(0); const stamped = await loadMeta(storagePath); - expect(stamped!.pdg).toEqual({ maxFunctionLines: 2000, maxEdgesPerFunction: 5000 }); + expect(stamped!.pdg).toEqual({ + maxFunctionLines: 2000, + maxEdgesPerFunction: 5000, + maxReachingDefEdgesPerFunction: 4000, + }); expect(stamped!.incrementalInProgress).toBeUndefined(); // cleared on success // 3. Steady state: a second identical --pdg run takes the fast path — @@ -105,6 +135,7 @@ describe('runFullAnalysis — pdg-mode flip (#2099 F1)', () => { expect((await loadMeta(storagePath))!.pdg).toEqual({ maxFunctionLines: 2000, maxEdgesPerFunction: 1, + maxReachingDefEdgesPerFunction: 4000, }); // The CFG layer survives a rebuild under a tighter edge cap (blocks are // never capped, only edges). From f9a1443d25a2a027f61cc0108ba78fb13c3cb15a Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Wed, 10 Jun 2026 21:01:21 +0000 Subject: [PATCH 05/19] test(cfg): REACHING_DEF snapshot, pipeline both-sinks, and cache-seam coverage (#2082 U5) --- .../__snapshots__/cfg-snapshot.test.ts.snap | 35 ++++- .../reaching-defs-snapshot.test.ts.snap | 120 +++++++++++++++++ .../test/integration/cfg/cfg-snapshot.test.ts | 7 +- .../integration/cfg/fixtures/ten-functions.ts | 28 ++++ .../integration/cfg/parse-cache-mixed.test.ts | 121 ++++++++++++++++++ .../test/integration/cfg/pipeline-pdg.test.ts | 25 +++- .../cfg/reaching-defs-snapshot.test.ts | 99 ++++++++++++++ 7 files changed, 426 insertions(+), 9 deletions(-) create mode 100644 gitnexus/test/integration/cfg/__snapshots__/reaching-defs-snapshot.test.ts.snap create mode 100644 gitnexus/test/integration/cfg/parse-cache-mixed.test.ts create mode 100644 gitnexus/test/integration/cfg/reaching-defs-snapshot.test.ts diff --git a/gitnexus/test/integration/cfg/__snapshots__/cfg-snapshot.test.ts.snap b/gitnexus/test/integration/cfg/__snapshots__/cfg-snapshot.test.ts.snap index 092da4de90..2c0393cb75 100644 --- a/gitnexus/test/integration/cfg/__snapshots__/cfg-snapshot.test.ts.snap +++ b/gitnexus/test/integration/cfg/__snapshots__/cfg-snapshot.test.ts.snap @@ -1,6 +1,6 @@ // Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html -exports[`U7 — AC1: 10-function fixture CFG snapshot > matches the committed CFG node/edge set 1`] = ` +exports[`U7 — AC1: ten-functions fixture CFG snapshot > matches the committed CFG node/edge set 1`] = ` [ { "blocks": 3, @@ -150,5 +150,38 @@ exports[`U7 — AC1: 10-function fixture CFG snapshot > matches the committed CF "exit": 1, "startLine": 87, }, + { + "blocks": 8, + "edges": [ + "0->2:seq", + "2->4:seq", + "3->1:finally-return", + "3->1:throw", + "3->7:seq", + "4->3:throw", + "4->5:cond-true", + "4->6:seq", + "5->3:return", + "5->3:throw", + "6->3:seq", + "6->3:throw", + "7->1:return", + ], + "entry": 0, + "exit": 1, + "startLine": 102, + }, + { + "blocks": 5, + "edges": [ + "0->2:seq", + "2->3:seq", + "3->4:seq", + "4->1:seq", + ], + "entry": 0, + "exit": 1, + "startLine": 115, + }, ] `; diff --git a/gitnexus/test/integration/cfg/__snapshots__/reaching-defs-snapshot.test.ts.snap b/gitnexus/test/integration/cfg/__snapshots__/reaching-defs-snapshot.test.ts.snap new file mode 100644 index 0000000000..22ae17cf5f --- /dev/null +++ b/gitnexus/test/integration/cfg/__snapshots__/reaching-defs-snapshot.test.ts.snap @@ -0,0 +1,120 @@ +// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html + +exports[`R5 — REACHING_DEF facts snapshot on the M1 fixture > matches the committed fact set for every fixture function 1`] = ` +[ + { + "defs": 0, + "facts": [], + "startLine": 9, + "status": "computed", + "uses": 2, + }, + { + "defs": 1, + "facts": [ + "0:0->2:0:x:14:23", + ], + "startLine": 14, + "status": "computed", + "uses": 4, + }, + { + "defs": 1, + "facts": [ + "0:0->2:0:x:23:27", + "0:0->4:0:x:23:27", + ], + "startLine": 23, + "status": "computed", + "uses": 6, + }, + { + "defs": 1, + "facts": [ + "0:0->2:0:x:34:26", + ], + "startLine": 34, + "status": "computed", + "uses": 3, + }, + { + "defs": 3, + "facts": [ + "0:0->2:0:n:41:24", + "4:0->2:0:i:42:11", + "4:0->4:0:i:42:11", + "6:0->2:0:i:42:11", + "6:0->4:0:i:42:11", + ], + "startLine": 41, + "status": "computed", + "uses": 5, + }, + { + "defs": 2, + "facts": [ + "0:0->2:0:xs:48:26", + "2:0->4:0:x:49:13", + ], + "startLine": 48, + "status": "computed", + "uses": 4, + }, + { + "defs": 1, + "facts": [ + "0:0->2:0:x:55:27", + ], + "startLine": 55, + "status": "computed", + "uses": 5, + }, + { + "defs": 1, + "facts": [], + "startLine": 69, + "status": "computed", + "uses": 4, + }, + { + "defs": 1, + "facts": [ + "0:0->2:0:x:80:27", + ], + "startLine": 80, + "status": "computed", + "uses": 1, + }, + { + "defs": 2, + "facts": [ + "0:0->2:0:xs:87:27", + "2:0->4:0:x:88:13", + ], + "startLine": 87, + "status": "computed", + "uses": 5, + }, + { + "defs": 3, + "facts": [ + "0:0->4:0:flag:102:37", + "2:0->5:0:val:103:6", + "3:0->7:0:val:103:6", + ], + "startLine": 102, + "status": "computed", + "uses": 5, + }, + { + "defs": 2, + "facts": [ + "2:0->4:0:s:116:6", + "3:0->3:1:s:118:8", + ], + "startLine": 115, + "status": "computed", + "uses": 4, + }, +] +`; diff --git a/gitnexus/test/integration/cfg/cfg-snapshot.test.ts b/gitnexus/test/integration/cfg/cfg-snapshot.test.ts index 138fe4f301..116a884a70 100644 --- a/gitnexus/test/integration/cfg/cfg-snapshot.test.ts +++ b/gitnexus/test/integration/cfg/cfg-snapshot.test.ts @@ -69,16 +69,17 @@ function reaches(adj: Map, from: string, to: string): boolean return seen.has(to); } -describe('U7 — AC1: 10-function fixture CFG snapshot', () => { +describe('U7 — AC1: ten-functions fixture CFG snapshot', () => { it('matches the committed CFG node/edge set', () => { const cfgs = cfgsOfFile('ten-functions.ts'); - expect(cfgs).toHaveLength(10); + // 10 M1 functions + 2 M2 additions (early-exit finally, shadowing — #2082 U5) + expect(cfgs).toHaveLength(12); expect(cfgs.map(serialize)).toMatchSnapshot(); }); }); describe('U7 — AC2: every BasicBlock reachable from its function ENTRY', () => { - it('holds for all ten functions (no dead code in the fixture)', () => { + it('holds for all fixture functions (no dead code in the fixture)', () => { const cfgs = cfgsOfFile('ten-functions.ts'); const { graph, nodeIds, rels } = recordingGraph(); emitFileCfgs(graph, cfgs); diff --git a/gitnexus/test/integration/cfg/fixtures/ten-functions.ts b/gitnexus/test/integration/cfg/fixtures/ten-functions.ts index 5a7a6db2a6..c799ff466a 100644 --- a/gitnexus/test/integration/cfg/fixtures/ten-functions.ts +++ b/gitnexus/test/integration/cfg/fixtures/ten-functions.ts @@ -95,6 +95,32 @@ export function withNested(xs: number[]): void { end(); } +// M2 additions (#2082 U5): an early-exit-through-finally and a shadowing case — +// the two reaching-defs acceptance shapes the original ten functions lacked. +// Their CFG topology exercises U2's finally threading; their facts pin R4/R9. + +export function withEarlyExitFinally(flag: boolean): number { + let val = 1; + try { + if (flag) { + return probe(val); + } + work(); + } finally { + val = 2; + } + return val; +} + +export function withShadowing(): void { + let s = 1; + { + let s = 2; + use(s); + } + done2(s); +} + declare function a(): void; declare function b(): void; declare function c(): void; @@ -113,3 +139,5 @@ declare function after(): void; declare function p(): void; declare function q(): void; declare function end(): void; +declare function probe(n: number): number; +declare function done2(n: number): void; diff --git a/gitnexus/test/integration/cfg/parse-cache-mixed.test.ts b/gitnexus/test/integration/cfg/parse-cache-mixed.test.ts new file mode 100644 index 0000000000..b02806f01b --- /dev/null +++ b/gitnexus/test/integration/cfg/parse-cache-mixed.test.ts @@ -0,0 +1,121 @@ +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { + getDurableParsedFileDir, + persistDurableParsedFileShardSync, + restoreDurableParsedFileShard, + loadParsedFilesForPaths, +} from '../../../src/storage/parsedfile-store.js'; +import type { ParsedFile } from 'gitnexus-shared'; +import type { FunctionCfg } from '../../../src/core/ingestion/cfg/types.js'; + +// #2082 M2 U5 — the warm/mixed cache seam for statement facts. On a warm (or +// mixed) run the unchanged chunk's ParsedFiles are BYTE-COPIED from the +// durable store instead of re-parsed (#2038); if that copy (or the store's +// interning reviver) dropped or aliased the new `bindings`/`statements` +// fields, reaching-defs would silently degrade to `no-facts` for every cached +// file — exactly the field-loss class the #2038 mergeChunkResults lesson +// warns about. This pins the persist → restore → load round-trip at the exact +// seam scope-resolution consumes. + +const factCfg: FunctionCfg = { + filePath: 'src/a.ts', + functionStartLine: 1, + functionEndLine: 5, + functionStartColumn: 0, + entryIndex: 0, + exitIndex: 1, + blocks: [ + { index: 0, startLine: 1, endLine: 1, text: '', kind: 'entry', statements: [] }, + { + index: 1, + startLine: 5, + endLine: 5, + text: '', + kind: 'exit', + statements: [ + { line: 2, defs: [0], uses: [] }, + { line: 3, defs: [1], uses: [0] }, + ], + }, + ], + edges: [{ from: 0, to: 1, kind: 'seq' }], + bindings: [ + { name: 'x', declLine: 2, declColumn: 6, kind: 'let' }, + { name: 'y', declLine: 3, declColumn: 6, kind: 'const' }, + ], +}; + +const mkParsedFile = (filePath: string): ParsedFile => + ({ + filePath, + moduleScope: '', + scopes: [], + parsedImports: [], + localDefs: [], + referenceSites: [], + cfgSideChannel: [factCfg], + }) as unknown as ParsedFile; + +describe('durable ParsedFile store carries M2 statement facts (#2082 U5)', () => { + let tempDir = ''; + beforeEach(() => { + tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'm2-facts-store-')); + }); + afterEach(() => { + if (tempDir) fs.rmSync(tempDir, { recursive: true, force: true }); + }); + + it('persist → restore → loadParsedFilesForPaths preserves bindings + statements deep-equal', async () => { + const durableDir = getDurableParsedFileDir(tempDir); + const chunkHash = 'c'.repeat(64); + const files = ['src/a.ts', 'src/b.ts']; + + // What a worker writes at flush on a cache MISS (the cold half of a + // mixed-mode run)… + persistDurableParsedFileShardSync(durableDir, chunkHash, 7, 0, files.map(mkParsedFile)); + // …and what a warm HIT byte-copies into the run-scoped store. + await restoreDurableParsedFileShard(durableDir, tempDir, chunkHash); + + const loaded = await loadParsedFilesForPaths(tempDir, new Set(files)); + expect(loaded.size).toBe(2); + for (const filePath of files) { + const pf = loaded.get(filePath); + expect(pf).toBeDefined(); + const channel = (pf as { cfgSideChannel?: unknown }).cfgSideChannel; + expect(Array.isArray(channel)).toBe(true); + const cfg = (channel as FunctionCfg[])[0]; + // deep-equal: the interning reviver may dedup strings/objects but the + // VALUES must be intact — and no aliasing may merge the two files' + // distinct fact arrays into wrong shapes. + expect(cfg.bindings).toEqual(factCfg.bindings); + expect(cfg.blocks.map((b) => b.statements)).toEqual( + factCfg.blocks.map((b) => b.statements), + ); + } + }); + + it('facts survive even when two files share identical binding tables (reviver dedup safety)', async () => { + // The store reviver interns strings and dedups objects keyed on `nodeId` + // presence — BindingEntry/StatementFacts deliberately carry no such field, + // so dedup must never alias-then-mutate across files. Two files with + // byte-identical channels is the worst case. + const durableDir = getDurableParsedFileDir(tempDir); + const chunkHash = 'd'.repeat(64); + persistDurableParsedFileShardSync(durableDir, chunkHash, 7, 0, [ + mkParsedFile('src/same1.ts'), + mkParsedFile('src/same2.ts'), + ]); + await restoreDurableParsedFileShard(durableDir, tempDir, chunkHash); + const loaded = await loadParsedFilesForPaths( + tempDir, + new Set(['src/same1.ts', 'src/same2.ts']), + ); + const c1 = (loaded.get('src/same1.ts') as { cfgSideChannel?: FunctionCfg[] }).cfgSideChannel; + const c2 = (loaded.get('src/same2.ts') as { cfgSideChannel?: FunctionCfg[] }).cfgSideChannel; + expect(c1?.[0].bindings).toEqual(factCfg.bindings); + expect(c2?.[0].bindings).toEqual(factCfg.bindings); + }); +}); diff --git a/gitnexus/test/integration/cfg/pipeline-pdg.test.ts b/gitnexus/test/integration/cfg/pipeline-pdg.test.ts index 35666fe7dd..69ff6677f1 100644 --- a/gitnexus/test/integration/cfg/pipeline-pdg.test.ts +++ b/gitnexus/test/integration/cfg/pipeline-pdg.test.ts @@ -13,16 +13,22 @@ import type { PipelineResult } from '../../../src/types/pipeline.js'; const FIXTURE = path.join(__dirname, 'fixtures', 'pdg-repo'); -function counts(result: PipelineResult): { basicBlocks: number; cfgEdges: number } { +function counts(result: PipelineResult): { + basicBlocks: number; + cfgEdges: number; + reachingDefs: number; +} { let basicBlocks = 0; result.graph.forEachNode((n) => { if (n.label === 'BasicBlock') basicBlocks++; }); let cfgEdges = 0; + let reachingDefs = 0; for (const rel of result.graph.iterRelationships()) { if (rel.type === 'CFG') cfgEdges++; + if (rel.type === 'REACHING_DEF') reachingDefs++; } - return { basicBlocks, cfgEdges }; + return { basicBlocks, cfgEdges, reachingDefs }; } const tmpDirs: string[] = []; @@ -40,25 +46,34 @@ describe('U7 — end-to-end --pdg pipeline', () => { it('with --pdg on: emits BasicBlock nodes + CFG edges into the graph', async () => { const result = await runPipelineFromRepo(freshRepo(), () => {}, { pdg: true }); - const { basicBlocks, cfgEdges } = counts(result); + const { basicBlocks, cfgEdges, reachingDefs } = counts(result); expect(basicBlocks).toBeGreaterThan(0); expect(cfgEdges).toBeGreaterThan(0); + // M2 (#2082 U5): the def→use projection rides the same gate — the fixture + // has a loop-carried accumulator (`sum`), so facts must exist. + expect(reachingDefs).toBeGreaterThan(0); // CFG edges connect BasicBlocks to BasicBlocks — both endpoints exist. const blockIds = new Set(); result.graph.forEachNode((n) => { if (n.label === 'BasicBlock') blockIds.add(n.id); }); for (const rel of result.graph.iterRelationships()) { - if (rel.type !== 'CFG') continue; + if (rel.type !== 'CFG' && rel.type !== 'REACHING_DEF') continue; expect(blockIds.has(rel.sourceId)).toBe(true); expect(blockIds.has(rel.targetId)).toBe(true); + if (rel.type === 'REACHING_DEF') { + // reason carries the plain variable name (M0/S1 verdict) + expect(typeof rel.reason).toBe('string'); + expect(rel.reason.length).toBeGreaterThan(0); + } } }, 60000); it('with --pdg off (default): emits zero BasicBlock nodes and zero CFG edges', async () => { const result = await runPipelineFromRepo(freshRepo(), () => {}); - const { basicBlocks, cfgEdges } = counts(result); + const { basicBlocks, cfgEdges, reachingDefs } = counts(result); expect(basicBlocks).toBe(0); expect(cfgEdges).toBe(0); + expect(reachingDefs).toBe(0); }, 60000); }); diff --git a/gitnexus/test/integration/cfg/reaching-defs-snapshot.test.ts b/gitnexus/test/integration/cfg/reaching-defs-snapshot.test.ts new file mode 100644 index 0000000000..406f4fb735 --- /dev/null +++ b/gitnexus/test/integration/cfg/reaching-defs-snapshot.test.ts @@ -0,0 +1,99 @@ +import { describe, it, expect } from 'vitest'; +import fs from 'fs'; +import path from 'path'; +import Parser from 'tree-sitter'; +import TypeScript from 'tree-sitter-typescript'; +import { collectFunctionCfgs } from '../../../src/core/ingestion/cfg/collect.js'; +import { computeReachingDefs } from '../../../src/core/ingestion/cfg/reaching-defs.js'; +import { getProvider } from '../../../src/core/ingestion/languages/index.js'; +import { SupportedLanguages } from '../../../src/config/supported-languages.js'; +import type { FunctionCfg } from '../../../src/core/ingestion/cfg/types.js'; + +// #2082 M2 R5 acceptance: a committed snapshot of the REACHING_DEF facts on +// the M1 fixture (extended in U5 with the early-exit-finally + shadowing +// functions). The serialization is deterministic — sorted fact strings keyed +// by program points + binding identity — so any solver/harvest behavior +// change shows as a reviewable snapshot diff, never silent drift. + +const FIXTURES = path.join(__dirname, 'fixtures'); + +function cfgsOfFile(file: string): readonly FunctionCfg[] { + const visitor = getProvider(SupportedLanguages.TypeScript).cfgVisitor; + if (!visitor) throw new Error('no cfgVisitor'); + const source = fs.readFileSync(path.join(FIXTURES, file), 'utf8'); + const parser = new Parser(); + parser.setLanguage(TypeScript.typescript); + return collectFunctionCfgs(parser.parse(source).rootNode, visitor, file).cfgs; +} + +/** Deterministic rendering: defBlock:stmt->useBlock:stmt:bindingKey */ +function serialize(cfg: FunctionCfg): Record { + const r = computeReachingDefs(cfg); + const key = (idx: number): string => { + const b = r.bindings[idx]; + return b.synthetic ? `${b.name}@module` : `${b.name}:${b.declLine}:${b.declColumn}`; + }; + return { + startLine: cfg.functionStartLine, + status: r.status, + defs: r.defCount, + uses: r.useCount, + facts: r.facts.map( + (f) => + `${f.def.blockIndex}:${f.def.stmtIndex}->${f.use.blockIndex}:${f.use.stmtIndex}:${key(f.bindingIdx)}`, + ), + }; +} + +describe('R5 — REACHING_DEF facts snapshot on the M1 fixture', () => { + it('matches the committed fact set for every fixture function', () => { + const cfgs = cfgsOfFile('ten-functions.ts'); + expect(cfgs).toHaveLength(12); + expect(cfgs.map(serialize)).toMatchSnapshot(); + }); + + it('every fixture function computes (no truncation at default limits, no no-facts)', () => { + for (const cfg of cfgsOfFile('ten-functions.ts')) { + const r = computeReachingDefs(cfg); + expect(r.status).toBe('computed'); + } + }); + + it('acceptance shapes: the finally redefinition and the shadowed binding behave per R4/R9', () => { + const cfgs = cfgsOfFile('ten-functions.ts'); + const byLine = new Map(cfgs.map((c) => [c.functionStartLine, c])); + + // withEarlyExitFinally — `val = 2` (finally) is the ONLY def reaching the + // post-try return; the early return's use sees the original `val = 1`. + const early = [...byLine.values()].find((c) => + c.blocks.some((b) => b.text.includes('return probe(val)')), + )!; + const re = computeReachingDefs(early); + const val = re.bindings.findIndex((b) => b.name === 'val'); + const probeUses = re.facts.filter( + (f) => f.bindingIdx === val && early.blocks[f.use.blockIndex].text.includes('probe'), + ); + const finalUses = re.facts.filter( + (f) => f.bindingIdx === val && early.blocks[f.use.blockIndex].text.includes('return val'), + ); + expect(probeUses).toHaveLength(1); + expect(early.blocks[probeUses[0].def.blockIndex].text).toContain('let val = 1'); + expect(finalUses).toHaveLength(1); + expect(early.blocks[finalUses[0].def.blockIndex].text).toContain('val = 2'); + + // withShadowing — two distinct `s` bindings; each use resolves to its own. + const shadow = [...byLine.values()].find((c) => + c.blocks.some((b) => b.text.includes('done2(s)')), + )!; + const rs = computeReachingDefs(shadow); + const sBindings = rs.bindings.filter((b) => b.name === 's'); + expect(sBindings).toHaveLength(2); + const factsByBinding = new Map(); + for (const f of rs.facts) { + factsByBinding.set(f.bindingIdx, (factsByBinding.get(f.bindingIdx) ?? 0) + 1); + } + // each s binding has exactly one use fact (no cross-kill, no cross-reach) + const sIdxs = rs.bindings.map((b, i) => (b.name === 's' ? i : -1)).filter((i) => i >= 0); + for (const idx of sIdxs) expect(factsByBinding.get(idx)).toBe(1); + }); +}); From ac8968c7798432562f477d75ba23fc9fa0948368 Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Wed, 10 Jun 2026 21:07:38 +0000 Subject: [PATCH 06/19] =?UTF-8?q?bench(cfg):=20reaching-defs=20scaling=20g?= =?UTF-8?q?ates=20=E2=80=94=20dense-bindings=20+=20fact-fanout=20scenarios?= =?UTF-8?q?=20(#2082=20U6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gitnexus/bench/cfg/baselines.json | 35 ++++-- gitnexus/bench/cfg/measure.mjs | 104 +++++++++++++++++- .../core/ingestion/cfg/visitors/typescript.ts | 10 ++ 3 files changed, 140 insertions(+), 9 deletions(-) diff --git a/gitnexus/bench/cfg/baselines.json b/gitnexus/bench/cfg/baselines.json index f830cc6805..fbbe267961 100644 --- a/gitnexus/bench/cfg/baselines.json +++ b/gitnexus/bench/cfg/baselines.json @@ -1,23 +1,44 @@ { "straight-line": { - "fingerprint": "f5524690b5b7d484573710938c5e9a28e08ef0882fea95111f01575c71f4a66a", + "fingerprint": "792229965a726d2c6b527f9ee65440a2b3023839ee71cb51522fc30e2f2cb454", "scaling_budget": 1.5, "disk_bytes_budget": 1.2, "heap_budget": 1.3, - "_note": "#2081 M1: ONE function, N coalescing statements (extendBlock text accumulation). Runs at 2000->8000 (larger than the other scenarios — output is constant 4 blocks, so disk/heap can't see this path; the TIME ratio is the sole guard). Verified at this N: the array-join impl is ~1.0, a V8-rope-optimized `+=` is also ~1.0 (correctly NOT a real regression — ropes keep naive concat linear), but a genuine O(n²) accumulation (e.g. re-join-the-array-every-append) is ~3.8 — so budget 1.5 catches a true superlinear regression while passing linear concat. disk ~1.03, retained heap ~0.98. Re-baseline the fingerprint only on an intentional CFG-shape change." + "rd_scaling_budget": 2.0, + "disk_bytes_large_max": 1309481, + "_note": "#2081 M1 / #2082 M2: ONE function, N coalescing statements (extendBlock text accumulation + per-statement fact harvest). Runs at 2000->8000. M2 REWROTE the old 'output is constant 4 blocks' note: statement facts make disk/heap LINEAR in N (a free gate on the harvest payload); TIME still guards the concat path (array-join ~1.0; a genuine O(n^2) re-join accumulation is ~3.8). M2 adds rd_scaling_budget (measured ~0.74) and disk_bytes_large_max -- an ABSOLUTE ceiling ~1.35x the measured indexed-encoding bytes (969,986 at N=8000, ~121 B/stmt); a named-record encoding regression (~4x facts bytes) blows it. Re-baseline the fingerprint only on an intentional CFG/harvest-shape change (the canon now includes statements+bindings)." }, "many-functions": { - "fingerprint": "c167ccd83086254e2b71eca153ca4a833be14b2d2a3827ab76b49f643aad13d5", + "fingerprint": "f3bcc5e6ef4cf58aefe4e7d801a8fea0215494b9688833e501c2afc6df029c1b", "scaling_budget": 1.5, "disk_bytes_budget": 1.2, "heap_budget": 1.3, - "_note": "#2081 M1: N small branchy functions (collect walk + per-function build). Time ~1.0, disk ~1.01, retained heap ~1.0 (~1KB/function; ~2MB at 2000 fns)." + "rd_scaling_budget": 2.0, + "_note": "#2081 M1 / #2082 M2: N small branchy functions (collect walk + per-function build + per-function solve). Time ~1.0, disk ~1.01, heap ~1.0, rd ~0.86 (solver is per-function; N functions scale linearly)." }, "branchy": { - "fingerprint": "944ab56ffc70e195f74d8533a8aadf4930d37d13bcfa47cc4feff29e74ddca5c", + "fingerprint": "5b5886521ab21604df8f78af98c8c28a6be8e64c24f3d67b165c2d96ba2a3d52", "scaling_budget": 1.8, "disk_bytes_budget": 1.2, "heap_budget": 1.3, - "_note": "#2081 M1: ONE function, N sequential ifs (block/edge growth in one CFG). Time ~1.1-1.25 (REPS=15 median; noisiest scenario), disk ~1.04, retained heap ~1.0. Time budget 1.8 absorbs noise while catching ~4.0 quadratic." + "rd_scaling_budget": 2.0, + "_note": "#2081 M1 / #2082 M2: ONE function, N sequential ifs (block/edge growth in one CFG). Time ~1.1-1.25 (noisiest scenario; budget 1.8 absorbs noise, catches ~4.0 quadratic), disk ~1.03, heap ~1.0, rd ~0.7." + }, + "dense-bindings": { + "fingerprint": "e4d7eb3c7e8b3772423af25cef391e0e6b68067b554819e81b543439a487403f", + "scaling_budget": 1.8, + "disk_bytes_budget": 1.2, + "heap_budget": 1.3, + "rd_scaling_budget": 10.0, + "_note": "#2082 M2: N bindings live across ~N blocks in one loop -- bindings x blocks scale JOINTLY (the solver-lattice stressor). The overlay design measures rd ~5.2 normalized: the OUT spine copy on genning blocks is O(V) per block, which is quadratic when V scales with B (bounded in prod by maxFunctionLines; real functions have V~10-40). Budget 10 deliberately tolerates that known shape and exists to catch the repo's recurring per-item-rescan class (a per-use scan over all defs is O(n^3) here, ratio >=16). If rd drops well below 5, tighten." + }, + "fact-fanout": { + "fingerprint": "488e63e072d514a9229e21872615e32c7b099ccbd65ec8c045ba517568fd3e5d", + "scaling_budget": 1.8, + "disk_bytes_budget": 1.2, + "heap_budget": 1.3, + "rd_scaling_budget": 3.0, + "facts_large_max": 16000, + "_note": "#2082 M2: N switch-arm defs of one variable + N later uses -- facts are O(defs x uses) BY SPEC, so the gate is BOUNDEDNESS, not linearity: with the production fact limit engaged (DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION=16000) the materialized fact count stays pinned at the limit as N grows (facts_large_max), and rd time stays bounded (measured ~1.4). Losing the maxFacts early-stop shows as facts_large exploding quadratically." } -} +} \ No newline at end of file diff --git a/gitnexus/bench/cfg/measure.mjs b/gitnexus/bench/cfg/measure.mjs index 115c878a8e..fdeda62b72 100644 --- a/gitnexus/bench/cfg/measure.mjs +++ b/gitnexus/bench/cfg/measure.mjs @@ -44,6 +44,10 @@ import { fileURLToPath } from 'node:url'; import Parser from 'tree-sitter'; import TypeScript from 'tree-sitter-typescript'; import { collectFunctionCfgs } from '../../src/core/ingestion/cfg/collect.ts'; +import { computeReachingDefs } from '../../src/core/ingestion/cfg/reaching-defs.ts'; +import { + DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION, +} from '../../src/core/ingestion/cfg/emit.ts'; import { createTypeScriptCfgVisitor } from '../../src/core/ingestion/cfg/visitors/typescript.ts'; import { getTreeSitterBufferSize } from '../../src/core/ingestion/constants.ts'; @@ -102,6 +106,43 @@ const SCENARIOS = [ return s + '}\n'; }, }, + { + name: 'dense-bindings', + // #2082 M2: N bindings live across ~N blocks inside one loop — bindings × + // blocks scale JOINTLY, the discriminator for solver-lattice quadratics. + // The overlay design (KTD2: sets shared by reference, OUT spine-copied + // only on gen) is expected to scale ~linearly-with-a-spine-copy here + // (normalized ratio low single digits); the regression this scenario + // exists to catch is the repo's recurring per-item-rescan shape — a + // per-use scan over all defs (O(n³) here) blows the ratio past ~16. + // rd time is the gated metric (rd_scaling_budget). + rdMaxFacts: 0, // measure the algorithm, not the cap + gen: (n) => { + let s = 'function f(c: number) {\n'; + for (let i = 0; i < n; i++) s += ` let v${i} = ${i};\n`; + s += ' while (c > 0) {\n'; + for (let i = 0; i < n; i++) s += ` if (c > ${i}) { v${i} = v${(i + 1) % n} + 1; }\n`; + return s + ' c = c - 1;\n }\n return v0;\n}\n'; + }, + }, + { + name: 'fact-fanout', + // #2082 M2: N parallel case-arm defs of one variable + N later uses — + // facts are O(defs×uses) BY SPEC, so a linearity ratio gate is the wrong + // shape. The gate here is BOUNDEDNESS: with the production fact limit + // engaged, the materialized fact count stays FLAT (== limit) as N grows + // past it (facts_large_max), and rd time stays bounded. An unbounded + // materialization regression (losing the maxFacts early-stop) shows as + // facts_large exploding quadratically. + rdMaxFacts: DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION, + gen: (n) => { + let s = 'function f(c: number) {\n let x = 0;\n switch (c) {\n'; + for (let i = 0; i < n; i++) s += ` case ${i}: x = ${i}; break;\n`; + s += ' }\n'; + for (let i = 0; i < n; i++) s += ` u${i}(x);\n`; + return s + '}\n'; + }, + }, ]; const SMALL = 500; @@ -130,6 +171,7 @@ function measureCollect(src, file, reps) { } return { ms: median(samples), + cfgs: out.cfgs, blockCount: out.cfgs.reduce((a, c) => a + c.blocks.length, 0), // DISK growth: utf8 byte size of the serialized cfgSideChannel — exactly // what a --pdg run writes onto every ParsedFile shard in the durable store @@ -140,6 +182,25 @@ function measureCollect(src, file, reps) { }; } +// ---- reaching-defs solve cost (#2082 M2) ---- + +// Times computeReachingDefs over a scenario's collected CFGs (the exact work +// the scope-resolution emit loop adds per file on a --pdg run). `maxFacts` +// mirrors the per-scenario production posture: 0 (unlimited) measures the +// algorithm; the production default exercises the boundedness contract. +function measureReachingDefs(cfgs, reps, maxFacts) { + for (const c of cfgs) computeReachingDefs(c, { maxFacts }); // warm JIT + const samples = []; + let facts = 0; + for (let i = 0; i < reps; i++) { + const start = process.hrtime.bigint(); + facts = 0; + for (const c of cfgs) facts += computeReachingDefs(c, { maxFacts }).facts.length; + samples.push(Number(process.hrtime.bigint() - start) / 1e6); + } + return { ms: median(samples), facts }; +} + // ---- memory growth: retained heap of the cfgSideChannel payload ---- // Needs `node --expose-gc` to force collection for a clean delta; without it the @@ -169,10 +230,17 @@ function retainedHeapBytes(src, file) { function canonicalizeCfg(cfg) { const blocks = cfg.blocks - .map((b) => `B|${b.index}|${b.startLine}-${b.endLine}|${b.kind}|${b.text}`) + .map( + (b) => + `B|${b.index}|${b.startLine}-${b.endLine}|${b.kind}|${b.text}|` + + // #2082 M2: statement facts join the canon so harvest drift (lost + // defs/uses, changed binding resolution) trips the fingerprint gate. + JSON.stringify(b.statements ?? null), + ) .sort(); const edges = cfg.edges.map((e) => `E|${e.from}->${e.to}|${e.kind}`).sort(); - return `${cfg.functionStartLine}:${cfg.functionStartColumn}\n${blocks.join('\n')}\n${edges.join('\n')}`; + const bindings = JSON.stringify(cfg.bindings ?? null); + return `${cfg.functionStartLine}:${cfg.functionStartColumn}\n${bindings}\n${blocks.join('\n')}\n${edges.join('\n')}`; } function fingerprint(scenario) { @@ -205,6 +273,12 @@ function measureScenario(scenario) { ? heapLarge / heapSmall / sizeRatio : null; + // #2082 M2: reaching-defs solve cost over the same CFGs. + const rdMaxFacts = scenario.rdMaxFacts ?? 0; + const rdSmall = measureReachingDefs(small.cfgs, REPS, rdMaxFacts); + const rdLarge = measureReachingDefs(large.cfgs, REPS, rdMaxFacts); + const rdRatio = rdSmall.ms > 0 ? rdLarge.ms / rdSmall.ms / sizeRatio : 0; + return { scenario: scenario.name, elapsed_ms_small: Number(small.ms.toFixed(3)), @@ -218,6 +292,11 @@ function measureScenario(scenario) { heap_ratio: heapRatio === null ? null : Number(heapRatio.toFixed(3)), blocks_small: small.blockCount, blocks_large: large.blockCount, + rd_ms_small: Number(rdSmall.ms.toFixed(3)), + rd_ms_large: Number(rdLarge.ms.toFixed(3)), + rd_scaling_ratio: Number(rdRatio.toFixed(3)), + facts_small: rdSmall.facts, + facts_large: rdLarge.facts, ...fingerprint(scenario), }; } @@ -267,6 +346,27 @@ if (!CHECK) { `${base.disk_bytes_budget} (bytes ${r.disk_bytes_small}->${r.disk_bytes_large})`, ); } + // #2082 M2 gates — rd solve-time scaling, fact-count boundedness, and an + // ABSOLUTE side-channel size ceiling (a ratio gate is blind to a + // constant-factor encoding bloat like named records vs indexed facts). + if (base.rd_scaling_budget !== undefined && r.rd_scaling_ratio >= base.rd_scaling_budget) { + failures.push( + `${r.scenario}: reaching-defs scaling ratio ${r.rd_scaling_ratio} >= budget ` + + `${base.rd_scaling_budget} (ms ${r.rd_ms_small}->${r.rd_ms_large})`, + ); + } + if (base.facts_large_max !== undefined && r.facts_large > base.facts_large_max) { + failures.push( + `${r.scenario}: fact materialization ${r.facts_large} > bound ${base.facts_large_max} ` + + `(the maxFacts early-stop is the boundedness contract)`, + ); + } + if (base.disk_bytes_large_max !== undefined && r.disk_bytes_large > base.disk_bytes_large_max) { + failures.push( + `${r.scenario}: cfgSideChannel absolute size ${r.disk_bytes_large} > ceiling ` + + `${base.disk_bytes_large_max} bytes (constant-factor encoding bloat)`, + ); + } // Heap gate only when measured (--expose-gc present) AND a budget exists. if ( base.heap_budget !== undefined && diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts index ec85433915..b4b030d84d 100644 --- a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts @@ -40,6 +40,16 @@ * rather than left as a dangling sink, and threads no finallys (target * unknown ⇒ crossed set unknown). Single-labeled loops/switches resolve * correctly, including across finallys. + * - Exceptional flow stays the sound over-approximation: EVERY protected-region + * block edges to the handler (an exception may fire mid-block), which + * over-supplies reaching-defs facts into `catch` — extra facts, never false + * kills. Per-leader throw precision is deliberately deferred (M3 decides). + * - Def/use harvest scope (#2082 M2, see typescript-harvest.ts for the full + * v1 semantics table): member/property writes are not scalar defs; nested + * function bodies are opaque in BOTH directions (writes to and reads of + * captured outer variables are invisible — callback flows are M4 territory); + * `case x:` test uses attach to the switch dispatch block (sound + * over-approximation of in-order case evaluation). * * Block/edge accounting and reachability are pinned in * `test/unit/cfg/cfg-builder.test.ts` (core) and From 157de5ebbbf21ac89579da09e92e4283a47be569 Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Wed, 10 Jun 2026 21:10:59 +0000 Subject: [PATCH 07/19] fix(mcp): exclude BasicBlock pseudo-symbols from detect_changes on pdg indexes (#2082 U7) --- gitnexus/src/mcp/local/local-backend.ts | 12 +++++++ gitnexus/test/unit/pdg-mode-flip.test.ts | 44 ++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/gitnexus/src/mcp/local/local-backend.ts b/gitnexus/src/mcp/local/local-backend.ts index 2abfdfbc9f..314ec73c27 100644 --- a/gitnexus/src/mcp/local/local-backend.ts +++ b/gitnexus/src/mcp/local/local-backend.ts @@ -2922,8 +2922,20 @@ export class LocalBackend { queryParams[`hunkEnd${i}`] = hunk.endLine; }); + // `n.name IS NOT NULL` excludes BasicBlock rows: on a --pdg index every + // edited function otherwise contributes N nameless BasicBlock + // pseudo-"symbols" (they carry filePath/start/end but the table has no + // name column), inflating changed_count and risk level with rows no + // consumer can act on (#2082 U7). Blocks are implementation substrate, + // not symbols — the owning Function row already represents the change. + // Filtering on the name column beats a label predicate here because + // `labels(n)[0]` is known to come back empty for several node types + // (see enrichCandidateLabels), and BasicBlock is the only line-bearing + // table without `name` (Community/Process lack it too but carry no + // startLine, so the existing filter already drops them). const symbolQuery = ` MATCH (n) WHERE n.filePath ENDS WITH $filePath + AND n.name IS NOT NULL AND n.startLine IS NOT NULL AND n.endLine IS NOT NULL AND (${overlapConditions}) RETURN n.id AS id, n.name AS name, labels(n)[0] AS type, diff --git a/gitnexus/test/unit/pdg-mode-flip.test.ts b/gitnexus/test/unit/pdg-mode-flip.test.ts index 086ac1864b..ae591a705a 100644 --- a/gitnexus/test/unit/pdg-mode-flip.test.ts +++ b/gitnexus/test/unit/pdg-mode-flip.test.ts @@ -60,6 +60,50 @@ describe('pdgModeMismatch — M1→M2 stamp upgrade (#2082 M2, pure)', () => { }); }); +describe('detect_changes BasicBlock exclusion (#2082 U7)', () => { + it('the symbol-overlap filter (name IS NOT NULL) excludes exactly the BasicBlock rows', async () => { + const repo = await setupMiniRepo(); + try { + const { runFullAnalysis } = await import('../../src/core/run-analyze.js'); + const cb = { onProgress: () => {}, onLog: () => {} }; + await runFullAnalysis(repo.dbPath, { skipAgentsMd: true, pdg: true }, cb); + + const adapter = await import('../../src/core/lbug/lbug-adapter.js'); + const { lbugPath } = getStoragePaths(repo.dbPath); + await adapter.initLbug(lbugPath); + try { + // Counterfactual: WITHOUT the U7 name filter, line-bearing nameless + // rows exist on a pdg index (the noise detect_changes used to report). + const nameless = (await adapter.executeQuery( + `MATCH (n) WHERE n.name IS NULL + AND n.startLine IS NOT NULL AND n.endLine IS NOT NULL + RETURN n.id AS id`, + )) as Array<{ id: string }>; + expect(nameless.length).toBeGreaterThan(0); + // …and every one of them is a BasicBlock — the filter excludes exactly + // the substrate rows, never a real symbol. + for (const row of nameless) { + expect(String(row.id)).toMatch(/^BasicBlock:/); + } + // With the U7 filter (what detectChanges now runs): zero BasicBlocks. + const symbols = (await adapter.executeQuery( + `MATCH (n) WHERE n.name IS NOT NULL + AND n.startLine IS NOT NULL AND n.endLine IS NOT NULL + RETURN n.id AS id`, + )) as Array<{ id: string }>; + expect(symbols.length).toBeGreaterThan(0); + for (const row of symbols) { + expect(String(row.id)).not.toMatch(/^BasicBlock:/); + } + } finally { + await adapter.closeLbug(); + } + } finally { + await repo.cleanup(); + } + }, 600_000); +}); + describe('runFullAnalysis — pdg-mode flip (#2099 F1)', () => { it('off→on flip forces a full writeback that persists the CFG layer; on→off removes it', async () => { const repo = await setupMiniRepo(); From cf360d4c8316fcf39f35033ebbaf60fd4ce8268c Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Wed, 10 Jun 2026 21:15:05 +0000 Subject: [PATCH 08/19] style: prettier pass over M2 files --- gitnexus/bench/cfg/baselines.json | 2 +- gitnexus/bench/cfg/measure.mjs | 4 +-- .../src/core/ingestion/cfg/cfg-builder.ts | 6 ++++- gitnexus/src/core/ingestion/cfg/emit.ts | 20 +++++++++++--- .../src/core/ingestion/cfg/reaching-defs.ts | 5 +--- .../cfg/visitors/typescript-harvest.ts | 3 ++- .../core/ingestion/cfg/visitors/typescript.ts | 7 ++++- .../integration/cfg/parse-cache-mixed.test.ts | 4 +-- gitnexus/test/unit/cfg/harvest.test.ts | 8 +++--- gitnexus/test/unit/cfg/reaching-defs.test.ts | 27 ++++++++++++++++--- .../test/unit/cfg/typescript-visitor.test.ts | 4 +-- gitnexus/test/unit/pdg-mode-flip.test.ts | 7 +++++ 12 files changed, 68 insertions(+), 29 deletions(-) diff --git a/gitnexus/bench/cfg/baselines.json b/gitnexus/bench/cfg/baselines.json index fbbe267961..7e3d718d83 100644 --- a/gitnexus/bench/cfg/baselines.json +++ b/gitnexus/bench/cfg/baselines.json @@ -41,4 +41,4 @@ "facts_large_max": 16000, "_note": "#2082 M2: N switch-arm defs of one variable + N later uses -- facts are O(defs x uses) BY SPEC, so the gate is BOUNDEDNESS, not linearity: with the production fact limit engaged (DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION=16000) the materialized fact count stays pinned at the limit as N grows (facts_large_max), and rd time stays bounded (measured ~1.4). Losing the maxFacts early-stop shows as facts_large exploding quadratically." } -} \ No newline at end of file +} diff --git a/gitnexus/bench/cfg/measure.mjs b/gitnexus/bench/cfg/measure.mjs index fdeda62b72..379fd2c382 100644 --- a/gitnexus/bench/cfg/measure.mjs +++ b/gitnexus/bench/cfg/measure.mjs @@ -45,9 +45,7 @@ import Parser from 'tree-sitter'; import TypeScript from 'tree-sitter-typescript'; import { collectFunctionCfgs } from '../../src/core/ingestion/cfg/collect.ts'; import { computeReachingDefs } from '../../src/core/ingestion/cfg/reaching-defs.ts'; -import { - DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION, -} from '../../src/core/ingestion/cfg/emit.ts'; +import { DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION } from '../../src/core/ingestion/cfg/emit.ts'; import { createTypeScriptCfgVisitor } from '../../src/core/ingestion/cfg/visitors/typescript.ts'; import { getTreeSitterBufferSize } from '../../src/core/ingestion/constants.ts'; diff --git a/gitnexus/src/core/ingestion/cfg/cfg-builder.ts b/gitnexus/src/core/ingestion/cfg/cfg-builder.ts index adf76a89b7..c8bdd2de1c 100644 --- a/gitnexus/src/core/ingestion/cfg/cfg-builder.ts +++ b/gitnexus/src/core/ingestion/cfg/cfg-builder.ts @@ -110,7 +110,11 @@ export class CfgBuilder { * sit at statement index 0 of the handler entry block or an in-block * `use(e)` at index 0 would see no reaching def in the in-order sweep. */ - attachFacts(index: number, facts: StatementFacts, position: 'append' | 'prepend' = 'append'): void { + attachFacts( + index: number, + facts: StatementFacts, + position: 'append' | 'prepend' = 'append', + ): void { const b = this.blocks[index]; if (!b) return; if (position === 'prepend') b.statements.unshift(facts); diff --git a/gitnexus/src/core/ingestion/cfg/emit.ts b/gitnexus/src/core/ingestion/cfg/emit.ts index 18afd897b8..2c619c16b8 100644 --- a/gitnexus/src/core/ingestion/cfg/emit.ts +++ b/gitnexus/src/core/ingestion/cfg/emit.ts @@ -289,7 +289,11 @@ export function emitFileReachingDefs( const key = `${f.def.blockIndex}:${f.use.blockIndex}:${f.bindingIdx}`; if (seen.has(key)) continue; seen.add(key); - deduped.push({ defBlock: f.def.blockIndex, useBlock: f.use.blockIndex, bindingIdx: f.bindingIdx }); + deduped.push({ + defBlock: f.def.blockIndex, + useBlock: f.use.blockIndex, + bindingIdx: f.bindingIdx, + }); } result.solveMs += performance.now() - t0; @@ -312,8 +316,18 @@ export function emitFileReachingDefs( break; } const binding = r.bindings[edge.bindingIdx]; - const sourceId = basicBlockId(filePath, functionStartLine, functionStartColumn, edge.defBlock); - const targetId = basicBlockId(filePath, functionStartLine, functionStartColumn, edge.useBlock); + const sourceId = basicBlockId( + filePath, + functionStartLine, + functionStartColumn, + edge.defBlock, + ); + const targetId = basicBlockId( + filePath, + functionStartLine, + functionStartColumn, + edge.useBlock, + ); graph.addRelationship({ // Single function anchor — the two block ids share it, so templating // it once halves the id size (ids are in-memory-only but ~4000 of diff --git a/gitnexus/src/core/ingestion/cfg/reaching-defs.ts b/gitnexus/src/core/ingestion/cfg/reaching-defs.ts index 7225a0e31c..cdf2f61ff7 100644 --- a/gitnexus/src/core/ingestion/cfg/reaching-defs.ts +++ b/gitnexus/src/core/ingestion/cfg/reaching-defs.ts @@ -89,10 +89,7 @@ const EMPTY_LATTICE: Lattice = new Map(); * Compute reaching definitions for one function. See the module doc for the * purity/determinism/sharing contract. */ -export function computeReachingDefs( - cfg: FunctionCfg, - limits?: ReachingDefsLimits, -): FunctionDefUse { +export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimits): FunctionDefUse { if (!cfg.bindings) { return { status: 'no-facts', bindings: [], facts: [], defCount: 0, useCount: 0 }; } diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts index b175f668f1..157a10c22b 100644 --- a/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts @@ -103,7 +103,8 @@ export class TsHarvester { this.scopeByNode.set(fnNode.id, this.root); this.declareParams(fnNode); const body = fnNode.childForFieldName('body'); - if (body) this.prescan(body, body.type === 'statement_block' ? this.openScope(body) : this.root); + if (body) + this.prescan(body, body.type === 'statement_block' ? this.openScope(body) : this.root); } /** The completed binding table — pass to `CfgBuilder.finish`. */ diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts index b4b030d84d..d4c26c5a52 100644 --- a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts @@ -174,7 +174,12 @@ class TsCfgWalk { openSimple = idx; dangling = [idx]; } else { - this.builder.extendBlock(openSimple, endLineOf(stmt), stmt.text, this.harvest.facts(stmt)); + this.builder.extendBlock( + openSimple, + endLineOf(stmt), + stmt.text, + this.harvest.facts(stmt), + ); } } } diff --git a/gitnexus/test/integration/cfg/parse-cache-mixed.test.ts b/gitnexus/test/integration/cfg/parse-cache-mixed.test.ts index b02806f01b..58f1837829 100644 --- a/gitnexus/test/integration/cfg/parse-cache-mixed.test.ts +++ b/gitnexus/test/integration/cfg/parse-cache-mixed.test.ts @@ -91,9 +91,7 @@ describe('durable ParsedFile store carries M2 statement facts (#2082 U5)', () => // VALUES must be intact — and no aliasing may merge the two files' // distinct fact arrays into wrong shapes. expect(cfg.bindings).toEqual(factCfg.bindings); - expect(cfg.blocks.map((b) => b.statements)).toEqual( - factCfg.blocks.map((b) => b.statements), - ); + expect(cfg.blocks.map((b) => b.statements)).toEqual(factCfg.blocks.map((b) => b.statements)); } }); diff --git a/gitnexus/test/unit/cfg/harvest.test.ts b/gitnexus/test/unit/cfg/harvest.test.ts index 2030234785..4a11791e0a 100644 --- a/gitnexus/test/unit/cfg/harvest.test.ts +++ b/gitnexus/test/unit/cfg/harvest.test.ts @@ -52,9 +52,7 @@ function allFacts(cfg: FunctionCfg): StatementFacts[] { /** Binding indices of every entry named `name`. */ function bindingIdxs(cfg: FunctionCfg, name: string): number[] { - return (cfg.bindings ?? []) - .map((b, i) => (b.name === name ? i : -1)) - .filter((i) => i >= 0); + return (cfg.bindings ?? []).map((b, i) => (b.name === name ? i : -1)).filter((i) => i >= 0); } /** The single binding index for `name` (throws when shadowed/ambiguous). */ @@ -365,8 +363,8 @@ describe('TS/JS def/use harvest — serialization', () => { }`); const n = cfg.bindings!.length; for (const f of allFacts(cfg)) { - for (const d of f.defs) expect(d).toBeGreaterThanOrEqual(0), expect(d).toBeLessThan(n); - for (const u of f.uses) expect(u).toBeGreaterThanOrEqual(0), expect(u).toBeLessThan(n); + for (const d of f.defs) (expect(d).toBeGreaterThanOrEqual(0), expect(d).toBeLessThan(n)); + for (const u of f.uses) (expect(u).toBeGreaterThanOrEqual(0), expect(u).toBeLessThan(n)); } }); }); diff --git a/gitnexus/test/unit/cfg/reaching-defs.test.ts b/gitnexus/test/unit/cfg/reaching-defs.test.ts index 1c5000a6ae..73014d76aa 100644 --- a/gitnexus/test/unit/cfg/reaching-defs.test.ts +++ b/gitnexus/test/unit/cfg/reaching-defs.test.ts @@ -145,7 +145,13 @@ describe('computeReachingDefs — kill/gen fundamentals (hand-built)', () => { it('loop back-edge: pre-loop def AND loop-carried redef both reach the header use', () => { // 0→2(def x)→3(use x = header)→4(def x, body)→3(back); 3→1(exit) const cfg = mkCfg( - [{}, {}, { stmts: [stmt(10, [0])] }, { stmts: [stmt(20, [], [0])] }, { stmts: [stmt(30, [0])] }], + [ + {}, + {}, + { stmts: [stmt(10, [0])] }, + { stmts: [stmt(20, [], [0])] }, + { stmts: [stmt(30, [0])] }, + ], [ [0, 2], [2, 3], @@ -178,7 +184,13 @@ describe('computeReachingDefs — kill/gen fundamentals (hand-built)', () => { it('unreachable block: its defs reach nothing; reachable uses see only reachable defs', () => { // 2(def x)→3(use x); 4 is DISCONNECTED and also defs x const cfg = mkCfg( - [{}, {}, { stmts: [stmt(10, [0])] }, { stmts: [stmt(20, [], [0])] }, { stmts: [stmt(30, [0])] }], + [ + {}, + {}, + { stmts: [stmt(10, [0])] }, + { stmts: [stmt(20, [], [0])] }, + { stmts: [stmt(30, [0])] }, + ], [ [0, 2], [2, 3], @@ -193,7 +205,12 @@ describe('computeReachingDefs — kill/gen fundamentals (hand-built)', () => { it('intra-block sweep: a use BEFORE the same-block def sees the incoming def', () => { // 2: def x. 3: use x (stmt0); def x (stmt1); use x (stmt2) const cfg = mkCfg( - [{}, {}, { stmts: [stmt(10, [0])] }, { stmts: [stmt(20, [], [0]), stmt(21, [0]), stmt(22, [], [0])] }], + [ + {}, + {}, + { stmts: [stmt(10, [0])] }, + { stmts: [stmt(20, [], [0]), stmt(21, [0]), stmt(22, [], [0])] }, + ], [ [0, 2], [2, 3], @@ -275,7 +292,9 @@ describe('computeReachingDefs — determinism and convergence', () => { const r = computeReachingDefs(cfg); // every header use sees both the init def and the innermost redef for (const useBlock of [3, 4, 5]) { - const defs = r.facts.filter((f) => f.use.blockIndex === useBlock).map((f) => f.def.blockIndex); + const defs = r.facts + .filter((f) => f.use.blockIndex === useBlock) + .map((f) => f.def.blockIndex); expect(new Set(defs)).toEqual(new Set([2, 6])); } }); diff --git a/gitnexus/test/unit/cfg/typescript-visitor.test.ts b/gitnexus/test/unit/cfg/typescript-visitor.test.ts index ef97bc44a2..6ff251b989 100644 --- a/gitnexus/test/unit/cfg/typescript-visitor.test.ts +++ b/gitnexus/test/unit/cfg/typescript-visitor.test.ts @@ -524,9 +524,7 @@ describe('TS/JS CfgVisitor — early exits through finally (#2082 M2 U2)', () => }); it('nested finallys chain: return threads a() then b() then EXIT', () => { - const cfg = cfgOf( - `function f() { try { try { return; } finally { a(); } } finally { b(); } }`, - ); + const cfg = cfgOf(`function f() { try { try { return; } finally { a(); } } finally { b(); } }`); const ret = block(cfg, 'return'); const finA = block(cfg, 'a()'); const finB = block(cfg, 'b()'); diff --git a/gitnexus/test/unit/pdg-mode-flip.test.ts b/gitnexus/test/unit/pdg-mode-flip.test.ts index ae591a705a..652895cfa3 100644 --- a/gitnexus/test/unit/pdg-mode-flip.test.ts +++ b/gitnexus/test/unit/pdg-mode-flip.test.ts @@ -95,6 +95,13 @@ describe('detect_changes BasicBlock exclusion (#2082 U7)', () => { for (const row of symbols) { expect(String(row.id)).not.toMatch(/^BasicBlock:/); } + // DB-level smoke for the M2 projection itself: REACHING_DEF rows + // persisted with the variable name in `reason` (plan Validation). + const rd = (await adapter.executeQuery( + `MATCH (:BasicBlock)-[r:CodeRelation {type: 'REACHING_DEF'}]->(:BasicBlock) + RETURN count(r) AS c`, + )) as Array<{ c: number | bigint }>; + expect(Number(rd[0]?.c ?? 0)).toBeGreaterThan(0); } finally { await adapter.closeLbug(); } From cebc8fb71f8a89f9ebaadb0b11e58dc2d2aea793 Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Wed, 10 Jun 2026 21:42:27 +0000 Subject: [PATCH 09/19] =?UTF-8?q?fix(cfg):=20review-pass=20fixes=20?= =?UTF-8?q?=E2=80=94=20defKey=20overflow=20guard,=20catch-param=20block,?= =?UTF-8?q?=20class=20defs,=20intra-statement=20reads,=20graceful=20fact?= =?UTF-8?q?=20degradation=20(#2082)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - reaching-defs: STMT_STRIDE 2^16→2^21 + upfront aliasing bail-out; a use that shares its statement with a def now also sees the same-statement def (assign-and-test idiom was a taint false negative); drop dead posInOrder - visitor: catch-param def gets its own once-executed block (prepending into a loop-header entry re-genned per iteration and killed loop-carried redefs); unresolved-label jumps now thread all active finallys; the finalizer-threading protocol moved to control-flow-context as shared helpers for future language visitors - harvest: class declarations def their name (was a bogus use in JS, silent skip in TS); class-expression names stay internal - emit: isEmitSafeCfg adds index==position contiguity; fact validation split into hasEmitSafeFacts so malformed facts degrade to CFG-only instead of dropping the function's whole CFG layer; facts-per-edge multiplier single source; lazy top-binding tally; dead solveMs removed - run-analyze: pdgModeMismatch compares the key union structurally — new resolved knobs join the comparison automatically - mcp: BasicBlock exclusion via id prefix (NULL-name rows of real symbols are no longer dropped) + same filter on the BM25 filePath fallback - bench: rd ratio denominator clamped (gate no longer self-disables at fast small-N); PROF-gated pdg timing in run.ts --- gitnexus/bench/cfg/measure.mjs | 4 +- .../src/core/ingestion/cfg/cfg-builder.ts | 16 +-- .../ingestion/cfg/control-flow-context.ts | 57 ++++++++- gitnexus/src/core/ingestion/cfg/emit.ts | 117 ++++++++++++------ .../src/core/ingestion/cfg/reaching-defs.ts | 45 +++++-- .../cfg/visitors/typescript-harvest.ts | 19 +++ .../core/ingestion/cfg/visitors/typescript.ts | 100 ++++++++------- .../scope-resolution/pipeline/run.ts | 7 +- gitnexus/src/core/run-analyze.ts | 25 ++-- gitnexus/src/mcp/local/local-backend.ts | 29 +++-- .../__snapshots__/cfg-snapshot.test.ts.snap | 13 +- gitnexus/test/unit/cfg/harvest.test.ts | 65 ++++++++-- gitnexus/test/unit/cfg/reaching-defs.test.ts | 9 +- .../test/unit/cfg/typescript-visitor.test.ts | 9 +- gitnexus/test/unit/pdg-mode-flip.test.ts | 23 ++-- 15 files changed, 374 insertions(+), 164 deletions(-) diff --git a/gitnexus/bench/cfg/measure.mjs b/gitnexus/bench/cfg/measure.mjs index 379fd2c382..2451b5d62f 100644 --- a/gitnexus/bench/cfg/measure.mjs +++ b/gitnexus/bench/cfg/measure.mjs @@ -275,7 +275,9 @@ function measureScenario(scenario) { const rdMaxFacts = scenario.rdMaxFacts ?? 0; const rdSmall = measureReachingDefs(small.cfgs, REPS, rdMaxFacts); const rdLarge = measureReachingDefs(large.cfgs, REPS, rdMaxFacts); - const rdRatio = rdSmall.ms > 0 ? rdLarge.ms / rdSmall.ms / sizeRatio : 0; + // Clamp the denominator: a 0.000ms small-N median would otherwise yield + // ratio 0 and the gate would self-disable exactly when the solver is fast. + const rdRatio = rdLarge.ms / Math.max(rdSmall.ms, 0.001) / sizeRatio; return { scenario: scenario.name, diff --git a/gitnexus/src/core/ingestion/cfg/cfg-builder.ts b/gitnexus/src/core/ingestion/cfg/cfg-builder.ts index c8bdd2de1c..b6e69126bf 100644 --- a/gitnexus/src/core/ingestion/cfg/cfg-builder.ts +++ b/gitnexus/src/core/ingestion/cfg/cfg-builder.ts @@ -105,20 +105,14 @@ export class CfgBuilder { /** * Attach a facts-only statement record to a block WITHOUT touching its text * or line span (#2082 M2 U1) — bench fingerprints and CFG snapshots include - * block text, so harvesting must never perturb it. `prepend` is for records - * that lexically precede the block's statements: a `catch (e)` param def must - * sit at statement index 0 of the handler entry block or an in-block - * `use(e)` at index 0 would see no reaching def in the in-order sweep. + * block text, so harvesting must never perturb it (ENTRY-block param defs + * are the canonical use; records that must precede a walked body get their + * own facts-only block instead, see the catch-param handling in visitTry). */ - attachFacts( - index: number, - facts: StatementFacts, - position: 'append' | 'prepend' = 'append', - ): void { + attachFacts(index: number, facts: StatementFacts): void { const b = this.blocks[index]; if (!b) return; - if (position === 'prepend') b.statements.unshift(facts); - else b.statements.push(facts); + b.statements.push(facts); } get blockCount(): number { diff --git a/gitnexus/src/core/ingestion/cfg/control-flow-context.ts b/gitnexus/src/core/ingestion/cfg/control-flow-context.ts index 7b54865a85..8b1aefc588 100644 --- a/gitnexus/src/core/ingestion/cfg/control-flow-context.ts +++ b/gitnexus/src/core/ingestion/cfg/control-flow-context.ts @@ -17,6 +17,7 @@ * reaching-defs pass (a taint false negative). A parallel stack cannot express * that between-ness, which is why the frames live here. */ +import type { CfgBuilder } from './cfg-builder.js'; import type { CfgEdgeKind } from './types.js'; interface LoopFrame { @@ -111,12 +112,17 @@ export class ControlFlowContext { return fins; } - /** Target block for a `break` (no finalizer info) — see {@link resolveBreak}. */ + /** + * Target block for a `break` (no finalizer info) — see {@link resolveBreak}. + * Prefer `resolveBreak` + {@link wireJumpThroughFinalizers} in visitors: a + * target-only lookup silently loses finalizer threading (the M2 soundness + * fix). Kept for target-shape assertions in tests. + */ breakTarget(label?: string): number | undefined { return this.resolveBreak(label)?.target; } - /** Target block for a `continue` (no finalizer info) — see {@link resolveContinue}. */ + /** Target block for a `continue` — same caveat as {@link breakTarget}. */ continueTarget(label?: string): number | undefined { return this.resolveContinue(label)?.target; } @@ -137,3 +143,50 @@ export class ControlFlowContext { return undefined; } } + +/** + * Wire a jump from `from` to `target`, routing through the finallys it + * crosses (innermost first). The first leg keeps the bare jump `kind` + * (preserving the "kind ⟹ source-block terminator" invariant in types.ts); + * each finally's completion leg is registered as pending on its frame with the + * matching `finally-*` kind and wired by the owning try via + * {@link drainFinalizerPending} once the finally's exits are known. + * + * Language-agnostic on purpose (#2082 M2): the threading protocol encodes + * three subtle invariants every future language visitor needs identically — + * keeping it here means a new visitor cannot drift on any of them. + */ +export function wireJumpThroughFinalizers( + builder: CfgBuilder, + from: number, + finalizers: readonly FinalizerFrame[], + target: number, + kind: 'return' | 'break' | 'continue', +): void { + if (finalizers.length === 0) { + builder.edge(from, target, kind); + return; + } + const completionKind = `finally-${kind}` as CfgEdgeKind; + builder.edge(from, finalizers[0].entry, kind); + for (let i = 0; i < finalizers.length; i++) { + const to = i + 1 < finalizers.length ? finalizers[i + 1].entry : target; + finalizers[i].pending.push({ to, kind: completionKind }); + } +} + +/** + * Wire a popped finalizer frame's pending completion legs from the finally's + * exit blocks. A finally that itself always jumps (`finally { return 2; }`) + * has no exits — its pending legs wire nowhere, matching JS's + * finally-override semantics. + */ +export function drainFinalizerPending( + builder: CfgBuilder, + frame: FinalizerFrame, + finallyExits: readonly number[], +): void { + for (const p of frame.pending) { + builder.connect(finallyExits, p.to, p.kind); + } +} diff --git a/gitnexus/src/core/ingestion/cfg/emit.ts b/gitnexus/src/core/ingestion/cfg/emit.ts index 2c619c16b8..14befc2a31 100644 --- a/gitnexus/src/core/ingestion/cfg/emit.ts +++ b/gitnexus/src/core/ingestion/cfg/emit.ts @@ -42,14 +42,19 @@ export const DEFAULT_MAX_CFG_EDGES_PER_FUNCTION = 5000; export const DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION = 4000; /** - * Fact-materialization limit handed to {@link computeReachingDefs} on the - * emit path (#2082 M2 U3/F3): facts are O(defs×uses) BY SPEC in merge-heavy - * code, and the edge cap alone bounds the GRAPH, not the per-function memory - * spike of materializing facts before dedup. 4× the edge cap leaves dedup - * headroom. Scales with a custom edge cap; unlimited when the edge cap is 0. + * Fact-materialization headroom over the edge cap (#2082 M2 U3/F3): facts are + * O(defs×uses) BY SPEC in merge-heavy code, and the edge cap alone bounds the + * GRAPH, not the per-function memory spike of materializing facts before + * dedup. {@link emitFileReachingDefs} hands `edgeCap × this` to + * `computeReachingDefs` as `maxFacts` (unlimited when the edge cap is 0) — + * single source of truth; the DEFAULT constant below is derived, never the + * mechanism. */ +export const REACHING_DEF_FACTS_PER_EDGE_CAP = 4; + +/** Derived emit-path fact limit at the default edge cap (bench/doc anchor). */ export const DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION = - 4 * DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION; + REACHING_DEF_FACTS_PER_EDGE_CAP * DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION; export interface CfgEmitResult { blocks: number; @@ -93,42 +98,62 @@ export const isEmitSafeCfg = (cfg: FunctionCfg | undefined | null): cfg is Funct ) { return false; } - const blockIndices = new Set(); - for (const b of cfg.blocks) { - if (!Number.isInteger(b?.index)) return false; - blockIndices.add(b.index); + // Contiguity (index === position), not just integer-ness: every consumer — + // this module's id templating AND the reaching-defs solver's + // position-indexed adjacency arrays — assumes blocks[i].index === i. A + // membership-only check would admit a compacted channel ({index:0},{index:5}) + // whose edge 0→5 passes membership but indexes past the arrays downstream. + for (let i = 0; i < cfg.blocks.length; i++) { + if (cfg.blocks[i]?.index !== i) return false; } - if (!cfg.edges.every((e) => blockIndices.has(e?.from) && blockIndices.has(e?.to))) return false; + const n = cfg.blocks.length; + return cfg.edges.every( + (e) => + Number.isInteger(e?.from) && + Number.isInteger(e?.to) && + e.from >= 0 && + e.from < n && + e.to >= 0 && + e.to < n, + ); +}; - // M2 (#2082 U1/U4): the binding table + statement facts join the - // REACHING_DEF id path (binding name/declLine/declColumn template into edge - // ids; statement defs/uses index into the table). A statement record whose - // index escapes the table would silently fabricate `undefined`-keyed edge - // ids, so indices are checked for RANGE, not just integer-ness. A channel - // with statements but NO binding table is malformed by construction. +/** + * Whether a structurally-valid CFG's M2 statement facts are safe to feed to + * the reaching-defs solver + REACHING_DEF id templating (#2082 U1/U4): the + * binding table's name/declLine/declColumn template into edge ids, and + * statement def/use indices must stay IN RANGE of the table (an escaping + * index would fabricate `undefined`-keyed ids). Deliberately SEPARATE from + * {@link isEmitSafeCfg}: malformed facts must cost only the function's + * REACHING_DEF projection — degrading to M1 behavior (CFG emitted, no facts) + * — never the BasicBlock/CFG layer itself. + */ +export const hasEmitSafeFacts = (cfg: FunctionCfg): boolean => { const bindings = cfg.bindings; - if (bindings !== undefined) { - if (!Array.isArray(bindings)) return false; - for (const b of bindings) { - if ( - typeof b?.name !== 'string' || - !Number.isInteger(b.declLine) || - !Number.isInteger(b.declColumn) - ) { - return false; - } + if (bindings === undefined) { + // Pre-M2 channel — statements must be absent too. + return cfg.blocks.every((b) => b.statements === undefined); + } + if (!Array.isArray(bindings)) return false; + for (const b of bindings) { + if ( + typeof b?.name !== 'string' || + !Number.isInteger(b.declLine) || + !Number.isInteger(b.declColumn) + ) { + return false; } } - const bindingCount = bindings?.length ?? 0; + const bindingCount = bindings.length; + const inRange = (i: number): boolean => Number.isInteger(i) && i >= 0 && i < bindingCount; for (const b of cfg.blocks) { const stmts = b.statements; if (stmts === undefined) continue; - if (bindings === undefined || !Array.isArray(stmts)) return false; + if (!Array.isArray(stmts)) return false; for (const s of stmts) { if (!Number.isInteger(s?.line) || !Array.isArray(s.defs) || !Array.isArray(s.uses)) { return false; } - const inRange = (i: number): boolean => Number.isInteger(i) && i >= 0 && i < bindingCount; if (!s.defs.every(inRange) || !s.uses.every(inRange)) return false; } } @@ -208,10 +233,10 @@ export interface ReachingDefEmitResult { cappedFunctions: number; /** Functions whose FACT materialization hit the solver's maxFacts limit. */ truncatedFunctions: number; + /** Functions whose facts failed {@link hasEmitSafeFacts} (CFG kept, facts skipped). */ + malformedFactFunctions: number; /** Total statement-level facts the solver produced (pre-dedup telemetry). */ facts: number; - /** Aggregate solve+dedup time in ms (PROF support). */ - solveMs: number; } /** @@ -254,19 +279,26 @@ export function emitFileReachingDefs( droppedEdges: 0, cappedFunctions: 0, truncatedFunctions: 0, + malformedFactFunctions: 0, facts: 0, - solveMs: 0, }; const cap = maxEdgesPerFunction > 0 ? maxEdgesPerFunction : Infinity; - const maxFacts = Number.isFinite(cap) ? (cap as number) * 4 : 0; // 0 ⇒ unlimited + const maxFacts = Number.isFinite(cap) ? (cap as number) * REACHING_DEF_FACTS_PER_EDGE_CAP : 0; // 0 ⇒ unlimited for (const cfg of cfgs) { - const t0 = performance.now(); - const r = computeReachingDefs(cfg, { maxFacts }); - if (r.status === 'no-facts') { - result.solveMs += performance.now() - t0; + // Graceful degradation: malformed M2 facts cost only this function's + // REACHING_DEF projection — its BasicBlock/CFG layer was already emitted. + if (!hasEmitSafeFacts(cfg)) { + result.malformedFactFunctions++; + onWarn?.( + `[reaching-defs] ${cfg.filePath}:${cfg.functionStartLine}: malformed ` + + `statement facts (bad binding table or out-of-range fact indices) — ` + + `REACHING_DEF skipped for this function; its CFG is unaffected`, + ); continue; } + const r = computeReachingDefs(cfg, { maxFacts }); + if (r.status === 'no-facts') continue; result.facts += r.facts.length; const { filePath, functionStartLine, functionStartColumn } = cfg; @@ -283,9 +315,7 @@ export function emitFileReachingDefs( // deduped order (and therefore cap truncation) is deterministic. const seen = new Set(); const deduped: { defBlock: number; useBlock: number; bindingIdx: number }[] = []; - const factsPerBinding = new Map(); for (const f of r.facts) { - factsPerBinding.set(f.bindingIdx, (factsPerBinding.get(f.bindingIdx) ?? 0) + 1); const key = `${f.def.blockIndex}:${f.use.blockIndex}:${f.bindingIdx}`; if (seen.has(key)) continue; seen.add(key); @@ -295,7 +325,6 @@ export function emitFileReachingDefs( bindingIdx: f.bindingIdx, }); } - result.solveMs += performance.now() - t0; let emittedForFn = 0; for (const edge of deduped) { @@ -303,6 +332,12 @@ export function emitFileReachingDefs( const dropped = deduped.length - emittedForFn; result.droppedEdges += dropped; result.cappedFunctions++; + // Tallied lazily — cap overflow is the rare path; the common uncapped + // case must not pay a per-fact counting pass. + const factsPerBinding = new Map(); + for (const f of r.facts) { + factsPerBinding.set(f.bindingIdx, (factsPerBinding.get(f.bindingIdx) ?? 0) + 1); + } const top = [...factsPerBinding.entries()] .sort((a, b) => b[1] - a[1] || a[0] - b[0]) .slice(0, 2) diff --git a/gitnexus/src/core/ingestion/cfg/reaching-defs.ts b/gitnexus/src/core/ingestion/cfg/reaching-defs.ts index cdf2f61ff7..0255103e39 100644 --- a/gitnexus/src/core/ingestion/cfg/reaching-defs.ts +++ b/gitnexus/src/core/ingestion/cfg/reaching-defs.ts @@ -74,8 +74,17 @@ export interface FunctionDefUse { readonly useCount: number; } -/** def-site key: packs (blockIndex, stmtIndex) into one number. */ -const STMT_STRIDE = 1 << 16; // maxFunctionLines caps statements far below 65536 +/** + * def-site key: packs (blockIndex, stmtIndex) into one number. The stride is + * a per-BLOCK statement bound, and `maxFunctionLines` caps LINES, not + * statements — a minified one-line function coalesces arbitrarily many + * statements into one block, so an overflow would silently alias + * (block b, stmt STRIDE+k) with (block b+1, stmt k) and fabricate wrong-block + * facts. computeReachingDefs therefore range-checks up front and bails to a + * sound empty `truncated` result instead of ever letting a key alias. + * 2^21 statements per block × blocks ≤ 2^32 stays inside Number's 2^53. + */ +const STMT_STRIDE = 1 << 21; const defKey = (blockIndex: number, stmtIndex: number): number => blockIndex * STMT_STRIDE + stmtIndex; @@ -97,6 +106,16 @@ export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimit const blocks = cfg.blocks; const n = blocks.length; + // Key-aliasing guard (see STMT_STRIDE): a block with ≥ STRIDE statements + // cannot be keyed without aliasing into the next block's def sites, which + // would fabricate wrong-block facts — strictly worse than producing none. + // Bail to a sound empty `truncated` result (the emit path warns). + for (const b of blocks) { + if ((b.statements?.length ?? 0) >= STMT_STRIDE) { + return { status: 'truncated', bindings: cfg.bindings, facts: [], defCount: 0, useCount: 0 }; + } + } + // ── adjacency (sorted for deterministic merges) ───────────────────────── // A `throw` edge contributes IN(from) ∪ OUT(from) to its handler, not just // OUT: an exception can fire BEFORE the faulting block's defs complete, so @@ -155,8 +174,6 @@ export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimit // ── fixpoint ──────────────────────────────────────────────────────────── const inSets: Lattice[] = new Array(n).fill(EMPTY_LATTICE); const outSets: Lattice[] = new Array(n).fill(EMPTY_LATTICE); - const posInOrder = new Map(); - order.forEach((b, i) => posInOrder.set(b, i)); const inWorklist = new Array(n).fill(true); let pending = n; @@ -214,10 +231,24 @@ export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimit let reach: Lattice | null = null; for (let i = 0; i < stmts.length; i++) { const s = stmts[i]; + // A use's binding that the SAME statement also defines could be a + // read-then-write (`x += 1` — sees prior defs) OR a write-then-read + // (`if ((m = re.exec(s)) && m[1])` — sees the same-statement def). + // StatementFacts carries no intra-statement order, so emit BOTH: prior + // defs ∪ the same-statement def. Sound over-approximation — the extra + // self-fact on compound assignments is harmless; missing the + // assign-and-test def→use (the most common JS idiom) would be a taint + // false negative. + const sameStmtDefs = s.defs.length > 0 ? new Set(s.defs) : null; for (const u of s.uses) { const reaching = (reach ?? inSets[b.index]).get(u); - if (!reaching) continue; - for (const key of reaching) { + const selfKey = sameStmtDefs?.has(u) ? defKey(b.index, i) : undefined; + if (!reaching && selfKey === undefined) continue; + const keys = + selfKey !== undefined && !reaching?.has(selfKey) + ? [...(reaching ?? []), selfKey] + : [...(reaching ?? [])]; + for (const key of keys) { if (facts.length >= maxFacts) { truncated = true; break outer; @@ -226,7 +257,7 @@ export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimit const defStmt = key % STMT_STRIDE; facts.push({ bindingIdx: u, - def: { blockIndex: defBlock, stmtIndex: defStmt, line: defLine.get(key) ?? 0 }, + def: { blockIndex: defBlock, stmtIndex: defStmt, line: defLine.get(key) ?? s.line }, use: { blockIndex: b.index, stmtIndex: i, line: s.line }, }); } diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts index 157a10c22b..f15c35f32c 100644 --- a/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts @@ -425,6 +425,25 @@ export class TsHarvester { } return; } + case 'class_declaration': { + // The class NAME is a def (prescan declared the binding) — without + // this case the default walk would record it as a bogus USE in plain + // JS (the name is an `identifier` there; in TS it's a type_identifier + // and would be silently skipped, losing the def either way). The body + // walk picks up field-initializer uses; methods are opaque nested fns. + const name = node.childForFieldName('name'); + if (name) this.def(name, acc); + const body = node.childForFieldName('body'); + if (body) this.walkValue(body, acc); + return; + } + case 'class': { + // Class EXPRESSION: its name (if any) binds only inside the class — + // not a def in the enclosing function. Walk only the body. + const body = node.childForFieldName('body'); + if (body) this.walkValue(body, acc); + return; + } default: for (let i = 0; i < node.namedChildCount; i++) { const c = node.namedChild(i); diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts index d4c26c5a52..be14fa78cc 100644 --- a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts @@ -57,9 +57,13 @@ */ import type { SyntaxNode } from '../../utils/ast-helpers.js'; import { CfgBuilder } from '../cfg-builder.js'; -import { ControlFlowContext, type FinalizerFrame } from '../control-flow-context.js'; +import { + ControlFlowContext, + drainFinalizerPending, + wireJumpThroughFinalizers, +} from '../control-flow-context.js'; import type { TraversalResult } from '../traversal-result.js'; -import type { CfgEdgeKind, CfgVisitor, FunctionCfg } from '../types.js'; +import type { CfgVisitor, FunctionCfg } from '../types.js'; import { TsHarvester } from './typescript-harvest.js'; /** TS/JS node types that own a CFG-bearing function body. */ @@ -245,36 +249,16 @@ class TsCfgWalk { this.harvest.facts(stmt), ); // A return crosses EVERY active finally before reaching EXIT. - this.jumpVia(idx, this.cfc.finalizersForReturn(), this.builder.exitIndex, 'return'); + wireJumpThroughFinalizers( + this.builder, + idx, + this.cfc.finalizersForReturn(), + this.builder.exitIndex, + 'return', + ); return { entry: idx, exits: [] }; } - /** - * Wire a jump from `from` to `target`, routing through the finallys it - * crosses (innermost first). The first leg keeps the bare jump `kind` - * (preserving the "kind ⟹ source-block terminator" invariant); each - * finally's completion leg is registered as pending on its frame with the - * matching `finally-*` kind and wired by the owning `visitTry` once the - * finally's exits are known to it (#2082 M2 U2 / KTD5). - */ - private jumpVia( - from: number, - finalizers: readonly FinalizerFrame[], - target: number, - kind: 'return' | 'break' | 'continue', - ): void { - if (finalizers.length === 0) { - this.builder.edge(from, target, kind); - return; - } - const completionKind = `finally-${kind}` as CfgEdgeKind; - this.builder.edge(from, finalizers[0].entry, kind); - for (let i = 0; i < finalizers.length; i++) { - const to = i + 1 < finalizers.length ? finalizers[i + 1].entry : target; - finalizers[i].pending.push({ to, kind: completionKind }); - } - } - private visitThrow(stmt: SyntaxNode): TraversalResult { const idx = this.builder.newBlock( startLineOf(stmt), @@ -295,19 +279,29 @@ class TsCfgWalk { // would otherwise leave this block with NO out-edge, stranding it and // breaking the single-exit invariant a downstream post-dominator / PDG pass // relies on. Conservatively route an unresolved jump to the function EXIT - // ("escapes the function"): sound over-approximation, keeps single-exit; - // no finallys thread (the target is unknown, so the crossed set is too). - if (res) this.jumpVia(idx, res.finalizers, res.target, 'break'); - else this.builder.edge(idx, this.builder.exitIndex, 'break'); + // ("escapes the function") and thread ALL active finallys — a superset of + // the truly-crossed set (the real target is somewhere in the function, so + // execution provably runs every finally between the jump and wherever it + // lands... up to the ones the conservative EXIT routing over-includes). + // Sound for dataflow either way: extra paths, never a bypassed finally. + const { target, finalizers } = res ?? { + target: this.builder.exitIndex, + finalizers: this.cfc.finalizersForReturn(), + }; + wireJumpThroughFinalizers(this.builder, idx, finalizers, target, 'break'); return { entry: idx, exits: [] }; } private visitContinue(stmt: SyntaxNode): TraversalResult { const idx = this.builder.newBlock(startLineOf(stmt), endLineOf(stmt), stmt.text); const res = this.cfc.resolveContinue(this.labelOf(stmt)); - // See visitBreak: an unresolved label routes to EXIT to preserve single-exit. - if (res) this.jumpVia(idx, res.finalizers, res.target, 'continue'); - else this.builder.edge(idx, this.builder.exitIndex, 'continue'); + // See visitBreak: an unresolved label routes to EXIT (threading all + // active finallys) to preserve single-exit without bypassing a finally. + const { target, finalizers } = res ?? { + target: this.builder.exitIndex, + finalizers: this.cfc.finalizersForReturn(), + }; + wireJumpThroughFinalizers(this.builder, idx, finalizers, target, 'continue'); return { entry: idx, exits: [] }; } @@ -619,12 +613,27 @@ class TsCfgWalk { const idx = this.builder.newBlock(startLineOf(catchClause), endLineOf(catchClause), ''); catchRes = { entry: idx, exits: [idx] }; } - // `catch (e)` has no header block — the param def fact PREPENDS onto the - // handler entry (it lexically precedes the body, and the body was walked - // first, so appending would put the def AFTER the body's facts and an - // in-block `use(e)` at statement index 0 would see no reaching def). + // `catch (e)` has no header block — the param def gets its OWN + // facts-only block in front of the body entry. It must NOT be prepended + // into the body's entry block: when the catch body STARTS with a loop, + // that entry is the loop HEADER, re-entered on every iteration — the + // param def would re-gen there and falsely KILL loop-carried + // redefinitions of the param (`catch (e) { while (c) { e = fix(e); } + // sink(e); }` would lose the fix→sink fact, a taint false negative). + // The param block becomes the handler entry, which is also semantically + // right: the binding happens exactly once, on handler entry. const paramFacts = this.harvest.catchParamFacts(catchClause); - if (paramFacts) this.builder.attachFacts(catchRes.entry, paramFacts, 'prepend'); + if (paramFacts) { + const paramBlock = this.builder.newBlock( + startLineOf(catchClause), + startLineOf(catchClause), + '', + 'normal', + paramFacts, + ); + this.builder.edge(paramBlock, catchRes.entry, 'seq'); + catchRes = { entry: paramBlock, exits: catchRes.exits }; + } } // Handler for the try body: catch if present, else finally, else outer. @@ -650,14 +659,11 @@ class TsCfgWalk { // The finalizer frame closes once the protected/catch walks are done; any // jumps that crossed it left their completion legs on `pending`, wired - // here from the finally's exits. A finally that itself always jumps - // (`finally { return 2; }`) has no exits — pending legs wire nowhere, - // matching JS's finally-override semantics. + // here from the finally's exits (see drainFinalizerPending for the + // finally-override semantics of an always-jumping finally). if (finFrame && finallyRes) { this.cfc.pop(); - for (const p of finFrame.pending) { - this.builder.connect(finallyRes.exits, p.to, p.kind); - } + drainFinalizerPending(this.builder, finFrame, finallyRes.exits); } const exits: number[] = []; diff --git a/gitnexus/src/core/ingestion/scope-resolution/pipeline/run.ts b/gitnexus/src/core/ingestion/scope-resolution/pipeline/run.ts index a0ea40a1f7..e8d5fe6b57 100644 --- a/gitnexus/src/core/ingestion/scope-resolution/pipeline/run.ts +++ b/gitnexus/src/core/ingestion/scope-resolution/pipeline/run.ts @@ -762,8 +762,9 @@ export function runScopeResolution( // M2 (#2082 U4): reaching definitions over the same validated CFGs. // In-memory facts are computed per function and dropped after the // bounded (defBlock, useBlock, binding) projection is persisted — - // M3 recomputes via the same pure solver in-phase (KTD8). - const t0 = performance.now(); + // M3 recomputes via the same pure solver in-phase (KTD8). Timing is + // PROF-gated like every other checkpoint here (zero cost when off). + const t0 = PROF ? performance.now() : 0; const rd = emitFileReachingDefs( graph, wellFormed, @@ -771,7 +772,7 @@ export function runScopeResolution( DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION, (message) => logger.warn(message), // unconditional — R7, both layers ); - pdgMs += performance.now() - t0; + if (PROF) pdgMs += performance.now() - t0; rdEdges += rd.edges; rdDropped += rd.droppedEdges; rdFacts += rd.facts; diff --git a/gitnexus/src/core/run-analyze.ts b/gitnexus/src/core/run-analyze.ts index 38740f7083..66300b8b31 100644 --- a/gitnexus/src/core/run-analyze.ts +++ b/gitnexus/src/core/run-analyze.ts @@ -370,17 +370,20 @@ export const pdgModeMismatch = (recorded: RepoMeta['pdg'], options: PdgOptions): const requested = resolvePdgConfig(options); if (!requested && !recorded) return false; if (!requested || !recorded) return true; - return ( - requested.maxFunctionLines !== recorded.maxFunctionLines || - requested.maxEdgesPerFunction !== recorded.maxEdgesPerFunction || - // M2 (#2082): an M1-era stamp has NO maxReachingDefEdgesPerFunction — - // `4000 !== undefined` trips here, which is what makes an M1→M2 upgrade - // force the full writeback that populates REACHING_DEF rows without - // `--force`. The comparator is field-wise on purpose; new emit-affecting - // knobs MUST join it (a knob the comparator misses silently strands a - // stale projection). - requested.maxReachingDefEdgesPerFunction !== recorded.maxReachingDefEdgesPerFunction - ); + // Structural comparison over the KEY UNION of both resolved records — not a + // hand-maintained field list. Both sides come fully resolved from + // resolvePdgConfig, so any new emit-affecting knob added there joins the + // comparison automatically (M1's hand-extended comparator was the trap this + // closes: a knob it missed would silently strand a stale projection). It is + // also what makes the M1→M2 upgrade work with zero extra code: an M1-era + // stamp lacks maxReachingDefEdgesPerFunction, so `4000 !== undefined` trips + // a full writeback that populates REACHING_DEF rows without `--force`. + const reqRecord = requested as Record; + const recRecord = recorded as Record; + for (const key of new Set([...Object.keys(reqRecord), ...Object.keys(recRecord)])) { + if (reqRecord[key] !== recRecord[key]) return true; + } + return false; }; export async function runFullAnalysis( diff --git a/gitnexus/src/mcp/local/local-backend.ts b/gitnexus/src/mcp/local/local-backend.ts index 314ec73c27..f2d17b3bd0 100644 --- a/gitnexus/src/mcp/local/local-backend.ts +++ b/gitnexus/src/mcp/local/local-backend.ts @@ -1677,9 +1677,14 @@ export class LocalBackend { ) : await executeParameterized( repo.lbugPath, + // Same BasicBlock exclusion as detect_changes (#2082 U7): on a + // --pdg index a function-heavy file has far more BasicBlock rows + // than symbols, so an unfiltered LIMIT 3 would surface nameless + // substrate rows and displace the real symbols. ` MATCH (n) WHERE n.filePath = $filePath + AND NOT n.id STARTS WITH 'BasicBlock:' RETURN n.id AS id, n.name AS name, labels(n)[0] AS type, n.filePath AS filePath, n.startLine AS startLine, n.endLine AS endLine LIMIT 3 `, @@ -2922,20 +2927,20 @@ export class LocalBackend { queryParams[`hunkEnd${i}`] = hunk.endLine; }); - // `n.name IS NOT NULL` excludes BasicBlock rows: on a --pdg index every - // edited function otherwise contributes N nameless BasicBlock - // pseudo-"symbols" (they carry filePath/start/end but the table has no - // name column), inflating changed_count and risk level with rows no - // consumer can act on (#2082 U7). Blocks are implementation substrate, - // not symbols — the owning Function row already represents the change. - // Filtering on the name column beats a label predicate here because - // `labels(n)[0]` is known to come back empty for several node types - // (see enrichCandidateLabels), and BasicBlock is the only line-bearing - // table without `name` (Community/Process lack it too but carry no - // startLine, so the existing filter already drops them). + // Exclude BasicBlock rows by id prefix: on a --pdg index every edited + // function otherwise contributes N nameless BasicBlock pseudo-"symbols" + // (they carry filePath/start/end but no name), inflating changed_count + // and risk level with rows no consumer can act on (#2082 U7). Blocks + // are implementation substrate, not symbols — the owning Function row + // already represents the change. The id prefix (`BasicBlock::…`, + // cfg/emit.ts basicBlockId) beats a label predicate (`labels(n)[0]` is + // known to come back empty for several node types — see + // enrichCandidateLabels) AND beats `n.name IS NOT NULL` (which would + // also drop legitimate symbols whose name loaded as NULL, e.g. + // quoted-empty CSV fields for anonymous constructs). const symbolQuery = ` MATCH (n) WHERE n.filePath ENDS WITH $filePath - AND n.name IS NOT NULL + AND NOT n.id STARTS WITH 'BasicBlock:' AND n.startLine IS NOT NULL AND n.endLine IS NOT NULL AND (${overlapConditions}) RETURN n.id AS id, n.name AS name, labels(n)[0] AS type, diff --git a/gitnexus/test/integration/cfg/__snapshots__/cfg-snapshot.test.ts.snap b/gitnexus/test/integration/cfg/__snapshots__/cfg-snapshot.test.ts.snap index 2c0393cb75..04e592d1ef 100644 --- a/gitnexus/test/integration/cfg/__snapshots__/cfg-snapshot.test.ts.snap +++ b/gitnexus/test/integration/cfg/__snapshots__/cfg-snapshot.test.ts.snap @@ -107,14 +107,15 @@ exports[`U7 — AC1: ten-functions fixture CFG snapshot > matches the committed "startLine": 55, }, { - "blocks": 6, + "blocks": 7, "edges": [ - "0->4:seq", - "2->5:seq", + "0->5:seq", + "2->6:seq", "3->2:seq", - "4->2:seq", - "4->3:throw", - "5->1:seq", + "4->3:seq", + "5->2:seq", + "5->4:throw", + "6->1:seq", ], "entry": 0, "exit": 1, diff --git a/gitnexus/test/unit/cfg/harvest.test.ts b/gitnexus/test/unit/cfg/harvest.test.ts index 4a11791e0a..88ca809f34 100644 --- a/gitnexus/test/unit/cfg/harvest.test.ts +++ b/gitnexus/test/unit/cfg/harvest.test.ts @@ -219,18 +219,34 @@ describe('TS/JS def/use harvest — harvest sites beyond visitSeq', () => { expect(forOfHead.statements!.some((f) => f.uses.includes(bindingIdx(cfg, 'list')))).toBe(true); }); - it('catch param defines at statement index 0 of the handler entry block', () => { + it('catch param defines in its own facts-only block preceding the body', () => { const cfg = cfgOf(`function f() { try { risky(); } catch (e) { use(e); } }`); const e = bindingIdx(cfg, 'e'); expect(cfg.bindings![e].kind).toBe('catch'); - const handler = cfg.blocks.find((b) => b.text.includes('use(e)')); - const facts = handler!.statements!; - // def of e PRECEDES the body's use of e — index 0, so the in-order sweep - // gives `use(e)` a reaching def - expect([...facts[0].defs]).toEqual([e]); - expect(facts.findIndex((f) => f.uses.includes(e))).toBeGreaterThan(0); + // The param def gets a DEDICATED once-executed block in front of the body + // entry — NOT prepended into the body's entry block, which can be a loop + // header that would re-gen the def per iteration and falsely kill + // loop-carried redefinitions of the param. + const paramBlock = cfg.blocks.find( + (b) => b.text === '' && (b.statements ?? []).some((f) => f.defs.includes(e)), + ); + expect(paramBlock).toBeDefined(); + const body = cfg.blocks.find((b) => b.text.includes('use(e)'))!; + expect(cfg.edges.some((ed) => ed.from === paramBlock!.index && ed.to === body.index)).toBe( + true, + ); + }); + + it('catch body starting with a loop: param def does NOT re-gen on the loop header', () => { + const cfg = cfgOf(`function f(c) { + try { risky(); } catch (e) { while (c) { e = fix(e); } sink(e); } + }`); + const e = bindingIdx(cfg, 'e'); + const header = cfg.blocks.find((b) => b.text === '(c)' || b.text === 'c')!; + // the loop header carries NO def of e — only the dedicated param block does + expect((header.statements ?? []).some((f) => f.defs.includes(e))).toBe(false); }); it('empty catch: param def lands on the synthetic handler block', () => { @@ -368,3 +384,38 @@ describe('TS/JS def/use harvest — serialization', () => { } }); }); + +describe('TS/JS def/use harvest — review-pass regressions (#2082)', () => { + it('class declarations harvest the name as a DEF (JS identifier and TS type_identifier)', () => { + const cfg = cfgOf(`function f() { + class A {} + return new A(); + }`); + const a = bindingIdx(cfg, 'A'); + expect(cfg.bindings![a].kind).toBe('class'); + const facts = allFacts(cfg); + expect(facts.some((fa) => fa.defs.includes(a))).toBe(true); + // the `new A()` use resolves to the same binding + expect(facts.some((fa) => fa.uses.includes(a))).toBe(true); + // and the declaration statement records NO bogus use of A + const declFact = facts.find((fa) => fa.defs.includes(a)); + expect(declFact!.uses).not.toContain(a); + }); + + it('write-then-read in one statement (assign-and-test idiom) forms the def→use fact', async () => { + const { computeReachingDefs } = + await import('../../../src/core/ingestion/cfg/reaching-defs.js'); + const cfg = cfgOf(`function f(re, s) { + let m = null; + if ((m = re.exec(s)) && m) { sink(m); } + }`); + const m = bindingIdx(cfg, 'm'); + const r = computeReachingDefs(cfg); + // the `m` read in the condition gets a fact from the SAME-statement + // assignment (write-then-read), not only from the dead `m = null` init + const condUses = r.facts.filter( + (fa) => fa.bindingIdx === m && fa.def.line === fa.use.line && fa.use.line === 3, + ); + expect(condUses.length).toBeGreaterThan(0); + }); +}); diff --git a/gitnexus/test/unit/cfg/reaching-defs.test.ts b/gitnexus/test/unit/cfg/reaching-defs.test.ts index 73014d76aa..27b6784b39 100644 --- a/gitnexus/test/unit/cfg/reaching-defs.test.ts +++ b/gitnexus/test/unit/cfg/reaching-defs.test.ts @@ -222,7 +222,12 @@ describe('computeReachingDefs — kill/gen fundamentals (hand-built)', () => { expect(render(r.facts).sort()).toEqual(['2:0->3:0:0', '3:1->3:2:0']); }); - it('def+use in one statement (x += 1): the use sees PRIOR defs, not its own', () => { + it('def+use in one statement: the use sees prior defs AND the same-statement def', () => { + // StatementFacts carries no intra-statement order, so `x += 1` + // (read-then-write) and `if ((m = f()) && m.p)` (write-then-read) are + // indistinguishable — the sweep emits BOTH the prior def and the + // same-statement self-def (sound over-approximation; missing the + // assign-and-test idiom's def→use would be a taint false negative). const cfg = mkCfg( [{}, {}, { stmts: [stmt(10, [0]), stmt(11, [0], [0])] }], [ @@ -232,7 +237,7 @@ describe('computeReachingDefs — kill/gen fundamentals (hand-built)', () => { ['x'], ); const r = computeReachingDefs(cfg); - expect(render(r.facts)).toEqual(['2:0->2:1:0']); + expect(render(r.facts).sort()).toEqual(['2:0->2:1:0', '2:1->2:1:0']); }); }); diff --git a/gitnexus/test/unit/cfg/typescript-visitor.test.ts b/gitnexus/test/unit/cfg/typescript-visitor.test.ts index 6ff251b989..2d49f5fd13 100644 --- a/gitnexus/test/unit/cfg/typescript-visitor.test.ts +++ b/gitnexus/test/unit/cfg/typescript-visitor.test.ts @@ -350,7 +350,14 @@ describe('TS/JS CfgVisitor — try/catch/finally (R10)', () => { it('non-empty catch is unchanged by the empty-catch synthesis (F2 regression guard)', () => { const cfg = cfgOf(`function f() { try { a(); } catch (e) { h(); } after(); }`); - expect(throwTargets(cfg).has(block(cfg, 'h();'))).toBe(true); + // The throw lands on the handler ENTRY — since M2 that is the catch-param + // binding block (a facts-only block in front of the body), which flows + // into the body. Assert the path, not block identity. + const handlerEntries = [...throwTargets(cfg)]; + expect(handlerEntries.length).toBeGreaterThan(0); + for (const t of handlerEntries) { + expect(reaches(cfg, t, block(cfg, 'h();'))).toBe(true); + } expect(reaches(cfg, block(cfg, 'h();'), block(cfg, 'after();'))).toBe(true); }); diff --git a/gitnexus/test/unit/pdg-mode-flip.test.ts b/gitnexus/test/unit/pdg-mode-flip.test.ts index 652895cfa3..f258ecb8a0 100644 --- a/gitnexus/test/unit/pdg-mode-flip.test.ts +++ b/gitnexus/test/unit/pdg-mode-flip.test.ts @@ -61,7 +61,7 @@ describe('pdgModeMismatch — M1→M2 stamp upgrade (#2082 M2, pure)', () => { }); describe('detect_changes BasicBlock exclusion (#2082 U7)', () => { - it('the symbol-overlap filter (name IS NOT NULL) excludes exactly the BasicBlock rows', async () => { + it('the symbol-overlap id-prefix filter excludes exactly the BasicBlock rows', async () => { const repo = await setupMiniRepo(); try { const { runFullAnalysis } = await import('../../src/core/run-analyze.js'); @@ -72,22 +72,19 @@ describe('detect_changes BasicBlock exclusion (#2082 U7)', () => { const { lbugPath } = getStoragePaths(repo.dbPath); await adapter.initLbug(lbugPath); try { - // Counterfactual: WITHOUT the U7 name filter, line-bearing nameless - // rows exist on a pdg index (the noise detect_changes used to report). - const nameless = (await adapter.executeQuery( - `MATCH (n) WHERE n.name IS NULL + // Counterfactual: WITHOUT the U7 filter, line-bearing BasicBlock rows + // exist on a pdg index (the noise detect_changes used to report). + const blocks = (await adapter.executeQuery( + `MATCH (n) WHERE n.id STARTS WITH 'BasicBlock:' AND n.startLine IS NOT NULL AND n.endLine IS NOT NULL RETURN n.id AS id`, )) as Array<{ id: string }>; - expect(nameless.length).toBeGreaterThan(0); - // …and every one of them is a BasicBlock — the filter excludes exactly - // the substrate rows, never a real symbol. - for (const row of nameless) { - expect(String(row.id)).toMatch(/^BasicBlock:/); - } - // With the U7 filter (what detectChanges now runs): zero BasicBlocks. + expect(blocks.length).toBeGreaterThan(0); + // With the U7 filter (the exact predicate detectChanges now runs — + // also validates STARTS WITH against the real engine): no BasicBlocks, + // real symbols intact. const symbols = (await adapter.executeQuery( - `MATCH (n) WHERE n.name IS NOT NULL + `MATCH (n) WHERE NOT n.id STARTS WITH 'BasicBlock:' AND n.startLine IS NOT NULL AND n.endLine IS NOT NULL RETURN n.id AS id`, )) as Array<{ id: string }>; From e356803ac5fa746722692e9e9b590e0c62cf1d29 Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Thu, 11 Jun 2026 04:23:23 +0000 Subject: [PATCH 10/19] test(run-analyze): model the M2 RepoMeta.pdg stamp in resolvePdgConfig defaults The DEFAULTS constant lacked the maxReachingDefEdgesPerFunction field that resolvePdgConfig resolves since the M2 stamp landed, failing two strict toEqual expectations (the CI 'tests' job failures). Models M2 steady-state equality; the M1-era-stamp upgrade path stays pinned in pdg-mode-flip.test.ts. Finding P1-4 of review 4471987625 (#2160). --- gitnexus/test/unit/run-analyze.test.ts | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/gitnexus/test/unit/run-analyze.test.ts b/gitnexus/test/unit/run-analyze.test.ts index ba80bde8fd..9790f337cb 100644 --- a/gitnexus/test/unit/run-analyze.test.ts +++ b/gitnexus/test/unit/run-analyze.test.ts @@ -330,7 +330,14 @@ describe('deriveEmbeddingCap', () => { }); describe('pdgModeMismatch / resolvePdgConfig (#2099 F1)', () => { - const DEFAULTS = { maxFunctionLines: 2000, maxEdgesPerFunction: 5000 }; + // M2 (#2082) added the resolved REACHING_DEF cap to the stamp; these tests + // model M2 STEADY-STATE equality. The M1-era-stamp (field absent) upgrade + // path is pinned in pdg-mode-flip.test.ts. + const DEFAULTS = { + maxFunctionLines: 2000, + maxEdgesPerFunction: 5000, + maxReachingDefEdgesPerFunction: 4000, + }; it('resolvePdgConfig: pdg-off run resolves to undefined (the meta field is omitted)', async () => { const { resolvePdgConfig } = await import('../../src/core/run-analyze.js'); @@ -342,8 +349,13 @@ describe('pdgModeMismatch / resolvePdgConfig (#2099 F1)', () => { const { resolvePdgConfig } = await import('../../src/core/run-analyze.js'); expect(resolvePdgConfig({ pdg: true })).toEqual(DEFAULTS); expect( - resolvePdgConfig({ pdg: true, pdgMaxFunctionLines: 0, pdgMaxEdgesPerFunction: 0 }), - ).toEqual({ maxFunctionLines: 0, maxEdgesPerFunction: 0 }); + resolvePdgConfig({ + pdg: true, + pdgMaxFunctionLines: 0, + pdgMaxEdgesPerFunction: 0, + pdgMaxReachingDefEdgesPerFunction: 0, + }), + ).toEqual({ maxFunctionLines: 0, maxEdgesPerFunction: 0, maxReachingDefEdgesPerFunction: 0 }); }); it('legacy meta (no recorded stamp) + plain run → no mismatch', async () => { From a32958c871eb0a9897178630dd8e1db2881eec17 Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Thu, 11 Jun 2026 04:23:54 +0000 Subject: [PATCH 11/19] =?UTF-8?q?test(cfg):=20reassign=20the=20shadowing?= =?UTF-8?q?=20fixture's=20bindings=20=E2=80=94=20fixes=20prefer-const=20CI?= =?UTF-8?q?=20errors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both withShadowing let bindings now genuinely reassign (s = s + 1 per scope), clearing the two prefer-const errors that failed quality/lint. Plain const would change the binding kind the harvest test exercises; reassignment keeps the let semantics and enriches the reaching-defs facts the snapshot pins (snapshot + per-binding assertion updated accordingly). Finding P2-6 of review 4471987625 (#2160). --- .../cfg/__snapshots__/reaching-defs-snapshot.test.ts.snap | 8 ++++++-- gitnexus/test/integration/cfg/fixtures/ten-functions.ts | 2 ++ .../test/integration/cfg/reaching-defs-snapshot.test.ts | 8 ++++++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/gitnexus/test/integration/cfg/__snapshots__/reaching-defs-snapshot.test.ts.snap b/gitnexus/test/integration/cfg/__snapshots__/reaching-defs-snapshot.test.ts.snap index 22ae17cf5f..7ad8b7afb3 100644 --- a/gitnexus/test/integration/cfg/__snapshots__/reaching-defs-snapshot.test.ts.snap +++ b/gitnexus/test/integration/cfg/__snapshots__/reaching-defs-snapshot.test.ts.snap @@ -107,14 +107,18 @@ exports[`R5 — REACHING_DEF facts snapshot on the M1 fixture > matches the comm "uses": 5, }, { - "defs": 2, + "defs": 4, "facts": [ "2:0->4:0:s:116:6", "3:0->3:1:s:118:8", + "3:1->3:1:s:118:8", + "3:1->3:2:s:118:8", + "4:0->4:0:s:116:6", + "4:0->4:1:s:116:6", ], "startLine": 115, "status": "computed", - "uses": 4, + "uses": 6, }, ] `; diff --git a/gitnexus/test/integration/cfg/fixtures/ten-functions.ts b/gitnexus/test/integration/cfg/fixtures/ten-functions.ts index c799ff466a..f58bd9c8b8 100644 --- a/gitnexus/test/integration/cfg/fixtures/ten-functions.ts +++ b/gitnexus/test/integration/cfg/fixtures/ten-functions.ts @@ -116,8 +116,10 @@ export function withShadowing(): void { let s = 1; { let s = 2; + s = s + 1; use(s); } + s = s + 1; done2(s); } diff --git a/gitnexus/test/integration/cfg/reaching-defs-snapshot.test.ts b/gitnexus/test/integration/cfg/reaching-defs-snapshot.test.ts index 406f4fb735..aa0fb7f0cd 100644 --- a/gitnexus/test/integration/cfg/reaching-defs-snapshot.test.ts +++ b/gitnexus/test/integration/cfg/reaching-defs-snapshot.test.ts @@ -92,8 +92,12 @@ describe('R5 — REACHING_DEF facts snapshot on the M1 fixture', () => { for (const f of rs.facts) { factsByBinding.set(f.bindingIdx, (factsByBinding.get(f.bindingIdx) ?? 0) + 1); } - // each s binding has exactly one use fact (no cross-kill, no cross-reach) + // each s binding forms its own facts (no cross-kill, no cross-reach): the + // inner block's reassign+use never references the outer binding and vice + // versa — both have facts, and every fact's def and use share the binding + // by construction of DefUseFact, so distinct counts per binding prove the + // bindings never conflated. const sIdxs = rs.bindings.map((b, i) => (b.name === 's' ? i : -1)).filter((i) => i >= 0); - for (const idx of sIdxs) expect(factsByBinding.get(idx)).toBe(1); + for (const idx of sIdxs) expect(factsByBinding.get(idx) ?? 0).toBeGreaterThanOrEqual(2); }); }); From a271f255e261995bc4a8a93313af2874dfc15499 Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Thu, 11 Jun 2026 04:25:01 +0000 Subject: [PATCH 12/19] fix(cfg): validate entry/exit indices in the emit-safety guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A corrupted side-channel element with an out-of-range entryIndex passed isEmitSafeCfg and threw inside the reaching-defs RPO walk — caught by the per-FILE try/catch, costing every sibling function's REACHING_DEF projection instead of the one element (and logging a misleading message). entry/exit join the guard's id-anchor checks. Finding P3 (entryIndex) of review 4471987625 (#2160). --- gitnexus/src/core/ingestion/cfg/emit.ts | 13 ++++++++++++ gitnexus/test/unit/cfg/emit-guard.test.ts | 24 +++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/gitnexus/src/core/ingestion/cfg/emit.ts b/gitnexus/src/core/ingestion/cfg/emit.ts index 14befc2a31..8962c9595c 100644 --- a/gitnexus/src/core/ingestion/cfg/emit.ts +++ b/gitnexus/src/core/ingestion/cfg/emit.ts @@ -107,6 +107,19 @@ export const isEmitSafeCfg = (cfg: FunctionCfg | undefined | null): cfg is Funct if (cfg.blocks[i]?.index !== i) return false; } const n = cfg.blocks.length; + // entry/exit must land on real blocks — the solver feeds entryIndex straight + // into its RPO walk, where an out-of-range index throws and (worse than this + // one element) costs the whole FILE's REACHING_DEF pass (tri-review P3). + if ( + !Number.isInteger(cfg.entryIndex) || + cfg.entryIndex < 0 || + cfg.entryIndex >= n || + !Number.isInteger(cfg.exitIndex) || + cfg.exitIndex < 0 || + cfg.exitIndex >= n + ) { + return false; + } return cfg.edges.every( (e) => Number.isInteger(e?.from) && diff --git a/gitnexus/test/unit/cfg/emit-guard.test.ts b/gitnexus/test/unit/cfg/emit-guard.test.ts index a117e2730d..2a141ae936 100644 --- a/gitnexus/test/unit/cfg/emit-guard.test.ts +++ b/gitnexus/test/unit/cfg/emit-guard.test.ts @@ -313,3 +313,27 @@ describe('#2082 M2 — statement-fact emit guard (isEmitSafeCfg extension)', () expect(warns()).toHaveLength(0); }); }); + +describe('#2160 review — entry/exit index validation', () => { + it('an out-of-range entryIndex is rejected per element (would crash the solver mid-file)', () => { + const bad = { ...validCfg, entryIndex: 99 }; + const logs = _captureLogger(); + try { + const graph = emitWith([bad, validCfg]); + // the malformed element is skipped; the valid sibling still emits + let cfgEdges = 0; + graph.forEachRelationship((r) => { + if (r.type === 'CFG') cfgEdges++; + }); + expect(cfgEdges).toBeGreaterThan(0); + expect( + logs + .records() + .filter((r) => r.level >= 40) + .some((r) => String(r.msg).includes('malformed')), + ).toBe(true); + } finally { + logs.restore(); + } + }); +}); From 04da731f1d4f2648025d8df51e8523d6a9e758c3 Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Thu, 11 Jun 2026 04:26:22 +0000 Subject: [PATCH 13/19] fix(cfg): report the def-key stride bail-out as a distinct 'overflow' status The STMT_STRIDE aliasing guard reused status 'truncated', so the emit warn misnamed it as the fact-materialization limit (printing an unrelated maxFacts value, including '(0)' when unlimited) and telemetry conflated the two. A distinct 'overflow' status gets its own warn naming the actual cause; the function's CFG layer is explicitly unaffected. Finding P3 (stride-bail diagnosis) of review 4471987625 (#2160). --- gitnexus/src/core/ingestion/cfg/emit.ts | 9 +++++++++ .../src/core/ingestion/cfg/reaching-defs.ts | 11 ++++++++--- gitnexus/test/unit/cfg/reaching-defs.test.ts | 18 ++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/gitnexus/src/core/ingestion/cfg/emit.ts b/gitnexus/src/core/ingestion/cfg/emit.ts index 8962c9595c..6e642a51f6 100644 --- a/gitnexus/src/core/ingestion/cfg/emit.ts +++ b/gitnexus/src/core/ingestion/cfg/emit.ts @@ -322,6 +322,15 @@ export function emitFileReachingDefs( `limit (${maxFacts}) reached — facts beyond it were not computed; ` + `the persisted REACHING_DEF projection for this function is sparse`, ); + } else if (r.status === 'overflow') { + result.truncatedFunctions++; + onWarn?.( + `[reaching-defs] ${filePath}:${functionStartLine}: a basic block exceeds ` + + `the def-key stride (≥2^21 coalesced statements — minified/generated ` + + `code) — REACHING_DEF skipped for this function (computing any facts ` + + `would risk wrong-block aliasing); its CFG is unaffected`, + ); + continue; } // Dedup to (defBlock, useBlock, binding) — facts arrive sorted, so the diff --git a/gitnexus/src/core/ingestion/cfg/reaching-defs.ts b/gitnexus/src/core/ingestion/cfg/reaching-defs.ts index 0255103e39..7bafa9609e 100644 --- a/gitnexus/src/core/ingestion/cfg/reaching-defs.ts +++ b/gitnexus/src/core/ingestion/cfg/reaching-defs.ts @@ -63,8 +63,13 @@ export interface FunctionDefUse { * `no-facts` — the CFG carries no statement facts (hand-built or pre-M2 * side channel); empty facts, NOT an error. * `truncated` — `limits.maxFacts` hit; `facts` is a deterministic prefix. + * `overflow` — a block's statement count breaches the def-key stride; no + * facts at all (computing any would risk key aliasing — + * wrong-block facts are strictly worse than none). Distinct + * from `truncated` so the caller's diagnostic doesn't + * misname it as the fact-materialization limit. */ - readonly status: 'computed' | 'no-facts' | 'truncated'; + readonly status: 'computed' | 'no-facts' | 'truncated' | 'overflow'; /** Pass-through of the CFG's binding table (empty for `no-facts`). */ readonly bindings: readonly BindingEntry[]; /** Sorted by (def block, def stmt, use block, use stmt, binding). */ @@ -109,10 +114,10 @@ export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimit // Key-aliasing guard (see STMT_STRIDE): a block with ≥ STRIDE statements // cannot be keyed without aliasing into the next block's def sites, which // would fabricate wrong-block facts — strictly worse than producing none. - // Bail to a sound empty `truncated` result (the emit path warns). + // Bail to a sound empty `overflow` result (the emit path warns distinctly). for (const b of blocks) { if ((b.statements?.length ?? 0) >= STMT_STRIDE) { - return { status: 'truncated', bindings: cfg.bindings, facts: [], defCount: 0, useCount: 0 }; + return { status: 'overflow', bindings: cfg.bindings, facts: [], defCount: 0, useCount: 0 }; } } diff --git a/gitnexus/test/unit/cfg/reaching-defs.test.ts b/gitnexus/test/unit/cfg/reaching-defs.test.ts index 27b6784b39..68ba62a3b9 100644 --- a/gitnexus/test/unit/cfg/reaching-defs.test.ts +++ b/gitnexus/test/unit/cfg/reaching-defs.test.ts @@ -420,3 +420,21 @@ describe('computeReachingDefs — parser-direct acceptance (with U1/U2)', () => expect(new Set(retUse.map((f) => f.def.line))).toEqual(new Set([2, 3])); }); }); + +describe('computeReachingDefs — tri-review soundness fixes (#2160 review)', () => { + it('a block with ≥ STMT_STRIDE statements reports overflow with zero facts (no aliasing)', () => { + const shared = { line: 1, defs: [], uses: [] }; + const huge = new Array(1 << 21).fill(shared); + const cfg = mkCfg( + [{}, {}, { stmts: huge as StatementFacts[] }], + [ + [0, 2], + [2, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + expect(r.status).toBe('overflow'); + expect(r.facts).toEqual([]); + }); +}); From e8fa2435b9c74dd9e6c86dfa206d5a551ffdd986 Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Thu, 11 Jun 2026 04:26:57 +0000 Subject: [PATCH 14/19] perf(cfg): cache the nearest enclosing scope per node during the prescan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit resolve() walked the AST parent chain per identifier — O(expression nesting depth), quadratic on deeply-chained single-statement expressions in generated code (not caught by any bench scenario, which scale blocks/bindings, not expression depth). The prescan already visits every node once, so caching its innermost scope makes phase-2 resolution O(scope-chain). Behavior-identical; the parent-chain walk survives as fallback for prescan-unvisited nodes. Finding P2 (resolve depth walk) of review 4471987625 (#2160). --- .../cfg/visitors/typescript-harvest.ts | 38 +++++++++++++++---- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts index f15c35f32c..70e5bd67dc 100644 --- a/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts @@ -97,6 +97,14 @@ export class TsHarvester { /** name → synthetic binding index (implicit global / import / captured). */ private readonly synthetic = new Map(); private readonly fnId: number; + /** + * Innermost enclosing scope per visited node id, filled during the prescan + * (which already touches every named node once). Makes phase-2 resolution + * O(scope-chain) instead of O(AST-depth) per identifier — a deeply-chained + * single-statement expression (generated code) otherwise turns the + * parent-chain walk quadratic (tri-review perf finding). + */ + private readonly nearestScopeCache = new Map(); constructor(private readonly fnNode: SyntaxNode) { this.fnId = fnNode.id; @@ -213,6 +221,7 @@ export class TsHarvester { } private prescan(node: SyntaxNode, scope: Scope): void { + this.nearestScopeCache.set(node.id, scope); const t = node.type; if (NESTED_FUNCTION_TYPES.has(t) && node.id !== this.fnId) { // A nested function's NAME binds in the enclosing scope; its body is opaque. @@ -335,16 +344,29 @@ export class TsHarvester { private resolve(nameNode: SyntaxNode): number { const name = nameNode.text; - for (let p: SyntaxNode | null = nameNode; p; p = p.parent) { - const scope = this.scopeByNode.get(p.id); - if (scope) { - for (let s: Scope | null = scope; s; s = s.parent) { - const idx = s.table.get(name); - if (idx !== undefined) return idx; + // Fast path: the prescan cached every visited node's innermost scope, so + // resolution walks the SCOPE chain (shallow), not the AST parent chain + // (arbitrarily deep in chained expressions). The parent-chain walk remains + // as fallback for the few nodes the prescan never visits (e.g. a nested + // function declaration's own name node). + const cached = this.nearestScopeCache.get(nameNode.id); + let startScope: Scope | null = cached ?? null; + if (!startScope) { + for (let p: SyntaxNode | null = nameNode; p; p = p.parent) { + const scope = this.scopeByNode.get(p.id) ?? this.nearestScopeCache.get(p.id); + if (scope) { + startScope = scope; + break; + } + if (p.id === this.fnId) { + startScope = this.root; + break; } - break; // reached the root scope without a hit } - if (p.id === this.fnId) break; + } + for (let s: Scope | null = startScope; s; s = s.parent) { + const idx = s.table.get(name); + if (idx !== undefined) return idx; } // No in-function declaration — synthetic module-level binding, shared by // defs and uses so `notDeclared = 1; use(notDeclared)` still forms a fact. From a951459cdcaac9392cbda44ae18bedc70b01353e Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Thu, 11 Jun 2026 04:27:30 +0000 Subject: [PATCH 15/19] fix(cfg): stop harvesting initializer-less var declarators as defs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A bare `var x;` mid-function is hoisted and writes nothing at runtime, but the harvester recorded a def — fabricating a kill of the live def in the same block: `x = source(); var x; sink(x)` lost the source→sink fact (a reaching-defs false negative). Defs now require an initializer for variable_declaration declarators; let/const genuinely initialize and keep their def. Finding P2-5 of review 4471987625 (#2160). --- .../core/ingestion/cfg/visitors/typescript-harvest.ts | 8 +++++++- gitnexus/test/unit/cfg/harvest.test.ts | 11 +++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts index 70e5bd67dc..ff8e669ffd 100644 --- a/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts @@ -413,7 +413,13 @@ export class TsHarvester { if (d?.type !== 'variable_declarator') continue; const name = d.childForFieldName('name'); const value = d.childForFieldName('value'); - if (name) this.walkDefPattern(name, acc); + // A bare `var x;` mid-function is hoisted and writes NOTHING at + // runtime — harvesting it as a def would fabricate a kill of the + // live def (`x = source(); var x; sink(x)` must keep source→sink; + // tri-review P2). `let`/`const` declarators genuinely initialize. + if (name && (value || t === 'lexical_declaration')) { + this.walkDefPattern(name, acc); + } if (value) this.walkValue(value, acc); } return; diff --git a/gitnexus/test/unit/cfg/harvest.test.ts b/gitnexus/test/unit/cfg/harvest.test.ts index 88ca809f34..b25596ac6e 100644 --- a/gitnexus/test/unit/cfg/harvest.test.ts +++ b/gitnexus/test/unit/cfg/harvest.test.ts @@ -419,3 +419,14 @@ describe('TS/JS def/use harvest — review-pass regressions (#2082)', () => { expect(condUses.length).toBeGreaterThan(0); }); }); + +describe('TS/JS def/use harvest — tri-review harvest fixes (#2160 review)', () => { + it('bare `var x;` is a runtime no-op — no def fact (initialized var still defs)', () => { + const cfg = cfgOf(`function f() { x = source(); var x; var y = 1; sink(x, y); }`); + const x = bindingIdx(cfg, 'x'); + const y = bindingIdx(cfg, 'y'); + const defFacts = allFacts(cfg).filter((s) => s.defs.includes(x)); + expect(defFacts).toHaveLength(1); // only the assignment, never the bare declarator + expect(allFacts(cfg).some((s) => s.defs.includes(y))).toBe(true); + }); +}); From 3d8102a4c9c53e8bc8162a391e0e0327bf702e40 Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Thu, 11 Jun 2026 04:28:10 +0000 Subject: [PATCH 16/19] fix(cfg): unwrap parenthesized/non-null lvalue wrappers before def detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `(x) += 1` and `(x)++` gated the def on the node type being exactly 'identifier', so the parenthesized form fell to the uses-only branch — the def (and its kill) silently vanished. Wrappers that don't change the lvalue (parenthesized_expression, TS non_null_expression) now unwrap at all three lvalue sites. Finding P3 (parenthesized lvalues) of review 4471987625 (#2160). --- .../cfg/visitors/typescript-harvest.ts | 20 ++++++++++++++++--- gitnexus/test/unit/cfg/harvest.test.ts | 7 +++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts index ff8e669ffd..aef286daea 100644 --- a/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts @@ -387,6 +387,17 @@ export class TsHarvester { acc.addUse(this.resolve(nameNode)); } + /** Strip wrappers that don't change the lvalue (`(x) += 1`, `x! ++`). */ + private unwrapLvalue(node: SyntaxNode): SyntaxNode { + let n = node; + while (n.type === 'parenthesized_expression' || n.type === 'non_null_expression') { + const inner = n.namedChild(0); + if (!inner) break; + n = inner; + } + return n; + } + /** Value-position walk: collect uses; route def positions to the pattern walk. */ private walkValue(node: SyntaxNode, acc: FactAccumulator): void { const t = node.type; @@ -426,13 +437,15 @@ export class TsHarvester { case 'assignment_expression': { const left = node.childForFieldName('left'); const right = node.childForFieldName('right'); - if (left) this.walkDefPattern(left, acc); + if (left) this.walkDefPattern(this.unwrapLvalue(left), acc); if (right) this.walkValue(right, acc); return; } case 'augmented_assignment_expression': { // `x += y` both defines and uses x. - const left = node.childForFieldName('left'); + const left = node.childForFieldName('left') + ? this.unwrapLvalue(node.childForFieldName('left') as SyntaxNode) + : null; const right = node.childForFieldName('right'); if (left?.type === 'identifier') { this.def(left, acc); @@ -444,7 +457,8 @@ export class TsHarvester { return; } case 'update_expression': { - const arg = node.childForFieldName('argument'); + const rawArg = node.childForFieldName('argument'); + const arg = rawArg ? this.unwrapLvalue(rawArg) : null; if (arg?.type === 'identifier') { this.def(arg, acc); this.use(arg, acc); diff --git a/gitnexus/test/unit/cfg/harvest.test.ts b/gitnexus/test/unit/cfg/harvest.test.ts index b25596ac6e..ecacaac2da 100644 --- a/gitnexus/test/unit/cfg/harvest.test.ts +++ b/gitnexus/test/unit/cfg/harvest.test.ts @@ -429,4 +429,11 @@ describe('TS/JS def/use harvest — tri-review harvest fixes (#2160 review)', () expect(defFacts).toHaveLength(1); // only the assignment, never the bare declarator expect(allFacts(cfg).some((s) => s.defs.includes(y))).toBe(true); }); + + it('parenthesized lvalues unwrap: `(x) += 1` and `(x)++` def+use x', () => { + const cfg = cfgOf(`function f(x) { (x) += 1; (x)++; }`); + const x = bindingIdx(cfg, 'x'); + const withDef = allFacts(cfg).filter((s) => s.defs.includes(x)); + expect(withDef.length).toBeGreaterThanOrEqual(2); + }); }); From a67a566d6c3d4ff4a34347a820084d6a7e56c40f Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Thu, 11 Jun 2026 04:29:57 +0000 Subject: [PATCH 17/19] =?UTF-8?q?fix(cfg):=20conditionally-evaluated=20def?= =?UTF-8?q?s=20are=20MAY-defs=20=E2=80=94=20gen=20without=20kill?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A def inside a short-circuit right operand, ternary arm, logical assignment, or switch case test was harvested as a must-def; the solver's total kill then erased the prior def on the not-taken path — a taint false negative on core idioms (`if (a && (x = clean())) {} sink(x)` lost source→sink; `cached ?? (cached = load())` likewise). StatementFacts gains an optional mayDefs field (conditional-context tracking in the harvester); the solver's per-block GEN carries {set, kills} so a may-def UNIONS into the binding's set instead of replacing it, in both the transfer and the statement sweep; the emit fact-guard validates mayDefs indices; switch case tests harvest via the conditional path. Finding P1-1 of review 4471987625 (#2160). --- gitnexus/src/core/ingestion/cfg/emit.ts | 3 + .../src/core/ingestion/cfg/reaching-defs.ts | 88 +++++++++++++---- gitnexus/src/core/ingestion/cfg/types.ts | 9 ++ .../cfg/visitors/typescript-harvest.ts | 98 +++++++++++++++++-- .../core/ingestion/cfg/visitors/typescript.ts | 7 +- gitnexus/test/unit/cfg/harvest.test.ts | 56 ++++++++++- gitnexus/test/unit/cfg/reaching-defs.test.ts | 37 +++++++ 7 files changed, 268 insertions(+), 30 deletions(-) diff --git a/gitnexus/src/core/ingestion/cfg/emit.ts b/gitnexus/src/core/ingestion/cfg/emit.ts index 6e642a51f6..3b1a1e50d0 100644 --- a/gitnexus/src/core/ingestion/cfg/emit.ts +++ b/gitnexus/src/core/ingestion/cfg/emit.ts @@ -168,6 +168,9 @@ export const hasEmitSafeFacts = (cfg: FunctionCfg): boolean => { return false; } if (!s.defs.every(inRange) || !s.uses.every(inRange)) return false; + if (s.mayDefs !== undefined) { + if (!Array.isArray(s.mayDefs) || !s.mayDefs.every(inRange)) return false; + } } } return true; diff --git a/gitnexus/src/core/ingestion/cfg/reaching-defs.ts b/gitnexus/src/core/ingestion/cfg/reaching-defs.ts index 7bafa9609e..8cc8a1dee6 100644 --- a/gitnexus/src/core/ingestion/cfg/reaching-defs.ts +++ b/gitnexus/src/core/ingestion/cfg/reaching-defs.ts @@ -16,13 +16,15 @@ * * COMPLEXITY DISCIPLINE (the four-times-repeated repo bug shape is per-item * re-derivation inside the loop): def-sets are SHARED BY REFERENCE, never - * deep-copied — RD's kill is total per binding, so a transfer either aliases - * the incoming set or replaces it with a fresh singleton. Single-predecessor - * blocks alias the predecessor's OUT map outright; multi-pred merges union - * only bindings whose incoming sets differ by reference. Iteration is reverse - * post-order, seeded with every block (unreachable blocks keep ⊥ IN — correct, - * their defs reach nothing). Convergence: sets grow monotonically within the - * finite def-site universe ⇒ ≤ loop-depth+1 passes in practice. + * deep-copied — a MUST def's kill is total per binding, so a transfer either + * aliases the incoming set or replaces it; a MAY def (conditional context — + * see StatementFacts.mayDefs) unions WITHOUT killing via a copy-on-extend. + * Single-predecessor blocks alias the predecessor's OUT map outright; + * multi-pred merges union only bindings whose incoming sets differ by + * reference. Iteration is reverse post-order, seeded with every block + * (unreachable blocks keep ⊥ IN — correct, their defs reach nothing). + * Convergence: sets grow monotonically within the finite def-site universe ⇒ + * ≤ loop-depth+1 passes in practice. * * `limits.maxFacts` bounds materialization: facts are O(defs×uses) BY SPEC in * merge-heavy code (N branch-arm defs × N later uses = N² facts), and a @@ -149,26 +151,40 @@ export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimit } for (const list of succs) list.sort((a, b) => a - b); - // ── per-block GEN (last def per binding) + def/use telemetry ──────────── - // gen[b]: bindingIdx → singleton def-set of the block's LAST def of it. - const gen: (Map | null)[] = new Array(n).fill(null); + // ── per-block GEN + def/use telemetry ──────────────────────────────────── + // gen[b]: bindingIdx → { set, kills }. A MUST def resets the accumulated + // set (kill is total); a MAY def (conditionally-evaluated context — see + // StatementFacts.mayDefs) only ADDS: the binding's incoming defs survive, + // so the transfer is out[x] = kills ? set : in[x] ∪ set. + interface GenEntry { + set: DefSet; + kills: boolean; + } + const gen: (Map | null)[] = new Array(n).fill(null); const defLine = new Map(); // defKey → source line let defCount = 0; let useCount = 0; for (const b of blocks) { const stmts = b.statements; if (!stmts || stmts.length === 0) continue; - let g: Map | null = null; + let g: Map | null = null; for (let i = 0; i < stmts.length; i++) { const s = stmts[i]; useCount += s.uses.length; - for (const d of s.defs) { + const key = defKey(b.index, i); + const record = (d: number, kills: boolean): void => { defCount += 1; - const key = defKey(b.index, i); defLine.set(key, s.line); if (!g) g = new Map(); - g.set(d, new Set([key])); // later defs overwrite — kill is total - } + const entry = g.get(d); + if (kills || !entry) { + g.set(d, { set: new Set([key]), kills: kills || (entry?.kills ?? false) }); + } else { + entry.set.add(key); // may-def accumulates; never clears + } + }; + if (s.mayDefs) for (const d of s.mayDefs) record(d, false); + for (const d of s.defs) record(d, true); } gen[b.index] = g; } @@ -199,14 +215,22 @@ export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimit inSets[b] = inB; const g = gen[b]; - // OUT = overlay(IN) replacing only GEN'd bindings with singletons. When + // OUT = overlay(IN): a KILLING gen entry replaces the binding's set; a + // may-def-only entry unions with the incoming set (never kills). When // nothing is genned, OUT aliases IN outright. let outB: Lattice; if (!g) { outB = inB; } else { outB = new Map(inB); // copies REFERENCES, never set contents - for (const [bindingIdx, set] of g) outB.set(bindingIdx, set); + for (const [bindingIdx, entry] of g) { + if (entry.kills) { + outB.set(bindingIdx, entry.set); + } else { + const incoming = inB.get(bindingIdx); + outB.set(bindingIdx, incoming ? unionSets(incoming, entry.set) : entry.set); + } + } } const requeue = (s: number): void => { @@ -243,8 +267,9 @@ export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimit // defs ∪ the same-statement def. Sound over-approximation — the extra // self-fact on compound assignments is harmless; missing the // assign-and-test def→use (the most common JS idiom) would be a taint - // false negative. - const sameStmtDefs = s.defs.length > 0 ? new Set(s.defs) : null; + // false negative. May-defs join the self-key set the same way. + const sameStmtDefs = + s.defs.length > 0 || s.mayDefs?.length ? new Set([...s.defs, ...(s.mayDefs ?? [])]) : null; for (const u of s.uses) { const reaching = (reach ?? inSets[b.index]).get(u); const selfKey = sameStmtDefs?.has(u) ? defKey(b.index, i) : undefined; @@ -267,6 +292,15 @@ export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimit }); } } + if (s.mayDefs?.length) { + // Gen WITHOUT kill: the conditional def joins the binding's set. + if (!reach) reach = new Map(inSets[b.index]); + const key = defKey(b.index, i); + for (const d of s.mayDefs) { + const prior = reach.get(d); + reach.set(d, prior ? unionSets(prior, new Set([key])) : new Set([key])); + } + } if (s.defs.length > 0) { if (!reach) reach = new Map(inSets[b.index]); for (const d of s.defs) reach.set(d, new Set([defKey(b.index, i)])); // kill + gen @@ -357,6 +391,22 @@ function mergePreds( return merged; } +/** Order-stable union of two def-sets (shares `a` when `b` adds nothing). */ +function unionSets(a: DefSet, b: DefSet): DefSet { + let target = a; + let copied = false; + for (const key of b) { + if (!target.has(key)) { + if (!copied) { + target = new Set(a); + copied = true; + } + target.add(key); + } + } + return target; +} + /** Per-binding equality with a reference fast path (sets only ever grow). */ function latticeEquals(a: Lattice, b: Lattice): boolean { if (a === b) return true; diff --git a/gitnexus/src/core/ingestion/cfg/types.ts b/gitnexus/src/core/ingestion/cfg/types.ts index bf6e47a538..ed789ec30c 100644 --- a/gitnexus/src/core/ingestion/cfg/types.ts +++ b/gitnexus/src/core/ingestion/cfg/types.ts @@ -49,11 +49,20 @@ export interface BindingEntry { * lists its binding in BOTH. Self-describing — `line` is carried here, never * inferred from the block's text fragments (facts-only records exist, e.g. * params on ENTRY and catch params). + * + * `mayDefs` (tri-review P1): defs harvested inside CONDITIONALLY-EVALUATED + * subexpressions — short-circuit right operands (`a && (x = v)`, + * `c ?? (c = load())`), ternary arms, logical-assignment operators, and + * switch case-test expressions. The solver treats them as GEN WITHOUT KILL: + * treating them as must-defs would falsely kill the prior def on the + * not-taken path (a taint false negative on core JS idioms). Optional — + * absent means none. */ export interface StatementFacts { readonly line: number; readonly defs: readonly number[]; readonly uses: readonly number[]; + readonly mayDefs?: readonly number[]; } /** A basic block: a maximal straight-line run of statements between leaders. */ diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts index aef286daea..81823c97a9 100644 --- a/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript-harvest.ts @@ -105,6 +105,13 @@ export class TsHarvester { * parent-chain walk quadratic (tri-review perf finding). */ private readonly nearestScopeCache = new Map(); + /** + * >0 while walking a conditionally-evaluated subexpression (short-circuit + * right operand, ternary arm, logical-assignment target, case test). Defs + * found there are MAY-defs — gen without kill (tri-review P1: a must-def + * here falsely kills the prior def on the not-taken path). + */ + private conditionalDepth = 0; constructor(private readonly fnNode: SyntaxNode) { this.fnId = fnNode.id; @@ -302,6 +309,17 @@ export class TsHarvester { return acc.finish(); } + /** + * Facts for an expression whose WHOLE evaluation is conditional (switch + * case tests, which only run when earlier cases didn't match) — every def + * inside becomes a may-def. + */ + factsConditional(node: SyntaxNode): StatementFacts { + const acc = new FactAccumulator(node.startPosition.row + 1); + this.conditional(() => this.walkValue(node, acc)); + return acc.finish(); + } + /** Facts for a `for (left in/of right)` head: left binds/assigns, right is used. */ forInHeadFacts(stmt: SyntaxNode): StatementFacts { const acc = new FactAccumulator(stmt.startPosition.row + 1); @@ -380,11 +398,18 @@ export class TsHarvester { } private def(nameNode: SyntaxNode, acc: FactAccumulator): void { - acc.addDef(this.resolve(nameNode)); + if (this.conditionalDepth > 0) acc.addMayDef(this.resolve(nameNode)); + else acc.addDef(this.resolve(nameNode)); } - private use(nameNode: SyntaxNode, acc: FactAccumulator): void { - acc.addUse(this.resolve(nameNode)); + /** Run `fn` with defs demoted to may-defs (conditionally-evaluated context). */ + private conditional(fn: () => void): void { + this.conditionalDepth++; + try { + fn(); + } finally { + this.conditionalDepth--; + } } /** Strip wrappers that don't change the lvalue (`(x) += 1`, `x! ++`). */ @@ -398,6 +423,10 @@ export class TsHarvester { return n; } + private use(nameNode: SyntaxNode, acc: FactAccumulator): void { + acc.addUse(this.resolve(nameNode)); + } + /** Value-position walk: collect uses; route def positions to the pattern walk. */ private walkValue(node: SyntaxNode, acc: FactAccumulator): void { const t = node.type; @@ -442,18 +471,27 @@ export class TsHarvester { return; } case 'augmented_assignment_expression': { - // `x += y` both defines and uses x. + // `x += y` both defines and uses x. The logical-assignment operators + // (`||=`, `&&=`, `??=`) only WRITE conditionally — their def is a + // may-def (the read always happens). const left = node.childForFieldName('left') ? this.unwrapLvalue(node.childForFieldName('left') as SyntaxNode) : null; const right = node.childForFieldName('right'); + const op = node.childForFieldName('operator')?.type ?? ''; + const logical = op === '||=' || op === '&&=' || op === '??='; if (left?.type === 'identifier') { - this.def(left, acc); + if (logical) this.conditional(() => this.def(left, acc)); + else this.def(left, acc); this.use(left, acc); } else if (left) { this.walkValue(left, acc); // member/subscript target — uses only } - if (right) this.walkValue(right, acc); + // The RHS of a logical assignment is itself conditionally evaluated. + if (right) { + if (logical) this.conditional(() => this.walkValue(right, acc)); + else this.walkValue(right, acc); + } return; } case 'update_expression': { @@ -467,6 +505,34 @@ export class TsHarvester { } return; } + case 'binary_expression': { + // Short-circuit operators evaluate their RIGHT operand conditionally: + // a def inside it (`a && (x = clean())`, `c ?? (c = load())`) must be + // a may-def or the not-taken path's prior def is falsely killed + // (tri-review P1). Other binary operators evaluate both sides. + const left = node.childForFieldName('left'); + const right = node.childForFieldName('right'); + const op = node.childForFieldName('operator')?.type ?? ''; + if (left) this.walkValue(left, acc); + if (right) { + if (op === '&&' || op === '||' || op === '??') { + this.conditional(() => this.walkValue(right, acc)); + } else { + this.walkValue(right, acc); + } + } + return; + } + case 'ternary_expression': { + // Each arm is conditionally evaluated — defs inside are may-defs. + const cond = node.childForFieldName('condition'); + const consequence = node.childForFieldName('consequence'); + const alternative = node.childForFieldName('alternative'); + if (cond) this.walkValue(cond, acc); + if (consequence) this.conditional(() => this.walkValue(consequence, acc)); + if (alternative) this.conditional(() => this.walkValue(alternative, acc)); + return; + } case 'class_declaration': { // The class NAME is a def (prescan declared the binding) — without // this case the default walk would record it as a bogus USE in plain @@ -543,8 +609,10 @@ export class TsHarvester { class FactAccumulator { private readonly defs: number[] = []; private readonly uses: number[] = []; + private readonly mayDefs: number[] = []; private readonly defSeen = new Set(); private readonly useSeen = new Set(); + private readonly mayDefSeen = new Set(); constructor(private readonly line: number) {} @@ -554,6 +622,13 @@ class FactAccumulator { this.defs.push(idx); } + /** A def that may not execute (conditional context) — gen without kill. */ + addMayDef(idx: number): void { + if (this.mayDefSeen.has(idx)) return; + this.mayDefSeen.add(idx); + this.mayDefs.push(idx); + } + addUse(idx: number): void { if (this.useSeen.has(idx)) return; this.useSeen.add(idx); @@ -561,7 +636,7 @@ class FactAccumulator { } defCount(): number { - return this.defs.length; + return this.defs.length + this.mayDefs.length; } useCount(): number { @@ -569,6 +644,13 @@ class FactAccumulator { } finish(): StatementFacts { - return { line: this.line, defs: this.defs, uses: this.uses }; + return { + line: this.line, + defs: this.defs, + uses: this.uses, + // Optional field stays absent when empty — keeps the serialized + // side-channel payload lean (most statements have no may-defs). + ...(this.mayDefs.length > 0 ? { mayDefs: this.mayDefs } : {}), + }; } } diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts index be14fa78cc..288e06daec 100644 --- a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts @@ -528,10 +528,13 @@ class TsCfgWalk { // `case x:` test expressions live in no block (caseStatements filters the // value node out) — harvest their uses onto the dispatch block, one record // per case in source order (a sound over-approximation of JS's in-order - // case evaluation). + // case evaluation). Conditionally: a later case test only evaluates when + // earlier cases didn't match, so any def inside one is a may-def — as a + // must-def on the always-executed dispatch block it would falsely kill + // prior defs for earlier-matching arms (tri-review). for (const c of cases) { const caseValue = c.childForFieldName('value'); - if (caseValue) this.builder.attachFacts(dispatch, this.harvest.facts(caseValue)); + if (caseValue) this.builder.attachFacts(dispatch, this.harvest.factsConditional(caseValue)); } const caseResults = cases.map((c) => this.visitSeq(this.caseStatements(c))); diff --git a/gitnexus/test/unit/cfg/harvest.test.ts b/gitnexus/test/unit/cfg/harvest.test.ts index ecacaac2da..6fccf95bb1 100644 --- a/gitnexus/test/unit/cfg/harvest.test.ts +++ b/gitnexus/test/unit/cfg/harvest.test.ts @@ -420,7 +420,61 @@ describe('TS/JS def/use harvest — review-pass regressions (#2082)', () => { }); }); -describe('TS/JS def/use harvest — tri-review harvest fixes (#2160 review)', () => { +describe('TS/JS def/use harvest — conditional contexts are MAY-defs (tri-review P1)', () => { + it('short-circuit RHS def lands in mayDefs, not defs', () => { + const cfg = cfgOf(`function f(a) { let x = source(); if (a && (x = clean())) {} sink(x); }`); + const x = bindingIdx(cfg, 'x'); + const cond = cfg.blocks.find((b) => b.text.includes('a && (x = clean())'))!; + const fact = cond.statements!.find((s) => (s.mayDefs ?? []).includes(x)); + expect(fact).toBeDefined(); + expect(fact!.defs).not.toContain(x); + }); + + it('nullish lazy-init (`c ?? (c = load())`) and ternary-arm defs are may-defs', () => { + const cfg = cfgOf(`function f(c, k) { + const v = c ?? (c = load()); + const w = k ? (c = a()) : b(); + use(v, w, c); + }`); + const c = bindingIdx(cfg, 'c'); + const all = allFacts(cfg); + expect(all.filter((s) => (s.mayDefs ?? []).includes(c))).toHaveLength(2); + // the only MUST def of c is its ENTRY param record — neither conditional + // assignment is a must-def + const mustDefs = all.filter((s) => s.defs.includes(c)); + expect(mustDefs).toHaveLength(1); + expect(mustDefs[0].line).toBe(1); // the param record + }); + + it('switch case-test defs are may-defs on the dispatch block', () => { + const cfg = cfgOf(`function f(v) { + let y = taint(); + switch (v) { + case probe(): sinkA(y); break; + case (y = 1): sinkB(); break; + } + }`); + const y = bindingIdx(cfg, 'y'); + const dispatch = cfg.blocks.find((b) => b.text === '(v)')!; + expect(dispatch.statements!.some((s) => (s.mayDefs ?? []).includes(y))).toBe(true); + expect(dispatch.statements!.some((s) => s.defs.includes(y))).toBe(false); + }); + + it('logical-assignment operators (`x ||= v`) write conditionally — may-def, but the read is a use', () => { + const cfg = cfgOf(`function f(x) { x ||= fallback(); use(x); }`); + const x = bindingIdx(cfg, 'x'); + const stmt = allFacts(cfg).find((s) => (s.mayDefs ?? []).includes(x)); + expect(stmt).toBeDefined(); + expect(stmt!.defs).not.toContain(x); + expect(stmt!.uses).toContain(x); + }); + + it('plain compound assignment (`x += 1`) stays a MUST def', () => { + const cfg = cfgOf(`function f(x) { x += 1; }`); + const x = bindingIdx(cfg, 'x'); + expect(allFacts(cfg).some((s) => s.defs.includes(x))).toBe(true); + }); + it('bare `var x;` is a runtime no-op — no def fact (initialized var still defs)', () => { const cfg = cfgOf(`function f() { x = source(); var x; var y = 1; sink(x, y); }`); const x = bindingIdx(cfg, 'x'); diff --git a/gitnexus/test/unit/cfg/reaching-defs.test.ts b/gitnexus/test/unit/cfg/reaching-defs.test.ts index 68ba62a3b9..4c153783fc 100644 --- a/gitnexus/test/unit/cfg/reaching-defs.test.ts +++ b/gitnexus/test/unit/cfg/reaching-defs.test.ts @@ -422,6 +422,43 @@ describe('computeReachingDefs — parser-direct acceptance (with U1/U2)', () => }); describe('computeReachingDefs — tri-review soundness fixes (#2160 review)', () => { + it('may-def gen does NOT kill: prior def survives a conditional assignment (hand-built)', () => { + // block 2: def x. block 3: stmt with MAY-def of x. block 4: use x. + const cfg = mkCfg( + [ + {}, + {}, + { stmts: [stmt(10, [0])] }, + { stmts: [{ line: 20, defs: [], uses: [], mayDefs: [0] }] }, + { stmts: [stmt(30, [], [0])] }, + ], + [ + [0, 2], + [2, 3], + [3, 4], + [4, 1], + ], + ['x'], + ); + const r = computeReachingDefs(cfg); + // BOTH the unconditional def and the conditional one reach the use + expect(render(r.facts).sort()).toEqual(['2:0->4:0:0', '3:0->4:0:0']); + }); + + it('short-circuit conditional def: the not-taken path keeps the prior def (parser-direct, P1)', () => { + const cfg = cfgOf(`function f(a) { + let x = source(); + if (a && (x = clean())) {} + sink(x); + }`); + const [x] = nameIdx(cfg, 'x'); + const r = computeReachingDefs(cfg); + const sinkUses = r.facts.filter((f) => f.bindingIdx === x && f.use.line === 4); + // BOTH source (line 2) and clean (line 3) reach sink — pre-fix, source was + // falsely killed (taint false negative on the lazy-init idiom) + expect(new Set(sinkUses.map((f) => f.def.line))).toEqual(new Set([2, 3])); + }); + it('a block with ≥ STMT_STRIDE statements reports overflow with zero facts (no aliasing)', () => { const shared = { line: 1, defs: [], uses: [] }; const huge = new Array(1 << 21).fill(shared); From 142624729bf8fa030f65f1b6c016d5051f346a41 Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Thu, 11 Jun 2026 04:30:32 +0000 Subject: [PATCH 18/19] =?UTF-8?q?fix(cfg):=20model=20labeled=20statements?= =?UTF-8?q?=20generically=20=E2=80=94=20break=20keeps=20its=20real=20conti?= =?UTF-8?q?nuation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A break to a label the visitor didn't model (labeled non-loop block, the OUTER label of a doubly-labeled construct) routed to EXIT, REMOVING the only path that kept the pre-jump def live — a reaching-defs false kill the in-code comment wrongly called sound. Loop/switch frames now carry their full label LIST (`outer: inner: for` resolves both); a labeled non-loop statement gets a break-target frame whose target is a synthesized join after the body; an unlabeled break never matches a block frame; labels compose with finalizer threading (a labeled break crossing a finally still threads it). Finding P1-2 of review 4471987625 (#2160). --- .../ingestion/cfg/control-flow-context.ts | 46 ++++++++--- .../core/ingestion/cfg/visitors/typescript.ts | 76 +++++++++++-------- gitnexus/test/unit/cfg/reaching-defs.test.ts | 25 ++++++ .../test/unit/cfg/typescript-visitor.test.ts | 63 +++++++++++++++ 4 files changed, 169 insertions(+), 41 deletions(-) diff --git a/gitnexus/src/core/ingestion/cfg/control-flow-context.ts b/gitnexus/src/core/ingestion/cfg/control-flow-context.ts index 8b1aefc588..6b9bf6c1c2 100644 --- a/gitnexus/src/core/ingestion/cfg/control-flow-context.ts +++ b/gitnexus/src/core/ingestion/cfg/control-flow-context.ts @@ -26,14 +26,28 @@ interface LoopFrame { readonly continueTo: number; /** Block a `break` jumps to (the loop exit / join). */ readonly breakTo: number; - readonly label?: string; + /** All labels naming this construct (`outer: inner: for` carries both). */ + readonly labels: readonly string[]; } interface SwitchFrame { readonly kind: 'switch'; /** Block a `break` jumps to (after the switch). `continue` is invalid here. */ readonly breakTo: number; - readonly label?: string; + readonly labels: readonly string[]; +} + +/** + * A labeled NON-loop statement (`blk: { … break blk; … }`) — break-to-label + * targets the synthesized join after the body (tri-review P1: routing such a + * break to EXIT removed the real continuation and falsely killed every def + * live at the jump for post-construct uses). Matched ONLY by a labeled break + * naming it; unlabeled breaks and continues skip it. + */ +interface BlockFrame { + readonly kind: 'block'; + readonly breakTo: number; + readonly labels: readonly string[]; } /** A `finally` whose body any crossing jump must route through. */ @@ -49,7 +63,8 @@ export interface FinalizerFrame { readonly pending: { to: number; kind: CfgEdgeKind }[]; } -type Frame = LoopFrame | SwitchFrame | FinalizerFrame; +type Frame = LoopFrame | SwitchFrame | BlockFrame | FinalizerFrame; +type TargetFrame = LoopFrame | SwitchFrame | BlockFrame; /** A resolved jump: its ultimate target + the finallys it crosses (inner→outer). */ export interface JumpResolution { @@ -60,12 +75,17 @@ export interface JumpResolution { export class ControlFlowContext { private readonly stack: Frame[] = []; - pushLoop(continueTo: number, breakTo: number, label?: string): void { - this.stack.push({ kind: 'loop', continueTo, breakTo, label }); + pushLoop(continueTo: number, breakTo: number, labels: readonly string[] = []): void { + this.stack.push({ kind: 'loop', continueTo, breakTo, labels }); + } + + pushSwitch(breakTo: number, labels: readonly string[] = []): void { + this.stack.push({ kind: 'switch', breakTo, labels }); } - pushSwitch(breakTo: number, label?: string): void { - this.stack.push({ kind: 'switch', breakTo, label }); + /** Push a labeled non-loop statement's break-target frame. */ + pushLabeledBlock(breakTo: number, labels: readonly string[]): void { + this.stack.push({ kind: 'block', breakTo, labels }); } /** @@ -91,13 +111,17 @@ export class ControlFlowContext { * threads nothing. */ resolveBreak(label?: string): JumpResolution | undefined { - return this.resolve((f) => label === undefined || f.label === label); + return this.resolve((f) => + label === undefined + ? f.kind !== 'block' // an unlabeled break never targets a labeled block + : f.labels.includes(label), + ); } /** Resolve a `continue`: like {@link resolveBreak} but only loop frames match. */ resolveContinue(label?: string): JumpResolution | undefined { return this.resolve( - (f) => f.kind === 'loop' && (label === undefined || f.label === label), + (f) => f.kind === 'loop' && (label === undefined || f.labels.includes(label)), (f) => (f as LoopFrame).continueTo, ); } @@ -128,8 +152,8 @@ export class ControlFlowContext { } private resolve( - matches: (f: LoopFrame | SwitchFrame) => boolean, - targetOf: (f: LoopFrame | SwitchFrame) => number = (f) => f.breakTo, + matches: (f: TargetFrame) => boolean, + targetOf: (f: TargetFrame) => number = (f) => f.breakTo, ): JumpResolution | undefined { const crossed: FinalizerFrame[] = []; for (let i = this.stack.length - 1; i >= 0; i--) { diff --git a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts index 288e06daec..1382fdb668 100644 --- a/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts +++ b/gitnexus/src/core/ingestion/cfg/visitors/typescript.ts @@ -30,16 +30,18 @@ * finallys chain inner→outer; finally-as-shared-join conflates exit paths * (sound over-approximation; duplication-per-exit-path was rejected). An * empty/comment-only finally pushes no frame — jumps keep direct edges. - * - labeled `break`/`continue` resolve against the labeled loop's frame. + * - labeled `break`/`continue` resolve against the labeled construct's frame: + * loops/switches carry their full label LIST (`outer: inner: for` resolves + * both), and a labeled NON-loop statement (`blk: { … break blk; … }`) gets + * a break-target frame whose target is a synthesized join after the body — + * the M1 route-to-EXIT fallback removed the real continuation and falsely + * killed defs for reaching-defs (tri-review P1). * * Known limitations: - * - A `break`/`continue` to a label on a non-loop/non-switch block, and the - * OUTER label of a doubly-labeled construct (`outer: inner: for (...)`), are - * not modeled. The jump is conservatively routed to the function EXIT (a - * sound over-approximation that keeps the graph single-exit — see visitBreak) - * rather than left as a dangling sink, and threads no finallys (target - * unknown ⇒ crossed set unknown). Single-labeled loops/switches resolve - * correctly, including across finallys. + * - A jump whose label STILL fails to resolve (malformed source) keeps the + * conservative route-to-EXIT + thread-all-finallys fallback in + * visitBreak/visitContinue — single-exit preserved, no finally bypassed, + * but the continuation path is approximate. * - Exceptional flow stays the sound over-approximation: EVERY protected-region * block edges to the handler (an exception may fire mid-block), which * over-supplies reaching-defs facts into `catch` — extra facts, never false @@ -120,8 +122,8 @@ class TsCfgWalk { private readonly cfc = new ControlFlowContext(); /** Stack of exception-handler entry blocks (catch/finally) a `throw` jumps to. */ private readonly handlers: number[] = []; - /** Label awaiting the loop/switch it immediately precedes (labeled_statement). */ - private pendingLabel: string | undefined; + /** Labels awaiting the construct they precede (`outer: inner: for` = both). */ + private pendingLabels: string[] = []; constructor( private readonly builder: CfgBuilder, @@ -308,14 +310,28 @@ class TsCfgWalk { private visitLabeled(stmt: SyntaxNode): SeqResult { const body = stmt.childForFieldName('body') ?? stmt.namedChildren[stmt.namedChildren.length - 1]; - if (body && LOOP_OR_SWITCH_TYPES.has(body.type)) { - this.pendingLabel = this.labelOf(stmt); + const label = this.labelOf(stmt); + if (body && (LOOP_OR_SWITCH_TYPES.has(body.type) || body.type === 'labeled_statement')) { + // Loop/switch consumes the accumulated labels via takeLabels(); a nested + // labeled_statement keeps accumulating (`outer: inner: for` → both + // labels land on the loop frame). + if (label) this.pendingLabels.push(label); const res = this.visitStmt(body); - this.pendingLabel = undefined; // clear even if the construct didn't consume it + this.pendingLabels = []; // clear leftovers if the construct didn't consume return res; } - // Labeled non-loop blocks (break-to-block-label) are not modeled in M1. - return this.visitBody(body); + // Labeled NON-loop statement (`blk: { … break blk; … }`): break-to-label + // targets a synthesized join after the body. Routing it to EXIT instead + // (the M1 behavior) removed the real continuation and falsely killed + // every def live at the jump for post-construct uses (tri-review P1). + const labels = [...this.pendingLabels, ...(label ? [label] : [])]; + this.pendingLabels = []; + const join = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); + this.cfc.pushLabeledBlock(join, labels); + const res = this.visitBody(body); + this.cfc.pop(); + if (res) this.builder.connect(res.exits, join, 'seq'); + return { entry: res?.entry ?? join, exits: [join] }; } private visitIf(stmt: SyntaxNode): TraversalResult { @@ -365,7 +381,7 @@ class TsCfgWalk { } private visitWhile(stmt: SyntaxNode): TraversalResult { - const label = this.takeLabel(); + const labels = this.takeLabels(); const cond = stmt.childForFieldName('condition') ?? stmt; const header = this.builder.newBlock( startLineOf(stmt), @@ -376,7 +392,7 @@ class TsCfgWalk { ); const loopExit = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); - this.cfc.pushLoop(header, loopExit, label); + this.cfc.pushLoop(header, loopExit, labels); const body = this.visitBody(this.bodyBlockOf(stmt)); this.cfc.pop(); @@ -391,7 +407,7 @@ class TsCfgWalk { } private visitDoWhile(stmt: SyntaxNode): TraversalResult { - const label = this.takeLabel(); + const labels = this.takeLabels(); const cond = stmt.childForFieldName('condition') ?? stmt; const condBlock = this.builder.newBlock( startLineOf(cond), @@ -402,7 +418,7 @@ class TsCfgWalk { ); const loopExit = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); - this.cfc.pushLoop(condBlock, loopExit, label); + this.cfc.pushLoop(condBlock, loopExit, labels); const body = this.visitBody(this.bodyBlockOf(stmt)); this.cfc.pop(); @@ -414,7 +430,7 @@ class TsCfgWalk { } private visitFor(stmt: SyntaxNode): TraversalResult { - const label = this.takeLabel(); + const labels = this.takeLabels(); const init = stmt.childForFieldName('initializer'); const cond = stmt.childForFieldName('condition'); const incr = stmt.childForFieldName('increment'); @@ -440,7 +456,7 @@ class TsCfgWalk { this.builder.edge(incrBlock, header, 'loop-back'); } - this.cfc.pushLoop(incrBlock, loopExit, label); + this.cfc.pushLoop(incrBlock, loopExit, labels); const body = this.visitBody(this.bodyBlockOf(stmt)); this.cfc.pop(); @@ -475,7 +491,7 @@ class TsCfgWalk { } private visitForIn(stmt: SyntaxNode): TraversalResult { - const label = this.takeLabel(); + const labels = this.takeLabels(); // Header text is SYNTHESIZED, so facts come from the left/right AST nodes // directly (the loop variable is a def, the iterated expression a use). const header = this.builder.newBlock( @@ -487,7 +503,7 @@ class TsCfgWalk { ); const loopExit = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); - this.cfc.pushLoop(header, loopExit, label); + this.cfc.pushLoop(header, loopExit, labels); const body = this.visitBody(this.bodyBlockOf(stmt)); this.cfc.pop(); @@ -508,7 +524,7 @@ class TsCfgWalk { } private visitSwitch(stmt: SyntaxNode): TraversalResult { - const label = this.takeLabel(); + const labels = this.takeLabels(); const value = stmt.childForFieldName('value') ?? stmt; const dispatch = this.builder.newBlock( startLineOf(stmt), @@ -519,7 +535,7 @@ class TsCfgWalk { ); const switchExit = this.builder.newBlock(endLineOf(stmt), endLineOf(stmt), ''); - this.cfc.pushSwitch(switchExit, label); + this.cfc.pushSwitch(switchExit, labels); const body = stmt.childForFieldName('body'); const cases = body ? body.namedChildren.filter((c) => c.type === 'switch_case' || c.type === 'switch_default') @@ -692,11 +708,11 @@ class TsCfgWalk { return this.handlers.length ? this.handlers[this.handlers.length - 1] : this.builder.exitIndex; } - /** Consume the label awaiting the loop/switch this call is building. */ - private takeLabel(): string | undefined { - const label = this.pendingLabel; - this.pendingLabel = undefined; - return label; + /** Consume the labels awaiting the loop/switch this call is building. */ + private takeLabels(): string[] { + const labels = this.pendingLabels; + this.pendingLabels = []; + return labels; } private labelOf(stmt: SyntaxNode): string | undefined { diff --git a/gitnexus/test/unit/cfg/reaching-defs.test.ts b/gitnexus/test/unit/cfg/reaching-defs.test.ts index 4c153783fc..c9be3ff830 100644 --- a/gitnexus/test/unit/cfg/reaching-defs.test.ts +++ b/gitnexus/test/unit/cfg/reaching-defs.test.ts @@ -459,6 +459,31 @@ describe('computeReachingDefs — tri-review soundness fixes (#2160 review)', () expect(new Set(sinkUses.map((f) => f.def.line))).toEqual(new Set([2, 3])); }); + it('labeled non-loop block: break keeps the real continuation (parser-direct, P1)', () => { + const cfg = cfgOf(`function f(c) { + let x = 1; + blk: { if (c) break blk; x = 2; } + sink(x); + }`); + const [x] = nameIdx(cfg, 'x'); + const r = computeReachingDefs(cfg); + const sinkUses = r.facts.filter((f) => f.bindingIdx === x && f.use.line === 4); + // the break path preserves x=1; the fall-through path redefines to x=2 + expect(new Set(sinkUses.map((f) => f.def.line))).toEqual(new Set([2, 3])); + }); + + it('doubly-labeled loop: `break outer` resolves to the loop exit, keeping post-loop facts (P1)', () => { + const cfg = cfgOf(`function f(c) { + let x = 1; + outer: inner: do { if (c) break outer; x = 2; } while (g()); + sink(x); + }`); + const [x] = nameIdx(cfg, 'x'); + const r = computeReachingDefs(cfg); + const sinkUses = r.facts.filter((f) => f.bindingIdx === x && f.use.line === 4); + expect(new Set(sinkUses.map((f) => f.def.line))).toEqual(new Set([2, 3])); + }); + it('a block with ≥ STMT_STRIDE statements reports overflow with zero facts (no aliasing)', () => { const shared = { line: 1, defs: [], uses: [] }; const huge = new Array(1 << 21).fill(shared); diff --git a/gitnexus/test/unit/cfg/typescript-visitor.test.ts b/gitnexus/test/unit/cfg/typescript-visitor.test.ts index 2d49f5fd13..7dc7711a53 100644 --- a/gitnexus/test/unit/cfg/typescript-visitor.test.ts +++ b/gitnexus/test/unit/cfg/typescript-visitor.test.ts @@ -645,3 +645,66 @@ describe('TS/JS CfgVisitor — early exits through finally (#2082 M2 U2)', () => } }); }); + +describe('TS/JS CfgVisitor — labeled statements modeled generically (#2160 review)', () => { + it('break to a labeled non-loop block targets the synthesized join, not EXIT', () => { + const cfg = cfgOf(`function f(c) { + let x = 1; + blk: { if (c) { break blk; } x = 2; } + sink(x); + }`); + const brk = block(cfg, 'break blk'); + const sink = block(cfg, 'sink(x)'); + const brkEdges = cfg.edges.filter((e) => e.from === brk && e.kind === 'break'); + expect(brkEdges).toHaveLength(1); + expect(brkEdges[0].to).not.toBe(cfg.exitIndex); + // the break's target flows into the post-construct continuation + expect(reaches(cfg, brkEdges[0].to, sink)).toBe(true); + }); + + it('doubly-labeled loop: break to the OUTER label resolves to the loop exit', () => { + const cfg = cfgOf(`function f(c) { + outer: inner: do { if (c) { break outer; } work(); } while (g()); + done(); + }`); + const brk = block(cfg, 'break outer'); + const done = block(cfg, 'done()'); + const brkEdges = cfg.edges.filter((e) => e.from === brk && e.kind === 'break'); + expect(brkEdges).toHaveLength(1); + expect(brkEdges[0].to).not.toBe(cfg.exitIndex); + expect(reaches(cfg, brkEdges[0].to, done)).toBe(true); + }); + + it('labeled break crossing a finally still threads it (labels + finalizers compose)', () => { + const cfg = cfgOf(`function f(c) { + blk: { + try { if (c) { break blk; } } finally { f1(); } + rest(); + } + after(); + }`); + const brk = block(cfg, 'break blk'); + const fin = block(cfg, 'f1()'); + expect(cfg.edges).toContainEqual({ from: brk, to: fin, kind: 'break' }); + const completions = cfg.edges.filter((e) => e.from === fin && e.kind === 'finally-break'); + expect(completions).toHaveLength(1); + // the completion resumes at the block's join → after() reachable, rest() skipped on that path + expect(reaches(cfg, completions[0].to, block(cfg, 'after()'))).toBe(true); + expect(completions[0].to).not.toBe(block(cfg, 'rest()')); + }); + + it('an unlabeled break inside a labeled block still targets the enclosing loop', () => { + const cfg = cfgOf(`function f(xs) { + for (const x of xs) { + blk: { if (x) { break; } } + body(); + } + done(); + }`); + const brk = block(cfg, 'break'); + const brkEdges = cfg.edges.filter((e) => e.from === brk && e.kind === 'break'); + expect(brkEdges).toHaveLength(1); + // targets the LOOP exit (reaches done() without re-entering body()) + expect(reaches(cfg, brkEdges[0].to, block(cfg, 'done()'))).toBe(true); + }); +}); From 98f19c3c4cd7526960cd3c9ac9a0f5618383b487 Mon Sep 17 00:00:00 2001 From: Gergo Magyar Date: Thu, 11 Jun 2026 04:31:00 +0000 Subject: [PATCH 19/19] fix(cfg): throw edges deliver ALL of a block's defs to the handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The throw contribution was IN ∪ OUT — entry and final states only. The intermediate defs of a multi-def coalesced block were invisible to the handler, though they are exactly what the catch observes when a later statement throws: `try { x = parse(a); x = normalize(x); } catch { sink(x) }` lost the parse→sink fact (normalize throwing delivers parse's value). Throw predecessors now contribute IN(from) ∪ allDefs(from) — a static per-block all-def-sites map — which subsumes OUT; monotone and deterministic. Finding P1-3 of review 4471987625 (#2160). --- .../src/core/ingestion/cfg/reaching-defs.ts | 47 +++++++++++++++---- gitnexus/test/unit/cfg/reaching-defs.test.ts | 18 +++++++ 2 files changed, 55 insertions(+), 10 deletions(-) diff --git a/gitnexus/src/core/ingestion/cfg/reaching-defs.ts b/gitnexus/src/core/ingestion/cfg/reaching-defs.ts index 8cc8a1dee6..14c7b3745e 100644 --- a/gitnexus/src/core/ingestion/cfg/reaching-defs.ts +++ b/gitnexus/src/core/ingestion/cfg/reaching-defs.ts @@ -124,12 +124,13 @@ export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimit } // ── adjacency (sorted for deterministic merges) ───────────────────────── - // A `throw` edge contributes IN(from) ∪ OUT(from) to its handler, not just - // OUT: an exception can fire BEFORE the faulting block's defs complete, so - // OUT-only would falsely kill the pre-block defs on the exceptional path — - // `let x = seed(); try { x = risky(); } catch { sink(x) }` must let the - // seed def reach the sink (risky() may throw before assigning). Sound - // over-approximation; monotone, so the fixpoint absorbs it. + // A `throw` edge contributes IN(from) ∪ allDefs(from) to its handler, not + // OUT: an exception can fire BEFORE the block's defs complete (the seed def + // in `let x = seed(); try { x = risky(); } catch { sink(x) }` must reach the + // sink) AND between any two defs of a multi-def coalesced block (the parse + // def in `x = parse(a); x = normalize(x);` is live exactly when normalize + // throws — OUT's last-def-wins misses it). Sound over-approximation; + // monotone, so the fixpoint absorbs it. See mergePreds. const preds: { from: number; viaThrow: boolean }[][] = Array.from({ length: n }, () => []); const succs: number[][] = Array.from({ length: n }, () => []); // Handlers whose IN depends on this block's IN (throw edges) — requeued on @@ -161,6 +162,13 @@ export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimit kills: boolean; } const gen: (Map | null)[] = new Array(n).fill(null); + // allDefsGen[b]: bindingIdx → EVERY def-site key in the block (must + may). + // This is what a throw edge delivers to its handler: an exception can fire + // between any two statements, so every intermediate def may be the live one + // at the handler — IN∪OUT alone misses defs overwritten later in the same + // coalesced block (`try { x = parse(a); x = normalize(x); } catch { sink(x) }` + // — parse's value is exactly what sink sees when normalize throws). + const allDefsGen: (Lattice | null)[] = new Array(n).fill(null); const defLine = new Map(); // defKey → source line let defCount = 0; let useCount = 0; @@ -168,6 +176,7 @@ export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimit const stmts = b.statements; if (!stmts || stmts.length === 0) continue; let g: Map | null = null; + let all: Lattice | null = null; for (let i = 0; i < stmts.length; i++) { const s = stmts[i]; useCount += s.uses.length; @@ -182,11 +191,16 @@ export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimit } else { entry.set.add(key); // may-def accumulates; never clears } + if (!all) all = new Map(); + const allSet = all.get(d); + if (allSet) allSet.add(key); + else all.set(d, new Set([key])); }; if (s.mayDefs) for (const d of s.mayDefs) record(d, false); for (const d of s.defs) record(d, true); } gen[b.index] = g; + allDefsGen[b.index] = all; } // ── iteration order: RPO over reachable blocks, then the rest by index ── @@ -210,7 +224,7 @@ export function computeReachingDefs(cfg: FunctionCfg, limits?: ReachingDefsLimit ? EMPTY_LATTICE : p.length === 1 && !p[0].viaThrow ? outSets[p[0].from] // alias — zero allocation on straight-line chains - : mergePreds(p, inSets, outSets); + : mergePreds(p, inSets, outSets, allDefsGen); const inChanged = !latticeEquals(inSets[b], inB); inSets[b] = inB; @@ -354,11 +368,19 @@ function reversePostOrder(entry: number, succs: readonly number[][], n: number): return order; } -/** Union predecessor lattices (OUT; plus IN for throw edges), sharing sets. */ +/** + * Union predecessor lattices, sharing sets where possible. A normal edge + * contributes OUT(from). A THROW edge contributes IN(from) ∪ allDefs(from): + * an exception may fire before, between, or after any of the block's defs, so + * the handler can observe the incoming state OR any intermediate def — OUT + * alone (last-def-wins) misses defs overwritten later in the same block. + * IN ∪ allDefs ⊇ OUT, so the throw contribution subsumes it. + */ function mergePreds( preds: readonly { from: number; viaThrow: boolean }[], inSets: readonly Lattice[], outSets: readonly Lattice[], + allDefsGen: readonly (Lattice | null)[], ): Lattice { const merged: Lattice = new Map(); const mergeOne = (source: Lattice): void => { @@ -385,8 +407,13 @@ function mergePreds( } }; for (const p of preds) { - mergeOne(outSets[p.from]); - if (p.viaThrow) mergeOne(inSets[p.from]); // exception may fire pre-defs + if (p.viaThrow) { + mergeOne(inSets[p.from]); // exception may fire pre-defs… + const all = allDefsGen[p.from]; + if (all) mergeOne(all); // …or after ANY of the block's defs + } else { + mergeOne(outSets[p.from]); + } } return merged; } diff --git a/gitnexus/test/unit/cfg/reaching-defs.test.ts b/gitnexus/test/unit/cfg/reaching-defs.test.ts index c9be3ff830..01dc598edc 100644 --- a/gitnexus/test/unit/cfg/reaching-defs.test.ts +++ b/gitnexus/test/unit/cfg/reaching-defs.test.ts @@ -484,6 +484,24 @@ describe('computeReachingDefs — tri-review soundness fixes (#2160 review)', () expect(new Set(sinkUses.map((f) => f.def.line))).toEqual(new Set([2, 3])); }); + it('throw edges deliver INTERMEDIATE defs of a coalesced block to the handler (parser-direct, P1)', () => { + const cfg = cfgOf(`function f(a) { + let x = seed(a); + try { + x = parse(a); + x = normalize(x); + } catch (e) { + sink(x); + } + }`); + const [x] = nameIdx(cfg, 'x'); + const r = computeReachingDefs(cfg); + const sinkUses = r.facts.filter((f) => f.bindingIdx === x && f.use.line === 7); + // seed (pre-try), parse (intermediate — normalize may throw with parse's + // value live), and normalize (its own RHS use may throw) all reach sink + expect(new Set(sinkUses.map((f) => f.def.line))).toEqual(new Set([2, 4, 5])); + }); + it('a block with ≥ STMT_STRIDE statements reports overflow with zero facts (no aliasing)', () => { const shared = { line: 1, defs: [], uses: [] }; const huge = new Array(1 << 21).fill(shared);