diff --git a/gitnexus/bench/scope-capture/baselines.json b/gitnexus/bench/scope-capture/baselines.json index 96fa1430d1..e65799f803 100644 --- a/gitnexus/bench/scope-capture/baselines.json +++ b/gitnexus/bench/scope-capture/baselines.json @@ -16,11 +16,11 @@ "_added": "#1956: c added to the scope-capture bench (was UNBENCHED). C has no inheritance \u2014 flat scale source. Adding it exposed + fixed a pre-existing O(n^2) findNodeAtRange root-walk in c/captures.ts (threaded c.node, byte-identical over c-* fixtures); scaling 3.475 -> 0.96." }, "cpp": { - "fingerprint": "931bf7af55dc1480d1a5d3c479ea3803003a6a2e2c4406447bd96f3e312e88de", + "fingerprint": "e21e05c92870b82468b5d73f04d205b6aafad4143331cf718131f0517ba34e0a", "scaling_budget": 1.5, "_added": "#1956: cpp added to the scope-capture bench (was UNBENCHED). Heritage-bearing scale source (: public Base, public Mixin) drives emitCppInheritanceCaptures at scale. Adding it exposed + fixed a pre-existing O(n^2) findNodeAtRange root-walk in cpp/captures.ts (~12 sites, threaded c.node, byte-identical over 263 cpp-* fixtures); scaling 2.30 -> 1.12.", "_rebaselined": "#1965 / #1923 F4: uninitialized non-leading multi-declarators now emit @declaration.variable captures; cpp-adl-inner-callable-outer-noncallable data::Pair a, b adds the legitimate fixture drift. Linear (~1.06).", - "_note": "#1975: + cpp-out-of-line-class fixture (out-of-line struct Outer::Inner / Other::Inner). Pure fixture-corpus drift — the fix is the legacy structure-query qualified_identifier arm, NOT the cpp scope-extractor; existing fixtures' captures byte-identical. fixture_count 263->265." + "_note": "#1975: + cpp-out-of-line-class fixture, fixture_count 263->265. #1990: + cpp-adl-ns-plus-hidden-friend-same-name fixture (ADL hidden-friend + namespace-callable merge parity test). Pure fixture-corpus drift — no scope-extractor change; existing fixtures' captures byte-identical. fixture_count 265->267." }, "csharp": { "_rebaselined": "#1956 synth-widening: + csharp-qualified-base fixture; the synth now walks record_declaration + struct_declaration base_lists and handles alias_qualified_name (matching the #1940 legacy leg), so record/struct heritage now emits. csharp-record-base gains a record inherits capture. (record->record SAME-namespace EXTENDS is a separate registry resolution gap, tracked as follow-up.) Linear (~1.00). (Earlier #1956: heritage-bearing scale source.)", diff --git a/gitnexus/src/core/ingestion/languages/cpp/adl.ts b/gitnexus/src/core/ingestion/languages/cpp/adl.ts index 9c23c73cc1..565c231254 100644 --- a/gitnexus/src/core/ingestion/languages/cpp/adl.ts +++ b/gitnexus/src/core/ingestion/languages/cpp/adl.ts @@ -49,13 +49,18 @@ * * ## State lifecycle * - * Three module-level maps populated per pipeline invocation, cleared via - * `clearCppAdlState()` (called from `clearFileLocalNames`): + * Five pieces of module-level state populated per pipeline invocation, all + * reset together by `clearCppAdlState()` (called from + * `cppScopeResolver.loadResolutionConfig`, alongside `clearFileLocalNames` — + * NOT from `clearFileLocalNames` itself), grouped by when they fill: * * - `argInfoBySite` — per-call-site argument shape (capture-time) * - `noAdlSites` — call sites with parenthesized function (capture-time) * - `classToNamespaceQualifiedName` — class def → its enclosing namespace * qualified name (`populateCppAssociatedNamespaces` time) + * - `adlIndex` / `adlIndexSource` — the lazily-built candidate index and the + * `parsedFiles` reference it was built from (first-`pickCppAdlCandidates` + * time; see `ensureAdlIndex`) * * The class→namespace map uses qualified names (not scope IDs) because * C++ namespaces are open: `namespace N { ... }` in file A and @@ -103,10 +108,260 @@ const argInfoBySite = new Map(); const noAdlSites = new Set(); const classToNamespaceQualifiedName = new Map(); +/** + * ADL candidate index — built **once** per pipeline run from + * `(scopes, parsedFiles)` and reused by every call site. + * + * The legacy `pickCppAdlCandidates` re-scanned all parsed files (rebuilding a + * per-file `scopesById` map each time), all workspace defs (for the + * class-by-simple-name lookup), and used an O(scopes²) child-scope walk for + * hidden friends — once **per unresolved call site**. With hundreds of + * thousands of unresolved C++ sites that made the scope-resolution emit phase + * super-linear (observed ~6.7h on a large repo). This index moves all of that + * work to a single pass; per-site cost drops to O(associated namespaces). + */ +export interface AdlCandidateIndex { + /** simple name → class-like defs (Class/Struct/Interface/Enum), preserving + * `scopes.defs.byId` iteration order so first-match / ambiguous semantics + * match the legacy linear scan. */ + readonly classDefsBySimple: Map; + /** namespace QName → simple name → callable defs owned by that namespace, + * with inline-namespace transparency (inline-ns defs are also registered + * under the parent namespace's QName). */ + readonly nsCandidates: Map>; + /** associated-class enclosing-namespace QName → simple name → hidden-friend + * and class-member callable defs. */ + readonly friendCandidates: Map>; + /** namespace QName (own) → simple name → Function/Method defs, for the + * qualified function-reference ADL path. */ + readonly nsFunctionsByQName: Map>; + /** simple name → Function/Method defs across all namespaces, for the + * unqualified function-reference ADL path. */ + readonly nsFunctionsBySimple: Map; + /** nodeId → visitation sequence number, used to merge per-namespace buckets + * back into the exact legacy candidate order (file-major; namespace defs + * before friend/member defs within a file). */ + readonly seqByNodeId: Map; +} + +let adlIndex: AdlCandidateIndex | undefined; +let adlIndexSource: readonly ParsedFile[] | undefined; + function siteKey(filePath: string, line: number, col: number): string { return `${filePath}:${line}:${col}`; } +/** Last segment of a dotted qualified name (matches legacy inline expression). */ +function adlSimpleName(def: SymbolDefinition): string { + return def.qualifiedName?.split('.').pop() ?? def.qualifiedName ?? ''; +} + +function isAdlCallableType(type: string): boolean { + return type === 'Function' || type === 'Method' || type === 'Constructor'; +} + +function pushNested( + map: Map>, + outerKey: string, + innerKey: string, + def: SymbolDefinition, +): void { + let inner = map.get(outerKey); + if (inner === undefined) { + inner = new Map(); + map.set(outerKey, inner); + } + let arr = inner.get(innerKey); + if (arr === undefined) { + arr = []; + inner.set(innerKey, arr); + } + arr.push(def); +} + +function pushFlat(map: Map, key: string, def: SymbolDefinition): void { + let arr = map.get(key); + if (arr === undefined) { + arr = []; + map.set(key, arr); + } + arr.push(def); +} + +/** Build the ADL candidate index in a single pass over the workspace. The + * visitation order (file-major; per file, all namespace scopes before all + * class scopes; ownedDefs in declaration order) mirrors the legacy push + * order so `seqByNodeId` reconstructs identical candidate ordering. */ +function buildAdlIndex( + scopes: ScopeResolutionIndexes, + parsedFiles: readonly ParsedFile[], +): AdlCandidateIndex { + const idx: AdlCandidateIndex = { + classDefsBySimple: new Map(), + nsCandidates: new Map(), + friendCandidates: new Map(), + nsFunctionsByQName: new Map(), + nsFunctionsBySimple: new Map(), + seqByNodeId: new Map(), + }; + + // (1) class-like defs by simple name — preserve byId order so arr[0] is the + // legacy `firstMatch` and `arr.length > 1` is the legacy `ambiguous`. + for (const def of scopes.defs.byId.values()) { + if ( + def.type !== 'Class' && + def.type !== 'Struct' && + def.type !== 'Interface' && + def.type !== 'Enum' + ) + continue; + pushFlat(idx.classDefsBySimple, adlSimpleName(def), def); + } + + let seq = 0; + for (const parsed of parsedFiles) { + const scopesById = new Map(); + for (const sc of parsed.scopes) scopesById.set(sc.id, sc); + // parent → children, built once (replaces the legacy O(scopes²) walk). + const childrenByParent = new Map(); + for (const sc of parsed.scopes) { + if (sc.parent === null) continue; + let kids = childrenByParent.get(sc.parent); + if (kids === undefined) { + kids = []; + childrenByParent.set(sc.parent, kids); + } + kids.push(sc); + } + + // PASS A — namespace-owned candidates (+ function-reference indexes). + for (const scope of parsed.scopes) { + if (scope.kind !== 'Namespace') continue; + const qName = computeNamespaceQName(scope, scopesById); + // Registration keys reproduce the legacy membership test: own QName + // always; for an inline namespace child of a Namespace, also the parent + // QName (ISO C++ inline-namespace transparency for ADL). + const keys: string[] = []; + if (qName !== '') keys.push(qName); + if (isCppInlineNamespaceScope(scope.id)) { + const parentScope = scope.parent !== null ? scopesById.get(scope.parent) : undefined; + if (parentScope !== undefined && parentScope.kind === 'Namespace') { + const parentQName = computeNamespaceQName(parentScope, scopesById); + if (parentQName !== '' && parentQName !== qName) keys.push(parentQName); + } + } + for (const def of scope.ownedDefs) { + if (def.type === 'Function' || def.type === 'Method') { + const sn = adlSimpleName(def); + pushFlat(idx.nsFunctionsBySimple, sn, def); + if (qName !== '') pushNested(idx.nsFunctionsByQName, qName, sn, def); + } + if (!isAdlCallableType(def.type)) continue; + const s = seq++; + idx.seqByNodeId.set(def.nodeId, s); + const sn = adlSimpleName(def); + for (const key of keys) pushNested(idx.nsCandidates, key, sn, def); + } + } + + // PASS B — hidden-friend + class-member candidates for associated classes. + for (const scope of parsed.scopes) { + if (scope.kind !== 'Class') continue; + // Enclosing-namespace QName(s) of the class def(s) in this scope. + const classNsKeys = new Set(); + for (const def of scope.ownedDefs) { + if (def.type !== 'Class' && def.type !== 'Struct' && def.type !== 'Interface') continue; + const nsQName = classToNamespaceQualifiedName.get(def.nodeId); + if (nsQName !== undefined) classNsKeys.add(nsQName); + } + if (classNsKeys.size === 0) continue; + // Friend functions: callable defs in child Function scopes. + for (const childScope of childrenByParent.get(scope.id) ?? []) { + if (childScope.kind !== 'Function') continue; + for (const def of childScope.ownedDefs) { + if (!isAdlCallableType(def.type)) continue; + const s = seq++; + idx.seqByNodeId.set(def.nodeId, s); + const sn = adlSimpleName(def); + for (const key of classNsKeys) pushNested(idx.friendCandidates, key, sn, def); + } + } + // Class-member callables. + for (const def of scope.ownedDefs) { + if (!isAdlCallableType(def.type)) continue; + const s = seq++; + idx.seqByNodeId.set(def.nodeId, s); + const sn = adlSimpleName(def); + for (const key of classNsKeys) pushNested(idx.friendCandidates, key, sn, def); + } + } + } + + // Dev/test-only invariant guard: every def bucketed into nsCandidates/ + // friendCandidates must have a seqByNodeId entry, otherwise the `?? 0` + // fallback in pickCppAdlCandidates could collapse two seq-0 candidates and + // silently drop a CALLS edge. Gated like the rest of the resolver's opt-in + // validation (see contract/scope-resolver.ts and reconcile-ownership.ts): + // active in dev/test, off in production and when VALIDATE_SEMANTIC_MODEL=0. + if (process.env.NODE_ENV !== 'production' && process.env.VALIDATE_SEMANTIC_MODEL !== '0') { + const missing = validateAdlSeqCoverage(idx); + if (missing.length > 0) { + throw new Error( + `[cpp-adl] seq-coverage invariant violated: ${missing.length} candidate def(s) ` + + `bucketed without a seqByNodeId entry (e.g. ${missing.slice(0, 5).join(', ')}). ` + + `Every def pushed into nsCandidates/friendCandidates must be seq-assigned in the ` + + `same build block — see pickCppAdlCandidates' \`?? 0\` fallback.`, + ); + } + } + + return idx; +} + +/** + * Return the nodeIds present in the index's candidate buckets + * (`nsCandidates` + `friendCandidates`) but missing from `seqByNodeId`, each + * reported once. Empty array means the seq-coverage invariant holds — which it + * must, since `buildAdlIndex` assigns a seq to every callable def in the same + * block that buckets it. Exported for the dev-gated guard in `buildAdlIndex` + * and its unit test. + */ +export function validateAdlSeqCoverage(idx: AdlCandidateIndex): string[] { + const missing = new Set(); + for (const buckets of [idx.nsCandidates, idx.friendCandidates]) { + for (const bySimple of buckets.values()) { + for (const defs of bySimple.values()) { + for (const def of defs) { + if (!idx.seqByNodeId.has(def.nodeId)) missing.add(def.nodeId); + } + } + } + } + return [...missing]; +} + +/** Build the ADL index on first use of a given `parsedFiles` set; reuse it for + * all subsequent call sites in the same pipeline run. Reset by + * `clearCppAdlState`. + * + * Staleness is keyed on `parsedFiles` reference identity ONLY, but the index + * is a function of THREE inputs: `parsedFiles` (namespace/friend candidates), + * `scopes` (`classDefsBySimple`, read from `scopes.defs.byId`), and the + * module-level `classToNamespaceQualifiedName` (friend-candidate keys). This + * is sound for the current pipeline because all three are built together once + * per `runScopeResolution` pass and `clearCppAdlState` runs in + * `loadResolutionConfig` at the start of every pass. Callers MUST call + * `clearCppAdlState` between any two passes that change `scopes` or + * `classToNamespaceQualifiedName` while reusing the same `parsedFiles` array + * reference — otherwise a stale index would be served. (No such caller exists + * today; widening the guard to also key on `scopes` is deferred until one + * does.) */ +function ensureAdlIndex(scopes: ScopeResolutionIndexes, parsedFiles: readonly ParsedFile[]): void { + if (adlIndex !== undefined && adlIndexSource === parsedFiles) return; + adlIndex = buildAdlIndex(scopes, parsedFiles); + adlIndexSource = parsedFiles; +} + /** Record per-call-site argument info. Called once per call site from * `emitCppScopeCaptures`. */ export function markCppAdlSiteArgs( @@ -124,12 +379,15 @@ export function markCppAdlSiteNoAdl(filePath: string, line: number, col: number) noAdlSites.add(siteKey(filePath, line, col)); } -/** Clear ADL state. Called from `clearFileLocalNames` so all C++ resolver - * per-pipeline state is reset together. */ +/** Clear ADL state. Called from `cppScopeResolver.loadResolutionConfig` + * (alongside `clearFileLocalNames`) so all C++ resolver per-pipeline state is + * reset together at the start of each resolution pass. */ export function clearCppAdlState(): void { argInfoBySite.clear(); noAdlSites.clear(); classToNamespaceQualifiedName.clear(); + adlIndex = undefined; + adlIndexSource = undefined; } /** @@ -195,111 +453,51 @@ export function pickCppAdlCandidates( const args = argInfoBySite.get(key); if (args === undefined || args.length === 0) return undefined; + // Build the workspace-wide ADL candidate index once; reuse for every site. + ensureAdlIndex(scopes, parsedFiles); + // Collect associated namespace QNames from every participating class-typed arg // and from function-reference args. const associatedNamespaces = new Set(); for (const arg of args) { collectAssociatedNamespacesForAdlArg(arg, scopes, associatedNamespaces); if (arg.functionRefText !== undefined) { - collectFunctionTypeAssociatedNamespaces( - arg.functionRefText, - scopes, - parsedFiles, - associatedNamespaces, - ); + collectFunctionTypeAssociatedNamespaces(arg.functionRefText, scopes, associatedNamespaces); } } if (associatedNamespaces.size === 0) return undefined; - // Walk every namespace scope in every parsed file; collect callable - // ownedDefs whose enclosing namespace matches one of the associated - // QNames AND whose simple name matches the call's name. - // ISO C++: inline namespaces are transparent — candidates in inline - // children of an associated namespace are also ADL-reachable. - const candidates: SymbolDefinition[] = []; + // Gather candidates from the prebuilt index instead of re-scanning every + // parsed file. For each associated namespace, pull: + // - namespace-owned callables (`nsCandidates`, includes inline-namespace + // transparency), AND + // - hidden-friend / class-member callables of associated classes + // (`friendCandidates`, ISO C++ `[basic.lookup.argdep]` §2). + // Dedup by nodeId and sort by visitation sequence so the candidate list is + // byte-for-byte identical to the legacy file-major scan order. + const idx = adlIndex; + if (idx === undefined) return undefined; + const bySeq = new Map(); const seenKey = new Set(); - for (const parsed of parsedFiles) { - const scopesById = new Map(); - for (const sc of parsed.scopes) scopesById.set(sc.id, sc); - for (const scope of parsed.scopes) { - if (scope.kind !== 'Namespace') continue; - const qName = computeNamespaceQName(scope, scopesById); - if (!associatedNamespaces.has(qName)) { - // Check if this is an inline-namespace child of an associated NS. - // ISO C++ inline namespaces are transparent for ADL: if the outer - // namespace is in the associated set, candidates in the inline child - // are also reachable. - if (!isCppInlineNamespaceScope(scope.id)) continue; - const parentScope = scope.parent !== null ? scopesById.get(scope.parent) : undefined; - if (parentScope === undefined || parentScope.kind !== 'Namespace') continue; - const parentQName = computeNamespaceQName(parentScope, scopesById); - if (!associatedNamespaces.has(parentQName)) continue; - } - for (const def of scope.ownedDefs) { - if (def.type !== 'Function' && def.type !== 'Method' && def.type !== 'Constructor') { - continue; - } - const simple = def.qualifiedName?.split('.').pop() ?? def.qualifiedName ?? ''; - if (simple !== site.name) continue; - // Dedup by nodeId — using normalized parameter-types as the key - // would collapse `process(int)`/`process(long)`-style overloads - // (both normalize to `['int']`) before - // `isOverloadAmbiguousAfterNormalization` can detect them. - if (seenKey.has(def.nodeId)) continue; - seenKey.add(def.nodeId); - candidates.push(def); - } - } - // ISO C++ `[basic.lookup.argdep]` §2: hidden friend functions declared - // inside a class body are visible via ADL when the class is an associated - // class. Scan Class scopes whose enclosing namespace is in the associated - // set for callable ownedDefs matching the call name. This enables the - // canonical "hidden friend" idiom: - // struct Foo { friend void swap(Foo&, Foo&) {} }; - for (const scope of parsed.scopes) { - if (scope.kind !== 'Class') continue; - // Check if ANY class def in this scope has an associated namespace. - let isAssociatedClass = false; - for (const def of scope.ownedDefs) { - if (def.type !== 'Class' && def.type !== 'Struct' && def.type !== 'Interface') continue; - const nsQName = classToNamespaceQualifiedName.get(def.nodeId); - if (nsQName !== undefined && associatedNamespaces.has(nsQName)) { - isAssociatedClass = true; - break; - } - } - if (!isAssociatedClass) continue; - // Also scan Function scopes that are direct children of this class - // scope — friend function definitions create their own Function scope - // underneath the Class scope. - for (const childScope of parsed.scopes) { - if (childScope.parent !== scope.id) continue; - if (childScope.kind !== 'Function') continue; - for (const def of childScope.ownedDefs) { - if (def.type !== 'Function' && def.type !== 'Method' && def.type !== 'Constructor') { - continue; - } - const simple = def.qualifiedName?.split('.').pop() ?? def.qualifiedName ?? ''; - if (simple !== site.name) continue; - if (seenKey.has(def.nodeId)) continue; - seenKey.add(def.nodeId); - candidates.push(def); - } - } - for (const def of scope.ownedDefs) { - if (def.type !== 'Function' && def.type !== 'Method' && def.type !== 'Constructor') { - continue; - } - const simple = def.qualifiedName?.split('.').pop() ?? def.qualifiedName ?? ''; - if (simple !== site.name) continue; + const collectFrom = (buckets: Map>): void => { + for (const ns of associatedNamespaces) { + const matches = buckets.get(ns)?.get(site.name); + if (matches === undefined) continue; + for (const def of matches) { if (seenKey.has(def.nodeId)) continue; seenKey.add(def.nodeId); - candidates.push(def); + // `?? 0` is unreachable: every bucketed def is seq-assigned in the same + // block that buckets it in buildAdlIndex (PASS A / PASS B). The dev-gated + // validateAdlSeqCoverage guard in buildAdlIndex fails loudly if that ever + // breaks, rather than letting two seq-0 defs collide and drop a candidate. + bySeq.set(idx.seqByNodeId.get(def.nodeId) ?? 0, def); } } - } - if (candidates.length === 0) return undefined; - return candidates; + }; + collectFrom(idx.nsCandidates); + collectFrom(idx.friendCandidates); + if (bySeq.size === 0) return undefined; + return [...bySeq.entries()].sort((a, b) => a[0] - b[0]).map(([, def]) => def); } function collectAssociatedNamespacesForAdlArg( @@ -331,7 +529,7 @@ function addAssociatedNamespaceForClassName( associatedNamespaces: Set, ): void { if (simpleClassName.length === 0) return; - const classLookup = findCppClassDefBySimpleName(simpleClassName, scopes); + const classLookup = findCppClassDefBySimpleName(simpleClassName); if (classLookup === undefined) return; const { classDef, ambiguous } = classLookup; const nsQName = classToNamespaceQualifiedName.get(classDef.nodeId); @@ -447,27 +645,14 @@ function findNamespaceDefInScope(scope: { * enclosing namespace to the associated set, just like class types. */ function findCppClassDefBySimpleName( simpleName: string, - scopes: ScopeResolutionIndexes, ): { classDef: SymbolDefinition; ambiguous: boolean } | undefined { - let firstMatch: SymbolDefinition | undefined; - for (const def of scopes.defs.byId.values()) { - if ( - def.type !== 'Class' && - def.type !== 'Struct' && - def.type !== 'Interface' && - def.type !== 'Enum' - ) - continue; - const simple = def.qualifiedName?.split('.').pop() ?? def.qualifiedName ?? ''; - if (simple !== simpleName) continue; - if (firstMatch === undefined) { - firstMatch = def; - continue; - } - return { classDef: firstMatch, ambiguous: true }; - } - if (firstMatch === undefined) return undefined; - return { classDef: firstMatch, ambiguous: false }; + // `classDefsBySimple` preserves `scopes.defs.byId` order, so `[0]` is the + // legacy first-match and `length > 1` is the legacy `ambiguous` flag. + const matches = adlIndex?.classDefsBySimple.get(simpleName); + if (matches === undefined) return undefined; + const first = matches[0]; + if (first === undefined) return undefined; + return { classDef: first, ambiguous: matches.length > 1 }; } /** @@ -477,31 +662,23 @@ function findCppClassDefBySimpleName( function collectFunctionTypeAssociatedNamespaces( refText: string, scopes: ScopeResolutionIndexes, - parsedFiles: readonly ParsedFile[], out: Set, ): void { + const idx = adlIndex; + if (idx === undefined) return; const colonIdx = refText.lastIndexOf('::'); if (colonIdx !== -1) { // Qualified ref: extract namespace prefix and normalise :: → dot notation. const nsText = refText.slice(0, colonIdx).replace(/::/g, '.'); if (nsText === '') return; const simpleName = refText.slice(colonIdx + 2); - // Verify that a Function/Method named `simpleName` exists in `nsText`. - // Without this guard every `a::b` qualified_identifier arg (variable, - // enum value, static member, type alias) would blindly contribute `a` - // to the associated set and risk a false-positive CALLS edge. - for (const parsed of parsedFiles) { - const scopesById = new Map(); - for (const sc of parsed.scopes) scopesById.set(sc.id, sc); - for (const scope of parsed.scopes) { - if (scope.kind !== 'Namespace') continue; - if (computeNamespaceQName(scope, scopesById) !== nsText) continue; - for (const def of scope.ownedDefs) { - if (def.type !== 'Function' && def.type !== 'Method') continue; - const simple = def.qualifiedName?.split('.').pop() ?? def.qualifiedName ?? ''; - if (simple === simpleName) collectAssociatedNamespacesForFunctionDef(def, scopes, out); - } - } + // Only Function/Method defs named `simpleName` in `nsText` contribute + // (the index already restricts to those types); this guards against an + // `a::b` arg that names a variable / enum value / type alias blindly + // contributing `a` to the associated set (false-positive CALLS edge). + const matches = idx.nsFunctionsByQName.get(nsText)?.get(simpleName); + if (matches !== undefined) { + for (const def of matches) collectAssociatedNamespacesForFunctionDef(def, scopes, out); } return; } @@ -510,16 +687,9 @@ function collectFunctionTypeAssociatedNamespaces( // the previous V1 lookup scope. The stricter part of this PR is what each // overload contributes: only namespaces from parameter/return types, never // the function's own enclosing namespace. - for (const parsed of parsedFiles) { - for (const scope of parsed.scopes) { - if (scope.kind !== 'Namespace') continue; - for (const def of scope.ownedDefs) { - if (def.type !== 'Function' && def.type !== 'Method') continue; - const simple = def.qualifiedName?.split('.').pop() ?? def.qualifiedName ?? ''; - if (simple !== refText) continue; - collectAssociatedNamespacesForFunctionDef(def, scopes, out); - } - } + const matches = idx.nsFunctionsBySimple.get(refText); + if (matches !== undefined) { + for (const def of matches) collectAssociatedNamespacesForFunctionDef(def, scopes, out); } } diff --git a/gitnexus/test/fixtures/lang-resolution/cpp-adl-ns-plus-hidden-friend-same-name/app.cpp b/gitnexus/test/fixtures/lang-resolution/cpp-adl-ns-plus-hidden-friend-same-name/app.cpp new file mode 100644 index 0000000000..f2688f29c6 --- /dev/null +++ b/gitnexus/test/fixtures/lang-resolution/cpp-adl-ns-plus-hidden-friend-same-name/app.cpp @@ -0,0 +1,18 @@ +#include "lib.h" + +// Both call sites use an unqualified name with a lib::T argument, so ordinary +// lookup fails and ADL fires via T's associated namespace `lib`. `combine` is +// only reachable as a hidden friend (friendCandidates); `process` only as a +// namespace member (nsCandidates). Both must resolve — that is what proves +// pickCppAdlCandidates consults BOTH buckets when merging. + +void call_friend() { + lib::T a; + lib::T b; + combine(a, b); +} + +void call_ns() { + lib::T t; + process(t); +} diff --git a/gitnexus/test/fixtures/lang-resolution/cpp-adl-ns-plus-hidden-friend-same-name/lib.h b/gitnexus/test/fixtures/lang-resolution/cpp-adl-ns-plus-hidden-friend-same-name/lib.h new file mode 100644 index 0000000000..0368418869 --- /dev/null +++ b/gitnexus/test/fixtures/lang-resolution/cpp-adl-ns-plus-hidden-friend-same-name/lib.h @@ -0,0 +1,12 @@ +namespace lib { + +struct T { + // Hidden friend: a namespace-scope member of `lib` visible ONLY via ADL. + // Exercises the friendCandidates bucket. + friend void combine(T& a, T& b) {} +}; + +// Ordinary namespace-level callable. Exercises the nsCandidates bucket. +void process(T& x) {} + +} diff --git a/gitnexus/test/integration/cpp-adl-benchmark.test.ts b/gitnexus/test/integration/cpp-adl-benchmark.test.ts new file mode 100644 index 0000000000..10ff504186 --- /dev/null +++ b/gitnexus/test/integration/cpp-adl-benchmark.test.ts @@ -0,0 +1,169 @@ +/** + * C++ ADL (argument-dependent lookup) emit-scaling benchmark. + * + * Guards the optimization in PR #1990: `pickCppAdlCandidates` used to rescan all + * parsed files (and all workspace defs) once PER unresolved ADL call site — + * O(sites × files). It now queries a once-built index — O(sites). This benchmark + * reproduces the pathological shape (many unresolved ADL sites) and asserts the + * scope-resolution EMIT phase scales sub-quadratically. + * + * Run: GITNEXUS_BENCH=1 npx vitest run test/integration/cpp-adl-benchmark.test.ts + * + * WHY EMIT MS, NOT WALL TIME: the fixture is parsed single-threaded + * (workerPoolSize: 0, so no dist build is needed), and parse dominates total + * wall time — masking the ADL cost. We isolate the scope-resolution `emit` ms + * from the profiler log (captured in-process via the logger test destination). + * + * WHY CO-SCALE FILES AND SITES: the regression is O(sites × files). At fixed + * files, both the old and new code are linear in sites and indistinguishable. + * Scaling both with N makes the OLD cost O(N²) and the NEW cost O(N); the + * end-to-end emit ratio then separates them cleanly (linear ≈ Nratio, + * quadratic ≈ Nratio²). The guard sits at Nratio^1.5. + */ +import { describe, it, expect } from 'vitest'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { runPipelineFromRepo } from '../../src/core/ingestion/pipeline.js'; +import { _captureLogger } from '../../src/core/logger.js'; + +const BENCH_ENABLED = process.env.GITNEXUS_BENCH === '1'; + +interface BenchResult { + fileCount: number; + siteCount: number; + elapsedMs: number; + emitMs: number; + peakHeapMB: number; + nodeCount: number; + callsResolved: number; +} + +/** + * Generate a workspace of `fileCount` headers, each declaring its own namespace + * + struct, and one app.cpp with `siteCount` callers. Every caller makes a + * class-typed local and calls `ghost(...)` — a name declared NOWHERE — so + * ordinary lookup fails, ADL fires (the arg is class-typed), the index is + * scanned, and the site stays UNRESOLVED. That is the maximal-scan shape the + * optimization targets. Per-file work is constant; sites scale independently. + */ +function generateCppAdlFixture(fileCount: number, siteCount: number): { dir: string } { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), `cpp-adl-bench-${fileCount}-`)); + for (let k = 0; k < fileCount; k++) { + const helpers = Array.from({ length: 3 }, (_, j) => `void helper${k}_${j}(T${k}& x) {}`).join( + '\n', + ); + fs.writeFileSync( + path.join(dir, `lib_${k}.h`), + `namespace lib_${k} {\nstruct T${k} {};\n${helpers}\n}\n`, + ); + } + const includes = Array.from({ length: fileCount }, (_, k) => `#include "lib_${k}.h"`).join('\n'); + const callers = Array.from({ length: siteCount }, (_, i) => { + const k = i % fileCount; + return `void call_${i}() {\n lib_${k}::T${k} t;\n ghost(t);\n}`; + }).join('\n'); + fs.writeFileSync(path.join(dir, 'app.cpp'), `${includes}\n\n${callers}\n`); + return { dir }; +} + +/** Largest `emit=ms` across the captured scope-resolution profiler lines + * (the C++ pass dominates). Returns NaN if no profiler line was captured. */ +function extractEmitMs(records: { msg?: string }[]): number { + let max = NaN; + for (const r of records) { + const m = /\[scope-resolution prof\].*emit=(\d+(?:\.\d+)?)ms/.exec(r.msg ?? ''); + if (m) { + const v = Number(m[1]); + max = Number.isNaN(max) ? v : Math.max(max, v); + } + } + return max; +} + +async function runBenchmark(fileCount: number, siteCount: number): Promise { + const { dir } = generateCppAdlFixture(fileCount, siteCount); + let peakHeapMB = 0; + const heapSampler = setInterval(() => { + const heap = process.memoryUsage().heapUsed / 1024 / 1024; + if (heap > peakHeapMB) peakHeapMB = heap; + }, 50); + + const prevProf = process.env.PROF_SCOPE_RESOLUTION; + process.env.PROF_SCOPE_RESOLUTION = '1'; + const cap = _captureLogger(); + try { + const start = Date.now(); + const result = await runPipelineFromRepo(dir, () => {}, { workerPoolSize: 0 }); + const elapsedMs = Date.now() - start; + const emitMs = extractEmitMs(cap.records()); + + let callsResolved = 0; + for (const rel of result.graph.iterRelationships()) { + if (rel.type === 'CALLS') callsResolved++; + } + + return { + fileCount, + siteCount, + elapsedMs, + emitMs, + peakHeapMB: Math.round(peakHeapMB), + nodeCount: result.graph.nodeCount, + callsResolved, + }; + } finally { + cap.restore(); + if (prevProf === undefined) delete process.env.PROF_SCOPE_RESOLUTION; + else process.env.PROF_SCOPE_RESOLUTION = prevProf; + clearInterval(heapSampler); + fs.rmSync(dir, { recursive: true, force: true }); + } +} + +function printResults(results: BenchResult[]) { + console.log('\nC++ ADL emit-scaling benchmark (unresolved-site pattern)'); + console.log('┌────────┬────────┬───────────┬──────────┬──────────┬───────┬───────────┐'); + console.log('│ Files │ Sites │ Wall (ms) │ Emit (ms)│ Heap MB │ Nodes │ CALLS res │'); + console.log('├────────┼────────┼───────────┼──────────┼──────────┼───────┼───────────┤'); + for (const r of results) { + console.log( + `│ ${String(r.fileCount).padStart(6)} │ ${String(r.siteCount).padStart(6)} │ ${String(r.elapsedMs).padStart(9)} │ ${String(Number.isNaN(r.emitMs) ? 'n/a' : Math.round(r.emitMs)).padStart(8)} │ ${String(r.peakHeapMB).padStart(8)} │ ${String(r.nodeCount).padStart(5)} │ ${String(r.callsResolved).padStart(9)} │`, + ); + } + console.log('└────────┴────────┴───────────┴──────────┴──────────┴───────┴───────────┘'); +} + +describe.skipIf(!BENCH_ENABLED)('C++ ADL emit benchmark', () => { + it('emit phase scales sub-quadratically with co-scaled files and sites', async () => { + // files = N, sites = 6N. OLD emit O(sites × files) = O(6N²); NEW emit O(N). + const scales = [40, 80, 160]; + const results: BenchResult[] = []; + for (const n of scales) { + results.push(await runBenchmark(n, n * 6)); + } + printResults(results); + + const first = results[0]; + const last = results[results.length - 1]; + const fileRatio = last.fileCount / first.fileCount; + + // Primary guard: isolated emit ms. Linear ≈ fileRatio; quadratic ≈ + // fileRatio². The threshold fileRatio^1.5 sits between them with margin for + // wall-clock/GC noise. Only applied when the profiler line was captured at + // both ends (otherwise the in-process capture is unavailable in this env). + if (!Number.isNaN(first.emitMs) && !Number.isNaN(last.emitMs) && first.emitMs > 0) { + const emitRatio = last.emitMs / first.emitMs; + expect(emitRatio).toBeLessThan(Math.pow(fileRatio, 1.5)); + } else { + // Fallback: a coarse catastrophe guard on total wall (parse-dominated, so + // it only catches gross blow-ups, not the constant-factor ADL regression). + const wallRatio = last.elapsedMs / first.elapsedMs; + expect(wallRatio).toBeLessThan(Math.pow(fileRatio, 2)); + } + + // Sanity: the sites are intentionally unresolved (ghost is declared nowhere), + // so this benchmark stresses the scan path, not edge emission. + expect(last.callsResolved).toBe(0); + }, 600_000); +}); diff --git a/gitnexus/test/integration/cpp-pipeline-benchmark.test.ts b/gitnexus/test/integration/cpp-pipeline-benchmark.test.ts new file mode 100644 index 0000000000..9754931a05 --- /dev/null +++ b/gitnexus/test/integration/cpp-pipeline-benchmark.test.ts @@ -0,0 +1,206 @@ +/** + * C++ ingestion pipeline benchmark. + * + * Generates synthetic C++ codebases at increasing scales and measures + * wall-clock time and peak heap through the full pipeline — scanning, parsing, + * structure extraction, scope resolution, and graph emission. Fills the one + * missing slot in the per-language benchmark suite (cobol/csharp/go/php/ruby/ + * rust already have one); modeled on cobol-pipeline-benchmark.test.ts. + * + * Run: GITNEXUS_BENCH=1 npx vitest run test/integration/cpp-pipeline-benchmark.test.ts + * + * Runs build-free (workerPoolSize: 0 → no dist/parse-worker.js needed), so it + * parses single-threaded; scales are kept modest accordingly. + * + * IMPORTANT — this benchmark measures scaling in FILE COUNT, so per-file work + * must stay constant as fileCount grows. Each translation unit therefore + * #includes a FIXED number of shared headers (HEADERS_PER_FILE), independent of + * fileCount. Do NOT make every TU include all headers: headerCount grows as + * floor(fileCount/5), so include-all makes emitted symbol nodes — and thus total + * work — O(fileCount²), which measures header fan-out rather than file-count + * scaling. With constant fan-out the pipeline is O(fileCount); the deterministic + * node-ratio assertion below guards against reintroducing the O(n²) pattern. + */ +import { describe, it, expect } from 'vitest'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { runPipelineFromRepo } from '../../src/core/ingestion/pipeline.js'; + +const BENCH_ENABLED = process.env.GITNEXUS_BENCH === '1'; + +interface BenchResult { + fileCount: number; + headerCount: number; + methodCount: number; + elapsedMs: number; + peakHeapMB: number; + nodeCount: number; + edgeCount: number; +} + +const METHODS_PER_CLASS = 4; +const HEADERS_PER_FILE = 3; + +function generateCppFixture(fileCount: number): { + dir: string; + headerCount: number; + methodCount: number; +} { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), `cpp-bench-${fileCount}-`)); + + // Shared headers (1 per 5 TUs, at least 2): each a small namespace with a + // struct and a free function the TUs call cross-file (constant fan-in). + const headerCount = Math.max(2, Math.floor(fileCount / 5)); + const headerNames: string[] = []; + for (let h = 0; h < headerCount; h++) { + const ns = `hdr${h}`; + headerNames.push(ns); + fs.writeFileSync( + path.join(dir, `${ns}.h`), + [ + `#pragma once`, + `namespace ${ns} {`, + `struct Rec${h} { int value; };`, + `void use${h}(Rec${h}& r);`, + `}`, + '', + ].join('\n'), + ); + } + + const methodCount = fileCount * METHODS_PER_CLASS; + + for (let f = 0; f < fileCount; f++) { + const className = `C${String(f).padStart(5, '0')}`; + // Constant include fan-out, chosen by index so headers stay shared. + const includes = [ + ...new Set( + Array.from({ length: HEADERS_PER_FILE }, (_, k) => headerNames[(f + k) % headerCount]), + ), + ]; + + const methods: string[] = []; + for (let m = 0; m < METHODS_PER_CLASS; m++) { + // Intra-file call (resolves locally) + one cross-file call into an + // included header's free function (constant cross-file fan-out). + const nextM = (m + 1) % METHODS_PER_CLASS; + const hdr = includes[m % includes.length]; + const hdrIdx = hdr.replace('hdr', ''); + methods.push( + ` void m${m}() {`, + ` m${nextM}();`, + ` ${hdr}::Rec${hdrIdx} r;`, + ` ${hdr}::use${hdrIdx}(r);`, + ` }`, + ); + } + + const content = [ + ...includes.map((h) => `#include "${h}.h"`), + `class ${className} {`, + `public:`, + ...methods, + `};`, + '', + ].join('\n'); + + fs.writeFileSync(path.join(dir, `${className}.cpp`), content); + } + + return { dir, headerCount, methodCount }; +} + +async function runBenchmark(fileCount: number, budgetMs: number): Promise { + const { dir, headerCount, methodCount } = generateCppFixture(fileCount); + + let peakHeapMB = 0; + const heapSampler = setInterval(() => { + const heap = process.memoryUsage().heapUsed / 1024 / 1024; + if (heap > peakHeapMB) peakHeapMB = heap; + }, 50); + + try { + const start = Date.now(); + const result = await Promise.race([ + runPipelineFromRepo(dir, () => {}, { workerPoolSize: 0 }), + new Promise((_, reject) => + setTimeout( + () => reject(new Error(`Pipeline exceeded ${budgetMs}ms at ${fileCount} files`)), + budgetMs, + ), + ), + ]); + const elapsedMs = Date.now() - start; + + return { + fileCount, + headerCount, + methodCount, + elapsedMs, + peakHeapMB: Math.round(peakHeapMB), + nodeCount: result.graph.nodeCount, + edgeCount: result.graph.relationshipCount, + }; + } finally { + clearInterval(heapSampler); + fs.rmSync(dir, { recursive: true, force: true }); + } +} + +function printResults(results: BenchResult[]) { + console.log('\nC++ Pipeline'); + console.log('┌──────────┬──────────┬──────────┬───────────┬──────────┬───────┬───────┐'); + console.log('│ Files │ Headers │ Methods │ Time (ms) │ Heap MB │ Nodes │ Edges │'); + console.log('├──────────┼──────────┼──────────┼───────────┼──────────┼───────┼───────┤'); + for (const r of results) { + console.log( + `│ ${String(r.fileCount).padStart(8)} │ ${String(r.headerCount).padStart(8)} │ ${String(r.methodCount).padStart(8)} │ ${String(r.elapsedMs).padStart(9)} │ ${String(r.peakHeapMB).padStart(8)} │ ${String(r.nodeCount).padStart(5)} │ ${String(r.edgeCount).padStart(5)} │`, + ); + } + console.log('└──────────┴──────────┴──────────┴───────────┴──────────┴───────┴───────┘'); + + if (results.length >= 2) { + console.log('\nScaling ratios (time_ratio / file_ratio):'); + for (let i = 1; i < results.length; i++) { + const fileRatio = results[i].fileCount / results[i - 1].fileCount; + const timeRatio = results[i].elapsedMs / results[i - 1].elapsedMs; + const scaling = timeRatio / fileRatio; + console.log( + ` ${results[i - 1].fileCount} → ${results[i].fileCount}: ${scaling.toFixed(2)}x (${scaling < 1.5 ? 'linear' : scaling < 3 ? 'superlinear' : 'WARNING: quadratic'})`, + ); + } + } +} + +describe.skipIf(!BENCH_ENABLED)('C++ pipeline benchmark', () => { + it('scales with file count', async () => { + const scales = [50, 100, 200, 400]; + const results: BenchResult[] = []; + + for (const fileCount of scales) { + const result = await runBenchmark(fileCount, 300_000); + results.push(result); + console.log( + ` ${fileCount} files: ${result.elapsedMs}ms, ${result.peakHeapMB}MB heap, ${result.nodeCount} nodes, ${result.edgeCount} edges`, + ); + } + + printResults(results); + + for (let i = 1; i < results.length; i++) { + const fileRatio = results[i].fileCount / results[i - 1].fileCount; + const timeRatio = results[i].elapsedMs / results[i - 1].elapsedMs; + // Wall-clock is noisy (GC/CI load); keep a coarse upper bound here. + expect(timeRatio / fileRatio).toBeLessThan(4); + + // Deterministic regression guard: with constant per-file include fan-out + // the emitted node count is linear in fileCount (ratio ≈ 1.0). If someone + // reintroduces O(fileCount²) work — e.g. by making every TU include all + // headers — node growth jumps and this fails. Node count is deterministic, + // so this is a non-flaky guard unlike the wall-clock check above. + const nodeRatio = results[i].nodeCount / results[i - 1].nodeCount; + expect(nodeRatio / fileRatio).toBeLessThan(1.3); + } + }, 600_000); +}); diff --git a/gitnexus/test/integration/resolvers/cpp.test.ts b/gitnexus/test/integration/resolvers/cpp.test.ts index da62a243a5..3463ceec1e 100644 --- a/gitnexus/test/integration/resolvers/cpp.test.ts +++ b/gitnexus/test/integration/resolvers/cpp.test.ts @@ -2630,6 +2630,41 @@ describe('C++ ADL — merges with non-empty ordinary lookup', () => { }); }); +describe('C++ ADL — hidden friend and namespace callable in one namespace', () => { + let result: PipelineResult; + + beforeAll(async () => { + result = await runPipelineFromRepo( + path.join(FIXTURES, 'cpp-adl-ns-plus-hidden-friend-same-name'), + () => {}, + ); + }, 60000); + + // pickCppAdlCandidates merges two buckets for one associated namespace: + // friendCandidates (hidden friends of associated classes) and nsCandidates + // (namespace-owned callables). This fixture reaches exactly one callable + // through each bucket — `combine` only as a hidden friend, `process` only as + // a namespace member — so a regression that stopped consulting either bucket + // would drop the corresponding edge. (Candidate ORDER is not observable — + // overload narrowing resolves a unique survivor or suppresses — so the guard + // is on the SET: both edges must be present.) + it('combine(a, b) resolves to the hidden friend via friendCandidates', () => { + const calls = getRelationships(result, 'CALLS').filter( + (c) => c.source === 'call_friend' && c.target === 'combine', + ); + expect(calls.length).toBe(1); + expect(calls[0].targetFilePath).toContain('lib.h'); + }); + + it('process(t) resolves to the namespace callable via nsCandidates', () => { + const calls = getRelationships(result, 'CALLS').filter( + (c) => c.source === 'call_ns' && c.target === 'process', + ); + expect(calls.length).toBe(1); + expect(calls[0].targetFilePath).toContain('lib.h'); + }); +}); + describe('C++ ADL — base-class associated namespaces', () => { let result: PipelineResult; diff --git a/gitnexus/test/unit/scope-resolution/cpp/cpp-adl-seq-coverage.test.ts b/gitnexus/test/unit/scope-resolution/cpp/cpp-adl-seq-coverage.test.ts new file mode 100644 index 0000000000..7500a6bb53 --- /dev/null +++ b/gitnexus/test/unit/scope-resolution/cpp/cpp-adl-seq-coverage.test.ts @@ -0,0 +1,79 @@ +/** + * Unit tests for the C++ ADL seq-coverage invariant guard. + * + * `pickCppAdlCandidates` sorts merged candidates by `seqByNodeId`, falling back + * to `?? 0` if a bucketed def has no seq. That fallback is provably unreachable + * (every def pushed into `nsCandidates`/`friendCandidates` is seq-assigned in the + * same build block), but a future regression could break the invariant and + * silently collapse two seq-0 candidates into one. `validateAdlSeqCoverage` + * detects that break; `buildAdlIndex` runs it under the dev/test validation gate + * so a regression fails loudly in CI rather than dropping a CALLS edge in prod. + */ +import { describe, it, expect } from 'vitest'; +import { + validateAdlSeqCoverage, + type AdlCandidateIndex, +} from '../../../../src/core/ingestion/languages/cpp/adl.js'; +import type { SymbolDefinition } from 'gitnexus-shared'; + +function def(nodeId: string): SymbolDefinition { + return { nodeId } as unknown as SymbolDefinition; +} + +function makeIndex( + nsCandidates: Map>, + friendCandidates: Map>, + seqByNodeId: Map, +): AdlCandidateIndex { + return { + classDefsBySimple: new Map(), + nsCandidates, + friendCandidates, + nsFunctionsByQName: new Map(), + nsFunctionsBySimple: new Map(), + seqByNodeId, + }; +} + +describe('validateAdlSeqCoverage', () => { + it('returns no missing ids when every bucketed def has a seq', () => { + const ns = new Map([['lib', new Map([['act', [def('A')]]])]]); + const friend = new Map([['lib', new Map([['swap', [def('B')]]])]]); + const seq = new Map([ + ['A', 0], + ['B', 1], + ]); + + expect(validateAdlSeqCoverage(makeIndex(ns, friend, seq))).toEqual([]); + }); + + it('flags a namespace-candidate def missing from seqByNodeId', () => { + const ns = new Map([['lib', new Map([['act', [def('A'), def('C')]]])]]); + const friend = new Map>(); + const seq = new Map([['A', 0]]); // 'C' missing + + expect(validateAdlSeqCoverage(makeIndex(ns, friend, seq))).toEqual(['C']); + }); + + it('flags a friend-candidate def missing from seqByNodeId', () => { + const ns = new Map>(); + const friend = new Map([['lib', new Map([['swap', [def('D')]]])]]); + const seq = new Map(); // 'D' missing + + expect(validateAdlSeqCoverage(makeIndex(ns, friend, seq))).toEqual(['D']); + }); + + it('reports each missing nodeId once even when bucketed under multiple keys', () => { + // Inline-namespace transparency registers the same def under its own and + // its parent QName; a missing seq should surface as a single entry. + const inner = new Map([['act', [def('E')]]]); + const ns = new Map([ + ['lib', inner], + ['lib.inline', inner], + ]); + const friend = new Map>(); + const seq = new Map(); // 'E' missing + + expect(validateAdlSeqCoverage(makeIndex(ns, friend, seq))).toEqual(['E']); + }); +});