diff --git a/gitnexus/src/core/ingestion/language-provider.ts b/gitnexus/src/core/ingestion/language-provider.ts index 8edde3dca9..a896933f82 100644 --- a/gitnexus/src/core/ingestion/language-provider.ts +++ b/gitnexus/src/core/ingestion/language-provider.ts @@ -187,6 +187,12 @@ interface LanguageProviderConfig { * `undefined` when no constraints exist / the node isn't a templated * function. Languages without SFINAE / concept semantics leave this * undefined and the disambiguation is a pass-through. + * + * Cloneability contract: the returned payload crosses the worker boundary + * via structured clone, so it MUST be structured-clone-safe (no functions, + * symbols, or tree-sitter `SyntaxNode`s — only plain data). Wrap the return + * with `assertCloneable` from `workers/clone-safety.ts` so a future leak is a + * compile error at the source instead of a runtime DataCloneError (#2143). */ readonly extractTemplateConstraints?: (definitionNode: SyntaxNode) => unknown; @@ -343,8 +349,12 @@ interface LanguageProviderConfig { * disk store WITHOUT a main-thread re-parse. The main thread restores them * via the matching `ScopeResolver.applyCaptureSideChannel` hook. * - * MUST return plain data (objects / arrays / primitives) so it round-trips - * through `JSON.stringify` + the parsedfile-store interning reviver. + * Cloneability contract: MUST return plain data (objects / arrays / + * primitives — no functions, symbols, or tree-sitter `SyntaxNode`s) so it + * survives BOTH the worker→main structured clone AND `JSON.stringify` + the + * parsedfile-store interning reviver. Wrap the return with `assertCloneable` + * from `workers/clone-safety.ts` so a future non-serializable leak is a + * compile error at the source instead of a runtime DataCloneError (#2143). * * Default: undefined (provider has no capture-time module-level side effects). */ diff --git a/gitnexus/src/core/ingestion/languages/c-cpp.ts b/gitnexus/src/core/ingestion/languages/c-cpp.ts index 84523fc709..3d427fed85 100644 --- a/gitnexus/src/core/ingestion/languages/c-cpp.ts +++ b/gitnexus/src/core/ingestion/languages/c-cpp.ts @@ -65,7 +65,11 @@ import { cppReceiverBinding, collectCppCaptureSideChannel, } from './cpp/index.js'; -import { extractCppTemplateConstraints } from './cpp/constraint-extractor.js'; +import { + extractCppTemplateConstraints, + type CppConstraintPayload, +} from './cpp/constraint-extractor.js'; +import { assertCloneable } from '../workers/clone-safety.js'; const C_BUILT_INS: ReadonlySet = new Set([ 'printf', @@ -405,7 +409,11 @@ export const cProvider = defineLanguage({ // `static` functions look non-file-local on the main thread and leak into // cross-file global free-call resolution / wildcard imports. See // `c/capture-side-channel.ts`. - collectCaptureSideChannel: collectCStaticLinkageSideChannel, + // `assertCloneable` is a runtime identity; it makes a future non-serializable + // value in the side-channel payload a compile error here, at the source, rather + // than a DataCloneError at the worker boundary (#2143). + collectCaptureSideChannel: (filePath) => + assertCloneable(collectCStaticLinkageSideChannel(filePath)), interpretImport: interpretCImport, interpretTypeBinding: interpretCTypeBinding, bindingScopeFor: cBindingScopeFor, @@ -480,7 +488,7 @@ export const cppProvider = defineLanguage({ // just populated for this file into plain data on `ParsedFile.captureSideChannel`, // so the main thread can restore them via `applyCaptureSideChannel` WITHOUT a // re-parse (#1983). See `cpp/capture-side-channel.ts`. - collectCaptureSideChannel: collectCppCaptureSideChannel, + collectCaptureSideChannel: (filePath) => assertCloneable(collectCppCaptureSideChannel(filePath)), interpretImport: interpretCppImport, interpretTypeBinding: interpretCppTypeBinding, bindingScopeFor: cppBindingScopeFor, @@ -501,7 +509,9 @@ export const cppProvider = defineLanguage({ * functions whose constraints the extractor can't model — both cases * result in no constraint suffix on the node ID. */ -function extractCppTemplateConstraintsForProvider(definitionNode: SyntaxNode): unknown { +function extractCppTemplateConstraintsForProvider( + definitionNode: SyntaxNode, +): CppConstraintPayload | undefined { // Walk up to the enclosing template_declaration. Bound the walk so we // can't accidentally land on a far-ancestor template_declaration that // wraps an unrelated function. @@ -530,5 +540,8 @@ function extractCppTemplateConstraintsForProvider(definitionNode: SyntaxNode): u } break; } - return extractCppTemplateConstraints(templateDecl, declarator); + // Guard the boundary at the source: a future non-cloneable member of the + // constraint payload becomes a compile error here, not a runtime + // DataCloneError at the worker post (#2143). + return assertCloneable(extractCppTemplateConstraints(templateDecl, declarator)); } diff --git a/gitnexus/src/core/ingestion/languages/kotlin.ts b/gitnexus/src/core/ingestion/languages/kotlin.ts index 6ee7b1ddae..4cf87a76ff 100644 --- a/gitnexus/src/core/ingestion/languages/kotlin.ts +++ b/gitnexus/src/core/ingestion/languages/kotlin.ts @@ -11,6 +11,7 @@ import { SupportedLanguages } from 'gitnexus-shared'; import { createClassExtractor } from '../class-extractors/generic.js'; import { kotlinClassConfig } from '../class-extractors/configs/jvm.js'; import { defineLanguage } from '../language-provider.js'; +import { assertCloneable } from '../workers/clone-safety.js'; import { kotlinTypeConfig } from '../type-extractors/jvm.js'; import { kotlinExportChecker } from '../export-detection.js'; import { createImportResolver } from '../import-resolvers/resolver-factory.js'; @@ -182,7 +183,11 @@ export const kotlinProvider = defineLanguage({ // so the main thread can restore them via `applyCaptureSideChannel` WITHOUT a // re-parse (#1983). Without this, companion/static dispatch emits no CALLS // edges on the worker path. See `kotlin/capture-side-channel.ts`. - collectCaptureSideChannel: collectKotlinCaptureSideChannel, + // `assertCloneable` is a runtime identity; it makes a future non-serializable + // value in the side-channel payload a compile error here, at the source, rather + // than a DataCloneError at the worker boundary (#2143). + collectCaptureSideChannel: (filePath) => + assertCloneable(collectKotlinCaptureSideChannel(filePath)), interpretImport: interpretKotlinImport, interpretTypeBinding: interpretKotlinTypeBinding, bindingScopeFor: kotlinBindingScopeFor, diff --git a/gitnexus/src/core/ingestion/parsing-processor.ts b/gitnexus/src/core/ingestion/parsing-processor.ts index 41c2b4d32e..bf11e9f323 100644 --- a/gitnexus/src/core/ingestion/parsing-processor.ts +++ b/gitnexus/src/core/ingestion/parsing-processor.ts @@ -7,6 +7,7 @@ import { accumulateExportedTypesFromParsedNode, type ExportedTypeMap } from './c import type { ParsedFile } from 'gitnexus-shared'; import { WorkerPool } from './workers/worker-pool.js'; +import type { SkippedPath } from './workers/clone-safety.js'; import { logger } from '../logger.js'; import type { ParseWorkerResult, @@ -196,6 +197,29 @@ export const dispatchChunkParse = async ( logger.warn(` Skipped unsupported languages: ${summary}`); } + // Clone-safety telemetry (#2112): files whose parse output carried a value + // the structured-clone algorithm couldn't serialize across the worker + // boundary. The worker sanitized/dropped the offending value so the run + // could complete; surface the (rare) data loss so it's visible and the + // offending extractor can be fixed at source. + const skippedPaths: SkippedPath[] = []; + for (const result of chunkResults) { + for (const entry of result.skippedPaths ?? []) skippedPaths.push(entry); + } + if (skippedPaths.length > 0) { + // Keep the per-file reason ("stripped N value(s) from nodes" / + // "dropped non-serializable parsedFiles entry") — it distinguishes a + // recoverable strip from a whole-record drop, which a path-only line loses. + const shown = skippedPaths + .slice(0, 10) + .map((e) => `${e.path} (${e.reason})`) + .join(', '); + const more = skippedPaths.length > 10 ? ` …and ${skippedPaths.length - 10} more` : ''; + logger.warn( + ` Sanitized ${skippedPaths.length} file(s) with non-serializable parse output: ${shown}${more}`, + ); + } + onFileProgress?.(total, total, 'done'); return chunkResults; }; diff --git a/gitnexus/src/core/ingestion/workers/clone-safety.ts b/gitnexus/src/core/ingestion/workers/clone-safety.ts new file mode 100644 index 0000000000..cf56691e38 --- /dev/null +++ b/gitnexus/src/core/ingestion/workers/clone-safety.ts @@ -0,0 +1,545 @@ +/** + * Structured-clone safety for the worker result boundary (#2112). + * + * A parse worker delivers its accumulated result to the main thread via + * `parentPort.postMessage(...)`. Node serializes that payload with the + * structured-clone algorithm SYNCHRONOUSLY on the worker thread, and it + * THROWS a `DataCloneError` the instant it meets a value it can't serialize — + * a function, a symbol, a Promise, a WeakMap, etc. The reporter of #2112 hit + * exactly this: a node record whose `properties` carried an own-enumerable + * value pointing at a native function (`function toString() { [native code] } + * could not be cloned`). One such value aborted the entire parse phase, + * because the worker re-posts the throw as `{type:'error'}` which the pool + * counts as a worker death — and under `GITNEXUS_WORKER_POOL_SIZE=1` the same + * graph re-throws on every respawn until the slot's budget is exhausted. + * + * This module is the safety net. It runs ONLY after a real clone failure on + * the fast-path post (zero overhead on healthy runs), and rewrites the + * boundary-crossing arrays so the result becomes cloneable: a non-cloneable + * value inside a plain extraction record is dropped (the record is otherwise + * kept — strictly-missing data, never wrong), and a `ParsedFile` that can't be + * made cloneable is dropped whole so scope-resolution re-derives it on the + * main thread (where there is no clone boundary) with intact edge data. + * + * Language-neutral by construction: it keys on value shape and field name + * only, never on a language (AGENTS.md shared-pipeline rule). The strip + * semantics mirror what the store path's `JSON.stringify` already silently + * drops, so store / no-store / cold / warm runs converge on the same graph. + */ + +/** A file whose parse result was sanitized or dropped at the clone boundary. */ +export interface SkippedPath { + /** Best-effort source path of the offending record (or `(unknown)`). */ + path: string; + /** Human-readable reason, e.g. "dropped 1 non-serializable value from nodes". */ + reason: string; +} + +/** + * True iff `value` survives Node's structured-clone algorithm (the same + * algorithm `postMessage` uses). This is the authoritative probe — it matches + * the real failure exactly, including Map/Set/Date/RegExp/TypedArray support, + * so it never false-positives on the `Scope` Maps that clone fine. + */ +export function isStructuredCloneable(value: unknown): boolean { + try { + structuredClone(value); + return true; + } catch { + return false; + } +} + +// ── Compile-time boundary guard (#2143) ───────────────────────────────────── +// The runtime net above is the production backstop; this is its compile-time +// complement. The worker result is plain data EXCEPT a few `unknown`-typed +// sinks (a node's `properties` bag, the provider `extractTemplateConstraints` / +// `collectCaptureSideChannel` hook returns). `unknown` lets a non-serializable +// value (a function, a leaked tree-sitter `SyntaxNode`, …) pass with no +// compile-time guard — that is the structural hole #2112 leaked through. Typing +// those producers as `Cloneable` turns such a leak into a compile error at +// the source site instead of a runtime DataCloneError far downstream. + +/** The leaf values the structured-clone algorithm copies verbatim. */ +type CloneablePrimitive = undefined | null | boolean | number | bigint | string; + +/** + * Maps `T` to itself when every value reachable from it is structured-clone + * safe, and to a type containing `never` at the first offending property + * otherwise. A function or symbol — the values `postMessage` rejects — becomes + * `never`, so a struct carrying one is no longer assignable to its own + * `Cloneable` and `assertCloneable` rejects it, naming the bad key. + * + * Implemented as a homomorphic mapped type (`{ [K in keyof T]: … }`) so it + * preserves `interface` shapes and `readonly` modifiers and works WITHOUT + * requiring the payload types to carry an index signature — sidestepping the + * "closed interface is not assignable to a recursive index-signature type" wall + * that blocked the value-typed-`Cloneable` approach (#2143). `Map`/`Set`/array + * containers recurse into their element types; `Date`/`RegExp` are clone-safe + * leaves. + */ +/** True iff `T` is `any` (the canonical `IsAny` probe: only `any` satisfies `0 extends 1 & T`). */ +type IsAny = 0 extends 1 & T ? true : false; + +export type Cloneable = + IsAny extends true + ? never // an `any`-typed member defeats the guard — reject it like `unknown` (both → never) + : T extends CloneablePrimitive | Date | RegExp + ? T + : T extends (...args: never[]) => unknown + ? never + : T extends symbol + ? never + : T extends ReadonlyMap + ? ReadonlyMap, Cloneable> + : T extends ReadonlySet + ? ReadonlySet> + : T extends readonly (infer U)[] + ? T extends unknown[] + ? Cloneable[] + : readonly Cloneable[] + : T extends object + ? { [K in keyof T]: Cloneable } + : never; + +/** + * Identity at runtime (zero cost — returns its argument unchanged); a + * compile-time assertion that `value` is structured-clone safe. Wrap a + * producer that feeds an `unknown` worker-result sink: + * + * collectCaptureSideChannel: (filePath) => assertCloneable(collectFoo(filePath)) + * + * If `collectFoo`'s return type ever gains a non-cloneable member (a function, a + * `SyntaxNode`, …) the call fails to compile, pointing at the offending key. + * + * The parameter is a conditional type rather than an `extends Cloneable` + * constraint because a self-referential constraint (`T extends Cloneable`) + * is a "circular constraint" error in TypeScript. For a clone-safe `T` the + * parameter resolves to `T` (call type-checks as a plain identity); for an + * unsafe `T` it resolves to `Cloneable` (which has `never` at the bad key), + * so the argument is rejected. + */ +export function assertCloneable(value: T extends Cloneable ? T : Cloneable): T { + return value as T; +} + +/** + * Recursion cap for the module's own traversal. An over-deep subtree is treated + * as non-cloneable rather than recursing to a stack overflow — without this, a + * deeply-nested record would throw `RangeError` inside the sanitizer and (since + * the recovery path is the safety net) re-arm the very cascade #2112 fixes. Set + * far below the observed ~3000-frame overflow and far above any real + * parse-result record (extraction records are shallow plain data). Note: this + * caps the module's recursion only; `structuredClone`'s own internal recursion + * (the `isStructuredCloneable` probe of non-plain objects) is bounded by that + * helper's catch-all, which turns a probe-side `RangeError` into a + * non-cloneable verdict — so do not narrow that catch. + */ +const MAX_CLONE_DEPTH = 200; + +/** + * True iff `key` is a canonical array-index string (`"0"`, `"1"`, … `< 2^32-1`) + * — i.e. one of the slots the numeric index loop already visits. Everything + * else returned by `Object.keys(array)` is a NON-index own-enumerable property + * (`arr.meta = …`), which the structured-clone algorithm ALSO serializes (and + * throws on if non-cloneable). The array branches of `containsNonCloneable` and + * `stripNonCloneable` use this to scan those extra keys in lockstep. + */ +function isArrayIndexKey(key: string): boolean { + const n = Number(key); + return Number.isInteger(n) && n >= 0 && n < 4294967295 && String(n) === key; +} + +/** + * Non-allocating scan: returns true on the FIRST value structured-clone would + * reject. Used to decide whether an array (or element) needs rewriting at all, + * so clean arrays keep their referential identity and pay no copy cost. + */ +function containsNonCloneable(value: unknown, seen: WeakSet, depth = 0): boolean { + const t = typeof value; + if (t === 'function' || t === 'symbol') return true; + if (value === null || t !== 'object') return false; + // Depth bound: treat an over-deep subtree as non-cloneable (the element is + // then stripped/dropped) instead of overflowing the stack. + if (depth >= MAX_CLONE_DEPTH) return true; + const obj = value as object; + // Cycles clone fine; don't recurse into one twice. + if (seen.has(obj)) return false; + // Structured-clone-native containers carry no non-cloneable payload of their + // own; their *contents* still need scanning (a Map value could be a fn). + if (obj instanceof Date || obj instanceof RegExp) return false; + // Buffers/views usually clone, but a DETACHED one is rejected by + // structuredClone — probe rather than wave it through. No byteLength + // heuristic: a legitimately empty `new Uint8Array(0)` also has byteLength 0 + // yet clones fine, so a length check would false-positive. + if (obj instanceof ArrayBuffer || ArrayBuffer.isView(obj)) return !isStructuredCloneable(obj); + seen.add(obj); + if (Array.isArray(obj)) { + for (let i = 0; i < obj.length; i++) { + if (containsNonCloneable(obj[i], seen, depth + 1)) return true; + } + // structuredClone also serializes an array's NON-index own-enumerable + // properties and throws on a non-cloneable one — scan them too (lockstep + // with stripNonCloneable's array branch; see isArrayIndexKey). + for (const key of Object.keys(obj)) { + if (isArrayIndexKey(key)) continue; + let child: unknown; + try { + child = (obj as unknown as Record)[key]; + } catch { + return true; // a throwing getter can't be serialized either + } + if (containsNonCloneable(child, seen, depth + 1)) return true; + } + return false; + } + if (obj instanceof Map) { + for (const [k, v] of obj) { + if (containsNonCloneable(k, seen, depth + 1) || containsNonCloneable(v, seen, depth + 1)) + return true; + } + return false; + } + if (obj instanceof Set) { + for (const v of obj) { + if (containsNonCloneable(v, seen, depth + 1)) return true; + } + return false; + } + // A non-plain object (Promise, WeakMap, class instance with internal slots) + // that structured clone can't handle: detect via the authoritative probe. + // Plain objects fall through to a property scan (cheap, no allocation). + const proto = Object.getPrototypeOf(obj); + if (proto !== Object.prototype && proto !== null) { + if (!isStructuredCloneable(obj)) return true; + return false; + } + for (const key of Object.keys(obj)) { + let child: unknown; + try { + child = (obj as Record)[key]; + } catch { + // A getter that throws can't be serialized either — treat as non-cloneable. + return true; + } + if (containsNonCloneable(child, seen, depth + 1)) return true; + } + return false; +} + +/** + * State carried through a strip pass. `stripped` counts dropped values for the + * skip report; `seen` memoizes each visited object to its stripped COPY (not a + * bare visited-set) so a DAG-aliased subtree — the same object reached via two + * paths — is sanitized once and shared, never over-dropped, and cycles + * terminate by returning the in-progress copy. + */ +interface StripCtx { + stripped: number; + seen: Map; + /** + * Dotted key paths (relative to the element root) of every value that was + * stripped/dropped — e.g. `properties.toString`, `meta.data[3]`. Surfaced in + * the skip reason so the offending property is named precisely, which is what + * lets a still-unpinned leak be located from a single log line (#2112). + */ + keys: string[]; +} + +/** Record a strip at `path` (root → `(root)`); keeps the count + key path in sync. */ +function recordStrip(ctx: StripCtx, path: string): void { + ctx.stripped++; + ctx.keys.push(path === '' ? '(root)' : path); +} + +/** + * Deep-copy `value`, replacing any value structured-clone would reject with + * `undefined` (which clones fine). Preserves primitives, arrays, plain + * objects, and the structured-clone-native containers (Date, RegExp, Map, + * Set, ArrayBuffer, TypedArray). Rebuilds only what it must — clean leaves are + * returned by reference. `path` is the dotted key path of `value` (for the + * diagnostic record). + */ +function stripNonCloneable(value: unknown, ctx: StripCtx, depth = 0, path = ''): unknown { + const t = typeof value; + if (t === 'function' || t === 'symbol') { + recordStrip(ctx, path); + return undefined; + } + if (value === null || t !== 'object') return value; + // Depth bound (mirrors containsNonCloneable): drop an over-deep subtree to + // `undefined` (itself cloneable, and a legal property value / array element) + // rather than overflowing the stack. + if (depth >= MAX_CLONE_DEPTH) { + recordStrip(ctx, path); + return undefined; + } + const obj = value as object; + // Memoized? Return the SAME stripped copy (preserves DAG shape; terminates + // cycles by returning the in-progress copy inserted before recursing below). + if (ctx.seen.has(obj)) return ctx.seen.get(obj); + // Leaf-like values: returned by reference, but still memoize the decision so + // a second alias resolves identically. + if (obj instanceof Date || obj instanceof RegExp) { + ctx.seen.set(obj, value); + return value; + } + if (obj instanceof ArrayBuffer || ArrayBuffer.isView(obj)) { + // Keep a live buffer/view (even an empty one); drop a detached one, which + // structuredClone rejects. The probe is exact — no byteLength heuristic. + if (!isStructuredCloneable(obj)) { + recordStrip(ctx, path); + ctx.seen.set(obj, undefined); + return undefined; + } + ctx.seen.set(obj, value); + return value; + } + // Containers: allocate the empty copy, memoize it BEFORE recursing, then fill + // — so a cycle/alias that re-enters gets this in-progress copy. + if (Array.isArray(obj)) { + const out: unknown[] = []; + ctx.seen.set(obj, out); + for (let i = 0; i < obj.length; i++) + out.push(stripNonCloneable(obj[i], ctx, depth + 1, `${path}[${i}]`)); + // Carry NON-index own-enumerable props through the same strip (lockstep + // with containsNonCloneable): structuredClone serializes them, so a + // non-cloneable one must be stripped rather than left to throw on re-post. + for (const key of Object.keys(obj)) { + if (isArrayIndexKey(key)) continue; + const childPath = `${path}.${key}`; + let child: unknown; + try { + child = (obj as unknown as Record)[key]; + } catch { + recordStrip(ctx, childPath); + continue; + } + (out as unknown as Record)[key] = stripNonCloneable( + child, + ctx, + depth + 1, + childPath, + ); + } + return out; + } + if (obj instanceof Map) { + // Scope limit (acceptable): object keys aren't identity-preserved across + // stripping. Parse-result Maps are primitive-keyed, so this never bites. + const out = new Map(); + ctx.seen.set(obj, out); + for (const [k, v] of obj) + out.set( + stripNonCloneable(k, ctx, depth + 1, `${path}`), + stripNonCloneable(v, ctx, depth + 1, `${path}`), + ); + return out; + } + if (obj instanceof Set) { + const out = new Set(); + ctx.seen.set(obj, out); + for (const v of obj) out.add(stripNonCloneable(v, ctx, depth + 1, `${path}`)); + return out; + } + const proto = Object.getPrototypeOf(obj); + if (proto !== Object.prototype && proto !== null) { + // Non-plain object that the probe already flagged as non-cloneable and + // that we can't safely reconstruct (Promise, WeakMap, class instance with + // internal slots). Drop it whole — memoize the decision so aliases agree. + if (!isStructuredCloneable(obj)) { + recordStrip(ctx, path); + ctx.seen.set(obj, undefined); + return undefined; + } + ctx.seen.set(obj, value); + return value; + } + const out: Record = {}; + ctx.seen.set(obj, out); + for (const key of Object.keys(obj)) { + const childPath = path === '' ? key : `${path}.${key}`; + let child: unknown; + try { + child = (obj as Record)[key]; + } catch { + // A getter that throws is non-serializable — drop the property. + recordStrip(ctx, childPath); + continue; + } + out[key] = stripNonCloneable(child, ctx, depth + 1, childPath); + } + return out; +} + +/** Keys checked (top-level and one level deep) to attribute a record to a file. */ +const DEFAULT_PATH_KEYS = ['filePath', 'path', 'file'] as const; + +/** Read `obj[key]`, returning undefined if the access throws (throwing getter / Proxy trap). */ +function safeGet(obj: Record, key: string): unknown { + try { + return obj[key]; + } catch { + return undefined; + } +} + +/** Read a path key off a child object (one level deep); never throws. */ +function pathFromChild(child: unknown, pathKeys: readonly string[]): string | undefined { + if (child === null || typeof child !== 'object') return undefined; + const crec = child as Record; + for (const pk of pathKeys) { + const v = safeGet(crec, pk); + if (typeof v === 'string') return v; + } + return undefined; +} + +/** + * Best-effort source-path extraction for reporting; never throws. Reads are + * defensive (a throwing getter / Proxy trap on a path-attribution key must not + * escape and abandon the sanitize — it would re-arm the fail-closed cascade). + */ +function findFilePath(element: unknown, pathKeys: readonly string[]): string | undefined { + if (element === null || typeof element !== 'object') return undefined; + const rec = element as Record; + // Top level first — a ParsedFile carries `filePath` here. + for (const key of pathKeys) { + const v = safeGet(rec, key); + if (typeof v === 'string') return v; + } + // Known child next — a ParsedNode carries its path at `properties.filePath`. + // Prefer it over the generic sweep so attribution is deterministic when a + // sibling child also happens to carry a path-like key. + const fromProps = pathFromChild(safeGet(rec, 'properties'), pathKeys); + if (fromProps !== undefined) return fromProps; + // Generic one-level sweep as the fallback for other shapes. + let keys: string[]; + try { + keys = Object.keys(rec); + } catch { + return undefined; // a Proxy ownKeys trap that throws — give up on attribution + } + for (const key of keys) { + if (key === 'properties') continue; // already checked above + const fromChild = pathFromChild(safeGet(rec, key), pathKeys); + if (fromChild !== undefined) return fromChild; + } + return undefined; +} + +export interface MakeCloneSafeOptions { + /** + * Array field names whose offending elements are DROPPED whole rather than + * stripped in place (e.g. `parsedFiles` — its `captureSideChannel` drives + * edge resolution, so a stripped-and-delivered file would ship WRONG edges; + * dropping it lets scope-resolution re-derive it on the main thread). + */ + dropWholeElement: ReadonlySet; + /** Field names to skip entirely (e.g. the `skippedPaths` field itself). */ + skipFields?: ReadonlySet; + /** Keys to probe for a file path when attributing a skip. */ + pathKeys?: readonly string[]; +} + +/** + * Make a worker result's boundary-crossing array fields structured-cloneable, + * mutating `result` in place. Only arrays that actually contain a + * non-cloneable value are rewritten; everything else keeps referential + * identity. Returns the list of affected file paths for reporting. + * + * Call this after ANY failure of the fast-path post — a `DataCloneError`, OR a + * throwing getter's own error surfaced by structuredClone (the caller in + * `post-result.ts` recovers on any throw, not only `DataCloneError`). + */ +export function makeWorkerResultCloneSafe( + result: Record, + options: MakeCloneSafeOptions, +): { skipped: SkippedPath[] } { + const pathKeys = options.pathKeys ?? DEFAULT_PATH_KEYS; + const skipped: SkippedPath[] = []; + + for (const field of Object.keys(result)) { + if (options.skipFields?.has(field)) continue; + const value = result[field]; + if (!Array.isArray(value)) continue; + + const dropWhole = options.dropWholeElement.has(field); + // `out` is built lazily — only once a dirty element appears — by copying the + // clean prefix, so a fully-clean array is never rebuilt and keeps its + // referential identity (no field reassignment). A dirty element is scanned + // (containsNonCloneable) and then stripped (stripNonCloneable): two passes, + // deliberately. The non-allocating pre-scan is exactly what lets CLEAN + // elements stay by reference (zero-copy) — replacing it with an + // always-allocating strip would regress that. This whole path is + // failure-path-only (the fast post already threw), so the second pass over + // the rare dirty element is acceptable. + let out: unknown[] | null = null; + for (let i = 0; i < value.length; i++) { + const element = value[i]; + try { + if (!containsNonCloneable(element, new WeakSet())) { + if (out) out.push(element); + continue; + } + if (!out) out = value.slice(0, i); // first dirty element: copy clean prefix + const path = findFilePath(element, pathKeys) ?? '(unknown)'; + if (dropWhole) { + skipped.push({ path, reason: `dropped non-serializable ${field} entry` }); + continue; + } + const ctx: StripCtx = { stripped: 0, seen: new Map(), keys: [] }; + const cleaned = stripNonCloneable(element, ctx); + // Last-resort guard: if stripping functions/symbols still left something + // structured-clone rejects, drop the element rather than re-throw. + if (isStructuredCloneable(cleaned)) { + out.push(cleaned); + // Name the offending key path(s) so the leak is locatable from the log + // (e.g. "from nodes: properties.toString") — not just the array field. + const at = ctx.keys.slice(0, 3).join(', '); + const more = ctx.keys.length > 3 ? `, …+${ctx.keys.length - 3}` : ''; + skipped.push({ + path, + reason: `stripped ${ctx.stripped} non-serializable value(s) from ${field}: ${at}${more}`, + }); + } else { + skipped.push({ path, reason: `dropped unsalvageable ${field} entry` }); + } + } catch { + // A throw DURING this element's scan/strip — a Proxy with a throwing + // `getPrototypeOf`/`ownKeys` trap reached by Object.getPrototypeOf / + // Object.keys, or any other structural-enumeration throw. Drop the + // element rather than let the throw escape to postResultCloneSafe's + // fail-closed {type:'error'} (which under POOL_SIZE=1 re-arms the + // cascade this net prevents). One pathological element can't sink the + // whole result. + if (!out) out = value.slice(0, i); + skipped.push({ path: '(unknown)', reason: `dropped ${field} entry (sanitizer error)` }); + } + } + if (out) result[field] = out; + } + + // Final safety gate. The loop above only rewrites ARRAY fields, so a future + // non-array result sink (a nested object / Map) — or an array field whose own + // non-index property the element loop didn't reach — could still hold a + // non-cloneable value and throw on the re-post. Make "the returned result is + // structured-cloneable" a hard postcondition: strip any remaining offending + // field in place. Failure-path-only and a no-op once the result is already + // clean (the per-field probe short-circuits every clean field). + if (!isStructuredCloneable(result)) { + for (const field of Object.keys(result)) { + if (options.skipFields?.has(field)) continue; + if (isStructuredCloneable(result[field])) continue; + const ctx: StripCtx = { stripped: 0, seen: new Map(), keys: [] }; + result[field] = stripNonCloneable(result[field], ctx); + const at = ctx.keys.slice(0, 3).join(', '); + skipped.push({ + path: '(result)', + reason: `stripped ${ctx.stripped} non-serializable value(s) from ${field}${at ? `: ${at}` : ''}`, + }); + } + } + + return { skipped }; +} diff --git a/gitnexus/src/core/ingestion/workers/parse-worker.ts b/gitnexus/src/core/ingestion/workers/parse-worker.ts index aacc41e79e..941c464a48 100644 --- a/gitnexus/src/core/ingestion/workers/parse-worker.ts +++ b/gitnexus/src/core/ingestion/workers/parse-worker.ts @@ -25,6 +25,9 @@ import { deriveDefaultExportHocName, } from '../ts-js-hoc-utils.js'; import { parseSourceSafe } from '../../tree-sitter/safe-parse.js'; +import type { SkippedPath } from './clone-safety.js'; +import { postResultCloneSafe } from './post-result.js'; +import { mergeResult } from './result-merge.js'; import type { SymbolTableReader } from '../model/symbol-table.js'; import type { ExtractedRouterInclude, @@ -389,6 +392,15 @@ export interface ParseWorkerResult { */ parsedFiles: ParsedFile[]; skippedLanguages: Record; + /** + * Files whose parse output carried a value the structured-clone algorithm + * couldn't serialize across the worker boundary (#2112). The clone-safety + * net stripped or dropped the offending value so the result could be + * delivered; these paths are surfaced to the operator so the (rare) data + * loss is visible. Optional for cache backward compatibility — older cache + * entries predate the field; consumers must guard with `?? []`. + */ + skippedPaths?: SkippedPath[]; fileCount: number; } @@ -2333,40 +2345,8 @@ let accumulated: ParseWorkerResult = { fileCount: 0, }; let cumulativeProcessed = 0; - -// Use a loop instead of push(...spread) to avoid hitting V8's argument limit -// when merging large result sets (push(...arr) calls apply() under the hood -// and blows the stack when arr has >~65k elements). -const appendAll = (target: T[], src: T[]) => { - for (let i = 0; i < src.length; i++) target.push(src[i]); -}; - -const mergeResult = (target: ParseWorkerResult, src: ParseWorkerResult) => { - appendAll(target.nodes, src.nodes); - appendAll(target.relationships, src.relationships); - appendAll(target.symbols, src.symbols); - appendAll(target.calls, src.calls); - appendAll(target.assignments, src.assignments); - appendAll(target.routes, src.routes); - appendAll(target.fetchCalls, src.fetchCalls); - appendAll(target.fetchWrapperDefs, src.fetchWrapperDefs); - appendAll(target.decoratorRoutes, src.decoratorRoutes); - if (src.routerIncludes) appendAll(target.routerIncludes, src.routerIncludes); - if (src.routerImports) appendAll(target.routerImports, src.routerImports); - if (src.routerModuleAliases) { - target.routerModuleAliases ??= []; - appendAll(target.routerModuleAliases, src.routerModuleAliases); - } - appendAll(target.toolDefs, src.toolDefs); - appendAll(target.ormQueries, src.ormQueries); - appendAll(target.constructorBindings, src.constructorBindings); - appendAll(target.fileScopeBindings, src.fileScopeBindings); - appendAll(target.parsedFiles, src.parsedFiles); - for (const [lang, count] of Object.entries(src.skippedLanguages)) { - target.skippedLanguages[lang] = (target.skippedLanguages[lang] || 0) + count; - } - target.fileCount += src.fileCount; -}; +// `mergeResult` (+ its `appendAll`) lives in ./result-merge.ts (extracted so it +// can be unit-tested without importing this entry module). // Signal the pool that worker-side initialization (parser imports, language // grammars, type-env setup, all helper modules) is complete and the message @@ -2480,7 +2460,7 @@ parentPort!.on('message', (msg: WorkerIncomingMessage) => { accumulated.parsedFiles = []; } } - parentPort!.postMessage({ type: 'result', data: accumulated }); + postResultCloneSafe(accumulated); // Reset for potential reuse accumulated = { nodes: [], diff --git a/gitnexus/src/core/ingestion/workers/post-result.ts b/gitnexus/src/core/ingestion/workers/post-result.ts new file mode 100644 index 0000000000..9fb5de495b --- /dev/null +++ b/gitnexus/src/core/ingestion/workers/post-result.ts @@ -0,0 +1,91 @@ +/** + * Worker → main result delivery with clone-safety (#2112). + * + * Extracted from `parse-worker.ts` into its own side-effect-free module so it + * can be imported and exercised directly (the parse worker is an entry module: + * importing it would construct the parser, post `ready`, and attach the real + * message handler). The integration test imports `postResultCloneSafe` from + * here to cover the production wiring end to end rather than re-implementing it. + */ +import { parentPort } from 'node:worker_threads'; + +import { makeWorkerResultCloneSafe } from './clone-safety.js'; +import type { ParseWorkerResult } from './parse-worker.js'; + +/** + * Strict mode (opt-in via `GITNEXUS_STRICT_CLONE=1`, inherited by workers). When + * on, a clone failure THROWS with the offending key path instead of silently + * sanitizing + delivering — so a leak introduced by a future provider/extractor + * change fails LOUDLY (in CI / dev) at its origin rather than being quietly + * stripped in production. The silent-recovery behavior is exactly what hid the + * original #2112 leak; strict mode removes the silence where we want loudness. + * Off in production, where the net's job is to keep the run alive. + */ +const STRICT_CLONE = process.env.GITNEXUS_STRICT_CLONE === '1'; + +/** + * Deliver the accumulated result to the pool, surviving a non-cloneable value + * (#2112). Fast path: post as-is — on a healthy result this is the only thing + * that runs, so clone-safety adds zero overhead to normal runs. If structured + * clone rejects the payload (a function/symbol leaked into an extraction + * record — the reporter's case was a node `properties` value pointing at a + * native `toString`), rewrite the boundary-crossing arrays so the result is + * cloneable, record the affected paths on `result.skippedPaths`, warn the + * operator naming the offending field + file (so the still-unpinned leak is + * diagnosable from logs and fixable at source), and re-post. + * + * Recovery is attempted for ANY first-post failure, not only a `DataCloneError`. + * structuredClone invokes getters, and a getter that THROWS surfaces its own + * error (a `RangeError`, etc.) — NOT a `DataCloneError` (confirmed against a + * real MessageChannel). Gating recovery on `DataCloneError` let such a throw + * re-throw past the sanitizer and re-arm, under `POOL_SIZE=1`, the worker-death + * cascade this net prevents. The recovery path is wrapped in its own try/catch + * so a still-uncloneable re-post fails closed to a primitive-only + * `{type:'error'}` DELIBERATELY rather than escaping the worker. + */ +export function postResultCloneSafe(result: ParseWorkerResult): void { + try { + parentPort!.postMessage({ type: 'result', data: result }); + return; + } catch { + // Fall through to recovery on ANY failure (DataCloneError OR a throwing + // getter's own error). A healthy post returned above and never reaches here. + } + try { + // `as unknown as Record` is the standard widening for a + // no-index-signature interface (TS rejects a single-step `as`). The field + // sets are typed to `keyof ParseWorkerResult` so renaming a field is a + // compile error here, not a silent loss of the drop-whole / skip protection. + const { skipped } = makeWorkerResultCloneSafe(result as unknown as Record, { + dropWholeElement: new Set(['parsedFiles']), + skipFields: new Set(['skippedPaths']), + }); + if (skipped.length > 0) { + if (STRICT_CLONE) { + // Surface the leak loudly with its exact key path(s) instead of + // delivering a sanitized result. Routes to the catch below → a + // primitive-only {type:'error'} the pool reports, failing CI. + const detail = skipped.map((s) => `${s.path}: ${s.reason}`).join('; '); + throw new Error( + `GITNEXUS_STRICT_CLONE: worker result was not structured-cloneable — ${detail}`, + ); + } + result.skippedPaths = [...(result.skippedPaths ?? []), ...skipped]; + const sample = skipped + .slice(0, 5) + .map((s) => `${s.path} (${s.reason})`) + .join('; '); + const more = skipped.length > 5 ? ` …and ${skipped.length - 5} more` : ''; + if (parentPort) { + parentPort.postMessage({ + type: 'warning', + message: `Sanitized ${skipped.length} file(s) with non-serializable parse output before delivery: ${sample}${more}`, + }); + } + } + parentPort!.postMessage({ type: 'result', data: result }); + } catch (err) { + const e = err instanceof Error ? err : new Error(String(err)); + parentPort!.postMessage({ type: 'error', error: e.message, errorStack: e.stack }); + } +} diff --git a/gitnexus/src/core/ingestion/workers/result-merge.ts b/gitnexus/src/core/ingestion/workers/result-merge.ts new file mode 100644 index 0000000000..423709f894 --- /dev/null +++ b/gitnexus/src/core/ingestion/workers/result-merge.ts @@ -0,0 +1,56 @@ +/** + * Merge of accumulated parse-worker results (sub-batch result → the conceptual + * job's running accumulator). + * + * Extracted from `parse-worker.ts` into this side-effect-free module so the + * merge can be imported and unit-tested directly — the parse worker is an entry + * module (importing it constructs the parser, posts `ready`, and attaches the + * real MessagePort handler), so a main-thread test cannot import a helper out of + * it. Mirrors the `post-result.ts` extraction. + * + * `import type` of `ParseWorkerResult` is erased at runtime, so there is no + * import cycle with `parse-worker.ts` (which imports this module's runtime). + */ +import type { ParseWorkerResult } from './parse-worker.js'; + +// Use a loop instead of push(...spread) to avoid hitting V8's argument limit +// when merging large result sets (push(...arr) calls apply() under the hood +// and blows the stack when arr has >~65k elements). +const appendAll = (target: T[], src: T[]): void => { + for (let i = 0; i < src.length; i++) target.push(src[i]); +}; + +/** + * Merge `src` into `target` in place: append every boundary-crossing array, + * sum the per-language skip counts, union the clone-safety `skippedPaths`, and + * add the file count. + */ +export const mergeResult = (target: ParseWorkerResult, src: ParseWorkerResult): void => { + appendAll(target.nodes, src.nodes); + appendAll(target.relationships, src.relationships); + appendAll(target.symbols, src.symbols); + appendAll(target.calls, src.calls); + appendAll(target.assignments, src.assignments); + appendAll(target.routes, src.routes); + appendAll(target.fetchCalls, src.fetchCalls); + appendAll(target.fetchWrapperDefs, src.fetchWrapperDefs); + appendAll(target.decoratorRoutes, src.decoratorRoutes); + if (src.routerIncludes) appendAll(target.routerIncludes, src.routerIncludes); + if (src.routerImports) appendAll(target.routerImports, src.routerImports); + if (src.routerModuleAliases) { + target.routerModuleAliases ??= []; + appendAll(target.routerModuleAliases, src.routerModuleAliases); + } + appendAll(target.toolDefs, src.toolDefs); + appendAll(target.ormQueries, src.ormQueries); + appendAll(target.constructorBindings, src.constructorBindings); + appendAll(target.fileScopeBindings, src.fileScopeBindings); + appendAll(target.parsedFiles, src.parsedFiles); + for (const [lang, count] of Object.entries(src.skippedLanguages)) { + target.skippedLanguages[lang] = (target.skippedLanguages[lang] || 0) + count; + } + if (src.skippedPaths && src.skippedPaths.length > 0) { + (target.skippedPaths ??= []).push(...src.skippedPaths); + } + target.fileCount += src.fileCount; +}; diff --git a/gitnexus/src/core/ingestion/workers/worker-pool.ts b/gitnexus/src/core/ingestion/workers/worker-pool.ts index 5082453a0a..038d8ed654 100644 --- a/gitnexus/src/core/ingestion/workers/worker-pool.ts +++ b/gitnexus/src/core/ingestion/workers/worker-pool.ts @@ -858,6 +858,12 @@ function createJobs( * time spent across all attempts/splits/retries. When the budget is * exhausted, the pool surfaces the in-flight path via `WorkerPoolDispatchError` * instead of letting timeouts compound indefinitely. + * + * Upstream of these layers, the parse worker self-sanitizes a result that the + * structured-clone algorithm can't serialize (#2112) — stripping or dropping + * the offending value and reporting the affected paths on the result — so a + * single non-cloneable value can't masquerade as a worker death and exhaust a + * slot's respawn budget here. */ export const createWorkerPool = ( workerUrl: URL, @@ -1816,14 +1822,19 @@ export const createWorkerPool = ( if (slotGenerations[workerIndex] !== slotGen) return; if (settled || stopped) return; // Native postMessage delivers POJO directly via Node's - // structured clone. V8 deserialization failures (malformed - // frame, non-cloneable value) surface as a `messageerror` - // event handled below — they never reach this handler. The - // only thing we need to guard for here is a worker that - // sends a message without a `type` discriminant (a bug in - // the worker, not a wire-format issue): without the guard - // `null.type` would throw a TypeError out of the - // EventEmitter listener → uncaughtException on the main + // structured clone. Two distinct clone failure modes exist, + // and NEITHER reaches this handler: (1) a SENDER-side + // non-cloneable value (a function/symbol that leaked into the + // result) throws a synchronous `DataCloneError` on the + // worker's own postMessage — the parse worker self-sanitizes + // such results before delivery (#2112) and falls back to a + // primitive-only `{type:'error'}` if it still can't serialize; + // (2) a RECEIVER-side deserialization failure surfaces as a + // `messageerror` event handled below. The only thing THIS + // handler guards is a worker that sends a message without a + // `type` discriminant (a worker bug, not a wire-format issue): + // without the guard `null.type` would throw a TypeError out of + // the EventEmitter listener → uncaughtException on the main // thread. const msg = raw as WorkerOutgoingMessage; if (msg === null || typeof msg !== 'object' || typeof msg.type !== 'string') { @@ -1931,12 +1942,15 @@ export const createWorkerPool = ( } }; - // `messageerror` fires when V8 fails to deserialize a postMessage - // payload (e.g., the worker tries to send a non-cloneable value - // back, or structured-clone hits an unsupported shape). The worker - // stays ALIVE but the message is lost — without this handler the - // pool would sit on the dropped message until the idle timeout - // expires. Treat it as worker death so the resilience layers fire: + // `messageerror` fires when V8 fails to DESERIALIZE a postMessage + // payload on THIS (receiver) side — a value that serialized on the + // worker but can't be reconstructed here. (A non-cloneable value on + // the SENDER side instead throws a synchronous DataCloneError on the + // worker's own postMessage; that path is caught and sanitized + // worker-side (#2112) and never arrives here.) The worker stays ALIVE + // but the message is lost — without this handler the pool would sit on + // the dropped message until the idle timeout expires. Treat it as + // worker death so the resilience layers fire: // requeue the remainder via `recoverAndResume`, attribute the // in-flight file from the `starting-file` signal (if observed), // and let the per-slot respawn budget and circuit breaker decide diff --git a/gitnexus/src/server/analyze-worker-ipc.ts b/gitnexus/src/server/analyze-worker-ipc.ts new file mode 100644 index 0000000000..335e8cb839 --- /dev/null +++ b/gitnexus/src/server/analyze-worker-ipc.ts @@ -0,0 +1,72 @@ +/** + * JSON-safe projection of `AnalyzeResult` for the analyze-worker → parent IPC + * boundary (#2112 boundary audit; #2135). + * + * The forked analyze worker (`analyze-worker.ts`) reports completion to the + * parent over `child_process` IPC, which uses Node's DEFAULT `'json'` + * serialization — `api.ts` forks the worker with no `serialization:` option, so + * the channel runs `JSON.stringify`/`JSON.parse`, NOT V8 structured clone. + * + * `AnalyzeResult.pipelineResult` is populated on every successful analysis + * (`run-analyze.ts`) and carries `pipelineResult.graph` — the live + * `KnowledgeGraph` closure object. Sending the raw result across this channel is + * wrong three ways: + * 1. Waste — the graph's `nodes`/`relationships` getters force-materialize the + * ENTIRE graph into two arrays, then JSON-stringify them, on every analyze. + * On a large repo (the #2112 scenario) that is a multi-hundred-MB + * stringify+parse whose result is immediately discarded. + * 2. Silent corruption — the graph's methods are own function properties; + * `JSON.stringify` drops them with no error, so a `pipelineResult.graph` + * that survived the wire is a data-only husk whose `forEachNode(...)` throws + * "is not a function" far from the cause. + * 3. Conditional crash — a BigInt or circular reference anywhere in the + * payload makes `process.send` throw `TypeError` synchronously; the throw is + * caught in the worker and re-sent as `{type:'error'}`, turning a + * SUCCESSFUL analysis (DB already written) into a reported FAILURE. This is + * the #2112 failure family on the server path, and — unlike the parse-worker + * result boundary — it has no clone-safety net. + * + * The parent (`api.ts`) reads only `result.repoName`; `pipelineResult`'s real + * consumers (CLI skill generation) call `runFullAnalysis` in-process and never + * cross this fork. So the worker sends an explicit allowlist of the scalar + * fields, JSON-safe by construction. + */ +import type { AnalyzeResult } from '../core/run-analyze.js'; + +/** + * The JSON-safe subset of `AnalyzeResult` that crosses the analyze-worker IPC + * boundary. A `Pick` allowlist — NOT `Omit<…, 'pipelineResult'>`. With `Pick` + * the allowlist IS the type, so the projection is exhaustive by construction: + * `projectAnalyzeResultForIpc`'s return literal must name exactly these keys + * (omitting one is a compile error), and a new field added to `AnalyzeResult` + * is simply absent from the wire until it is *deliberately* added here. `Omit` + * couldn't give that guarantee — it kept every other field, including OPTIONAL + * ones (e.g. `isPrimaryBranch?`), so an optional non-serializable field could be + * advertised by the type yet silently dropped by the runtime allowlist. + * + * `isPrimaryBranch` is intentionally excluded: the parent (`api.ts`) reads only + * `repoName`, and nothing consumes `isPrimaryBranch` across this fork (its CLI + * consumer calls `runFullAnalysis` in-process). Add a field here only when a + * server-side IPC consumer actually needs it — and only if it is JSON-safe. + */ +export type AnalyzeResultIpc = Pick< + AnalyzeResult, + 'repoName' | 'repoPath' | 'stats' | 'alreadyUpToDate' | 'ftsRepairedOnly' | 'ftsSkipped' +>; + +/** + * Project an `AnalyzeResult` down to the JSON-safe fields the parent consumes, + * dropping `pipelineResult` (the live `KnowledgeGraph`) and any other field not + * in the `AnalyzeResultIpc` allowlist. The return literal is exhaustive over + * `AnalyzeResultIpc` (a missing key is a compile error). + */ +export function projectAnalyzeResultForIpc(result: AnalyzeResult): AnalyzeResultIpc { + return { + repoName: result.repoName, + repoPath: result.repoPath, + stats: result.stats, + alreadyUpToDate: result.alreadyUpToDate, + ftsRepairedOnly: result.ftsRepairedOnly, + ftsSkipped: result.ftsSkipped, + }; +} diff --git a/gitnexus/src/server/analyze-worker.ts b/gitnexus/src/server/analyze-worker.ts index a87b341795..1cacd1d2fb 100644 --- a/gitnexus/src/server/analyze-worker.ts +++ b/gitnexus/src/server/analyze-worker.ts @@ -11,7 +11,8 @@ * Child -> Parent: { type: 'error', message: string } */ -import { runFullAnalysis, type AnalyzeOptions, type AnalyzeResult } from '../core/run-analyze.js'; +import { runFullAnalysis, type AnalyzeOptions } from '../core/run-analyze.js'; +import { projectAnalyzeResultForIpc, type AnalyzeResultIpc } from './analyze-worker-ipc.js'; import { closeLbug } from '../core/lbug/lbug-adapter.js'; interface StartMessage { @@ -29,7 +30,9 @@ interface ProgressMessage { interface CompleteMessage { type: 'complete'; - result: AnalyzeResult; + // JSON-safe projection (no `pipelineResult` / live KnowledgeGraph). This + // channel is default-JSON child_process IPC — see analyze-worker-ipc.ts. + result: AnalyzeResultIpc; } interface ErrorMessage { @@ -79,7 +82,12 @@ process.on('message', async (msg: StartMessage) => { }, }); - send({ type: 'complete', result }); + // Send a JSON-safe projection, NOT the raw result: the IPC channel is + // default-JSON serialization and `result.pipelineResult` carries the live + // KnowledgeGraph (wasteful to materialize, silently corrupted by JSON, and + // a BigInt/circular value would throw and mis-report this success as a + // failure). See analyze-worker-ipc.ts. + send({ type: 'complete', result: projectAnalyzeResultForIpc(result) }); } catch (err: any) { send({ type: 'error', message: err?.message || 'Analysis failed' }); } diff --git a/gitnexus/src/storage/parse-cache.ts b/gitnexus/src/storage/parse-cache.ts index c61ade5ef6..0e683389d7 100644 --- a/gitnexus/src/storage/parse-cache.ts +++ b/gitnexus/src/storage/parse-cache.ts @@ -202,6 +202,9 @@ export const slimParseWorkerResultsForCache = ( assignments: [], constructorBindings: [], parsedFiles: [], + // #2112: a clone-safety skip list is per-run telemetry, not graph data — + // replay ignores it. Drop it so it doesn't bloat the cached shard. + skippedPaths: [], }); } return slim; diff --git a/gitnexus/test/integration/parse-impl-clone-skip.test.ts b/gitnexus/test/integration/parse-impl-clone-skip.test.ts new file mode 100644 index 0000000000..1fd155cf09 --- /dev/null +++ b/gitnexus/test/integration/parse-impl-clone-skip.test.ts @@ -0,0 +1,291 @@ +/** + * #2112 — Integration regression test for the worker result clone-safety net. + * + * Reproduces the deterministic large-repo killer: a parse worker whose + * accumulated result carries a value the structured-clone algorithm can't + * serialize (the reporter's case was a node `properties` value pointing at a + * native `toString`). Before the fix, `parentPort.postMessage({type:'result', + * data})` threw a `DataCloneError` SENDER-side; the worker re-posted it as + * `{type:'error'}`, the pool counted it as a worker death, and under + * `GITNEXUS_WORKER_POOL_SIZE=1` the same graph re-threw on every respawn until + * the slot's budget was exhausted and the whole parse phase aborted. + * + * Runs with REAL `worker_threads` + `createWorkerPool` over the production + * pool / merge / graph wiring, under `workerPoolSize: 1` (matching the + * conservative workaround that still failed in the issue). The GREEN worker is + * an ESM module that statically imports and calls the REAL built + * `postResultCloneSafe` from `dist/` — so this exercises the actual production + * delivery wiring across a real `postMessage` boundary (the fake-worker doubles + * used by the unit suite bypass structured clone entirely and can't reproduce + * the failure). + * + * Build prerequisite: the worker imports `dist/.../post-result.js`, so + * `node scripts/build.js` must run first (the `pretest:integration` step does + * this; a stale `dist/` would test old behavior). + */ +import { describe, it, expect, afterEach, beforeEach } from 'vitest'; +import { tmpdir } from 'node:os'; +import { mkdtempSync, writeFileSync, mkdirSync, rmSync, statSync } from 'node:fs'; +import path from 'node:path'; +import { pathToFileURL } from 'node:url'; + +import { createKnowledgeGraph } from '../../src/core/graph/graph.js'; +import { runChunkedParseAndResolve } from '../../src/core/ingestion/pipeline-phases/parse-impl.js'; +import { _captureLogger } from '../../src/core/logger.js'; + +// file:// URL of the BUILT production result-delivery helper, imported by the +// ESM test worker so it exercises the REAL postResultCloneSafe wiring (the +// {type:'warning'} post + skippedPaths append), not a re-implementation. +const POST_RESULT_URL = new URL('../../dist/core/ingestion/workers/post-result.js', import.meta.url) + .href; + +const ACCUMULATED_INIT = `{ + nodes: [], relationships: [], symbols: [], calls: [], assignments: [], + routes: [], fetchCalls: [], fetchWrapperDefs: [], decoratorRoutes: [], + routerIncludes: [], routerImports: [], toolDefs: [], ormQueries: [], + constructorBindings: [], fileScopeBindings: [], parsedFiles: [], + skippedLanguages: {}, fileCount: 0, +}`; + +/** + * Synthesizes a Function node per file. For `poison.ts` it leaks an own native + * `toString` into the node's `properties` — the exact #2112 shape that throws + * `DataCloneError` across the real worker boundary. + */ +const SUB_BATCH_HANDLER = ` + if (msg && msg.type === 'sub-batch') { + for (const file of msg.files) { + const baseName = file.path.split('/').pop().replace(/\\.ts$/, ''); + const properties = { + name: baseName, filePath: file.path, startLine: 1, endLine: 1, + language: 'typescript', isExported: true, + }; + if (file.path.endsWith('poison.ts')) { + properties.toString = Object.prototype.toString; // native fn → non-cloneable + } + accumulated.nodes.push({ id: 'func:' + file.path, label: 'Function', properties }); + accumulated.fileCount++; + } + parentPort.postMessage({ type: 'progress', filesProcessed: accumulated.fileCount }); + parentPort.postMessage({ type: 'sub-batch-done' }); + return; + }`; + +/** + * GREEN worker: delivers via the REAL production postResultCloneSafe, so this + * test covers the actual wiring (the {type:'warning'} post + skippedPaths + * append), not a re-implementation that could drift from production. + */ +const CLONE_SAFE_WORKER = ` +import { parentPort } from 'node:worker_threads'; +import { postResultCloneSafe } from '${POST_RESULT_URL}'; +const accumulated = ${ACCUMULATED_INIT}; +parentPort.postMessage({ type: 'ready' }); +parentPort.on('message', (msg) => { +${SUB_BATCH_HANDLER} + if (msg && msg.type === 'flush') { + postResultCloneSafe(accumulated); + } +}); +`; + +/** RED control: posts the non-cloneable result raw (no clone-safety net). */ +const RAW_WORKER = ` +import { parentPort } from 'node:worker_threads'; +const accumulated = ${ACCUMULATED_INIT}; +parentPort.postMessage({ type: 'ready' }); +parentPort.on('message', (msg) => { +${SUB_BATCH_HANDLER} + if (msg && msg.type === 'flush') { + parentPort.postMessage({ type: 'result', data: accumulated }); + } +}); +`; + +/** + * GETTER worker: poison.ts's node carries an own-enumerable getter that THROWS. + * structuredClone invokes getters, so this surfaces a RangeError — NOT a + * DataCloneError — at the boundary. The net must still recover (route it into + * the sanitizer), not re-throw past it. Delivers via the real postResultCloneSafe. + */ +const GETTER_WORKER = ` +import { parentPort } from 'node:worker_threads'; +import { postResultCloneSafe } from '${POST_RESULT_URL}'; +const accumulated = ${ACCUMULATED_INIT}; +parentPort.postMessage({ type: 'ready' }); +parentPort.on('message', (msg) => { + if (msg && msg.type === 'sub-batch') { + for (const file of msg.files) { + const baseName = file.path.split('/').pop().replace(/\\.ts$/, ''); + const properties = { + name: baseName, filePath: file.path, startLine: 1, endLine: 1, + language: 'typescript', isExported: true, + }; + if (file.path.endsWith('poison.ts')) { + Object.defineProperty(properties, 'boom', { + enumerable: true, + get() { throw new RangeError('boom getter'); }, + }); + } + accumulated.nodes.push({ id: 'func:' + file.path, label: 'Function', properties }); + accumulated.fileCount++; + } + parentPort.postMessage({ type: 'progress', filesProcessed: accumulated.fileCount }); + parentPort.postMessage({ type: 'sub-batch-done' }); + return; + } + if (msg && msg.type === 'flush') { + postResultCloneSafe(accumulated); + } +}); +`; + +const FIXTURE_FILES = { + 'src/good_a.ts': 'export function good_a() { return 1; }\n', + 'src/poison.ts': 'export function poison() { return 2; }\n', + 'src/good_c.ts': 'export function good_c() { return 3; }\n', +}; + +const nodeNames = (graph: ReturnType): Set => { + const names = new Set(); + for (const n of graph.nodes.values()) { + if (n.label === 'Function') { + const name = (n.properties as { name?: string }).name; + if (name) names.add(name); + } + } + return names; +}; + +// These cases deliberately inject non-cloneable values, so they're meaningless +// under a global GITNEXUS_STRICT_CLONE=1 run (strict turns the sanitize into a +// throw). Skip the whole suite there — a global strict lane's value is running +// the REAL-extractor integration tests under strict, not this synthetic one. +// The strict-mode case below sets the flag itself (self-contained). +const STRICT = process.env.GITNEXUS_STRICT_CLONE === '1'; + +describe.skipIf(STRICT)('#2112: worker result clone-safety integration (POOL_SIZE=1)', () => { + let tempDir: string; + let repoDir: string; + + const writeWorker = (script: string): URL => { + const p = path.join(tempDir, `clone-skip-worker-${Math.abs(hash(script))}.mjs`); + writeFileSync(p, script); + return pathToFileURL(p) as URL; + }; + // Stable name without Math.random (banned in this harness elsewhere) — index by content. + const hash = (s: string): number => { + let h = 0; + for (let i = 0; i < s.length; i++) h = (h * 31 + s.charCodeAt(i)) | 0; + return h; + }; + + const runWith = async (workerUrl: URL): Promise> => { + const filePaths = Object.keys(FIXTURE_FILES); + const scanned = filePaths.map((rel) => ({ + path: rel, + size: statSync(path.join(repoDir, rel)).size, + })); + const graph = createKnowledgeGraph(); + await runChunkedParseAndResolve( + graph, + scanned, + filePaths, + filePaths.length, + repoDir, + 1, // deterministic start time (Date.now is banned in this harness) + () => {}, + { + skipWorkers: false, + workerUrlForTest: workerUrl, + workerPoolSize: 1, // poison lands on the only slot — the issue's workaround config + }, + ); + return graph; + }; + + beforeEach(() => { + tempDir = mkdtempSync(path.join(tmpdir(), 'parse-impl-clone-skip-')); + repoDir = path.join(tempDir, 'repo'); + mkdirSync(repoDir, { recursive: true }); + for (const [rel, content] of Object.entries(FIXTURE_FILES)) { + const full = path.join(repoDir, rel); + mkdirSync(path.dirname(full), { recursive: true }); + writeFileSync(full, content); + } + }); + + afterEach(() => { + rmSync(tempDir, { recursive: true, force: true }); + }); + + it('GREEN: a non-cloneable result is sanitized and delivered; the run completes with all files', async () => { + // Capture the production telemetry (parsing-processor logs the per-file skip + // with its path + reason) so this asserts the REAL skippedPaths/warning + // wiring surfaced, not just that the graph ended up correct. + const cap = _captureLogger(); + try { + const graph = await runWith(writeWorker(CLONE_SAFE_WORKER)); + const names = nodeNames(graph); + // Survivors AND the sanitized poison file are all present — the run did not abort. + expect(names.has('good_a')).toBe(true); + expect(names.has('good_c')).toBe(true); + // The poison node is delivered with its legitimate data intact (only the + // leaked native `toString` was stripped), so it still lands in the graph. + expect(names.has('poison')).toBe(true); + // The clone-safety telemetry surfaced the offending file AND the exact + // stripped key path — the wiring this suite claims to cover. + const msgs = cap.records().map((r) => String(r.msg ?? '')); + const skipLine = msgs.find( + (m) => m.includes('poison.ts') && m.includes('properties.toString'), + ); + expect( + skipLine, + `expected a sanitize warning naming poison.ts + properties.toString; saw: ${msgs.join(' | ')}`, + ).toBeDefined(); + } finally { + cap.restore(); + } + }); + + it('GREEN: a throwing getter (RangeError, not DataCloneError) is recovered, not re-thrown', async () => { + // structuredClone invokes getters; a throwing getter surfaces its own + // RangeError at the boundary. The net must route it into the sanitizer + // (which drops the offending property) rather than re-throwing past it and + // re-arming the POOL_SIZE=1 worker-death cascade. Without the fix this run + // rejects; with it, all files (incl. the sanitized poison node) are present. + const graph = await runWith(writeWorker(GETTER_WORKER)); + const names = nodeNames(graph); + expect(names.has('good_a')).toBe(true); + expect(names.has('good_c')).toBe(true); + expect(names.has('poison')).toBe(true); + }); + + it('strict mode (GITNEXUS_STRICT_CLONE=1) surfaces the leak loudly with the key path, not silent sanitize', async () => { + // The spawned worker inherits process.env, so postResultCloneSafe runs in + // strict mode: instead of sanitizing + delivering, it THROWS with the exact + // offending key path → the run rejects (a real future extractor leak would + // fail CI loudly at its origin instead of being silently stripped in prod). + const prev = process.env.GITNEXUS_STRICT_CLONE; + process.env.GITNEXUS_STRICT_CLONE = '1'; + try { + await expect(runWith(writeWorker(CLONE_SAFE_WORKER))).rejects.toThrow( + /STRICT_CLONE|not structured-cloneable|properties\.toString/i, + ); + } finally { + if (prev === undefined) delete process.env.GITNEXUS_STRICT_CLONE; + else process.env.GITNEXUS_STRICT_CLONE = prev; + } + }); + + it('RED control: without clone-safety, the same poison result aborts the parse phase', async () => { + // Pre-fix behavior: the raw non-cloneable result throws DataCloneError in + // the worker; under POOL_SIZE=1 the pool exhausts the slot's respawn + // budget and rejects. The matcher is the specific contract — not a bare + // .toThrow() — so an unrelated failure (spawn error, stale dist) can't pass + // it and mask a broken RED→GREEN flip. + await expect(runWith(writeWorker(RAW_WORKER))).rejects.toThrow( + /circuit breaker|consecutive failures|respawn budget|could not be cloned/i, + ); + }); +}); diff --git a/gitnexus/test/unit/analyze-worker-ipc.test.ts b/gitnexus/test/unit/analyze-worker-ipc.test.ts new file mode 100644 index 0000000000..5d4738bee6 --- /dev/null +++ b/gitnexus/test/unit/analyze-worker-ipc.test.ts @@ -0,0 +1,107 @@ +/** + * #2112 boundary audit — the analyze-worker → parent IPC projection. + * + * The forked analyze worker reports completion over default-JSON child_process + * IPC. `AnalyzeResult.pipelineResult` carries the live `KnowledgeGraph` (closure + * methods, getter-materialized arrays) and can transitively hold a BigInt or a + * circular reference — all of which break `JSON.stringify` (the IPC serializer): + * methods drop silently, BigInt/circular THROW. `projectAnalyzeResultForIpc` + * strips `pipelineResult` to an explicit JSON-safe allowlist so the boundary is + * safe by construction. These tests pin that contract. + */ +import { describe, it, expect } from 'vitest'; + +import { projectAnalyzeResultForIpc } from '../../src/server/analyze-worker-ipc.js'; +import type { AnalyzeResult } from '../../src/core/run-analyze.js'; +import { createKnowledgeGraph } from '../../src/core/graph/graph.js'; + +/** + * An `AnalyzeResult` whose `pipelineResult` is hostile to JSON in all three + * ways the real `KnowledgeGraph` can be: a method (dropped), a circular ref + * (throws), and a BigInt (throws). + */ +function hostileResult(): AnalyzeResult { + const graph: Record = { + nodes: [{ id: 'a', label: 'Function', properties: { name: 'a' } }], + relationships: [], + forEachNode: () => {}, // own function property — JSON drops it silently + bigCount: 10n, // BigInt — JSON.stringify throws + }; + graph.self = graph; // circular — JSON.stringify throws + return { + repoName: 'demo', + repoPath: '/repos/demo', + stats: { files: 3, nodes: 1, edges: 0 }, + alreadyUpToDate: false, + ftsSkipped: true, + pipelineResult: { graph, repoPath: '/repos/demo', totalFileCount: 3 }, + }; +} + +describe('#2112: analyze-worker IPC projection', () => { + it('drops pipelineResult and preserves the scalar fields the parent consumes', () => { + const projected = projectAnalyzeResultForIpc(hostileResult()); + expect(projected).toEqual({ + repoName: 'demo', + repoPath: '/repos/demo', + stats: { files: 3, nodes: 1, edges: 0 }, + alreadyUpToDate: false, + ftsRepairedOnly: undefined, + ftsSkipped: true, + }); + expect('pipelineResult' in projected).toBe(false); + }); + + it('the projection is JSON-serializable (survives the default child_process IPC channel)', () => { + const projected = projectAnalyzeResultForIpc(hostileResult()); + // The real failure mode: process.send runs JSON.stringify under the hood. + expect(() => JSON.stringify(projected)).not.toThrow(); + const roundTripped = JSON.parse(JSON.stringify(projected)); + expect(roundTripped.repoName).toBe('demo'); + expect(roundTripped.stats.nodes).toBe(1); + }); + + it('anchors the hazard: serializing the RAW result throws (the bug the projection prevents)', () => { + // Without the projection, `send({type:'complete', result})` would throw a + // TypeError in the worker, get caught, and mis-report this success as a + // failure. This assertion documents why the projection exists. + expect(() => JSON.stringify(hostileResult())).toThrow(TypeError); + }); + + it('drops a REAL KnowledgeGraph from pipelineResult — the payload stays tiny (graph not materialized)', () => { + // The synthetic cases above use a hand-built hostile object; this uses the + // real createKnowledgeGraph the server path actually puts in pipelineResult, + // whose nodes/relationships getters would materialize the whole graph into + // arrays under JSON.stringify. The projection must drop it entirely. + const graph = createKnowledgeGraph(); + for (let i = 0; i < 50; i++) { + graph.addNode({ + id: `n${i}`, + label: 'Function', + properties: { name: `n${i}`, filePath: 'x.ts' }, + }); + } + graph.addRelationship({ id: 'n0->n1', source: 'n0', target: 'n1', type: 'CALLS' }); + const result: AnalyzeResult = { + repoName: 'demo', + repoPath: '/r', + stats: { nodes: 50, edges: 1 }, + pipelineResult: { + graph, + repoPath: '/r', + totalFileCount: 50, + resolutionOutcomes: [], + usedWorkerPool: true, + }, + }; + const projected = projectAnalyzeResultForIpc(result); + expect('pipelineResult' in projected).toBe(false); + const json = JSON.stringify(projected); + // A materialized 50-node graph would be thousands of bytes; the projection + // is just the scalar fields, so this stays small. + expect(json.length).toBeLessThan(300); + const rt = JSON.parse(json); + expect(rt.repoName).toBe('demo'); + expect(rt.stats.nodes).toBe(50); + }); +}); diff --git a/gitnexus/test/unit/clone-safety-cloneable.test.ts b/gitnexus/test/unit/clone-safety-cloneable.test.ts new file mode 100644 index 0000000000..6064245617 --- /dev/null +++ b/gitnexus/test/unit/clone-safety-cloneable.test.ts @@ -0,0 +1,86 @@ +/** + * #2143 — compile-time boundary guard: `Cloneable` + `assertCloneable()`. + * + * `assertCloneable` is a runtime identity (zero cost); its real value is the + * compile-time guarantee that a producer feeding an `unknown` worker-result + * sink returns only structured-clone-safe data. The `@ts-expect-error` lines + * below ARE the assertions — they make the type-check fail (`tsconfig.test.json`) + * if a non-cloneable payload ever becomes assignable to `Cloneable`; the + * runtime cases pin the identity contract callers rely on. + */ +import { describe, it, expect } from 'vitest'; + +import { assertCloneable, type Cloneable } from '../../src/core/ingestion/workers/clone-safety.js'; + +describe('#2143: assertCloneable runtime identity', () => { + it('returns clone-safe values unchanged (zero-cost identity)', () => { + const obj = { kind: 'cpp' as const, names: ['a', 'b'], depth: 3, ok: true }; + expect(assertCloneable(obj)).toBe(obj); + + const withMap = { m: new Map([['a', 1]]) as ReadonlyMap }; + expect(assertCloneable(withMap)).toBe(withMap); + + // `X | undefined` (the real shape provider hooks return — collectFoo(): Foo | undefined). + const maybe: string | undefined = undefined; + expect(assertCloneable(maybe)).toBeUndefined(); + + const nested = { a: { b: { c: [1, 2, 3] } } }; + expect(assertCloneable(nested)).toBe(nested); + }); + + it('a guarded value really is structured-cloneable (the runtime claim behind the type)', () => { + const payload = { kind: 'cpp' as const, ranges: ['1:2'], inner: { xs: [1, 2] } }; + const guarded = assertCloneable(payload); + expect(() => structuredClone(guarded)).not.toThrow(); + }); +}); + +describe('#2143: Cloneable compile-time rejection (type-level)', () => { + it('accepts clean interface payloads and rejects function/symbol members', () => { + interface Clean { + readonly kind: 'cpp'; + readonly names: readonly string[]; + readonly inner: { readonly n: number }; + } + const clean: Clean = { kind: 'cpp', names: ['a'], inner: { n: 1 } }; + expect(assertCloneable(clean)).toBe(clean); // compiles — clean interface is Cloneable + + interface LeakyFn { + readonly name: string; + readonly toString: () => string; + } + const leakyFn: LeakyFn = { name: 'x', toString: () => 'x' }; + // @ts-expect-error — a function member is not Cloneable (toString resolves to never) + assertCloneable(leakyFn); + + interface LeakySym { + readonly tag: symbol; + } + const leakySym: LeakySym = { tag: Symbol('t') }; + // @ts-expect-error — a symbol member is not Cloneable (tag resolves to never) + assertCloneable(leakySym); + + // R4 (#2135 tri-review): an `any`-typed member must NOT defeat the guard. + interface LeakyAny { + readonly name: string; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + readonly bag: any; + } + const leakyAny: LeakyAny = { name: 'x', bag: () => {} }; + // @ts-expect-error — an `any` member resolves to never, so the payload is rejected + assertCloneable(leakyAny); + + // The guard must not be vacuous — these resolve to `never` at the type level. + type FnIsNever = [Cloneable<() => void>] extends [never] ? true : false; + type SymIsNever = [Cloneable] extends [never] ? true : false; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + type AnyIsNever = [Cloneable] extends [never] ? true : false; + const fnIsNever: FnIsNever = true; + const symIsNever: SymIsNever = true; + const anyIsNever: AnyIsNever = true; + // Array equality (not `&&`) so this isn't a trivial-always-true conditional; + // the real assertions are the `: …IsNever = true` annotations above, which + // fail to compile if any guard regresses. + expect([fnIsNever, symIsNever, anyIsNever]).toEqual([true, true, true]); + }); +}); diff --git a/gitnexus/test/unit/clone-safety-payloads.test.ts b/gitnexus/test/unit/clone-safety-payloads.test.ts new file mode 100644 index 0000000000..2f2dabe8c1 --- /dev/null +++ b/gitnexus/test/unit/clone-safety-payloads.test.ts @@ -0,0 +1,44 @@ +/** + * #2143 — the concrete worker-boundary payload types are `Cloneable`. + * + * The provider hooks (`extractTemplateConstraints`, `collectCaptureSideChannel`) + * feed `unknown`-typed worker-result sinks and are wrapped in `assertCloneable` + * at their source sites (c-cpp.ts, kotlin.ts). These type-level assertions pin + * the guarantee INDEPENDENTLY of that wiring: if a future field added to any of + * these payloads is non-serializable (a function, symbol, …), `Cloneable` + * resolves to a type containing `never` at that key and `[T] extends + * [Cloneable]` becomes `false`, so the `: true` annotation fails the + * type-check (`tsconfig.test.json`) — catching the regression even if someone + * later removes the `assertCloneable` wrapper from the provider. + * + * Type-only imports (erased at runtime); the single runtime assertion exists so + * the file is a real, runnable test. + */ +import { describe, it, expect } from 'vitest'; + +import type { Cloneable } from '../../src/core/ingestion/workers/clone-safety.js'; +import type { CppConstraintPayload } from '../../src/core/ingestion/languages/cpp/constraint-extractor.js'; +import type { CppCaptureSideChannel } from '../../src/core/ingestion/languages/cpp/capture-side-channel.js'; +import type { CCaptureSideChannel } from '../../src/core/ingestion/languages/c/capture-side-channel.js'; +import type { KotlinCaptureSideChannel } from '../../src/core/ingestion/languages/kotlin/capture-side-channel.js'; + +// `[T] extends [Cloneable] ? true : false` is `true` iff T is wholly +// clone-safe. Tuple-wrapped to avoid distributing over unions / `never`. +type IsCloneable = [T] extends [Cloneable] ? true : false; + +describe('#2143: worker-boundary payload types are Cloneable', () => { + it('CppConstraintPayload / CppCaptureSideChannel / CCaptureSideChannel / KotlinCaptureSideChannel', () => { + const cppConstraint: IsCloneable = true; + const cppSideChannel: IsCloneable = true; + const cSideChannel: IsCloneable = true; + const kotlinSideChannel: IsCloneable = true; + // Array equality (not `&&`) so this isn't a trivial-always-true conditional; + // the real assertions are the `: IsCloneable<…> = true` annotations above. + expect([cppConstraint, cppSideChannel, cSideChannel, kotlinSideChannel]).toEqual([ + true, + true, + true, + true, + ]); + }); +}); diff --git a/gitnexus/test/unit/clone-safety.test.ts b/gitnexus/test/unit/clone-safety.test.ts new file mode 100644 index 0000000000..c4157067cd --- /dev/null +++ b/gitnexus/test/unit/clone-safety.test.ts @@ -0,0 +1,609 @@ +import { describe, expect, it } from 'vitest'; +import { + isStructuredCloneable, + makeWorkerResultCloneSafe, + type SkippedPath, +} from '../../src/core/ingestion/workers/clone-safety.js'; +import type { ParseWorkerResult } from '../../src/core/ingestion/workers/parse-worker.js'; + +/** + * #2112: the worker result boundary must survive a value the structured-clone + * algorithm can't serialize. The reporter's case was a node `properties` value + * pointing at a native `toString`, which crashed the whole parse phase. + */ +describe('clone-safety', () => { + describe('isStructuredCloneable', () => { + it('accepts plain data and the structured-clone-native containers', () => { + expect(isStructuredCloneable({ a: 1, b: [2, 3], c: 'x' })).toBe(true); + expect(isStructuredCloneable(new Map([['k', [1]]]))).toBe(true); + expect(isStructuredCloneable(new Set([1, 2]))).toBe(true); + expect(isStructuredCloneable(new Date())).toBe(true); + expect(isStructuredCloneable(/re/g)).toBe(true); + }); + + it('rejects functions and symbols', () => { + expect(isStructuredCloneable(() => 1)).toBe(false); + expect(isStructuredCloneable({ fn: () => 1 })).toBe(false); + expect(isStructuredCloneable({ s: Symbol('x') })).toBe(false); + }); + }); + + describe('makeWorkerResultCloneSafe', () => { + const opts = { + dropWholeElement: new Set(['parsedFiles']), + skipFields: new Set(['skippedPaths']), + }; + + it('leaves a fully cloneable result untouched (referential identity preserved)', () => { + const nodes = [{ id: 'n1', properties: { filePath: 'a.ts', name: 'foo' } }]; + const result: Record = { + nodes, + parsedFiles: [{ filePath: 'a.ts', scopes: [{ bindings: new Map([['x', [1]]]) }] }], + skippedLanguages: { ada: 2 }, + fileCount: 1, + }; + const { skipped } = makeWorkerResultCloneSafe(result, opts); + expect(skipped).toEqual([]); + // Untouched arrays keep their identity (no needless copy). + expect(result.nodes).toBe(nodes); + expect(isStructuredCloneable(result)).toBe(true); + }); + + it('strips a non-cloneable value from a plain record, keeps the record, attributes the path', () => { + // The exact #2112 shape: a node whose properties carry an own native fn. + const props: Record = { filePath: 'pkg/bad.cpp', name: 'wedge' }; + props.toString = Object.prototype.toString; + const result: Record = { + nodes: [ + { id: 'good', properties: { filePath: 'pkg/ok.cpp', name: 'ok' } }, + { id: 'bad', properties: props }, + ], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 2, + }; + expect(isStructuredCloneable(result)).toBe(false); // red: would crash postMessage + + const { skipped } = makeWorkerResultCloneSafe(result, opts); + + expect(isStructuredCloneable(result)).toBe(true); // green: now deliverable + const nodes = result.nodes as Array<{ id: string; properties: Record }>; + expect(nodes).toHaveLength(2); // record kept, not dropped + expect(nodes[1].properties.toString).toBeUndefined(); // offending value stripped + expect(nodes[1].properties.name).toBe('wedge'); // legitimate data preserved + expect(skipped).toHaveLength(1); + expect(skipped[0].path).toBe('pkg/bad.cpp'); + expect(skipped[0].reason).toContain('nodes'); + // The reason names the exact offending key path — what lets the leak be + // located from a single log line, not just the array field. + expect(skipped[0].reason).toContain('properties.toString'); + }); + + it('does not touch a result whose only "exotic" value is a clean Map (the refuted Map hypothesis)', () => { + const result: Record = { + symbols: [{ id: 's', filePath: 'a.ts', bindings: new Map([['t', 'T']]) }], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 1, + }; + const { skipped } = makeWorkerResultCloneSafe(result, opts); + expect(skipped).toEqual([]); + expect(result.symbols as unknown[]).toHaveLength(1); + }); + + it('drops a whole ParsedFile when its captureSideChannel is non-cloneable (re-parse path)', () => { + const sideChannel: Record = { staticNames: ['a'] }; + sideChannel.leaked = () => 1; // a function leaked into the side-channel + const result: Record = { + nodes: [], + parsedFiles: [ + { filePath: 'keep.c', scopes: [] }, + { filePath: 'drop.c', captureSideChannel: sideChannel }, + ], + skippedLanguages: {}, + fileCount: 2, + }; + const { skipped } = makeWorkerResultCloneSafe(result, opts); + + expect(isStructuredCloneable(result)).toBe(true); + const parsedFiles = result.parsedFiles as Array<{ filePath: string }>; + expect(parsedFiles).toHaveLength(1); // bad file dropped whole, not stripped + expect(parsedFiles[0].filePath).toBe('keep.c'); + expect(skipped).toHaveLength(1); + expect(skipped[0].path).toBe('drop.c'); + expect(skipped[0].reason).toContain('dropped'); + }); + + it('strips a non-cloneable value that is not a function/symbol (e.g. a Promise) and keeps the record', () => { + const result: Record = { + calls: [ + { id: 'c1', filePath: 'a.ts' }, + { id: 'c2', filePath: 'b.ts', pending: Promise.resolve(1) }, // Promise: not cloneable, not a fn + ], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 2, + }; + const { skipped } = makeWorkerResultCloneSafe(result, opts); + expect(isStructuredCloneable(result)).toBe(true); + const calls = result.calls as Array<{ id: string; pending?: unknown }>; + expect(calls.map((c) => c.id)).toEqual(['c1', 'c2']); // record kept + expect(calls[1].pending).toBeUndefined(); // unsalvageable value stripped to undefined + expect(skipped).toHaveLength(1); + expect(skipped[0].path).toBe('b.ts'); + expect(skipped[0].reason).toContain('calls'); + }); + + it('never recurses into the skippedPaths field it populates', () => { + const result: Record = { + nodes: [{ id: 'n', properties: { filePath: 'a.ts' } }], + skippedPaths: [{ path: 'prior.ts', reason: 'earlier sub-batch' }], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 1, + }; + const before = result.skippedPaths; + makeWorkerResultCloneSafe(result, opts); + expect(result.skippedPaths).toBe(before); // untouched + }); + }); + + // U1 (#2112): the sanitizer must not recurse to a stack overflow on a deeply + // nested record — an over-deep subtree is bounded (treated non-cloneable) and + // the result is salvaged rather than the sanitizer throwing and re-arming the + // cascade it exists to prevent. + describe('bounded recursion depth', () => { + const opts = { + dropWholeElement: new Set(['parsedFiles']), + skipFields: new Set(['skippedPaths']), + }; + + // Build a plain-object chain `{ child: { child: { … } } }` of the given + // depth with a non-cloneable function at the bottom. + const deepChainWithFn = (depth: number): Record => { + let node: Record = { leaked: () => 1 }; + for (let i = 0; i < depth; i++) node = { child: node }; + return node; + }; + + it('salvages a deeply-nested non-cloneable record without throwing RangeError', () => { + const result: Record = { + nodes: [{ id: 'deep', filePath: 'deep.ts', tree: deepChainWithFn(5000) }], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 1, + }; + // Must not throw (no RangeError escaping the sanitizer)... + expect(() => makeWorkerResultCloneSafe(result, opts)).not.toThrow(); + // ...and the rewritten result is deliverable across postMessage. + expect(isStructuredCloneable(result)).toBe(true); + }); + + it('a shallow result is unaffected by the depth bound', () => { + const nodes = [{ id: 'n', properties: { filePath: 'a.ts', name: 'ok' } }]; + const result: Record = { + nodes, + parsedFiles: [], + skippedLanguages: {}, + fileCount: 1, + }; + const { skipped } = makeWorkerResultCloneSafe(result, opts); + expect(skipped).toEqual([]); + expect(result.nodes).toBe(nodes); // identity preserved, no needless copy + }); + }); + + // U2 (#2112): two sanitizer-defeat vectors that previously let the re-post + // throw — a throwing getter and a detached ArrayBuffer/view. + describe('sanitizer-defeat hardening', () => { + const opts = { + dropWholeElement: new Set(['parsedFiles']), + skipFields: new Set(['skippedPaths']), + }; + + it('drops a throwing getter and delivers the rest of the record', () => { + const el: Record = { id: 'g', filePath: 'g.ts', name: 'keep' }; + Object.defineProperty(el, 'boom', { + enumerable: true, + get() { + throw new Error('getter boom'); + }, + }); + const result: Record = { + nodes: [el], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 1, + }; + expect(() => makeWorkerResultCloneSafe(result, opts)).not.toThrow(); + expect(isStructuredCloneable(result)).toBe(true); + const out = (result.nodes as Array>)[0]; + expect(out.name).toBe('keep'); // legitimate data preserved + expect('boom' in out).toBe(false); // throwing getter stripped + }); + + it('drops a detached ArrayBuffer view and delivers the rest', () => { + const buf = new ArrayBuffer(8); + const view = new Uint8Array(buf); + structuredClone(buf, { transfer: [buf] }); // detaches buf → view is now detached + const result: Record = { + nodes: [{ id: 'd', filePath: 'd.ts', data: view }], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 1, + }; + const { skipped } = makeWorkerResultCloneSafe(result, opts); + expect(isStructuredCloneable(result)).toBe(true); + expect((result.nodes as Array>)[0].data).toBeUndefined(); + expect(skipped).toHaveLength(1); + }); + + it('does NOT drop a legitimately empty but live view (byteLength false-positive guard)', () => { + const nodes = [{ id: 'e', filePath: 'e.ts', data: new Uint8Array(0) }]; + const result: Record = { + nodes, + parsedFiles: [], + skippedLanguages: {}, + fileCount: 1, + }; + const { skipped } = makeWorkerResultCloneSafe(result, opts); + expect(skipped).toEqual([]); + expect(result.nodes).toBe(nodes); // untouched — empty live view clones fine + }); + }); + + // R1 (#2135 tri-review): structuredClone serializes an array's NON-index + // own-enumerable properties and throws on a non-cloneable one. The index-only + // scan used to wave such an array through (`skipped: []`), leaving the result + // non-cloneable so the re-post threw and fail-closed → re-arming the cascade. + describe('array non-index own-enumerable properties', () => { + const opts = { + dropWholeElement: new Set(['parsedFiles']), + skipFields: new Set(['skippedPaths']), + }; + + it('strips a non-index function property off a nested array and delivers the record', () => { + const tags: unknown[] & { meta?: unknown } = [1, 2, 3]; + tags.meta = () => {}; // non-index own prop carrying a function + // sanity: this is the exact shape structuredClone rejects + expect(isStructuredCloneable({ tags })).toBe(false); + const result: Record = { + nodes: [{ id: 'a', filePath: 'a.ts', tags }], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 1, + }; + const { skipped } = makeWorkerResultCloneSafe(result, opts); + expect(isStructuredCloneable(result)).toBe(true); // net no longer defeated + expect(skipped.length).toBeGreaterThan(0); + const outTags = (result.nodes as Array<{ tags: unknown[] & { meta?: unknown } }>)[0].tags; + expect(Array.from(outTags)).toEqual([1, 2, 3]); // indexed elements preserved + expect(outTags.meta).toBeUndefined(); // the function was stripped + }); + + it('carries a CLONEABLE non-index property through the strip', () => { + const tags: unknown[] & { note?: unknown; bad?: unknown } = [1]; + tags.note = 'keep'; // cloneable non-index prop — must survive + tags.bad = () => {}; // forces the array dirty + const result: Record = { + nodes: [{ id: 'b', filePath: 'b.ts', tags }], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 1, + }; + makeWorkerResultCloneSafe(result, opts); + expect(isStructuredCloneable(result)).toBe(true); + const outTags = (result.nodes as Array<{ tags: { note?: unknown; bad?: unknown } }>)[0].tags; + expect(outTags.note).toBe('keep'); // data prop carried onto the stripped copy + expect(outTags.bad).toBeUndefined(); + }); + }); + + // R2 (#2135 tri-review): a throw DURING the sanitizer's own structural + // enumeration (a throwing getter on a path-less element reached by + // findFilePath, or a Proxy structural trap reached by instanceof/Object.keys) + // used to escape makeWorkerResultCloneSafe to the fail-closed {type:'error'}, + // re-arming the cascade. findFilePath now reads defensively, and each element's + // sanitize is wrapped to drop-on-throw. + describe('sanitizer-internal throw is contained (drop-on-throw)', () => { + const opts = { + dropWholeElement: new Set(['parsedFiles']), + skipFields: new Set(['skippedPaths']), + }; + + it('a throwing getter at a non-path key on a PATH-LESS element does not escape', () => { + // No top-level path key → findFilePath falls to its generic sweep and + // (pre-fix) read the throwing getter, throwing out of the sanitizer. + const el: Record = { id: 'p', name: 'keep' }; + Object.defineProperty(el, 'boom', { + enumerable: true, + get() { + throw new RangeError('boom'); + }, + }); + const result: Record = { + nodes: [el], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 1, + }; + expect(() => makeWorkerResultCloneSafe(result, opts)).not.toThrow(); + expect(isStructuredCloneable(result)).toBe(true); + const out = (result.nodes as Array>)[0]; + expect(out.name).toBe('keep'); // legitimate data delivered + expect('boom' in out).toBe(false); // throwing getter stripped, not escaped + }); + + it('a Proxy with a throwing structural trap is dropped, clean siblings survive', () => { + const trap = new Proxy( + { id: 'b' }, + { + getPrototypeOf() { + throw new Error('structural trap'); + }, + }, + ); + const result: Record = { + nodes: [{ id: 'a', filePath: 'a.ts' }, trap, { id: 'c', filePath: 'c.ts' }], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 3, + }; + let skipped: SkippedPath[] = []; + expect(() => { + skipped = makeWorkerResultCloneSafe(result, opts).skipped; + }).not.toThrow(); + expect(isStructuredCloneable(result)).toBe(true); + const ids = (result.nodes as Array<{ id: string }>).map((n) => n.id); + expect(ids).toEqual(['a', 'c']); // trap element dropped, siblings preserved + expect(skipped.some((s) => s.reason.includes('sanitizer error'))).toBe(true); + }); + }); + + // R3 (#2135 tri-review): the per-field loop rewrites ARRAY fields only. A + // final isStructuredCloneable(result) gate strips any remaining non-array + // field so "the result is cloneable after this call" is a hard postcondition + // regardless of future result-shape changes. + describe('non-array field safety gate', () => { + const opts = { + dropWholeElement: new Set(['parsedFiles']), + skipFields: new Set(['skippedPaths']), + }; + + it('strips a non-cloneable value carried on a non-ARRAY result field', () => { + const result: Record = { + nodes: [], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 0, + summary: { kept: 1, build: () => {} }, // non-array field, non-cloneable + }; + const { skipped } = makeWorkerResultCloneSafe(result, opts); + expect(isStructuredCloneable(result)).toBe(true); + expect((result.summary as { kept: number; build?: unknown }).kept).toBe(1); + expect((result.summary as { build?: unknown }).build).toBeUndefined(); + expect(skipped.some((s) => s.reason.includes('summary'))).toBe(true); + }); + + it('is a no-op when the array loop already made the result cloneable', () => { + const result: Record = { + nodes: [{ id: 'a', filePath: 'a.ts', leak: () => {} }], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 1, + }; + const { skipped } = makeWorkerResultCloneSafe(result, opts); + expect(isStructuredCloneable(result)).toBe(true); + // Only the array-element strip was recorded; the gate added no '(result)' entry. + expect(skipped.every((s) => s.path !== '(result)')).toBe(true); + }); + }); + + // R12 (#2135 tri-review): the "dropped unsalvageable" branch — a dirty element + // whose stripped copy is STILL not structured-cloneable must be dropped (not + // delivered), so the re-post can't throw. Triggered here with a non-plain + // member that the strip-time probe sees as clean but that turns non-cloneable + // on the final post-strip verification (a stateful getter). + describe('unsalvageable element drop', () => { + const opts = { + dropWholeElement: new Set(['parsedFiles']), + skipFields: new Set(['skippedPaths']), + }; + + it('drops an element that is still non-cloneable after stripping', () => { + let reads = 0; + class Blob {} + const blob = new Blob(); + Object.defineProperty(blob, 'data', { + enumerable: true, + // Clean on the strip-time probe (kept by reference), a function on the + // post-strip verification probe → the cleaned element is unsalvageable. + get() { + reads++; + return reads === 1 ? 'ok' : () => {}; + }, + }); + const result: Record = { + nodes: [{ id: 'x', filePath: 'x.ts', leak: () => {}, blob }], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 1, + }; + const { skipped } = makeWorkerResultCloneSafe(result, opts); + expect(isStructuredCloneable(result)).toBe(true); // run survives + expect((result.nodes as unknown[]).length).toBe(0); // unsalvageable element dropped + expect(skipped.some((s) => s.reason.includes('unsalvageable'))).toBe(true); + }); + }); + + // U3 (#2112): a DAG-aliased record (the same subobject reached via two paths) + // carrying a non-cloneable must be stripped-and-KEPT, not over-dropped — the + // old shared-WeakSet returned the un-stripped original on revisit, failing the + // last-resort guard and dropping the whole record. + describe('DAG-aliased records (memoized strip copies)', () => { + const opts = { + dropWholeElement: new Set(['parsedFiles']), + skipFields: new Set(['skippedPaths']), + }; + + it('keeps a DAG element whose shared subobject carries a non-cloneable value', () => { + const shared: Record = { tag: 's', leaked: () => 1 }; + const el: Record = { + id: 'dag', + filePath: 'dag.ts', + left: shared, + right: shared, + }; + const result: Record = { + nodes: [el], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 1, + }; + const { skipped } = makeWorkerResultCloneSafe(result, opts); + expect(isStructuredCloneable(result)).toBe(true); + const out = (result.nodes as Array>)[0]; + // Record is KEPT (stripped), not dropped as "unsalvageable". + expect(out).toBeDefined(); + expect(skipped[0].reason).toContain('stripped'); + const left = out.left as Record; + const right = out.right as Record; + expect(left.tag).toBe('s'); // legitimate data preserved + expect(left.leaked).toBeUndefined(); // function value stripped to undefined + // DAG shape preserved — the two aliases resolve to the SAME stripped copy. + expect(left).toBe(right); + }); + + it('terminates on a self-referential (cyclic) record', () => { + const cyc: Record = { id: 'c', filePath: 'c.ts', bad: () => 1 }; + cyc.self = cyc; + const result: Record = { + nodes: [cyc], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 1, + }; + expect(() => makeWorkerResultCloneSafe(result, opts)).not.toThrow(); + expect(isStructuredCloneable(result)).toBe(true); + const out = (result.nodes as Array>)[0]; + expect(out.self).toBe(out); // cycle preserved against the stripped copy + expect(out.bad).toBeUndefined(); // function value stripped to undefined + }); + }); + + // U4 (#2112): single-pass scan rebuilds only the dirty array (from the first + // dirty element on), and leaves every clean array untouched by identity. + describe('single-pass identity preservation', () => { + const opts = { + dropWholeElement: new Set(['parsedFiles']), + skipFields: new Set(['skippedPaths']), + }; + + it('reassigns only the dirty field; clean fields keep identity', () => { + const cleanSymbols = [{ id: 's', filePath: 's.ts' }]; + const cleanPrefix = { id: 'n0', properties: { filePath: 'n0.ts' } }; + const dirtyNodes = [ + cleanPrefix, + { id: 'n1', properties: { filePath: 'n1.ts', bad: () => 1 } }, + ]; + const result: Record = { + nodes: dirtyNodes, + symbols: cleanSymbols, + parsedFiles: [], + skippedLanguages: {}, + fileCount: 2, + }; + makeWorkerResultCloneSafe(result, opts); + expect(result.symbols).toBe(cleanSymbols); // clean field untouched (identity) + expect(result.nodes).not.toBe(dirtyNodes); // dirty field rebuilt + const outNodes = result.nodes as Array>; + expect(outNodes[0]).toBe(cleanPrefix); // clean prefix copied by reference + expect(isStructuredCloneable(result)).toBe(true); + }); + }); + + // U7 (#2112): a ParsedNode is attributed to properties.filePath even when a + // sibling child also carries a path-like key — the generic sweep alone could + // return the wrong sibling's path. + describe('findFilePath attribution (via skip reporting)', () => { + const opts = { + dropWholeElement: new Set(['parsedFiles']), + skipFields: new Set(['skippedPaths']), + }; + + it('prefers properties.filePath over a sibling child path key', () => { + const result: Record = { + nodes: [ + { + id: 'n', + meta: { file: 'sibling-wrong.ts' }, // sibling child with a path-like key, declared first + properties: { filePath: 'right.ts', bad: () => 1 }, + }, + ], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 1, + }; + const { skipped } = makeWorkerResultCloneSafe(result, opts); + expect(skipped).toHaveLength(1); + expect(skipped[0].path).toBe('right.ts'); // not 'sibling-wrong.ts' + }); + + it('uses a top-level filePath when present', () => { + const result: Record = { + parsedFiles: [{ filePath: 'top.c', captureSideChannel: { leaked: () => 1 } }], + nodes: [], + skippedLanguages: {}, + fileCount: 1, + }; + const { skipped } = makeWorkerResultCloneSafe(result, opts); + expect(skipped[0].path).toBe('top.c'); + }); + }); + + // C12 (#2112): contract — a representative ParseWorkerResult must be + // structured-cloneable. Typed as ParseWorkerResult so a NEW field added to + // the result shape forces this test to be updated (compile error until it is), + // and the runtime assert catches a field whose type becomes non-cloneable. + describe('ParseWorkerResult clone contract', () => { + it('a representative result is structured-cloneable', () => { + const result: ParseWorkerResult = { + nodes: [ + { + id: 'func:src/a.ts#foo', + label: 'Function', + properties: { + name: 'foo', + filePath: 'src/a.ts', + startLine: 1, + endLine: 3, + language: + 'typescript' as ParseWorkerResult['nodes'][number]['properties']['language'], + isExported: true, + }, + }, + ], + relationships: [], + symbols: [], + calls: [], + assignments: [], + routes: [], + fetchCalls: [], + fetchWrapperDefs: [], + decoratorRoutes: [], + routerIncludes: [], + routerImports: [], + routerModuleAliases: [], + toolDefs: [], + ormQueries: [], + constructorBindings: [], + fileScopeBindings: [], + parsedFiles: [], + skippedLanguages: { ada: 2 }, + skippedPaths: [{ path: 'x.ts', reason: 'prior' }], + fileCount: 1, + }; + expect(isStructuredCloneable(result)).toBe(true); + }); + }); +}); diff --git a/gitnexus/test/unit/result-merge.test.ts b/gitnexus/test/unit/result-merge.test.ts new file mode 100644 index 0000000000..4ec6ac85d1 --- /dev/null +++ b/gitnexus/test/unit/result-merge.test.ts @@ -0,0 +1,73 @@ +/** + * #2135 tri-review (R12): the parse-worker result merge unions the clone-safety + * `skippedPaths` across sub-batch results. mergeResult was extracted from the + * parse-worker entry module into result-merge.ts so it can be unit-tested here + * (a main-thread import of parse-worker.ts would run its MessagePort setup). + */ +import { describe, it, expect } from 'vitest'; + +import { mergeResult } from '../../src/core/ingestion/workers/result-merge.js'; +import type { ParseWorkerResult } from '../../src/core/ingestion/workers/parse-worker.js'; + +function emptyResult(): ParseWorkerResult { + return { + nodes: [], + relationships: [], + symbols: [], + calls: [], + assignments: [], + routes: [], + fetchCalls: [], + fetchWrapperDefs: [], + decoratorRoutes: [], + routerIncludes: [], + routerImports: [], + toolDefs: [], + ormQueries: [], + constructorBindings: [], + fileScopeBindings: [], + parsedFiles: [], + skippedLanguages: {}, + fileCount: 0, + }; +} + +describe('mergeResult', () => { + it('unions skippedPaths across sub-batch results, initializing the target when absent', () => { + const target = emptyResult(); // no skippedPaths on the target (the `??=` path) + mergeResult(target, { + ...emptyResult(), + skippedPaths: [{ path: 'a.ts', reason: 'r1' }], + fileCount: 1, + }); + mergeResult(target, { + ...emptyResult(), + skippedPaths: [{ path: 'b.ts', reason: 'r2' }], + fileCount: 1, + }); + expect(target.skippedPaths).toEqual([ + { path: 'a.ts', reason: 'r1' }, + { path: 'b.ts', reason: 'r2' }, + ]); + expect(target.fileCount).toBe(2); + }); + + it('leaves skippedPaths undefined when no source carries any (no spurious empty array)', () => { + const target = emptyResult(); + mergeResult(target, { ...emptyResult(), fileCount: 1 }); + expect(target.skippedPaths).toBeUndefined(); + }); + + it('also sums skippedLanguages and appends node arrays (sanity of the rest of the merge)', () => { + const target = { ...emptyResult(), skippedLanguages: { rust: 1 } }; + mergeResult(target, { + ...emptyResult(), + nodes: [ + { id: 'n', label: 'Function', properties: { name: 'n' } }, + ] as ParseWorkerResult['nodes'], + skippedLanguages: { rust: 2, go: 1 }, + }); + expect(target.skippedLanguages).toEqual({ rust: 3, go: 1 }); + expect(target.nodes).toHaveLength(1); + }); +});