diff --git a/gitnexus/src/cli/analyze.ts b/gitnexus/src/cli/analyze.ts index 6801f3f773..3481fd2ed1 100644 --- a/gitnexus/src/cli/analyze.ts +++ b/gitnexus/src/cli/analyze.ts @@ -500,6 +500,7 @@ const ANALYZE_CLI_ENV_KEYS = [ 'GITNEXUS_VERBOSE', 'GITNEXUS_PROFILE_DEFERRED', 'GITNEXUS_PROFILE_DEFERRED_SLOW_MS', + 'GITNEXUS_DEBUG_HEAP', 'GITNEXUS_MAX_FILE_SIZE', 'GITNEXUS_WORKER_SUB_BATCH_TIMEOUT_MS', 'GITNEXUS_WAL_CHECKPOINT_THRESHOLD', diff --git a/gitnexus/src/core/ingestion/call-processor.ts b/gitnexus/src/core/ingestion/call-processor.ts index 4a9fba9af4..5c8e447f6e 100644 --- a/gitnexus/src/core/ingestion/call-processor.ts +++ b/gitnexus/src/core/ingestion/call-processor.ts @@ -39,6 +39,34 @@ const MAX_TYPE_NAME_LENGTH = 256; * Consumed by the cross-file re-resolution / enrichment pass. */ export type ExportedTypeMap = Map>; +/** Record one exported graph node into the incremental ExportedTypeMap. */ +export const accumulateExportedTypesFromParsedNode = ( + result: ExportedTypeMap, + node: { id: string; properties?: Record }, + symbolTable: SymbolTableReader, +): void => { + if (!node.properties?.isExported) return; + if (!node.properties?.filePath || !node.properties?.name) return; + const filePath = node.properties.filePath as string; + const name = node.properties.name as string; + if (!name || name.length > MAX_TYPE_NAME_LENGTH) return; + const defs = symbolTable.lookupExactAll(filePath, name); + const def = defs.find((d) => d.nodeId === node.id) ?? defs[0]; + if (!def) return; + const typeName = def.returnType ?? def.declaredType; + if (!typeName || typeName.length > MAX_TYPE_NAME_LENGTH) return; + const simpleType = extractReturnTypeName(typeName) ?? typeName; + if (!simpleType) return; + let fileExports = result.get(filePath); + if (!fileExports) { + fileExports = new Map(); + result.set(filePath, fileExports); + } + if (fileExports.size < MAX_EXPORTS_PER_FILE) { + fileExports.set(name, simpleType); + } +}; + /** Build ExportedTypeMap from graph nodes — used for the worker path where the * sequential TypeEnv is not available in the main thread. Collects * returnType/declaredType from exported symbols with known types. */ @@ -48,29 +76,7 @@ export function buildExportedTypeMapFromGraph( ): ExportedTypeMap { const result: ExportedTypeMap = new Map(); graph.forEachNode((node) => { - if (!node.properties?.isExported) return; - if (!node.properties?.filePath || !node.properties?.name) return; - const filePath = node.properties.filePath as string; - const name = node.properties.name as string; - if (!name || name.length > MAX_TYPE_NAME_LENGTH) return; - // For callable symbols, use returnType; for properties/variables, use declaredType. - // Use lookupExactAll + nodeId match to handle same-name methods in different classes. - const defs = symbolTable.lookupExactAll(filePath, name); - const def = defs.find((d) => d.nodeId === node.id) ?? defs[0]; - if (!def) return; - const typeName = def.returnType ?? def.declaredType; - if (!typeName || typeName.length > MAX_TYPE_NAME_LENGTH) return; - // Extract simple type name (strip Promise<>, etc.) — reuse shared utility - const simpleType = extractReturnTypeName(typeName) ?? typeName; - if (!simpleType) return; - let fileExports = result.get(filePath); - if (!fileExports) { - fileExports = new Map(); - result.set(filePath, fileExports); - } - if (fileExports.size < MAX_EXPORTS_PER_FILE) { - fileExports.set(name, simpleType); - } + accumulateExportedTypesFromParsedNode(result, node, symbolTable); }); return result; } diff --git a/gitnexus/src/core/ingestion/parsing-processor.ts b/gitnexus/src/core/ingestion/parsing-processor.ts index 367fc55509..9f85b87996 100644 --- a/gitnexus/src/core/ingestion/parsing-processor.ts +++ b/gitnexus/src/core/ingestion/parsing-processor.ts @@ -26,6 +26,7 @@ import { } from './utils/ast-helpers.js'; import { detectFrameworkFromAST } from './framework-detection.js'; import { buildTypeEnv } from './type-env.js'; +import { accumulateExportedTypesFromParsedNode, type ExportedTypeMap } from './call-processor.js'; import type { FieldInfo, FieldExtractorContext } from './field-types.js'; import type { MethodInfo } from './method-types.js'; import { @@ -120,9 +121,8 @@ export const mergeChunkResults = ( graph: KnowledgeGraph, symbolTable: SymbolTableWriter, chunkResults: readonly ParseWorkerResult[], + exportedTypeMap?: ExportedTypeMap, ): WorkerExtractedData => { - const allCalls: ExtractedCall[] = []; - const allAssignments: ExtractedAssignment[] = []; const allRoutes: ExtractedRoute[] = []; const allFetchCalls: ExtractedFetchCall[] = []; const allFetchWrapperDefs: FetchWrapperDef[] = []; @@ -132,7 +132,6 @@ export const mergeChunkResults = ( const allRouterModuleAliases: ExtractedRouterModuleAlias[] = []; const allToolDefs: ExtractedToolDef[] = []; const allORMQueries: ExtractedORMQuery[] = []; - const allConstructorBindings: FileConstructorBindings[] = []; const fileScopeBindingsByFile: FileScopeBindings[] = []; const allParsedFiles: ParsedFile[] = []; @@ -160,8 +159,11 @@ export const mergeChunkResults = ( qualifiedName: sym.qualifiedName, }); } - for (const item of result.calls) allCalls.push(item); - for (const item of result.assignments) allAssignments.push(item); + if (exportedTypeMap) { + for (const node of result.nodes) { + accumulateExportedTypesFromParsedNode(exportedTypeMap, node, symbolTable); + } + } for (const item of result.routes) allRoutes.push(item); for (const item of result.fetchCalls) allFetchCalls.push(item); for (const item of result.fetchWrapperDefs ?? []) allFetchWrapperDefs.push(item); @@ -171,15 +173,14 @@ export const mergeChunkResults = ( for (const item of result.routerModuleAliases ?? []) allRouterModuleAliases.push(item); for (const item of result.toolDefs) allToolDefs.push(item); if (result.ormQueries) for (const item of result.ormQueries) allORMQueries.push(item); - for (const item of result.constructorBindings) allConstructorBindings.push(item); if (result.fileScopeBindings) for (const item of result.fileScopeBindings) fileScopeBindingsByFile.push(item); if (result.parsedFiles) for (const item of result.parsedFiles) allParsedFiles.push(item); } return { - calls: allCalls, - assignments: allAssignments, + calls: [], + assignments: [], routes: allRoutes, fetchCalls: allFetchCalls, fetchWrapperDefs: allFetchWrapperDefs, @@ -189,7 +190,7 @@ export const mergeChunkResults = ( routerModuleAliases: allRouterModuleAliases, toolDefs: allToolDefs, ormQueries: allORMQueries, - constructorBindings: allConstructorBindings, + constructorBindings: [], fileScopeBindings: fileScopeBindingsByFile, parsedFiles: allParsedFiles, }; @@ -210,6 +211,7 @@ const processParsingWithWorkers = async ( * `gitnexus/src/storage/parse-cache.ts`. */ outRawResults?: ParseWorkerResult[], + exportedTypeMap?: ExportedTypeMap, ): Promise => { // Filter to parseable files only const parseableFiles: ParseWorkerInput[] = []; @@ -254,7 +256,7 @@ const processParsingWithWorkers = async ( } // Merge results from all workers into graph and symbol table. - const merged = mergeChunkResults(graph, symbolTable, chunkResults); + const merged = mergeChunkResults(graph, symbolTable, chunkResults, exportedTypeMap); // Merge and log skipped languages from workers const skippedLanguages = new Map(); @@ -1017,6 +1019,7 @@ export const processParsing = async ( * artifact to cache there). See `gitnexus/src/storage/parse-cache.ts`. */ outRawResults?: ParseWorkerResult[], + exportedTypeMap?: ExportedTypeMap, ): Promise => { let lastProgress = 0; const reportProgress: FileProgressCallback | undefined = onFileProgress @@ -1062,6 +1065,7 @@ export const processParsing = async ( workerPool, reportProgress, outRawResults, + exportedTypeMap, ); // Session-scoped quarantine (worker-pool resilience Layer 3): surface // any files this pool has decided are unsafe for workers so the diff --git a/gitnexus/src/core/ingestion/pipeline-phases/parse-impl.ts b/gitnexus/src/core/ingestion/pipeline-phases/parse-impl.ts index 7b3698e0be..bf9263cbf6 100644 --- a/gitnexus/src/core/ingestion/pipeline-phases/parse-impl.ts +++ b/gitnexus/src/core/ingestion/pipeline-phases/parse-impl.ts @@ -20,7 +20,12 @@ import { type BindingEntry, } from '../binding-accumulator.js'; import { processParsing, mergeChunkResults } from '../parsing-processor.js'; -import { fileContentHash, computeChunkHash } from '../../../storage/parse-cache.js'; +import { + fileContentHash, + computeChunkHash, + loadParseCacheChunk, + persistParseCacheChunk, +} from '../../../storage/parse-cache.js'; import type { ParseWorkerResult } from '../workers/parse-worker.js'; import type { WorkerExtractedData } from '../parsing-processor.js'; import { @@ -59,7 +64,7 @@ import fs from 'node:fs'; import path from 'node:path'; import { fileURLToPath, pathToFileURL } from 'node:url'; -import { isDev } from '../utils/env.js'; +import { isDev, parseTruthyEnv } from '../utils/env.js'; import { isVerboseIngestionEnabled } from '../utils/verbose.js'; import { endTimer, @@ -67,6 +72,7 @@ import { logDeferredProfile, startTimer, } from '../utils/deferred-resolution-profile.js'; +import { logHeapProbe } from '../utils/heap-probe.js'; import { extractORMQueriesInline } from './orm-extraction.js'; import { logger } from '../../logger.js'; @@ -470,8 +476,18 @@ export async function runChunkedParseAndResolve( // body, which re-read process.env on every iteration even though // the env can't change mid-run. const verboseThroughputLog = isDev || isVerboseIngestionEnabled(); + const heapProbeEveryN = + parseTruthyEnv(process.env.GITNEXUS_DEBUG_HEAP) || isDeferredResolutionProfileEnabled() + ? 25 + : 0; for (let chunkIdx = 0; chunkIdx < numChunks; chunkIdx++) { + if (heapProbeEveryN > 0 && chunkIdx > 0 && chunkIdx % heapProbeEveryN === 0) { + logHeapProbe( + `parse-chunk-${chunkIdx}`, + `nodes=${graph.nodeCount} parsedFiles=${allParsedFiles.length}`, + ); + } const chunkPaths = chunks[chunkIdx]; // Start wall-clock for the per-chunk throughput log emitted at end // of this iteration. The gate is computed once above; here we just @@ -506,7 +522,8 @@ export async function runChunkedParseAndResolve( } let chunkWorkerData: WorkerExtractedData | null; - const cachedRaw = chunkHash && parseCache ? parseCache.entries.get(chunkHash) : undefined; + const cachedRaw = + chunkHash && parseCache ? await loadParseCacheChunk(parseCache, chunkHash) : undefined; // Track every chunk hash we touched so the orchestrator can // prune stale entries (chunks whose composition no longer @@ -517,7 +534,7 @@ export async function runChunkedParseAndResolve( // Cache hit: replay the cached worker output through the same // merge logic the live worker path uses. chunkCacheHits++; - chunkWorkerData = mergeChunkResults(graph, symbolTable, cachedRaw); + chunkWorkerData = mergeChunkResults(graph, symbolTable, cachedRaw, exportedTypeMap); if (isDev) { logger.info( `📦 parse-cache HIT: chunk ${chunkIdx + 1}/${numChunks} (${chunkFiles.length} files, ${chunkHash?.slice(0, 8) ?? 'unknown'})`, @@ -573,6 +590,7 @@ export async function runChunkedParseAndResolve( // Capture raw results only when we have a cache to write to — // otherwise we'd retain extra arrays for nothing. parseCache && chunkHash && activeWorkerPool ? rawResults : undefined, + exportedTypeMap, ); } catch (err) { if (!(err instanceof WorkerPoolInitializationError)) throw err; @@ -621,7 +639,7 @@ export async function runChunkedParseAndResolve( ); } } else { - parseCache.entries.set(chunkHash, rawResults); + await persistParseCacheChunk(parseCache, chunkHash, rawResults); if (isDev) { logger.info( `📦 parse-cache MISS+store: chunk ${chunkIdx + 1}/${numChunks} (${chunkFiles.length} files, ${chunkHash.slice(0, 8)})`, @@ -724,6 +742,11 @@ export async function runChunkedParseAndResolve( ); } + logHeapProbe( + 'post-parse-chunks', + `routes=${allExtractedRoutes.length} nodes=${graph.nodeCount} parsedFiles=${allParsedFiles.length}`, + ); + // Deferred end-of-loop extraction (moved out of the per-chunk block): // 1. route resolution on all chunks' routes // Resolution sees the full repo graph instead of just current-and-earlier @@ -740,8 +763,10 @@ export async function runChunkedParseAndResolve( // Populate `exportedTypeMap` from the in-progress graph so the post-parse // enrichment pass (enrichExportedTypeMap) sees cross-file export types. if (exportedTypeMap.size === 0 && graph.nodeCount > 0) { + logHeapProbe('pre-buildExportedTypeMapFromGraph'); const graphExports = buildExportedTypeMapFromGraph(graph, model.symbols); for (const [fp, exports] of graphExports) exportedTypeMap.set(fp, exports); + logHeapProbe('post-buildExportedTypeMapFromGraph'); } if (allExtractedRoutes.length > 0) { const tRoutes = startTimer(deferredProfile); diff --git a/gitnexus/src/core/ingestion/scope-resolution/pipeline/migrated-languages.ts b/gitnexus/src/core/ingestion/scope-resolution/pipeline/migrated-languages.ts new file mode 100644 index 0000000000..82c714b3a2 --- /dev/null +++ b/gitnexus/src/core/ingestion/scope-resolution/pipeline/migrated-languages.ts @@ -0,0 +1,32 @@ +/** + * Languages that resolve via the scope-resolution pipeline (RFC #909 Ring 3). + * + * Kept free of `ScopeResolver` imports so worker threads can gate + * `ParsedFile` emission without pulling in resolver implementations. + * Keep in sync with `SCOPE_RESOLVERS` in `registry.ts`. + */ + +import { SupportedLanguages } from 'gitnexus-shared'; + +export const SCOPE_RESOLUTION_LANGUAGES: ReadonlySet = new Set([ + SupportedLanguages.Python, + SupportedLanguages.CSharp, + SupportedLanguages.TypeScript, + SupportedLanguages.Go, + SupportedLanguages.Java, + SupportedLanguages.C, + SupportedLanguages.CPlusPlus, + SupportedLanguages.PHP, + SupportedLanguages.Rust, + SupportedLanguages.JavaScript, + SupportedLanguages.Kotlin, + SupportedLanguages.Ruby, + SupportedLanguages.Cobol, + SupportedLanguages.Swift, + SupportedLanguages.Dart, + SupportedLanguages.Vue, +]); + +export const isScopeResolutionLanguage = ( + lang: SupportedLanguages | null, +): lang is SupportedLanguages => lang !== null && SCOPE_RESOLUTION_LANGUAGES.has(lang); diff --git a/gitnexus/src/core/ingestion/utils/heap-probe.ts b/gitnexus/src/core/ingestion/utils/heap-probe.ts new file mode 100644 index 0000000000..3f44cd9a38 --- /dev/null +++ b/gitnexus/src/core/ingestion/utils/heap-probe.ts @@ -0,0 +1,21 @@ +/** + * Synchronous heap probes for large-repo OOM investigation (#1983). + * + * Writes to stderr (not pino) so lines flush under CI=1 / gdb attach. + * Enabled with `GITNEXUS_DEBUG_HEAP=1` or `GITNEXUS_PROFILE_DEFERRED=1`. + */ + +import { parseTruthyEnv } from './env.js'; +import { isDeferredResolutionProfileEnabled } from './deferred-resolution-profile.js'; + +export const isDebugHeapEnabled = (): boolean => + parseTruthyEnv(process.env.GITNEXUS_DEBUG_HEAP) || isDeferredResolutionProfileEnabled(); + +export const heapUsedMb = (): number => Math.round(process.memoryUsage().heapUsed / 1024 / 1024); + +/** Flush a one-line heap snapshot to stderr. */ +export const logHeapProbe = (label: string, detail?: string): void => { + if (!isDebugHeapEnabled()) return; + const suffix = detail ? ` ${detail}` : ''; + process.stderr.write(`[gitnexus-heap] ${label} used_mb=${heapUsedMb()}${suffix}\n`); +}; diff --git a/gitnexus/src/core/ingestion/workers/parse-worker.ts b/gitnexus/src/core/ingestion/workers/parse-worker.ts index d1d63e832e..ecaec5e154 100644 --- a/gitnexus/src/core/ingestion/workers/parse-worker.ts +++ b/gitnexus/src/core/ingestion/workers/parse-worker.ts @@ -97,6 +97,7 @@ import { extractTemplateArguments, templateArgumentsIdTag } from '../utils/templ import type { LanguageProvider } from '../language-provider.js'; import type { ParsedFile } from 'gitnexus-shared'; import { extractParsedFile, type ScopeCaptureSourceKind } from '../scope-extractor-bridge.js'; +import { isScopeResolutionLanguage } from '../scope-resolution/pipeline/migrated-languages.js'; import { extractLaravelRoutes, type ExtractedRoute } from '../route-extractors/laravel.js'; import { logger } from '../../logger.js'; @@ -1142,25 +1143,26 @@ const processFileGroup = ( const provider = getProvider(language); // RFC #909 Ring 2: produce a `ParsedFile` for the new scope-based - // resolution pipeline. No-op (returns undefined) for every language - // today — only fires once a provider implements `emitScopeCaptures`. - // Runs BEFORE legacy extraction and its result is independent: a - // failure here is caught inside `extractParsedFile` and does NOT - // affect the legacy DAG path that follows. - const parsedFile = extractParsedFile( - provider, - parseContent, - file.path, - (message) => { - if (parentPort) { - parentPort.postMessage({ type: 'warning', message }); - } else { - logger.warn(message); - } - }, - tree, - scopeSourceKind, - ); + // resolution pipeline. Skipped for registry-primary languages — the + // scope-resolution phase re-extracts from source on the main thread, + // which avoids retaining ~2× semantic model in RAM on huge repos (#1983). + let parsedFile: import('gitnexus-shared').ParsedFile | undefined; + if (!isScopeResolutionLanguage(language)) { + parsedFile = extractParsedFile( + provider, + parseContent, + file.path, + (message) => { + if (parentPort) { + parentPort.postMessage({ type: 'warning', message }); + } else { + logger.warn(message); + } + }, + tree, + scopeSourceKind, + ); + } if (parsedFile !== undefined) result.parsedFiles.push(parsedFile); // Build per-file type environment + constructor bindings in a single AST walk. diff --git a/gitnexus/src/storage/parse-cache.ts b/gitnexus/src/storage/parse-cache.ts index 080224eadd..8608522d23 100644 --- a/gitnexus/src/storage/parse-cache.ts +++ b/gitnexus/src/storage/parse-cache.ts @@ -44,11 +44,11 @@ import type { ParseWorkerResult } from '../core/ingestion/workers/parse-worker.j * On version mismatch, `loadParseCache` returns an empty cache and the * next save overwrites the on-disk file with the new version baked in. */ -// Bumped to 3 in RING4-1 (#942): ParseWorkerResult lost its `heritage` field -// when the legacy heritage path was deleted. Invalidating stale on-disk caches -// prevents cross-version replay (e.g. a rollback reading a heritage-less cache -// into legacy code that expects `result.heritage`). -const SCHEMA_BUMP = 3; +// Bumped to 4 in #1983: on-disk shards omit legacy DAG fields (`calls`, +// `assignments`, `constructorBindings`, worker `parsedFiles`) that are no +// longer consumed after RING4-1 (#942). Scope-resolution re-extracts +// `ParsedFile` on the main thread. +const SCHEMA_BUMP = 4; const GITNEXUS_PKG_VERSION = (() => { try { // package.json sits at gitnexus/package.json — two levels up from @@ -108,6 +108,14 @@ export interface ParseCache { * Transient — never serialized to disk. */ usedKeys: Set; + /** + * When set, chunk payloads are loaded from / flushed to sharded files on + * demand instead of retaining every chunk in `entries` for the whole run + * (#1983 — Linux kernel OOM from duplicate in-memory cache + graph). + */ + storagePath?: string; + /** Index of chunk hashes known to exist under `storagePath/parse-cache/`. */ + onDiskKeys?: Set; } /** SHA-256 hex of a single string or buffer. */ @@ -172,6 +180,75 @@ const getCacheIndexPath = (storagePath: string): string => const getCacheChunkPath = (storagePath: string, chunkHash: string): string => path.join(getCacheDirPath(storagePath), `${chunkHash}.json`); +/** + * Drop fields that are not replayed by `mergeChunkResults` / parse-impl after + * RING4-1 (#942). Shrinks on-disk shards and peak RSS during cold runs. + */ +export const slimParseWorkerResultsForCache = ( + chunkResults: readonly ParseWorkerResult[], +): ParseWorkerResult[] => { + const slim: ParseWorkerResult[] = []; + for (const result of chunkResults) { + slim.push({ + ...result, + calls: [], + assignments: [], + constructorBindings: [], + parsedFiles: [], + }); + } + return slim; +}; + +const readParseCacheChunkFromDisk = async ( + storagePath: string, + chunkHash: string, +): Promise => { + if (!isValidChunkCacheKey(chunkHash)) return undefined; + try { + const chunkRaw = await fs.readFile(getCacheChunkPath(storagePath, chunkHash), 'utf-8'); + const chunkData = JSON.parse(chunkRaw, mapReviver) as ParseWorkerResult[]; + return Array.isArray(chunkData) ? chunkData : undefined; + } catch { + return undefined; + } +}; + +/** Load one chunk shard. Does not retain it in `cache.entries`. */ +export const loadParseCacheChunk = async ( + cache: ParseCache, + chunkHash: string, +): Promise => { + const inMemory = cache.entries.get(chunkHash); + if (inMemory !== undefined) return inMemory; + if (cache.storagePath && cache.onDiskKeys?.has(chunkHash)) { + return readParseCacheChunkFromDisk(cache.storagePath, chunkHash); + } + return undefined; +}; + +/** + * Persist one chunk shard and avoid retaining it in RAM for the rest of the + * run. Falls back to `cache.entries` when `storagePath` is unset (unit tests). + */ +export const persistParseCacheChunk = async ( + cache: ParseCache, + chunkHash: string, + chunkResults: readonly ParseWorkerResult[], +): Promise => { + const slim = slimParseWorkerResultsForCache(chunkResults); + if (cache.storagePath) { + await fs.mkdir(getCacheDirPath(cache.storagePath), { recursive: true }); + const payload = JSON.stringify(slim, mapReplacer); + await fs.writeFile(getCacheChunkPath(cache.storagePath, chunkHash), payload, 'utf-8'); + cache.onDiskKeys ??= new Set(); + cache.onDiskKeys.add(chunkHash); + cache.entries.delete(chunkHash); + return; + } + cache.entries.set(chunkHash, slim); +}; + const loadLegacyParseCache = async (storagePath: string): Promise => { const cachePath = getLegacyCachePath(storagePath); try { @@ -184,15 +261,15 @@ const loadLegacyParseCache = async (storagePath: string): Promise => typeof data.entries !== 'object' || data.entries === null ) { - return emptyCache(); + return emptyCache(storagePath); } const entries = new Map(); for (const [k, v] of Object.entries(data.entries)) { if (Array.isArray(v)) entries.set(k, v as ParseWorkerResult[]); } - return { version: PARSE_CACHE_VERSION, entries, usedKeys: new Set() }; + return { version: PARSE_CACHE_VERSION, entries, usedKeys: new Set(), storagePath }; } catch { - return emptyCache(); + return emptyCache(storagePath); } }; @@ -207,22 +284,24 @@ const loadShardedParseCache = async (storagePath: string): Promise(); + const onDiskKeys = new Set(); for (const chunkHash of data.keys) { - if (typeof chunkHash !== 'string' || !isValidChunkCacheKey(chunkHash)) continue; - try { - const chunkRaw = await fs.readFile(getCacheChunkPath(storagePath, chunkHash), 'utf-8'); - const chunkData = JSON.parse(chunkRaw, mapReviver) as ParseWorkerResult[]; - if (Array.isArray(chunkData)) entries.set(chunkHash, chunkData); - } catch { - /* skip corrupt or missing shard */ + if (typeof chunkHash === 'string' && isValidChunkCacheKey(chunkHash)) { + onDiskKeys.add(chunkHash); } } - return { version: PARSE_CACHE_VERSION, entries, usedKeys: new Set() }; + // Lazy: index only — load individual shards on cache hit (#1983). + return { + version: PARSE_CACHE_VERSION, + entries: new Map(), + usedKeys: new Set(), + storagePath, + onDiskKeys, + }; } catch { return null; } @@ -255,20 +334,26 @@ export const saveParseCache = async (storagePath: string, cache: ParseCache): Pr await fs.rm(tmpDir, { recursive: true, force: true }); await fs.mkdir(tmpDir, { recursive: true }); - const keys: string[] = []; - for (const [chunkHash, chunkResults] of cache.entries) { - if (!isValidChunkCacheKey(chunkHash)) continue; - let payload: string; + const keys = [...cache.usedKeys].filter(isValidChunkCacheKey).sort(); + for (const chunkHash of keys) { + const chunkPath = path.join(tmpDir, `${chunkHash}.json`); + const inMemory = cache.entries.get(chunkHash); + if (inMemory !== undefined) { + let payload: string; + try { + payload = JSON.stringify(inMemory, mapReplacer); + } catch { + continue; + } + await fs.writeFile(chunkPath, payload, 'utf-8'); + continue; + } + const existingPath = getCacheChunkPath(storagePath, chunkHash); try { - payload = JSON.stringify(chunkResults, mapReplacer); + await fs.copyFile(existingPath, chunkPath); } catch { - // Extremely dense chunks could theoretically exceed string limits; skip - // rather than failing the entire save (orchestrator catches save errors). - continue; + /* shard missing — skip; next run treats as cache miss */ } - keys.push(chunkHash); - const chunkPath = path.join(tmpDir, `${chunkHash}.json`); - await fs.writeFile(chunkPath, payload, 'utf-8'); } const index: ShardedParseCacheIndex = { @@ -295,11 +380,21 @@ export const pruneCache = (cache: ParseCache, usedHashes: ReadonlySet): removed++; } } + if (cache.onDiskKeys) { + for (const k of cache.onDiskKeys) { + if (!usedHashes.has(k)) { + cache.onDiskKeys.delete(k); + removed++; + } + } + } return removed; }; -const emptyCache = (): ParseCache => ({ +const emptyCache = (storagePath?: string): ParseCache => ({ version: PARSE_CACHE_VERSION, entries: new Map(), usedKeys: new Set(), + storagePath, + onDiskKeys: storagePath ? new Set() : undefined, }); diff --git a/gitnexus/test/unit/incremental-parse-cache.test.ts b/gitnexus/test/unit/incremental-parse-cache.test.ts index eed82c5e8f..3ca0fbddcd 100644 --- a/gitnexus/test/unit/incremental-parse-cache.test.ts +++ b/gitnexus/test/unit/incremental-parse-cache.test.ts @@ -7,8 +7,11 @@ import { computeChunkHash, fileContentHash, loadParseCache, + loadParseCacheChunk, + persistParseCacheChunk, saveParseCache, pruneCache, + slimParseWorkerResultsForCache, type ParseCache, } from '../../src/storage/parse-cache.js'; import type { ParseWorkerResult } from '../../src/core/ingestion/workers/parse-worker.js'; @@ -217,7 +220,7 @@ describe('loadParseCache / saveParseCache (round-trip)', () => { } }); - it('skips corrupt or missing shards while loading the sharded cache', async () => { + it('skips corrupt or missing shards while loading the sharded cache index', async () => { const dir = await mkdtemp(path.join(tmpdir(), 'gnx-pc-')); try { const fs = await import('fs/promises'); @@ -242,8 +245,10 @@ describe('loadParseCache / saveParseCache (round-trip)', () => { await fs.writeFile(path.join(cacheDir, `${badKey}.json`), '{not-json', 'utf-8'); const loaded = await loadParseCache(dir); - expect(loaded.entries.size).toBe(1); - expect(loaded.entries.get(goodKey)?.[0]?.fileCount).toBe(3); + expect(loaded.entries.size).toBe(0); + expect(loaded.onDiskKeys?.size).toBe(3); + const chunk = await loadParseCacheChunk(loaded, goodKey); + expect(chunk?.[0]?.fileCount).toBe(3); } finally { await rm(dir, { recursive: true, force: true }); } @@ -290,7 +295,7 @@ describe('loadParseCache / saveParseCache (round-trip)', () => { expect(persisted).toContain('index.json'); expect(persisted).toContain(`${chunkKey}.json`); const loaded = await loadParseCache(dir); - const reloaded = loaded.entries.get(chunkKey)?.[0]; + const reloaded = (await loadParseCacheChunk(loaded, chunkKey))?.[0]; expect(reloaded).toBeDefined(); const scope = (reloaded as ParseWorkerResult).parsedFiles[0]?.scopes[0] as unknown as { typeBindings?: unknown; @@ -327,8 +332,9 @@ describe('loadParseCache / saveParseCache (round-trip)', () => { 'utf-8', ); const loaded = await loadParseCache(dir); - expect(loaded.entries.size).toBe(1); - expect(loaded.entries.get(safeKey)?.[0]?.fileCount).toBe(9); + expect(loaded.onDiskKeys?.size).toBe(1); + const chunk = await loadParseCacheChunk(loaded, safeKey); + expect(chunk?.[0]?.fileCount).toBe(9); } finally { await rm(dir, { recursive: true, force: true }); } @@ -356,7 +362,7 @@ describe('loadParseCache / saveParseCache (round-trip)', () => { expect(names).toContain('index.json'); expect(names.filter((n) => n.endsWith('.json') && n !== 'index.json').length).toBe(3); const loaded = await loadParseCache(dir); - expect(loaded.entries.size).toBe(3); + expect(loaded.onDiskKeys?.size).toBe(3); } finally { await rm(dir, { recursive: true, force: true }); } @@ -408,8 +414,9 @@ describe('loadParseCache / saveParseCache (round-trip)', () => { expect(names).not.toContain(`${k1}.json`); expect(names).toContain(`${k2}.json`); const loaded = await loadParseCache(dir); - expect(loaded.entries.size).toBe(1); - expect(loaded.entries.get(k2)?.[0]?.fileCount).toBe(99); + expect(loaded.onDiskKeys?.size).toBe(1); + const chunk = await loadParseCacheChunk(loaded, k2); + expect(chunk?.[0]?.fileCount).toBe(99); } finally { await rm(dir, { recursive: true, force: true }); } @@ -435,8 +442,56 @@ describe('loadParseCache / saveParseCache (round-trip)', () => { }); await expect(fs.access(path.join(dir, 'parse-cache.json'))).rejects.toThrow(); const loaded = await loadParseCache(dir); - expect(loaded.entries.get(k)?.[0]?.fileCount).toBe(6); - expect(loaded.entries.has('oldLegacy')).toBe(false); + const chunk = await loadParseCacheChunk(loaded, k); + expect(chunk?.[0]?.fileCount).toBe(6); + expect(loaded.onDiskKeys?.has(k)).toBe(true); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); + + it('slimParseWorkerResultsForCache drops legacy DAG fields', () => { + const raw = minimalResult({ + calls: [{ filePath: 'a.c', calleeName: 'f', line: 1 } as never], + assignments: [ + { filePath: 'a.c', sourceId: 's', receiverText: 'x', propertyName: 'y', line: 1 }, + ], + constructorBindings: [{ filePath: 'a.c', bindings: [] }], + parsedFiles: [ + { + filePath: 'a.c', + moduleScope: 'm', + scopes: [], + parsedImports: [], + localDefs: [], + referenceSites: [], + }, + ], + }); + const slim = slimParseWorkerResultsForCache([raw])[0]; + expect(slim.calls).toEqual([]); + expect(slim.assignments).toEqual([]); + expect(slim.constructorBindings).toEqual([]); + expect(slim.parsedFiles).toEqual([]); + expect(slim.fileCount).toBe(raw.fileCount); + }); + + it('persistParseCacheChunk writes to disk without retaining in-memory entries', async () => { + const dir = await mkdtemp(path.join(tmpdir(), 'gnx-pc-')); + try { + const key = '7'.repeat(64); + const cache: ParseCache = { + version: PARSE_CACHE_VERSION, + entries: new Map(), + usedKeys: new Set(), + storagePath: dir, + onDiskKeys: new Set(), + }; + await persistParseCacheChunk(cache, key, [minimalResult({ fileCount: 11 })]); + expect(cache.entries.has(key)).toBe(false); + expect(cache.onDiskKeys?.has(key)).toBe(true); + const chunk = await loadParseCacheChunk(cache, key); + expect(chunk?.[0]?.fileCount).toBe(11); } finally { await rm(dir, { recursive: true, force: true }); }