diff --git a/gitnexus/src/cli/analyze.ts b/gitnexus/src/cli/analyze.ts index 77855fbb9d..b75edda752 100644 --- a/gitnexus/src/cli/analyze.ts +++ b/gitnexus/src/cli/analyze.ts @@ -9,12 +9,14 @@ import { execFileSync } from 'child_process'; import v8 from 'v8'; import cliProgress from 'cli-progress'; import { runPipelineFromRepo } from '../core/ingestion/pipeline.js'; -import { initLbug, loadGraphToLbug, getLbugStats, executeQuery, executeWithReusedStatement, closeLbug, createFTSIndex, loadCachedEmbeddings } from '../core/lbug/lbug-adapter.js'; +import { initLbug, loadGraphToLbug, getLbugStats, executeQuery, executeWithReusedStatement, closeLbug, createFTSIndex } from '../core/lbug/lbug-adapter.js'; // Embedding imports are lazy (dynamic import) so onnxruntime-node is never // loaded when embeddings are not requested. This avoids crashes on Node // versions whose ABI is not yet supported by the native binary (#89). // disposeEmbedder intentionally not called — ONNX Runtime segfaults on cleanup (see #38) import { getStoragePaths, saveMeta, loadMeta, addToGitignore, registerRepo, getGlobalRegistryPath, cleanupOldKuzuFiles } from '../storage/repo-manager.js'; +import { loadParseCache, saveParseCache, pruneCache } from '../storage/parse-cache.js'; +import { loadEmbeddingCache, loadEmbeddingCacheMeta, validateEmbeddingCacheMeta, saveEmbeddingCache, createEmptyEmbeddingCache } from '../storage/embedding-cache.js'; import { getCurrentCommit, getGitRoot, hasGitDir } from '../storage/git.js'; import { generateAIContextFiles } from './ai-context.js'; import { generateSkillFiles, type GeneratedSkillInfo } from './skill-gen.js'; @@ -199,29 +201,16 @@ export const analyzeCommand = async ( const t0Global = Date.now(); - // ── Cache embeddings from existing index before rebuild ──────────── - let cachedEmbeddingNodeIds = new Set(); - let cachedEmbeddings: Array<{ nodeId: string; embedding: number[] }> = []; - - if (options?.embeddings && existingMeta && !options?.force) { - try { - updateBar(0, 'Caching embeddings...'); - await initLbug(lbugPath); - const cached = await loadCachedEmbeddings(); - cachedEmbeddingNodeIds = cached.embeddingNodeIds; - cachedEmbeddings = cached.embeddings; - await closeLbug(); - } catch { - try { await closeLbug(); } catch {} - } - } + // ── Caches ────────────────────────────────────────────────────────── + // Both caches are content-addressed and always safe to reuse (even with --force). + const parseCache = await loadParseCache(storagePath); // ── Phase 1: Full Pipeline (0–60%) ───────────────────────────────── const pipelineResult = await runPipelineFromRepo(repoPath, (progress) => { const phaseLabel = PHASE_LABELS[progress.phase] || progress.phase; const scaled = Math.round(progress.percent * 0.6); updateBar(scaled, phaseLabel); - }); + }, { parseCache }); // ── Phase 2: LadybugDB (60–85%) ────────────────────────────────────── updateBar(60, 'Loading into LadybugDB...'); @@ -258,31 +247,8 @@ export const analyzeCommand = async ( } const ftsTime = ((Date.now() - t0Fts) / 1000).toFixed(1); - // ── Phase 3.5: Re-insert cached embeddings ──────────────────────── - if (cachedEmbeddings.length > 0) { - // Check if cached embedding dimensions match current schema - const cachedDims = cachedEmbeddings[0].embedding.length; - const { EMBEDDING_DIMS } = await import('../core/lbug/schema.js'); - if (cachedDims !== EMBEDDING_DIMS) { - // Dimensions changed (e.g. switched embedding model) — discard cache and re-embed all - console.error(`⚠️ Embedding dimensions changed (${cachedDims}d → ${EMBEDDING_DIMS}d), discarding cache`); - cachedEmbeddings = []; - cachedEmbeddingNodeIds = new Set(); - } else { - updateBar(88, `Restoring ${cachedEmbeddings.length} cached embeddings...`); - const EMBED_BATCH = 200; - for (let i = 0; i < cachedEmbeddings.length; i += EMBED_BATCH) { - const batch = cachedEmbeddings.slice(i, i + EMBED_BATCH); - const paramsList = batch.map(e => ({ nodeId: e.nodeId, embedding: e.embedding })); - try { - await executeWithReusedStatement( - `CREATE (e:CodeEmbedding {nodeId: $nodeId, embedding: $embedding})`, - paramsList, - ); - } catch { /* some may fail if node was removed, that's fine */ } - } - } - } + // Old LadybugDB-based embedding cache removed — replaced by file-based + // content-addressed cache in embedding-cache.json (passed into runEmbeddingPipeline). // ── Phase 4: Embeddings (90–98%) ────────────────────────────────── const stats = await getLbugStats(); @@ -300,9 +266,21 @@ export const analyzeCommand = async ( if (!embeddingSkipped) { const { isHttpMode } = await import('../core/embeddings/http-client.js'); + const { DEFAULT_EMBEDDING_CONFIG } = await import('../core/embeddings/types.js'); const httpMode = isHttpMode(); updateBar(90, httpMode ? 'Connecting to embedding endpoint...' : 'Loading embedding model...'); const t0Emb = Date.now(); + + // Check metadata first (tiny file) — only deserialize full cache if valid + const embMeta = await loadEmbeddingCacheMeta(storagePath); + let embeddingCache: import('../storage/embedding-cache.js').EmbeddingCache; + if (embMeta && validateEmbeddingCacheMeta(embMeta, DEFAULT_EMBEDDING_CONFIG.dimensions, DEFAULT_EMBEDDING_CONFIG.modelId)) { + const fullCache = await loadEmbeddingCache(storagePath); + embeddingCache = fullCache ?? createEmptyEmbeddingCache(DEFAULT_EMBEDDING_CONFIG.dimensions, DEFAULT_EMBEDDING_CONFIG.modelId); + } else { + embeddingCache = createEmptyEmbeddingCache(DEFAULT_EMBEDDING_CONFIG.dimensions, DEFAULT_EMBEDDING_CONFIG.modelId); + } + const { runEmbeddingPipeline } = await import('../core/embeddings/embedding-pipeline.js'); await runEmbeddingPipeline( executeQuery, @@ -315,8 +293,9 @@ export const analyzeCommand = async ( updateBar(scaled, label); }, {}, - cachedEmbeddingNodeIds.size > 0 ? cachedEmbeddingNodeIds : undefined, + embeddingCache, ); + await saveEmbeddingCache(storagePath, embeddingCache); embeddingTime = ((Date.now() - t0Emb) / 1000).toFixed(1); } @@ -344,6 +323,7 @@ export const analyzeCommand = async ( }, }; await saveMeta(storagePath, meta); + await saveParseCache(storagePath, parseCache); await registerRepo(repoPath, meta); // Only attempt to update .gitignore when a .git directory is present. // Use hasGitDir (filesystem check) rather than git CLI subprocess @@ -397,10 +377,11 @@ export const analyzeCommand = async ( bar.stop(); // ── Summary ─────────────────────────────────────────────────────── - const embeddingsCached = cachedEmbeddings.length > 0; - console.log(`\n Repository indexed successfully (${totalTime}s)${embeddingsCached ? ` [${cachedEmbeddings.length} embeddings cached]` : ''}\n`); + console.log(`\n Repository indexed successfully (${totalTime}s)\n`); console.log(` ${stats.nodes.toLocaleString()} nodes | ${stats.edges.toLocaleString()} edges | ${pipelineResult.communityResult?.stats.totalCommunities || 0} clusters | ${pipelineResult.processResult?.stats.totalProcesses || 0} flows`); - console.log(` LadybugDB ${lbugTime}s | FTS ${ftsTime}s | Embeddings ${embeddingSkipped ? embeddingSkipReason : embeddingTime + 's'}`); + const cs = pipelineResult.cacheStats; + const cacheInfo = cs ? `Parse cache: ${cs.hits} cached, ${cs.misses} parsed | ` : ''; + console.log(` ${cacheInfo}LadybugDB ${lbugTime}s | FTS ${ftsTime}s | Embeddings ${embeddingSkipped ? embeddingSkipReason : embeddingTime + 's'}`); console.log(` ${repoPath}`); if (aiContext.files.length > 0) { diff --git a/gitnexus/src/core/embeddings/embedding-pipeline.ts b/gitnexus/src/core/embeddings/embedding-pipeline.ts index bb99a6ee2c..08912621c4 100644 --- a/gitnexus/src/core/embeddings/embedding-pipeline.ts +++ b/gitnexus/src/core/embeddings/embedding-pipeline.ts @@ -11,6 +11,7 @@ import { initEmbedder, embedBatch, embedText, embeddingToArray, isEmbedderReady } from './embedder.js'; import { generateBatchEmbeddingTexts, generateEmbeddingText } from './text-generator.js'; +import { embeddingTextHash, type EmbeddingCache } from '../../storage/embedding-cache.js'; import { type EmbeddingProgress, type EmbeddingConfig, @@ -142,58 +143,25 @@ const createVectorIndex = async ( * @param executeWithReusedStatement - Function to execute with reused prepared statement * @param onProgress - Callback for progress updates * @param config - Optional configuration override - * @param skipNodeIds - Optional set of node IDs that already have embeddings (incremental mode) + * @param embeddingCache - Optional content-addressed embedding cache (survives --force) */ export const runEmbeddingPipeline = async ( executeQuery: (cypher: string) => Promise, executeWithReusedStatement: (cypher: string, paramsList: Array>) => Promise, onProgress: EmbeddingProgressCallback, config: Partial = {}, - skipNodeIds?: Set, + embeddingCache?: EmbeddingCache, ): Promise => { const finalConfig = { ...DEFAULT_EMBEDDING_CONFIG, ...config }; try { - // Phase 1: Load embedding model - onProgress({ - phase: 'loading-model', - percent: 0, - modelDownloadPercent: 0, - }); - - if (!isEmbedderReady()) { - await initEmbedder((modelProgress: ModelProgress) => { - const downloadPercent = modelProgress.progress ?? 0; - onProgress({ - phase: 'loading-model', - percent: Math.round(downloadPercent * 0.2), - modelDownloadPercent: downloadPercent, - }); - }, finalConfig); - } - - onProgress({ - phase: 'loading-model', - percent: 20, - modelDownloadPercent: 100, - }); - if (isDev) { console.log('🔍 Querying embeddable nodes...'); } - // Phase 2: Query embeddable nodes + // Phase 1: Query embeddable nodes (before loading model — allows early exit if all cached) let nodes = await queryEmbeddableNodes(executeQuery); - // Incremental mode: filter out nodes that already have embeddings - if (skipNodeIds && skipNodeIds.size > 0) { - const beforeCount = nodes.length; - nodes = nodes.filter(n => !skipNodeIds.has(n.id)); - if (isDev) { - console.log(`📦 Incremental embeddings: ${beforeCount} total, ${skipNodeIds.size} cached, ${nodes.length} to embed`); - } - } - const totalNodes = nodes.length; if (isDev) { @@ -210,15 +178,79 @@ export const runEmbeddingPipeline = async ( return; } - // Phase 3: Batch embed nodes + // Single-pass: hash all texts, split into cached/uncached, derive allCached + const allTexts = generateBatchEmbeddingTexts(nodes, finalConfig); const batchSize = finalConfig.batchSize; - const totalBatches = Math.ceil(totalNodes / batchSize); let processedNodes = 0; + let cacheHits = 0; + + const usedHashes = new Set(); + const uncachedIndices: number[] = []; + const cachedUpdates: Array<{ id: string; embedding: number[] }> = []; + + for (let i = 0; i < nodes.length; i++) { + const hash = embeddingTextHash(allTexts[i]); + usedHashes.add(hash); + const cached = embeddingCache?.entries[hash]; + if (cached) { + cachedUpdates.push({ id: nodes[i].id, embedding: cached.embedding }); + cacheHits++; + } else { + uncachedIndices.push(i); + } + } + + const allCached = uncachedIndices.length === 0; + + // Phase 2: Load embedding model (only if we have uncached nodes) + if (!allCached) { + onProgress({ + phase: 'loading-model', + percent: 0, + modelDownloadPercent: 0, + }); + + if (!isEmbedderReady()) { + await initEmbedder((modelProgress: ModelProgress) => { + const downloadPercent = modelProgress.progress ?? 0; + onProgress({ + phase: 'loading-model', + percent: Math.round(downloadPercent * 0.2), + modelDownloadPercent: downloadPercent, + }); + }, finalConfig); + } + + onProgress({ + phase: 'loading-model', + percent: 20, + modelDownloadPercent: 100, + }); + } + + // Insert cached embeddings in bulk + if (cachedUpdates.length > 0) { + const BULK_BATCH = 200; + for (let i = 0; i < cachedUpdates.length; i += BULK_BATCH) { + const slice = cachedUpdates.slice(i, i + BULK_BATCH); + await batchInsertEmbeddings(executeWithReusedStatement, slice); + } + processedNodes += cachedUpdates.length; + } + + if (isDev && cacheHits > 0) { + console.log(`📦 Embedding cache: ${cacheHits} cached, ${uncachedIndices.length} to embed`); + } + + // Embed only uncached nodes + const uncachedNodes = uncachedIndices.map(i => nodes[i]); + const uncachedTexts = uncachedIndices.map(i => allTexts[i]); + const totalBatches = Math.ceil(uncachedNodes.length / batchSize); onProgress({ phase: 'embedding', percent: 20, - nodesProcessed: 0, + nodesProcessed: processedNodes, totalNodes, currentBatch: 0, totalBatches, @@ -226,24 +258,27 @@ export const runEmbeddingPipeline = async ( for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) { const start = batchIndex * batchSize; - const end = Math.min(start + batchSize, totalNodes); - const batch = nodes.slice(start, end); - - // Generate texts for this batch - const texts = generateBatchEmbeddingTexts(batch, finalConfig); + const end = Math.min(start + batchSize, uncachedNodes.length); + const batchNodes = uncachedNodes.slice(start, end); + const batchTexts = uncachedTexts.slice(start, end); // Embed the batch - const embeddings = await embedBatch(texts); - - // Update LadybugDB with embeddings - const updates = batch.map((node, i) => ({ - id: node.id, - embedding: embeddingToArray(embeddings[i]), - })); + const embeddings = await embedBatch(batchTexts); + + // Build updates and store in cache + const updates = batchNodes.map((node, i) => { + const vec = embeddingToArray(embeddings[i]); + // Store in file-based cache for next run + if (embeddingCache) { + const hash = embeddingTextHash(batchTexts[i]); + embeddingCache.entries[hash] = { embedding: Array.from(vec) }; + } + return { id: node.id, embedding: vec }; + }); await batchInsertEmbeddings(executeWithReusedStatement, updates); - processedNodes += batch.length; + processedNodes += batchNodes.length; // Report progress (20-90% for embedding phase) const embeddingProgress = 20 + ((processedNodes / totalNodes) * 70); @@ -257,6 +292,20 @@ export const runEmbeddingPipeline = async ( }); } + // Prune stale entries from embedding cache (symbols that no longer exist) + if (embeddingCache) { + let pruned = 0; + for (const hash of Object.keys(embeddingCache.entries)) { + if (!usedHashes.has(hash)) { + delete embeddingCache.entries[hash]; + pruned++; + } + } + if (isDev && pruned > 0) { + console.log(`🧹 Pruned ${pruned} stale embedding cache entries`); + } + } + // Phase 4: Create vector index onProgress({ phase: 'indexing', diff --git a/gitnexus/src/core/ingestion/parsing-processor.ts b/gitnexus/src/core/ingestion/parsing-processor.ts index c9f5eb12d9..2dcfddc7a2 100644 --- a/gitnexus/src/core/ingestion/parsing-processor.ts +++ b/gitnexus/src/core/ingestion/parsing-processor.ts @@ -32,6 +32,97 @@ export interface WorkerExtractedData { typeEnvBindings: FileTypeEnvBindings[]; } +/** Replay a ParseWorkerResult into the graph, symbol table, and accumulators. + * Shared by the worker merge path, sequential path replay, and cache replay. */ +export function mergeParseResult( + graph: KnowledgeGraph, + symbolTable: SymbolTable, + result: ParseWorkerResult, + accumulators: WorkerExtractedData, +): void { + for (const node of result.nodes) { + graph.addNode({ + id: node.id, + label: node.label as any, + properties: node.properties, + }); + } + + for (const rel of result.relationships) { + graph.addRelationship(rel); + } + + for (const sym of result.symbols) { + symbolTable.add(sym.filePath, sym.name, sym.nodeId, sym.type, { + parameterCount: sym.parameterCount, + requiredParameterCount: sym.requiredParameterCount, + parameterTypes: sym.parameterTypes, + returnType: sym.returnType, + declaredType: sym.declaredType, + ownerId: sym.ownerId, + }); + } + + accumulators.imports.push(...result.imports); + accumulators.calls.push(...result.calls); + accumulators.assignments.push(...result.assignments); + accumulators.heritage.push(...result.heritage); + accumulators.routes.push(...result.routes); + accumulators.fetchCalls.push(...result.fetchCalls); + accumulators.decoratorRoutes.push(...result.decoratorRoutes); + accumulators.toolDefs.push(...result.toolDefs); + if (result.ormQueries) accumulators.ormQueries.push(...result.ormQueries); + accumulators.constructorBindings.push(...result.constructorBindings); + accumulators.typeEnvBindings.push(...result.typeEnvBindings); +} + +/** Index raw ParseWorkerResults into a per-file Map in O(n) total. + * Replaces the O(n×m) filterRawResultsByFile approach. */ +export function indexResultsByFile(rawResults: ParseWorkerResult[]): Map { + const byFile = new Map(); + const nodeFileMap = new Map(); + + const emptyResult = (): ParseWorkerResult => ({ + nodes: [], relationships: [], symbols: [], + imports: [], calls: [], assignments: [], heritage: [], + routes: [], fetchCalls: [], decoratorRoutes: [], toolDefs: [], + ormQueries: [], constructorBindings: [], typeEnvBindings: [], + skippedLanguages: {}, fileCount: 1, + }); + + const getEntry = (filePath: string): ParseWorkerResult => { + let entry = byFile.get(filePath); + if (!entry) { entry = emptyResult(); byFile.set(filePath, entry); } + return entry; + }; + + for (const raw of rawResults) { + for (const n of raw.nodes) { + const fp = n.properties.filePath; + nodeFileMap.set(n.id, fp); + getEntry(fp).nodes.push(n); + } + for (const r of raw.relationships) { + const fp = nodeFileMap.get(r.sourceId) ?? nodeFileMap.get(r.targetId); + if (fp) getEntry(fp).relationships.push(r); + } + for (const s of raw.symbols) getEntry(s.filePath).symbols.push(s); + for (const i of raw.imports) getEntry(i.filePath).imports.push(i); + for (const c of raw.calls) getEntry(c.filePath).calls.push(c); + for (const a of raw.assignments) getEntry(a.filePath).assignments.push(a); + for (const h of raw.heritage) getEntry(h.filePath).heritage.push(h); + for (const r of raw.routes) getEntry(r.filePath).routes.push(r); + for (const f of raw.fetchCalls) getEntry(f.filePath).fetchCalls.push(f); + for (const d of raw.decoratorRoutes) getEntry(d.filePath).decoratorRoutes.push(d); + for (const t of raw.toolDefs) getEntry(t.filePath).toolDefs.push(t); + for (const o of raw.ormQueries) getEntry(o.filePath).ormQueries.push(o); + for (const c of raw.constructorBindings) getEntry(c.filePath).constructorBindings.push(c); + for (const t of raw.typeEnvBindings) getEntry(t.filePath).typeEnvBindings.push(t); + } + + return byFile; +} + // ============================================================================ // Worker-based parallel parsing // ============================================================================ @@ -43,7 +134,7 @@ const processParsingWithWorkers = async ( astCache: ASTCache, workerPool: WorkerPool, onFileProgress?: FileProgressCallback, -): Promise => { +): Promise => { // Filter to parseable files only const parseableFiles: ParseWorkerInput[] = []; for (const file of files) { @@ -64,52 +155,13 @@ const processParsingWithWorkers = async ( ); // Merge results from all workers into graph and symbol table - const allImports: ExtractedImport[] = []; - const allCalls: ExtractedCall[] = []; - const allAssignments: ExtractedAssignment[] = []; - const allHeritage: ExtractedHeritage[] = []; - const allRoutes: ExtractedRoute[] = []; - const allFetchCalls: ExtractedFetchCall[] = []; - const allDecoratorRoutes: ExtractedDecoratorRoute[] = []; - const allToolDefs: ExtractedToolDef[] = []; - const allORMQueries: ExtractedORMQuery[] = []; - const allConstructorBindings: FileConstructorBindings[] = []; - const allTypeEnvBindings: FileTypeEnvBindings[] = []; + const accumulators: WorkerExtractedData = { + imports: [], calls: [], assignments: [], heritage: [], routes: [], + fetchCalls: [], decoratorRoutes: [], toolDefs: [], ormQueries: [], + constructorBindings: [], typeEnvBindings: [], + }; for (const result of chunkResults) { - for (const node of result.nodes) { - graph.addNode({ - id: node.id, - label: node.label as any, - properties: node.properties, - }); - } - - for (const rel of result.relationships) { - graph.addRelationship(rel); - } - - for (const sym of result.symbols) { - symbolTable.add(sym.filePath, sym.name, sym.nodeId, sym.type, { - parameterCount: sym.parameterCount, - requiredParameterCount: sym.requiredParameterCount, - parameterTypes: sym.parameterTypes, - returnType: sym.returnType, - declaredType: sym.declaredType, - ownerId: sym.ownerId, - }); - } - - allImports.push(...result.imports); - allCalls.push(...result.calls); - allAssignments.push(...result.assignments); - allHeritage.push(...result.heritage); - allRoutes.push(...result.routes); - allFetchCalls.push(...result.fetchCalls); - allDecoratorRoutes.push(...result.decoratorRoutes); - allToolDefs.push(...result.toolDefs); - if (result.ormQueries) allORMQueries.push(...result.ormQueries); - allConstructorBindings.push(...result.constructorBindings); - allTypeEnvBindings.push(...result.typeEnvBindings); + mergeParseResult(graph, symbolTable, result, accumulators); } // Merge and log skipped languages from workers @@ -128,7 +180,7 @@ const processParsingWithWorkers = async ( // Final progress onFileProgress?.(total, total, 'done'); - return { imports: allImports, calls: allCalls, assignments: allAssignments, heritage: allHeritage, routes: allRoutes, fetchCalls: allFetchCalls, decoratorRoutes: allDecoratorRoutes, toolDefs: allToolDefs, ormQueries: allORMQueries, constructorBindings: allConstructorBindings, typeEnvBindings: allTypeEnvBindings }; + return { ...accumulators, rawResults: chunkResults }; }; // ============================================================================ @@ -423,7 +475,7 @@ export const processParsing = async ( astCache: ASTCache, onFileProgress?: FileProgressCallback, workerPool?: WorkerPool, -): Promise => { +): Promise<(WorkerExtractedData & { rawResults?: ParseWorkerResult[] }) | null> => { if (workerPool) { try { return await processParsingWithWorkers(graph, files, symbolTable, astCache, workerPool, onFileProgress); diff --git a/gitnexus/src/core/ingestion/pipeline.ts b/gitnexus/src/core/ingestion/pipeline.ts index 39294214ed..3ad21c0c81 100644 --- a/gitnexus/src/core/ingestion/pipeline.ts +++ b/gitnexus/src/core/ingestion/pipeline.ts @@ -2,7 +2,8 @@ import { createKnowledgeGraph } from '../graph/graph.js'; import { processStructure } from './structure-processor.js'; import { processMarkdown } from './markdown-processor.js'; import { processCobol, isCobolFile, isJclFile } from './cobol-processor.js'; -import { processParsing } from './parsing-processor.js'; +import { processParsing, mergeParseResult, indexResultsByFile, type WorkerExtractedData } from './parsing-processor.js'; +import { contentHash, pruneCache, type ParseCache } from '../../storage/parse-cache.js'; import { processImports, processImportsFromExtracted, @@ -400,6 +401,8 @@ async function runCrossFileBindingPropagation( export interface PipelineOptions { /** Skip MRO, community detection, and process extraction for faster test runs. */ skipGraphPhases?: boolean; + /** Parse cache from previous run. Files with matching content hashes skip Tree-sitter. */ + parseCache?: import('../../storage/parse-cache.js').ParseCache; } // ── Extracted pipeline phases ────────────────────────────────────────────── @@ -528,6 +531,11 @@ async function runScanAndStructure( * @writes graph (Symbol nodes, IMPORTS/CALLS/EXTENDS/IMPLEMENTS/ACCESSES edges) * @writes ctx.symbolTable, ctx.importMap, ctx.namedImportMap, ctx.moduleAliasMap */ + +/** Extract a single file's data from raw ParseWorkerResult arrays. + * Used to populate the parse cache with per-file entries after a fresh parse. */ +// filterRawResultsByFile removed — replaced by O(n) indexResultsByFile in parsing-processor.ts + async function runChunkedParseAndResolve( graph: ReturnType, ctx: ReturnType, @@ -537,6 +545,7 @@ async function runChunkedParseAndResolve( repoPath: string, pipelineStart: number, onProgress: ProgressFn, + parseCache?: ParseCache, ): Promise<{ exportedTypeMap: ExportedTypeMap; allFetchCalls: ExtractedFetchCall[]; @@ -544,6 +553,7 @@ async function runChunkedParseAndResolve( allDecoratorRoutes: ExtractedDecoratorRoute[]; allToolDefs: ExtractedToolDef[]; allORMQueries: ExtractedORMQuery[]; + cacheStats: { hits: number; misses: number }; }> { const symbolTable = ctx.symbols; @@ -666,6 +676,16 @@ async function runChunkedParseAndResolve( const allToolDefs: ExtractedToolDef[] = []; const allORMQueries: ExtractedORMQuery[] = []; + // Parse cache: replay cached results for unchanged files, only parse changed ones + let cacheHits = 0; + let cacheMisses = 0; + // Accumulator for replaying cached results into the graph + const cacheReplayAccumulators: WorkerExtractedData = { + imports: [], calls: [], assignments: [], heritage: [], routes: [], + fetchCalls: [], decoratorRoutes: [], toolDefs: [], ormQueries: [], + constructorBindings: [], typeEnvBindings: [], + }; + try { for (let chunkIdx = 0; chunkIdx < numChunks; chunkIdx++) { const chunkPaths = chunks[chunkIdx]; @@ -676,28 +696,80 @@ async function runChunkedParseAndResolve( .filter(p => chunkContents.has(p)) .map(p => ({ path: p, content: chunkContents.get(p)! })); - // Parse this chunk (workers or sequential fallback) - const chunkWorkerData = await processParsing( - graph, chunkFiles, symbolTable, astCache, - (current, _total, filePath) => { - const globalCurrent = filesParsedSoFar + current; - const parsingProgress = 20 + ((globalCurrent / totalParseable) * 62); - onProgress({ - phase: 'parsing', - percent: Math.round(parsingProgress), - message: `Parsing chunk ${chunkIdx + 1}/${numChunks}...`, - detail: filePath, - stats: { filesProcessed: globalCurrent, totalFiles: totalParseable, nodesCreated: graph.nodeCount }, - }); - }, - workerPool, - ); + // Split files into cache hits (replay) and cache misses (need parsing) + let filesToParse = chunkFiles; + const fileHashes = new Map(); + if (parseCache && Object.keys(parseCache.entries).length > 0) { + const uncached: typeof chunkFiles = []; + for (const file of chunkFiles) { + const entry = parseCache.entries[file.path]; + const hash = contentHash(file.content); + fileHashes.set(file.path, hash); + if (entry && entry.hash === hash) { + // Cache hit — replay stored result into graph + mergeParseResult(graph, symbolTable, entry.result, cacheReplayAccumulators); + cacheHits++; + } else { + uncached.push(file); + cacheMisses++; + } + } + filesToParse = uncached; + } else { + cacheMisses += chunkFiles.length; + } + + // Parse only uncached files (workers or sequential fallback) + const chunkWorkerData = filesToParse.length > 0 + ? await processParsing( + graph, filesToParse, symbolTable, astCache, + (current, _total, filePath) => { + const globalCurrent = filesParsedSoFar + current; + const parsingProgress = 20 + ((globalCurrent / totalParseable) * 62); + onProgress({ + phase: 'parsing', + percent: Math.round(parsingProgress), + message: `Parsing chunk ${chunkIdx + 1}/${numChunks}...`, + detail: filePath, + stats: { filesProcessed: globalCurrent, totalFiles: totalParseable, nodesCreated: graph.nodeCount }, + }); + }, + workerPool, + ) + : null; + + // Store freshly parsed results in cache for next run (O(n) indexing) + if (parseCache && chunkWorkerData?.rawResults) { + const perFileMap = indexResultsByFile(chunkWorkerData.rawResults); + for (const file of filesToParse) { + const hash = fileHashes.get(file.path) || contentHash(file.content); + const perFileResult = perFileMap.get(file.path); + if (perFileResult) { + parseCache.entries[file.path] = { hash, result: perFileResult }; + } + } + } const chunkBasePercent = 20 + ((filesParsedSoFar / totalParseable) * 62); - if (chunkWorkerData) { + // Merge cached replay data + freshly parsed data, then drain cache accumulators + const drainAndMerge = (cached: T[], fresh: T[] | undefined): T[] => { + const merged = cached.length > 0 ? [...cached, ...(fresh ?? [])] : (fresh ?? []); + cached.length = 0; + return merged; + }; + const chunkImports = drainAndMerge(cacheReplayAccumulators.imports, chunkWorkerData?.imports); + const chunkCalls = drainAndMerge(cacheReplayAccumulators.calls, chunkWorkerData?.calls); + const chunkHeritage = drainAndMerge(cacheReplayAccumulators.heritage, chunkWorkerData?.heritage); + const chunkRoutes = drainAndMerge(cacheReplayAccumulators.routes, chunkWorkerData?.routes); + const chunkAssignments = drainAndMerge(cacheReplayAccumulators.assignments, chunkWorkerData?.assignments); + const chunkConstructorBindings = drainAndMerge(cacheReplayAccumulators.constructorBindings, chunkWorkerData?.constructorBindings); + + const hasExtractedData = chunkWorkerData || chunkImports.length > 0; + + if (hasExtractedData) { // Imports - await processImportsFromExtracted(graph, allPathObjects, chunkWorkerData.imports, ctx, (current, total) => { + await processImportsFromExtracted(graph, allPathObjects, chunkImports, ctx, (current, total) => { onProgress({ phase: 'parsing', percent: Math.round(chunkBasePercent), @@ -718,7 +790,7 @@ async function runChunkedParseAndResolve( // it activates only if incremental export collection is added per-chunk. if (exportedTypeMap.size > 0 && ctx.namedImportMap.size > 0) { const { enrichedCount } = seedCrossFileReceiverTypes( - chunkWorkerData.calls, ctx.namedImportMap, exportedTypeMap, + chunkCalls, ctx.namedImportMap, exportedTypeMap, ); if (isDev && enrichedCount > 0) { console.log(`🔗 E1: Seeded ${enrichedCount} cross-file receiver types (chunk ${chunkIdx + 1})`); @@ -730,7 +802,7 @@ async function runChunkedParseAndResolve( await Promise.all([ processCallsFromExtracted( graph, - chunkWorkerData.calls, + chunkCalls, ctx, (current, total) => { onProgress({ @@ -741,11 +813,11 @@ async function runChunkedParseAndResolve( stats: { filesProcessed: filesParsedSoFar, totalFiles: totalParseable, nodesCreated: graph.nodeCount }, }); }, - chunkWorkerData.constructorBindings, + chunkConstructorBindings, ), processHeritageFromExtracted( graph, - chunkWorkerData.heritage, + chunkHeritage, ctx, (current, total) => { onProgress({ @@ -759,7 +831,7 @@ async function runChunkedParseAndResolve( ), processRoutesFromExtracted( graph, - chunkWorkerData.routes ?? [], + chunkRoutes, ctx, (current, total) => { onProgress({ @@ -773,34 +845,29 @@ async function runChunkedParseAndResolve( ), ]); // Process field write assignments (synchronous, runs after calls resolve) - if (chunkWorkerData.assignments?.length) { - processAssignmentsFromExtracted(graph, chunkWorkerData.assignments, ctx, chunkWorkerData.constructorBindings); - } - // Collect TypeEnv file-scope bindings for exported type enrichment - if (chunkWorkerData.typeEnvBindings?.length) { - workerTypeEnvBindings.push(...chunkWorkerData.typeEnvBindings); - } - // Collect fetch() calls for Next.js route matching - if (chunkWorkerData.fetchCalls?.length) { - allFetchCalls.push(...chunkWorkerData.fetchCalls); - } - if (chunkWorkerData.routes?.length) { - allExtractedRoutes.push(...chunkWorkerData.routes); - } - if (chunkWorkerData.decoratorRoutes?.length) { - allDecoratorRoutes.push(...chunkWorkerData.decoratorRoutes); + if (chunkAssignments.length > 0) { + processAssignmentsFromExtracted(graph, chunkAssignments, ctx, chunkConstructorBindings); } - if (chunkWorkerData.toolDefs?.length) { - allToolDefs.push(...chunkWorkerData.toolDefs); - } - if (chunkWorkerData.ormQueries?.length) { - allORMQueries.push(...chunkWorkerData.ormQueries); - } - } else { + } else if (filesToParse.length > 0) { + // Sequential fallback — no worker data AND no cache hits with extracted data await processImports(graph, chunkFiles, astCache, ctx, undefined, repoPath, allPaths); sequentialChunkPaths.push(chunkPaths); } + // Drain remaining accumulators unconditionally to prevent leak across chunks + const chunkTypeEnvBindings = drainAndMerge(cacheReplayAccumulators.typeEnvBindings, chunkWorkerData?.typeEnvBindings); + const chunkFetchCalls = drainAndMerge(cacheReplayAccumulators.fetchCalls, chunkWorkerData?.fetchCalls); + const chunkDecoratorRoutes = drainAndMerge(cacheReplayAccumulators.decoratorRoutes, chunkWorkerData?.decoratorRoutes); + const chunkToolDefs = drainAndMerge(cacheReplayAccumulators.toolDefs, chunkWorkerData?.toolDefs); + const chunkOrmQueries = drainAndMerge(cacheReplayAccumulators.ormQueries, chunkWorkerData?.ormQueries); + + if (chunkTypeEnvBindings.length > 0) workerTypeEnvBindings.push(...chunkTypeEnvBindings); + if (chunkFetchCalls.length > 0) allFetchCalls.push(...chunkFetchCalls); + if (chunkRoutes.length > 0) allExtractedRoutes.push(...chunkRoutes); + if (chunkDecoratorRoutes.length > 0) allDecoratorRoutes.push(...chunkDecoratorRoutes); + if (chunkToolDefs.length > 0) allToolDefs.push(...chunkToolDefs); + if (chunkOrmQueries.length > 0) allORMQueries.push(...chunkOrmQueries); + filesParsedSoFar += chunkFiles.length; // Clear AST cache between chunks to free memory @@ -891,7 +958,7 @@ async function runChunkedParseAndResolve( importCtx.index = EMPTY_INDEX; // Release suffix index memory (~30MB for large repos) importCtx.normalizedFileList = []; - return { exportedTypeMap, allFetchCalls, allExtractedRoutes, allDecoratorRoutes, allToolDefs, allORMQueries }; + return { exportedTypeMap, allFetchCalls, allExtractedRoutes, allDecoratorRoutes, allToolDefs, allORMQueries, cacheStats: { hits: cacheHits, misses: cacheMisses } }; } /** @@ -1112,9 +1179,14 @@ export const runPipelineFromRepo = async ( // Phase 1+2: Scan paths, build structure, process markdown const { scannedFiles, allPaths, totalFiles } = await runScanAndStructure(repoPath, graph, onProgress); + // Prune cache entries for files that no longer exist + if (options?.parseCache) { + pruneCache(options.parseCache, new Set(allPaths)); + } + // Phase 3+4: Chunked parse + resolve (imports, calls, heritage, routes) - const { exportedTypeMap, allFetchCalls, allExtractedRoutes, allDecoratorRoutes, allToolDefs, allORMQueries } = await runChunkedParseAndResolve( - graph, ctx, scannedFiles, allPaths, totalFiles, repoPath, pipelineStart, onProgress, + const { exportedTypeMap, allFetchCalls, allExtractedRoutes, allDecoratorRoutes, allToolDefs, allORMQueries, cacheStats } = await runChunkedParseAndResolve( + graph, ctx, scannedFiles, allPaths, totalFiles, repoPath, pipelineStart, onProgress, options?.parseCache, ); // ── Phase 3.5: Route Registry (Next.js + PHP + Laravel + decorators) ── @@ -1411,7 +1483,7 @@ export const runPipelineFromRepo = async ( }, }); - return { graph, repoPath, totalFileCount: totalFiles, communityResult, processResult }; + return { graph, repoPath, totalFileCount: totalFiles, communityResult, processResult, cacheStats }; } catch (error) { ctx.clear(); throw error; diff --git a/gitnexus/src/core/lbug/lbug-adapter.ts b/gitnexus/src/core/lbug/lbug-adapter.ts index 679baf6643..a053242e3c 100644 --- a/gitnexus/src/core/lbug/lbug-adapter.ts +++ b/gitnexus/src/core/lbug/lbug-adapter.ts @@ -612,41 +612,6 @@ export const getLbugStats = async (): Promise<{ nodes: number; edges: number }> return { nodes: totalNodes, edges: totalEdges }; }; -/** - * Load cached embeddings from LadybugDB before a rebuild. - * Returns all embedding vectors so they can be re-inserted after the graph is reloaded, - * avoiding expensive re-embedding of unchanged nodes. - */ -export const loadCachedEmbeddings = async (): Promise<{ - embeddingNodeIds: Set; - embeddings: Array<{ nodeId: string; embedding: number[] }>; -}> => { - if (!conn) { - return { embeddingNodeIds: new Set(), embeddings: [] }; - } - - const embeddingNodeIds = new Set(); - const embeddings: Array<{ nodeId: string; embedding: number[] }> = []; - try { - const rows = await conn.query(`MATCH (e:${EMBEDDING_TABLE_NAME}) RETURN e.nodeId AS nodeId, e.embedding AS embedding`); - const result = Array.isArray(rows) ? rows[0] : rows; - for (const row of await result.getAll()) { - const nodeId = String(row.nodeId ?? row[0] ?? ''); - if (!nodeId) continue; - embeddingNodeIds.add(nodeId); - const embedding = row.embedding ?? row[1]; - if (embedding) { - embeddings.push({ - nodeId, - embedding: Array.isArray(embedding) ? embedding.map(Number) : Array.from(embedding as any).map(Number), - }); - } - } - } catch { /* embedding table may not exist */ } - - return { embeddingNodeIds, embeddings }; -}; - export const closeLbug = async (): Promise => { if (conn) { try { diff --git a/gitnexus/src/storage/cache-io.ts b/gitnexus/src/storage/cache-io.ts new file mode 100644 index 0000000000..2cfe977d1a --- /dev/null +++ b/gitnexus/src/storage/cache-io.ts @@ -0,0 +1,50 @@ +/** + * Shared cache I/O utilities. + * Atomic JSON file read/write with version validation. + */ + +import crypto from 'crypto'; +import fs from 'fs/promises'; +import path from 'path'; + +export function sha256(content: string): string { + return crypto.createHash('sha256').update(content).digest('hex'); +} + +export async function loadJsonCache( + storagePath: string, + filename: string, + expectedVersion: number, +): Promise { + try { + const raw = await fs.readFile(path.join(storagePath, filename), 'utf-8'); + const parsed = JSON.parse(raw) as T; + if (parsed.version !== expectedVersion) return null; + return parsed; + } catch { + return null; + } +} + +export async function saveJsonCache( + storagePath: string, + filename: string, + data: unknown, +): Promise { + await fs.mkdir(storagePath, { recursive: true }); + const cachePath = path.join(storagePath, filename); + const tmpPath = cachePath + '.tmp'; + await fs.writeFile(tmpPath, JSON.stringify(data), 'utf-8'); + await fs.rename(tmpPath, cachePath); +} + +export async function deleteJsonCache( + storagePath: string, + filename: string, +): Promise { + try { + await fs.rm(path.join(storagePath, filename), { force: true }); + } catch { + // Already gone or never existed + } +} diff --git a/gitnexus/src/storage/embedding-cache.ts b/gitnexus/src/storage/embedding-cache.ts new file mode 100644 index 0000000000..58926b7121 --- /dev/null +++ b/gitnexus/src/storage/embedding-cache.ts @@ -0,0 +1,97 @@ +/** + * Embedding Cache — content-addressed cache for per-node embedding vectors. + * Keyed by SHA-256 of the embedding input text. Survives --force and LadybugDB drops. + * + * Metadata (version, dimensions, modelId) stored separately in embedding-cache-meta.json + * so callers can check staleness without deserializing the full entries file. + * Entries stored in a single embedding-cache.json with atomic tmp+rename writes. + */ + +import { sha256, loadJsonCache, saveJsonCache, deleteJsonCache } from './cache-io.js'; +import fs from 'fs/promises'; +import path from 'path'; + +export const EMBEDDING_CACHE_VERSION = 1; +const ENTRIES_FILENAME = 'embedding-cache.json'; +const META_FILENAME = 'embedding-cache-meta.json'; + +export { sha256 as embeddingTextHash }; + +export interface EmbeddingCacheEntry { + embedding: number[]; +} + +export interface EmbeddingCacheMeta { + version: number; + dimensions: number; + modelId: string; +} + +export interface EmbeddingCache { + version: number; + dimensions: number; + modelId: string; + entries: Record; +} + +export function createEmptyEmbeddingCache(dimensions: number, modelId: string): EmbeddingCache { + return { version: EMBEDDING_CACHE_VERSION, dimensions, modelId, entries: {} }; +} + +export async function loadEmbeddingCacheMeta(storagePath: string): Promise { + try { + const raw = await fs.readFile(path.join(storagePath, META_FILENAME), 'utf-8'); + const parsed = JSON.parse(raw) as EmbeddingCacheMeta; + if (parsed.version !== EMBEDDING_CACHE_VERSION) return null; + return parsed; + } catch { + return null; + } +} + +export function validateEmbeddingCacheMeta( + meta: EmbeddingCacheMeta, + dimensions: number, + modelId: string, +): boolean { + return meta.dimensions === dimensions && meta.modelId === modelId; +} + +export async function loadEmbeddingCache(storagePath: string): Promise { + const meta = await loadEmbeddingCacheMeta(storagePath); + if (!meta) return null; + + try { + const raw = await fs.readFile(path.join(storagePath, ENTRIES_FILENAME), 'utf-8'); + const entries = JSON.parse(raw) as Record; + return { + version: meta.version, + dimensions: meta.dimensions, + modelId: meta.modelId, + entries, + }; + } catch { + return null; + } +} + +export async function saveEmbeddingCache(storagePath: string, cache: EmbeddingCache): Promise { + await fs.mkdir(storagePath, { recursive: true }); + + // Write entries first — meta acts as the commit marker + await saveJsonCache(storagePath, ENTRIES_FILENAME, cache.entries); + + const metaPath = path.join(storagePath, META_FILENAME); + const tmpMeta = metaPath + '.tmp'; + await fs.writeFile(tmpMeta, JSON.stringify({ + version: cache.version, + dimensions: cache.dimensions, + modelId: cache.modelId, + }), 'utf-8'); + await fs.rename(tmpMeta, metaPath); +} + +export async function deleteEmbeddingCache(storagePath: string): Promise { + await deleteJsonCache(storagePath, ENTRIES_FILENAME); + await deleteJsonCache(storagePath, META_FILENAME); +} diff --git a/gitnexus/src/storage/parse-cache.ts b/gitnexus/src/storage/parse-cache.ts new file mode 100644 index 0000000000..2680aa1bdc --- /dev/null +++ b/gitnexus/src/storage/parse-cache.ts @@ -0,0 +1,52 @@ +/** + * Parse Cache — content-addressed cache for per-file Tree-sitter parse results. + * Keyed by file path. Unchanged files skip re-parsing on subsequent runs. + * + * Storage: single JSON file at .gitnexus/parse-cache.json with atomic tmp+rename writes. + */ + +import type { ParseWorkerResult } from '../core/ingestion/workers/parse-worker.js'; +import { sha256, loadJsonCache, saveJsonCache, deleteJsonCache } from './cache-io.js'; + +export const PARSE_CACHE_VERSION = 1; +const FILENAME = 'parse-cache.json'; + +export { sha256 as contentHash }; + +export interface ParseCacheEntry { + hash: string; + result: ParseWorkerResult; +} + +export interface ParseCache { + version: number; + entries: Record; +} + +export function createEmptyCache(): ParseCache { + return { version: PARSE_CACHE_VERSION, entries: {} }; +} + +export async function loadParseCache(storagePath: string): Promise { + const loaded = await loadJsonCache(storagePath, FILENAME, PARSE_CACHE_VERSION); + return loaded ?? createEmptyCache(); +} + +export async function saveParseCache(storagePath: string, cache: ParseCache): Promise { + await saveJsonCache(storagePath, FILENAME, cache); +} + +export async function deleteParseCache(storagePath: string): Promise { + await deleteJsonCache(storagePath, FILENAME); +} + +export function pruneCache(cache: ParseCache, currentPaths: Set): number { + let removed = 0; + for (const key of Object.keys(cache.entries)) { + if (!currentPaths.has(key)) { + delete cache.entries[key]; + removed++; + } + } + return removed; +} diff --git a/gitnexus/src/types/pipeline.ts b/gitnexus/src/types/pipeline.ts index adcce64e97..4abf6da48f 100644 --- a/gitnexus/src/types/pipeline.ts +++ b/gitnexus/src/types/pipeline.ts @@ -25,6 +25,8 @@ export interface PipelineResult { totalFileCount: number; communityResult?: CommunityDetectionResult; processResult?: ProcessDetectionResult; + /** Parse cache hit/miss stats (present when parse cache is enabled) */ + cacheStats?: { hits: number; misses: number }; } // Serializable version for Web Worker communication diff --git a/gitnexus/test/unit/embedding-cache.test.ts b/gitnexus/test/unit/embedding-cache.test.ts new file mode 100644 index 0000000000..0d4a4539c9 --- /dev/null +++ b/gitnexus/test/unit/embedding-cache.test.ts @@ -0,0 +1,207 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import fs from 'fs/promises'; +import path from 'path'; +import os from 'os'; +import { + EMBEDDING_CACHE_VERSION, + embeddingTextHash, + createEmptyEmbeddingCache, + loadEmbeddingCache, + loadEmbeddingCacheMeta, + validateEmbeddingCacheMeta, + saveEmbeddingCache, + deleteEmbeddingCache, + type EmbeddingCache, +} from '../../src/storage/embedding-cache.js'; + +describe('embedding-cache', () => { + let tmpDir: string; + + beforeEach(async () => { + tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'gitnexus-emb-cache-test-')); + }); + + afterEach(async () => { + await fs.rm(tmpDir, { recursive: true, force: true }); + }); + + describe('embeddingTextHash', () => { + it('is deterministic', () => { + expect(embeddingTextHash('Function: foo\nFile: bar.ts')).toBe( + embeddingTextHash('Function: foo\nFile: bar.ts'), + ); + }); + + it('differs for different text', () => { + expect(embeddingTextHash('Function: foo')).not.toBe(embeddingTextHash('Function: bar')); + }); + + it('returns a 64-char hex string', () => { + expect(embeddingTextHash('test')).toMatch(/^[0-9a-f]{64}$/); + }); + }); + + describe('createEmptyEmbeddingCache', () => { + it('has correct version, dimensions, and modelId', () => { + const cache = createEmptyEmbeddingCache(384, 'test-model'); + expect(cache.version).toBe(EMBEDDING_CACHE_VERSION); + expect(cache.dimensions).toBe(384); + expect(cache.modelId).toBe('test-model'); + expect(Object.keys(cache.entries)).toHaveLength(0); + }); + }); + + describe('loadEmbeddingCacheMeta', () => { + it('returns null when no meta file', async () => { + const meta = await loadEmbeddingCacheMeta(tmpDir); + expect(meta).toBeNull(); + }); + + it('returns null when version mismatches', async () => { + await fs.writeFile( + path.join(tmpDir, 'embedding-cache-meta.json'), + JSON.stringify({ version: -1, dimensions: 384, modelId: 'm' }), + 'utf-8', + ); + expect(await loadEmbeddingCacheMeta(tmpDir)).toBeNull(); + }); + + it('returns metadata when valid', async () => { + const cache = createEmptyEmbeddingCache(384, 'model-a'); + cache.entries['h1'] = { embedding: [1, 2, 3] }; + await saveEmbeddingCache(tmpDir, cache); + const meta = await loadEmbeddingCacheMeta(tmpDir); + expect(meta).not.toBeNull(); + expect(meta!.dimensions).toBe(384); + expect(meta!.modelId).toBe('model-a'); + }); + }); + + describe('validateEmbeddingCacheMeta', () => { + it('returns true for matching dimensions and model', () => { + expect(validateEmbeddingCacheMeta( + { version: EMBEDDING_CACHE_VERSION, dimensions: 384, modelId: 'a' }, + 384, 'a', + )).toBe(true); + }); + + it('returns false for mismatched dimensions', () => { + expect(validateEmbeddingCacheMeta( + { version: EMBEDDING_CACHE_VERSION, dimensions: 384, modelId: 'a' }, + 768, 'a', + )).toBe(false); + }); + + it('returns false for mismatched model', () => { + expect(validateEmbeddingCacheMeta( + { version: EMBEDDING_CACHE_VERSION, dimensions: 384, modelId: 'a' }, + 384, 'b', + )).toBe(false); + }); + }); + + describe('loadEmbeddingCache', () => { + it('returns null when no files exist', async () => { + const cache = await loadEmbeddingCache(tmpDir); + expect(cache).toBeNull(); + }); + + it('returns null when meta version mismatches', async () => { + await fs.writeFile( + path.join(tmpDir, 'embedding-cache-meta.json'), + JSON.stringify({ version: -1, dimensions: 384, modelId: 'm' }), + 'utf-8', + ); + expect(await loadEmbeddingCache(tmpDir)).toBeNull(); + }); + + it('loads valid cache', async () => { + const cache: EmbeddingCache = { + version: EMBEDDING_CACHE_VERSION, + dimensions: 384, + modelId: 'test-model', + entries: { abc123: { embedding: [0.1, 0.2, 0.3] } }, + }; + await saveEmbeddingCache(tmpDir, cache); + const loaded = await loadEmbeddingCache(tmpDir); + expect(loaded).not.toBeNull(); + expect(loaded!.entries['abc123'].embedding).toEqual([0.1, 0.2, 0.3]); + expect(loaded!.modelId).toBe('test-model'); + expect(loaded!.dimensions).toBe(384); + }); + }); + + describe('saveEmbeddingCache', () => { + it('writes and loads back correctly', async () => { + const cache: EmbeddingCache = { + version: EMBEDDING_CACHE_VERSION, + dimensions: 384, + modelId: 'model-a', + entries: { h1: { embedding: [1, 2, 3] } }, + }; + await saveEmbeddingCache(tmpDir, cache); + const loaded = await loadEmbeddingCache(tmpDir); + expect(loaded).not.toBeNull(); + expect(loaded!.modelId).toBe('model-a'); + expect(loaded!.entries['h1'].embedding).toEqual([1, 2, 3]); + }); + + it('creates directory if missing', async () => { + const nested = path.join(tmpDir, 'deep', 'path'); + const cache = createEmptyEmbeddingCache(384, 'x'); + await saveEmbeddingCache(nested, cache); + const loaded = await loadEmbeddingCache(nested); + expect(loaded).not.toBeNull(); + }); + + it('writes meta and entries files', async () => { + const cache: EmbeddingCache = { + version: EMBEDDING_CACHE_VERSION, + dimensions: 384, + modelId: 'x', + entries: { a1b2c3: { embedding: [1] } }, + }; + await saveEmbeddingCache(tmpDir, cache); + const files = await fs.readdir(tmpDir); + expect(files).toContain('embedding-cache-meta.json'); + expect(files).toContain('embedding-cache.json'); + }); + + it('no temp files left behind', async () => { + await saveEmbeddingCache(tmpDir, createEmptyEmbeddingCache(384, 'x')); + const files = await fs.readdir(tmpDir); + expect(files.filter(f => f.endsWith('.tmp'))).toHaveLength(0); + }); + }); + + describe('deleteEmbeddingCache', () => { + it('removes entries and meta files', async () => { + await saveEmbeddingCache(tmpDir, createEmptyEmbeddingCache(384, 'x')); + await deleteEmbeddingCache(tmpDir); + const files = await fs.readdir(tmpDir); + expect(files).not.toContain('embedding-cache.json'); + expect(files).not.toContain('embedding-cache-meta.json'); + }); + + it('does not throw if nothing exists', async () => { + await expect(deleteEmbeddingCache(tmpDir)).resolves.toBeUndefined(); + }); + }); + + describe('model/dimension invalidation', () => { + it('cache with different dimensions is treated as stale by validateEmbeddingCacheMeta', () => { + const meta = { version: EMBEDDING_CACHE_VERSION, dimensions: 384, modelId: 'model-a' }; + expect(validateEmbeddingCacheMeta(meta, 768, 'model-a')).toBe(false); + }); + + it('cache with different model is treated as stale by validateEmbeddingCacheMeta', () => { + const meta = { version: EMBEDDING_CACHE_VERSION, dimensions: 384, modelId: 'model-a' }; + expect(validateEmbeddingCacheMeta(meta, 384, 'model-b')).toBe(false); + }); + + it('cache with matching dimensions and model is valid', () => { + const meta = { version: EMBEDDING_CACHE_VERSION, dimensions: 384, modelId: 'model-a' }; + expect(validateEmbeddingCacheMeta(meta, 384, 'model-a')).toBe(true); + }); + }); +}); diff --git a/gitnexus/test/unit/parse-cache.test.ts b/gitnexus/test/unit/parse-cache.test.ts new file mode 100644 index 0000000000..361364fb9f --- /dev/null +++ b/gitnexus/test/unit/parse-cache.test.ts @@ -0,0 +1,171 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import fs from 'fs/promises'; +import path from 'path'; +import os from 'os'; +import { + PARSE_CACHE_VERSION, + contentHash, + createEmptyCache, + loadParseCache, + saveParseCache, + deleteParseCache, + pruneCache, + type ParseCache, +} from '../../src/storage/parse-cache.js'; + +describe('parse-cache', () => { + let tmpDir: string; + + beforeEach(async () => { + tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'gitnexus-parse-cache-test-')); + }); + + afterEach(async () => { + await fs.rm(tmpDir, { recursive: true, force: true }); + }); + + describe('contentHash', () => { + it('is deterministic', () => { + expect(contentHash('hello')).toBe(contentHash('hello')); + }); + + it('differs for different content', () => { + expect(contentHash('hello')).not.toBe(contentHash('world')); + }); + + it('returns a 64-char hex string (SHA-256)', () => { + const hash = contentHash('test'); + expect(hash).toMatch(/^[0-9a-f]{64}$/); + }); + }); + + describe('createEmptyCache', () => { + it('has correct version', () => { + const cache = createEmptyCache(); + expect(cache.version).toBe(PARSE_CACHE_VERSION); + }); + + it('has empty entries', () => { + const cache = createEmptyCache(); + expect(Object.keys(cache.entries)).toHaveLength(0); + }); + }); + + describe('loadParseCache', () => { + it('returns empty cache when no file exists', async () => { + const cache = await loadParseCache(tmpDir); + expect(cache.version).toBe(PARSE_CACHE_VERSION); + expect(Object.keys(cache.entries)).toHaveLength(0); + }); + + it('returns empty cache when version mismatches', async () => { + await fs.writeFile(path.join(tmpDir, 'parse-cache.json'), JSON.stringify({ version: -1, entries: {} }), 'utf-8'); + const cache = await loadParseCache(tmpDir); + expect(Object.keys(cache.entries)).toHaveLength(0); + }); + + it('loads valid cache', async () => { + const cache: ParseCache = { + version: PARSE_CACHE_VERSION, + entries: { 'foo.ts': { hash: 'abc', result: minimalResult() } }, + }; + await saveParseCache(tmpDir, cache); + const loaded = await loadParseCache(tmpDir); + expect(loaded.entries['foo.ts'].hash).toBe('abc'); + }); + }); + + describe('saveParseCache', () => { + it('writes and loads back correctly', async () => { + const cache: ParseCache = { version: PARSE_CACHE_VERSION, entries: { 'bar.rs': { hash: 'xyz', result: minimalResult() } } }; + await saveParseCache(tmpDir, cache); + const loaded = await loadParseCache(tmpDir); + expect(loaded.entries['bar.rs'].hash).toBe('xyz'); + }); + + it('creates directory if missing', async () => { + const nested = path.join(tmpDir, 'sub', 'dir'); + const cache = createEmptyCache(); + await saveParseCache(nested, cache); + const loaded = await loadParseCache(nested); + expect(loaded.version).toBe(PARSE_CACHE_VERSION); + }); + + it('writes a single cache file', async () => { + const cache: ParseCache = { + version: PARSE_CACHE_VERSION, + entries: { 'a.ts': { hash: '1', result: minimalResult() } }, + }; + await saveParseCache(tmpDir, cache); + const files = await fs.readdir(tmpDir); + expect(files).toContain('parse-cache.json'); + }); + + it('no temp files left behind', async () => { + await saveParseCache(tmpDir, createEmptyCache()); + const files = await fs.readdir(tmpDir); + expect(files.filter(f => f.endsWith('.tmp'))).toHaveLength(0); + }); + }); + + describe('deleteParseCache', () => { + it('removes cache file', async () => { + await saveParseCache(tmpDir, createEmptyCache()); + await deleteParseCache(tmpDir); + const files = await fs.readdir(tmpDir); + expect(files).not.toContain('parse-cache.json'); + }); + + it('does not throw if nothing exists', async () => { + await expect(deleteParseCache(tmpDir)).resolves.toBeUndefined(); + }); + }); + + describe('pruneCache', () => { + it('removes entries for deleted files', () => { + const cache: ParseCache = { + version: PARSE_CACHE_VERSION, + entries: { + 'a.ts': { hash: '1', result: minimalResult() }, + 'b.ts': { hash: '2', result: minimalResult() }, + 'c.ts': { hash: '3', result: minimalResult() }, + }, + }; + const removed = pruneCache(cache, new Set(['a.ts', 'c.ts'])); + expect(removed).toBe(1); + expect(cache.entries['a.ts']).toBeDefined(); + expect(cache.entries['b.ts']).toBeUndefined(); + expect(cache.entries['c.ts']).toBeDefined(); + }); + + it('preserves all entries when all files exist', () => { + const cache: ParseCache = { + version: PARSE_CACHE_VERSION, + entries: { 'x.ts': { hash: 'h', result: minimalResult() } }, + }; + const removed = pruneCache(cache, new Set(['x.ts'])); + expect(removed).toBe(0); + expect(cache.entries['x.ts']).toBeDefined(); + }); + + it('removes all entries when no files match', () => { + const cache: ParseCache = { + version: PARSE_CACHE_VERSION, + entries: { 'old.ts': { hash: 'h', result: minimalResult() } }, + }; + const removed = pruneCache(cache, new Set([])); + expect(removed).toBe(1); + expect(Object.keys(cache.entries)).toHaveLength(0); + }); + }); +}); + +function minimalResult(): any { + return { + nodes: [], relationships: [], symbols: [], + imports: [], calls: [], assignments: [], heritage: [], + routes: [], fetchCalls: [], decoratorRoutes: [], toolDefs: [], + ormQueries: [], constructorBindings: [], typeEnvBindings: [], + skippedLanguages: {}, fileCount: 0, + }; +}