diff --git a/gitnexus/src/cli/analyze.ts b/gitnexus/src/cli/analyze.ts index e54601bab8..55b701f949 100644 --- a/gitnexus/src/cli/analyze.ts +++ b/gitnexus/src/cli/analyze.ts @@ -669,6 +669,7 @@ export interface AnalyzeOptions { embeddingBatchSize?: string; embeddingSubBatchSize?: string; embeddingDevice?: string; + json?: boolean; /** * Extra fetch-wrapper function names to treat as HTTP consumers (#1589/#1852 * residual). Supplied via `.gitnexusrc` `fetchWrappers: [...]`. Threaded into @@ -1310,11 +1311,24 @@ const analyzeCommandImpl = async ( // ── Summary ──────────────────────────────────────────────────── const s = result.stats; - console.log(`\n Repository indexed successfully (${totalTime}s)\n`); - console.log( - ` ${(s.nodes ?? 0).toLocaleString()} nodes | ${(s.edges ?? 0).toLocaleString()} edges | ${s.communities ?? 0} clusters | ${s.processes ?? 0} flows`, - ); - console.log(` ${repoPath}`); + if (options?.json) { + console.log(JSON.stringify({ repoPath, totalTime, ...s }, null, 2)); + } else { + console.log(`\n Repository indexed successfully (${totalTime}s)\n`); + console.log( + ` ${(s.nodes ?? 0).toLocaleString()} nodes | ${(s.edges ?? 0).toLocaleString()} edges | ${s.communities ?? 0} clusters | ${s.processes ?? 0} flows`, + ); + if (s.parserCoverage && s.parserCoverage.unsupportedFiles > 0) { + const pc = s.parserCoverage; + const topExts = pc.unsupportedByExtension + .slice(0, 5) + .map((e) => `${e.extension}: ${e.count}`); + console.log( + ` Skipped ${pc.unsupportedFiles} files with unsupported extensions (${topExts.join(', ')}${pc.unsupportedByExtension.length > 5 ? ', ...' : ''})`, + ); + } + console.log(` ${repoPath}`); + } // Persistent (non-scrolling) warning when FTS indexing was skipped — the // progress-bar log() that fired mid-run has already scrolled away, so the diff --git a/gitnexus/src/cli/index.ts b/gitnexus/src/cli/index.ts index ffe6a64838..5f126c4161 100644 --- a/gitnexus/src/cli/index.ts +++ b/gitnexus/src/cli/index.ts @@ -112,6 +112,7 @@ program .option('--embedding-batch-size ', 'Number of nodes per embedding batch') .option('--embedding-sub-batch-size ', 'Number of chunks per embedding model call') .option('--embedding-device ', 'Embedding device: auto, cpu, dml, cuda, or wasm') + .option('--json', 'Output analysis result as JSON (includes parserCoverage stats)') .addHelpText('after', () => t('help.analyze.environment')) .action(createLbugLazyAction(() => import('./analyze.js'), 'analyzeCommand')); diff --git a/gitnexus/src/core/ingestion/pipeline-phases/parse-impl.ts b/gitnexus/src/core/ingestion/pipeline-phases/parse-impl.ts index 3885df16e9..31e9114f0d 100644 --- a/gitnexus/src/core/ingestion/pipeline-phases/parse-impl.ts +++ b/gitnexus/src/core/ingestion/pipeline-phases/parse-impl.ts @@ -256,6 +256,13 @@ export async function runChunkedParseAndResolve( * files. There is no sequential parser — the pool is the sole parse path * whenever a chunk misses the cache. */ usedWorkerPool: boolean; + /** Parser coverage — which files were parsed vs skipped */ + parserCoverage: { + totalFiles: number; + supportedFiles: number; + unsupportedFiles: number; + unsupportedByExtension: Array<{ extension: string; count: number }>; + }; /** Worker-produced ParsedFile artifacts aggregated across chunks. * Threaded into scope-resolution as a re-extract cache so the warm- * cache analyze run can skip the dominant `extractParsedFile` cost @@ -270,6 +277,28 @@ export async function runChunkedParseAndResolve( return lang && isLanguageAvailable(lang); }); + // ── Parser coverage stats ────────────────────────────────────────── + const unsupportedExtCounts = new Map(); + for (const f of scannedFiles) { + const lang = getLanguageFromFilename(f.path); + if (!lang) { + const ext = path.extname(f.path).toLowerCase() || '(no extension)'; + unsupportedExtCounts.set(ext, (unsupportedExtCounts.get(ext) || 0) + 1); + } + } + const unsupportedByExtension = Array.from(unsupportedExtCounts.entries()) + .map(([extension, count]) => ({ extension, count })) + .sort((a, b) => b.count - a.count); + const unsupportedFiles = unsupportedByExtension.reduce((sum, e) => sum + e.count, 0); + const supportedFiles = parseableScanned.length; + + const parserCoverage = { + totalFiles: scannedFiles.length, + supportedFiles, + unsupportedFiles, + unsupportedByExtension, + }; + // Warn about files skipped due to unavailable parsers const skippedByLang = new Map(); for (const f of scannedFiles) { @@ -293,6 +322,14 @@ export async function runChunkedParseAndResolve( } } + // Warn about files with unsupported extensions (no grammar at all) + if (unsupportedFiles > 0) { + const topExts = unsupportedByExtension.slice(0, 5).map((e) => `${e.extension}: ${e.count}`); + logger.warn( + `Skipped ${unsupportedFiles} files with unsupported extensions (${topExts.join(', ')}${unsupportedByExtension.length > 5 ? ', ...' : ''})`, + ); + } + // Sort parseableScanned alphabetically for stable chunk membership // across runs (Finding 4). Without this, filesystem-scan order can // shift between runs (notably on macOS APFS where directory entry @@ -1162,6 +1199,7 @@ export async function runChunkedParseAndResolve( // no pool was needed: a warm all-cache-hit run replays cached worker output // without spawning workers, or there were no parseable files. usedWorkerPool: workerPool !== undefined, + parserCoverage, // Per-file ParsedFile artifacts produced by workers' calls to // `extractParsedFile`. Consumed by scope-resolution as a re-extraction // cache: when the file's ParsedFile is here, scope-resolution skips its own diff --git a/gitnexus/src/core/ingestion/pipeline-phases/parse.ts b/gitnexus/src/core/ingestion/pipeline-phases/parse.ts index 1875d99112..c1caf22e2c 100644 --- a/gitnexus/src/core/ingestion/pipeline-phases/parse.ts +++ b/gitnexus/src/core/ingestion/pipeline-phases/parse.ts @@ -31,6 +31,7 @@ import type { } from '../workers/parse-worker.js'; import { runChunkedParseAndResolve } from './parse-impl.js'; import type { MutableSemanticModel } from '../model/index.js'; +import type { ParserCoverage } from '../../../types/pipeline.js'; export interface ParseOutput { /** @@ -77,6 +78,8 @@ export interface ParseOutput { * costing ~58s on a 1000-file repo). */ readonly parsedFiles: readonly ParsedFile[]; + /** Parser coverage — which files were parsed vs skipped */ + readonly parserCoverage: ParserCoverage; } export const parsePhase: PipelinePhase = { diff --git a/gitnexus/src/core/ingestion/pipeline.ts b/gitnexus/src/core/ingestion/pipeline.ts index f1f8326321..7e0cd446f3 100644 --- a/gitnexus/src/core/ingestion/pipeline.ts +++ b/gitnexus/src/core/ingestion/pipeline.ts @@ -40,6 +40,7 @@ import { type PipelinePhase, type CommunitiesOutput, type ProcessesOutput, + type ParseOutput, } from './pipeline-phases/index.js'; export interface PipelineOptions { @@ -237,10 +238,10 @@ export const runPipelineFromRepo = async ( }); // Extract final results for the PipelineResult contract - const { totalFiles, usedWorkerPool } = getPhaseOutput<{ - totalFiles: number; - usedWorkerPool: boolean; - }>(results, 'parse'); + const { totalFiles, usedWorkerPool, parserCoverage } = getPhaseOutput( + results, + 'parse', + ); let communityResult: CommunitiesOutput['communityResult'] | undefined; let processResult: ProcessesOutput['processResult'] | undefined; @@ -276,5 +277,6 @@ export const runPipelineFromRepo = async ( processResult, resolutionOutcomes, usedWorkerPool, + parserCoverage, }; }; diff --git a/gitnexus/src/core/run-analyze.ts b/gitnexus/src/core/run-analyze.ts index 66300b8b31..915fabfbe8 100644 --- a/gitnexus/src/core/run-analyze.ts +++ b/gitnexus/src/core/run-analyze.ts @@ -200,6 +200,12 @@ export interface AnalyzeResult { communities?: number; processes?: number; embeddings?: number; + parserCoverage?: { + totalFiles: number; + supportedFiles: number; + unsupportedFiles: number; + unsupportedByExtension: Array<{ extension: string; count: number }>; + }; }; alreadyUpToDate?: boolean; /** The raw pipeline result — only populated when needed by callers (e.g. skill generation). */ @@ -1221,6 +1227,7 @@ export async function runFullAnalysis( communities: pipelineResult.communityResult?.stats.totalCommunities, processes: pipelineResult.processResult?.stats.totalProcesses, embeddings: embeddingCount, + parserCoverage: pipelineResult.parserCoverage, }, capabilities: { graph: { provider: 'ladybugdb', status: runtimeCapabilities.graph }, diff --git a/gitnexus/src/mcp/local/local-backend.ts b/gitnexus/src/mcp/local/local-backend.ts index f2d17b3bd0..16dd723682 100644 --- a/gitnexus/src/mcp/local/local-backend.ts +++ b/gitnexus/src/mcp/local/local-backend.ts @@ -259,6 +259,12 @@ export interface CodebaseContext { communityCount: number; processCount: number; }; + parserCoverage?: { + totalFiles: number; + supportedFiles: number; + unsupportedFiles: number; + unsupportedByExtension: Array<{ extension: string; count: number }>; + }; } interface RepoHandle { @@ -601,6 +607,7 @@ export class LocalBackend { communityCount: s.communities || 0, processCount: s.processes || 0, }, + parserCoverage: s.parserCoverage, }); } diff --git a/gitnexus/src/mcp/resources.ts b/gitnexus/src/mcp/resources.ts index 707a5e047c..87f303b585 100644 --- a/gitnexus/src/mcp/resources.ts +++ b/gitnexus/src/mcp/resources.ts @@ -286,6 +286,9 @@ async function getReposResource(backend: LocalBackend): Promise { lines.push(` files: ${repo.stats.files || 0}`); lines.push(` symbols: ${repo.stats.nodes || 0}`); lines.push(` processes: ${repo.stats.processes || 0}`); + if (repo.stats.parserCoverage?.unsupportedFiles) { + lines.push(` unsupported_files: ${repo.stats.parserCoverage.unsupportedFiles}`); + } } } @@ -330,6 +333,21 @@ async function getContextResource(backend: LocalBackend, repoName?: string): Pro lines.push(` files: ${context.stats.fileCount}`); lines.push(` symbols: ${context.stats.functionCount}`); lines.push(` processes: ${context.stats.processCount}`); + + if (context.parserCoverage && context.parserCoverage.unsupportedFiles > 0) { + const pc = context.parserCoverage; + lines.push(''); + lines.push('parser_coverage:'); + lines.push(` total_files: ${pc.totalFiles}`); + lines.push(` supported: ${pc.supportedFiles}`); + lines.push(` unsupported: ${pc.unsupportedFiles}`); + lines.push(' unsupported_by_extension:'); + for (const ext of pc.unsupportedByExtension.slice(0, 10)) { + lines.push(` - extension: "${ext.extension}"`); + lines.push(` count: ${ext.count}`); + } + } + lines.push(''); lines.push('tools_available:'); lines.push(' - query: Process-grouped code intelligence (execution flows related to a concept)'); diff --git a/gitnexus/src/storage/repo-manager.ts b/gitnexus/src/storage/repo-manager.ts index 5e9fd972f8..f2b52ece3b 100644 --- a/gitnexus/src/storage/repo-manager.ts +++ b/gitnexus/src/storage/repo-manager.ts @@ -82,6 +82,12 @@ export interface RepoMeta { communities?: number; processes?: number; embeddings?: number; + parserCoverage?: { + totalFiles: number; + supportedFiles: number; + unsupportedFiles: number; + unsupportedByExtension: Array<{ extension: string; count: number }>; + }; }; /** * Bumped whenever incremental-indexing invariants change in an diff --git a/gitnexus/src/types/pipeline.ts b/gitnexus/src/types/pipeline.ts index 5ad08cae64..29a8ad3737 100644 --- a/gitnexus/src/types/pipeline.ts +++ b/gitnexus/src/types/pipeline.ts @@ -3,6 +3,24 @@ import { CommunityDetectionResult } from '../core/ingestion/community-processor. import { ProcessDetectionResult } from '../core/ingestion/process-processor.js'; import type { ResolutionOutcome } from '../core/ingestion/scope-resolution/resolution-outcome.js'; +/** Per-extension breakdown of unsupported files */ +export interface UnsupportedExtension { + extension: string; + count: number; +} + +/** Parser coverage stats — tracks which files were parsed vs skipped */ +export interface ParserCoverage { + /** Total source files in repo (before language filtering) */ + totalFiles: number; + /** Files with supported extensions that entered the parse pipeline */ + supportedFiles: number; + /** Files with unsupported extensions (no grammar defined) */ + unsupportedFiles: number; + /** Per-extension breakdown of unsupported files, sorted by count desc */ + unsupportedByExtension: UnsupportedExtension[]; +} + // CLI-specific: in-memory result with graph + detection results export interface PipelineResult { graph: KnowledgeGraph; @@ -27,4 +45,6 @@ export interface PipelineResult { * affordance so regression suites can prove the pool engaged. */ usedWorkerPool: boolean; + /** Parser coverage stats — which files were parsed vs skipped */ + parserCoverage?: ParserCoverage; } diff --git a/gitnexus/test/unit/language-availability-skip.test.ts b/gitnexus/test/unit/language-availability-skip.test.ts index 76ad659c28..79a375cf94 100644 --- a/gitnexus/test/unit/language-availability-skip.test.ts +++ b/gitnexus/test/unit/language-availability-skip.test.ts @@ -86,4 +86,42 @@ describe('native parser availability — unavailable language is skipped, not cr ); expect(warned).toBe(true); }); + + it('reports parser coverage for unsupported extensions without spawning a pool', async () => { + const files: Record = { + 'scripts/bootstrap.sh': '#!/usr/bin/env bash\necho hi\n', + 'data/report.csv': 'a,b,c\n', + }; + for (const [rel, content] of Object.entries(files)) { + const abs = path.join(repoDir, rel); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content); + } + + const scanned = Object.keys(files).map((rel) => ({ + path: rel, + size: fs.statSync(path.join(repoDir, rel)).size, + })); + + const result = await runChunkedParseAndResolve( + createKnowledgeGraph(), + scanned, + Object.keys(files), + Object.keys(files).length, + repoDir, + Date.now(), + () => {}, + ); + + expect(result.parserCoverage).toEqual({ + totalFiles: 2, + supportedFiles: 0, + unsupportedFiles: 2, + unsupportedByExtension: [ + { extension: '.sh', count: 1 }, + { extension: '.csv', count: 1 }, + ], + }); + expect(result.usedWorkerPool).toBe(false); + }); });