diff --git a/gitnexus/src/cli/analyze.ts b/gitnexus/src/cli/analyze.ts index 3f196b11d1..a3bf6cd5a6 100644 --- a/gitnexus/src/cli/analyze.ts +++ b/gitnexus/src/cli/analyze.ts @@ -37,7 +37,7 @@ import { } from './analyze-config.js'; import { runFullAnalysis } from '../core/run-analyze.js'; import { getMaxFileSizeBannerMessage } from '../core/ingestion/utils/max-file-size.js'; -import { warnMissingOptionalGrammars } from './optional-grammars.js'; +import { warnMissingOptionalGrammars, getOptionalGrammarExtensions } from './optional-grammars.js'; import { glob } from 'glob'; import fs from 'fs/promises'; import { cliError } from './cli-message.js'; @@ -943,11 +943,13 @@ const analyzeCommandImpl = async ( } // If the target repo contains files an optional grammar would parse but - // that grammar's native binding is absent, warn before analysis so users - // learn why those files end up unparsed instead of silently getting a - // degraded index. + // that grammar's native binding is absent (or disabled via + // GITNEXUS_SKIP_OPTIONAL_GRAMMARS), warn before analysis so users learn why + // those files end up unparsed instead of silently getting a degraded index. + // The extension set is derived from OPTIONAL_GRAMMARS so it can't drift. try { - const matches = await glob(['**/*.dart', '**/*.proto'], { + const optionalGlobs = getOptionalGrammarExtensions().map((e) => `**/*${e}`); + const matches = await glob(optionalGlobs, { cwd: repoPath, ignore: ['**/node_modules/**', '**/.git/**', '**/dist/**', '**/build/**'], dot: false, diff --git a/gitnexus/src/cli/optional-grammars.ts b/gitnexus/src/cli/optional-grammars.ts index e12b471e0e..994016a569 100644 --- a/gitnexus/src/cli/optional-grammars.ts +++ b/gitnexus/src/cli/optional-grammars.ts @@ -4,18 +4,22 @@ * tree-sitter-dart, tree-sitter-proto, and tree-sitter-swift are vendored * under vendor/ and materialized into node_modules/ at postinstall. Dart * and Proto are built from source with node-gyp; Swift ships platform - * prebuilds activated via node-gyp-build. All three can be skipped via + * prebuilds activated via node-gyp-build. tree-sitter-kotlin is a declared + * optionalDependency (not vendored). All can be skipped via * GITNEXUS_SKIP_OPTIONAL_GRAMMARS=1 (postinstall scripts), or can silently - * soft-fail when the toolchain is missing (Dart/Proto) or no prebuild - * matches the host platform (Swift). + * soft-fail when the toolchain is missing (Dart/Proto), when no prebuild + * matches the host platform (Swift), or when the optional install was + * skipped or its native build failed (Kotlin). * * Either path produces the same observable: the .node binding is absent * at runtime. This helper detects that condition and surfaces a single - * stderr line per missing grammar so users learn why .dart/.proto/.swift + * stderr line per missing grammar so users learn why .dart/.proto/.swift/.kt * support is unavailable instead of silently getting a degraded index. */ import { createRequire } from 'module'; +import { SupportedLanguages } from 'gitnexus-shared'; +import { isGrammarRuntimeSkipped } from '../core/tree-sitter/parser-loader.js'; import { cliWarn } from './cli-message.js'; const _require = createRequire(import.meta.url); @@ -27,17 +31,55 @@ interface OptionalGrammar { pkg: string; /** File extensions this grammar parses */ extensions: string[]; + /** + * SupportedLanguages id, when this grammar backs an ingestion language. + * Used to ask `isGrammarRuntimeSkipped` whether the grammar was disabled via + * `GITNEXUS_SKIP_OPTIONAL_GRAMMARS` (vs. genuinely missing). Omitted for + * `.proto`, which is a gRPC-extractor concern, not a SupportedLanguages. + */ + language?: SupportedLanguages; } const OPTIONAL_GRAMMARS: OptionalGrammar[] = [ - { name: 'tree-sitter-dart', pkg: 'tree-sitter-dart', extensions: ['.dart'] }, + { + name: 'tree-sitter-dart', + pkg: 'tree-sitter-dart', + extensions: ['.dart'], + language: SupportedLanguages.Dart, + }, { name: 'tree-sitter-proto', pkg: 'tree-sitter-proto', extensions: ['.proto'] }, - { name: 'tree-sitter-swift', pkg: 'tree-sitter-swift', extensions: ['.swift'] }, + { + name: 'tree-sitter-swift', + pkg: 'tree-sitter-swift', + extensions: ['.swift'], + language: SupportedLanguages.Swift, + }, + { + name: 'tree-sitter-kotlin', + pkg: 'tree-sitter-kotlin', + extensions: ['.kt', '.kts'], + language: SupportedLanguages.Kotlin, + }, ]; +/** + * The file extensions backed by an optional grammar — the single source for + * the `analyze` preflight glob (so the glob can't drift from this list). + */ +export function getOptionalGrammarExtensions(): string[] { + return [...new Set(OPTIONAL_GRAMMARS.flatMap((g) => g.extensions))]; +} + export interface MissingGrammar { name: string; extensions: string[]; + /** + * `missing` — the native binding could not be loaded (not installed / build + * soft-failed / no prebuild). `skipped` — the binding is fine but the user + * disabled it via `GITNEXUS_SKIP_OPTIONAL_GRAMMARS`. Drives the warning text + * so a deliberate opt-out is not told to reinstall. + */ + reason: 'missing' | 'skipped'; } /** @@ -59,6 +101,13 @@ export interface MissingGrammar { export function detectMissingOptionalGrammars(): MissingGrammar[] { const missing: MissingGrammar[] = []; for (const g of OPTIONAL_GRAMMARS) { + // Deliberate runtime opt-out comes first: even an installed binding is + // treated as unavailable, with a `skipped` reason so the warning says so + // instead of suggesting a reinstall (#2101 review). + if (g.language !== undefined && isGrammarRuntimeSkipped(g.language)) { + missing.push({ name: g.name, extensions: g.extensions, reason: 'skipped' }); + continue; + } try { _require(g.pkg); } catch (err) { @@ -80,7 +129,7 @@ export function detectMissingOptionalGrammars(): MissingGrammar[] { { grammar: g.name, extensions: g.extensions, error: msg }, ); } - missing.push({ name: g.name, extensions: g.extensions }); + missing.push({ name: g.name, extensions: g.extensions, reason: 'missing' }); } } return missing; @@ -110,9 +159,16 @@ export function warnMissingOptionalGrammars(opts?: { if (relevantExtensions && !g.extensions.some((e) => relevantExtensions.has(e))) { continue; } - cliWarn( - `GitNexus${ctx}: optional grammar "${g.name}" is unavailable — ${g.extensions.join('/')} files will not be parsed. Reinstall without GITNEXUS_SKIP_OPTIONAL_GRAMMARS=1 (and ensure python3, make, g++) to enable.`, - { grammar: g.name, extensions: g.extensions, context: opts?.context }, - ); + const exts = g.extensions.join('/'); + const message = + g.reason === 'skipped' + ? `GitNexus${ctx}: optional grammar "${g.name}" is disabled via GITNEXUS_SKIP_OPTIONAL_GRAMMARS — ${exts} files will not be parsed. Unset the variable to re-enable.` + : `GitNexus${ctx}: optional grammar "${g.name}" is unavailable — ${exts} files will not be parsed. Reinstall without GITNEXUS_SKIP_OPTIONAL_GRAMMARS=1 (and ensure python3, make, g++) to enable.`; + cliWarn(message, { + grammar: g.name, + extensions: g.extensions, + reason: g.reason, + context: opts?.context, + }); } } diff --git a/gitnexus/src/core/ingestion/languages/dart/query.ts b/gitnexus/src/core/ingestion/languages/dart/query.ts index 7d66b1c4a5..5314b0c8b1 100644 --- a/gitnexus/src/core/ingestion/languages/dart/query.ts +++ b/gitnexus/src/core/ingestion/languages/dart/query.ts @@ -23,7 +23,15 @@ */ import Parser from 'tree-sitter'; -import Dart from 'tree-sitter-dart'; +import { SupportedLanguages } from 'gitnexus-shared'; +// `tree-sitter-dart` is an optional/vendored grammar that may be absent on a +// default install. Loaded lazily + guarded via parser-loader rather than +// statically imported: this module is pulled onto the main thread eagerly by +// the scope-resolution registry and the language-provider index, so a top-level +// `import Dart from 'tree-sitter-dart'` would throw ERR_MODULE_NOT_FOUND at +// module-load and crash `analyze` even for repos with no Dart files (#2091, +// #2093). The grammar is only ever needed inside the lazy getters below. +import { getLanguageGrammar } from '../../../tree-sitter/parser-loader.js'; const DART_SCOPE_QUERY = ` ; ── Scopes ─────────────────────────────────────────────────────────────────── @@ -134,14 +142,19 @@ let _query: Parser.Query | null = null; export function getDartParser(): Parser { if (_parser === null) { _parser = new Parser(); - _parser.setLanguage(Dart as Parameters[0]); + _parser.setLanguage( + getLanguageGrammar(SupportedLanguages.Dart) as Parameters[0], + ); } return _parser; } export function getDartScopeQuery(): Parser.Query { if (_query === null) { - _query = new Parser.Query(Dart as Parameters[0], DART_SCOPE_QUERY); + _query = new Parser.Query( + getLanguageGrammar(SupportedLanguages.Dart) as Parameters[0], + DART_SCOPE_QUERY, + ); } return _query; } diff --git a/gitnexus/src/core/ingestion/languages/kotlin/query.ts b/gitnexus/src/core/ingestion/languages/kotlin/query.ts index 31613a38e3..f3a7490866 100644 --- a/gitnexus/src/core/ingestion/languages/kotlin/query.ts +++ b/gitnexus/src/core/ingestion/languages/kotlin/query.ts @@ -1,5 +1,13 @@ import Parser from 'tree-sitter'; -import Kotlin from 'tree-sitter-kotlin'; +import { SupportedLanguages } from 'gitnexus-shared'; +// `tree-sitter-kotlin` is an optionalDependency that may be absent on a default +// install (or fail its native build). Loaded lazily + guarded via parser-loader +// rather than statically imported: this module is pulled onto the main thread +// eagerly by the scope-resolution registry and the language-provider index, so +// a top-level `import Kotlin from 'tree-sitter-kotlin'` would throw +// ERR_MODULE_NOT_FOUND at module-load and crash `analyze` even for repos with no +// Kotlin files (#2091, #2093). The grammar is only ever needed in the getters. +import { getLanguageGrammar } from '../../../tree-sitter/parser-loader.js'; const KOTLIN_SCOPE_QUERY = ` ;; Scopes @@ -179,14 +187,19 @@ let query: Parser.Query | null = null; export function getKotlinParser(): Parser { if (parser === null) { parser = new Parser(); - parser.setLanguage(Kotlin as Parameters[0]); + parser.setLanguage( + getLanguageGrammar(SupportedLanguages.Kotlin) as Parameters[0], + ); } return parser; } export function getKotlinScopeQuery(): Parser.Query { if (query === null) { - query = new Parser.Query(Kotlin as Parameters[0], KOTLIN_SCOPE_QUERY); + query = new Parser.Query( + getLanguageGrammar(SupportedLanguages.Kotlin) as Parameters[0], + KOTLIN_SCOPE_QUERY, + ); } return query; } diff --git a/gitnexus/src/core/ingestion/languages/swift/query.ts b/gitnexus/src/core/ingestion/languages/swift/query.ts index cf26994232..78c1efa417 100644 --- a/gitnexus/src/core/ingestion/languages/swift/query.ts +++ b/gitnexus/src/core/ingestion/languages/swift/query.ts @@ -43,7 +43,15 @@ */ import Parser from 'tree-sitter'; -import Swift from 'tree-sitter-swift'; +import { SupportedLanguages } from 'gitnexus-shared'; +// `tree-sitter-swift` is an optional/vendored grammar that may be absent on a +// default install. It is loaded lazily + guarded via parser-loader rather than +// statically imported: this module is pulled onto the main thread eagerly by +// the scope-resolution registry and the language-provider index, so a top-level +// `import Swift from 'tree-sitter-swift'` would throw ERR_MODULE_NOT_FOUND at +// module-load and crash `analyze` even for repos with no Swift files (#2091, +// #2093). The grammar is only ever needed inside the lazy getters below. +import { getLanguageGrammar } from '../../../tree-sitter/parser-loader.js'; const SWIFT_SCOPE_QUERY = ` ;; ── Scopes ────────────────────────────────────────────────────────── @@ -186,14 +194,19 @@ let _query: Parser.Query | null = null; export function getSwiftParser(): Parser { if (_parser === null) { _parser = new Parser(); - _parser.setLanguage(Swift as Parameters[0]); + _parser.setLanguage( + getLanguageGrammar(SupportedLanguages.Swift) as Parameters[0], + ); } return _parser; } export function getSwiftScopeQuery(): Parser.Query { if (_query === null) { - _query = new Parser.Query(Swift as Parameters[0], SWIFT_SCOPE_QUERY); + _query = new Parser.Query( + getLanguageGrammar(SupportedLanguages.Swift) as Parameters[0], + SWIFT_SCOPE_QUERY, + ); } return _query; } diff --git a/gitnexus/src/core/ingestion/pipeline-phases/parse-impl.ts b/gitnexus/src/core/ingestion/pipeline-phases/parse-impl.ts index 244f178dd5..ddc8c902e1 100644 --- a/gitnexus/src/core/ingestion/pipeline-phases/parse-impl.ts +++ b/gitnexus/src/core/ingestion/pipeline-phases/parse-impl.ts @@ -43,9 +43,13 @@ import { type ExportedTypeMap, } from '../call-processor.js'; import { createSemanticModel, type MutableSemanticModel } from '../model/index.js'; -import { type PipelineProgress, getLanguageFromFilename } from 'gitnexus-shared'; +import { + type PipelineProgress, + getLanguageFromFilename, + SupportedLanguages, +} from 'gitnexus-shared'; import { readFileContents } from '../filesystem-walker.js'; -import { isLanguageAvailable } from '../../tree-sitter/parser-loader.js'; +import { isLanguageAvailable, isGrammarRuntimeSkipped } from '../../tree-sitter/parser-loader.js'; import { createWorkerPool, workerPoolDisabledByEnv, @@ -274,9 +278,18 @@ export async function runChunkedParseAndResolve( } } for (const [lang, count] of skippedByLang) { - logger.warn( - `Skipping ${count} ${lang} file(s) — ${lang} parser not available (native binding may not have built). Try: npm rebuild tree-sitter-${lang}`, - ); + // Distinguish a deliberate runtime opt-out from a genuinely-missing binding + // so we don't tell a user who set GITNEXUS_SKIP_OPTIONAL_GRAMMARS to + // `npm rebuild` a grammar that built fine (#2091/#2093 review). + if (isGrammarRuntimeSkipped(lang as SupportedLanguages)) { + logger.warn( + `Skipping ${count} ${lang} file(s) — ${lang} parsing disabled via GITNEXUS_SKIP_OPTIONAL_GRAMMARS.`, + ); + } else { + logger.warn( + `Skipping ${count} ${lang} file(s) — ${lang} parser not available (native binding may not have built). Try: npm rebuild tree-sitter-${lang}`, + ); + } } // Sort parseableScanned alphabetically for stable chunk membership diff --git a/gitnexus/src/core/ingestion/scope-resolution/pipeline/phase.ts b/gitnexus/src/core/ingestion/scope-resolution/pipeline/phase.ts index 3c3f688ec6..d686bdd430 100644 --- a/gitnexus/src/core/ingestion/scope-resolution/pipeline/phase.ts +++ b/gitnexus/src/core/ingestion/scope-resolution/pipeline/phase.ts @@ -31,6 +31,7 @@ import type { ParseOutput } from '../../pipeline-phases/parse.js'; import { SupportedLanguages, getLanguageFromFilename } from 'gitnexus-shared'; import { readFileContents } from '../../filesystem-walker.js'; import { runScopeResolution, type ScopeResolutionSubPhase } from './run.js'; +import { isLanguageAvailable } from '../../../tree-sitter/parser-loader.js'; import { buildGraphNodeLookup } from '../graph-bridge/node-lookup.js'; import { SCOPE_RESOLVERS } from './registry.js'; import { isDev, isSemanticModelValidatorEnabled } from '../../utils/env.js'; @@ -170,6 +171,15 @@ export const scopeResolutionPhase: PipelinePhase = { for (const f of scannedFiles) { const fileLang = getLanguageFromFilename(f.path); if (fileLang === null) continue; + // Skip files whose grammar isn't available (optional grammars like + // swift/dart/kotlin on an install where the binding is absent or the + // user set GITNEXUS_SKIP_OPTIONAL_GRAMMARS). The parse phase already + // excluded and warned about these (parse-impl.ts); without this guard the + // file would fall through to the main-thread re-extract in run.ts and + // throw "Unsupported language" (caught, but noisy, and it needlessly + // loads the grammar on the main thread). `isLanguageAvailable` is + // memoized, so this stays O(1) per language. (#2091, #2093) + if (!isLanguageAvailable(fileLang)) continue; let bucket = filesByLang.get(fileLang); if (bucket === undefined) { bucket = []; diff --git a/gitnexus/src/core/ingestion/workers/parse-worker.ts b/gitnexus/src/core/ingestion/workers/parse-worker.ts index b0f7878550..b32d957589 100644 --- a/gitnexus/src/core/ingestion/workers/parse-worker.ts +++ b/gitnexus/src/core/ingestion/workers/parse-worker.ts @@ -36,6 +36,19 @@ import type { /** Language grammar type accepted by Parser.setLanguage(). */ type TreeSitterLanguage = Parameters[0]; +// ── Worker grammar loading — enforcement boundary (#2091/#2093, #2101) ─────── +// The worker maintains its own grammar table (the guarded `_require`s below + +// `languageMap`) and intentionally does NOT consult the runtime +// `GITNEXUS_SKIP_OPTIONAL_GRAMMARS` opt-out. It does not need to: the MAIN +// THREAD's `parseableScanned` filter (pipeline-phases/parse-impl.ts, gated on +// `parser-loader.isLanguageAvailable`, which honors the runtime opt-out and a +// genuinely-absent binding alike) excludes files of an unavailable/opted-out +// language BEFORE any chunk is dispatched, so the worker never receives them. +// That main-thread filter is the single enforcement point. Any future change +// that dispatches files to the worker WITHOUT first passing them through +// `isLanguageAvailable` must re-introduce the gate here. (The cleaner end-state +// — routing this table through `parser-loader.getLanguageGrammar` so there is +// one loader — is the deferred Tier-1 consolidation.) // tree-sitter-swift is an optionalDependency — may not be installed const _require = createRequire(import.meta.url); let Swift: TreeSitterLanguage | null = null; diff --git a/gitnexus/src/core/tree-sitter/parser-loader.ts b/gitnexus/src/core/tree-sitter/parser-loader.ts index a51634fbe5..3d378f2f03 100644 --- a/gitnexus/src/core/tree-sitter/parser-loader.ts +++ b/gitnexus/src/core/tree-sitter/parser-loader.ts @@ -39,6 +39,15 @@ interface GrammarSource { unavailableNote: string; optional?: boolean; severity?: 'warn' | 'error'; + /** + * When true, this grammar may be disabled at runtime via + * `GITNEXUS_SKIP_OPTIONAL_GRAMMARS`. Set ONLY on genuinely-optional grammars + * (optionalDependencies / vendored — swift/dart/kotlin). Required dependencies + * routed through the optional machinery for ABI safety (e.g. C, which is + * `optional: true` + `severity: 'error'`) must NOT set this — opting out of a + * required parser is always an install/platform problem, never a user choice. + */ + userSkippable?: boolean; } const ISSUES_URL = 'https://github.com/abhigyanpatwari/GitNexus/issues'; @@ -139,6 +148,7 @@ const SOURCES: Record = { [SupportedLanguages.Swift]: { load: () => _require('tree-sitter-swift'), optional: true, + userSkippable: true, unavailableNote: 'Swift parsing disabled: vendored `tree-sitter-swift` (under ' + '`gitnexus/vendor/tree-sitter-swift`) failed to load. ' + @@ -148,6 +158,7 @@ const SOURCES: Record = { [SupportedLanguages.Dart]: { load: () => _require('tree-sitter-dart'), optional: true, + userSkippable: true, unavailableNote: 'Dart parsing disabled: vendored `tree-sitter-dart` (under ' + '`gitnexus/vendor/tree-sitter-dart`) failed to load. ' + @@ -157,6 +168,7 @@ const SOURCES: Record = { [SupportedLanguages.Kotlin]: { load: () => _require('tree-sitter-kotlin'), optional: true, + userSkippable: true, unavailableNote: 'Kotlin parsing disabled: `tree-sitter-kotlin` is an optionalDependency ' + 'and is not installed (or its native binding failed to build).', @@ -189,6 +201,63 @@ type LoadResult = const loadCache = new Map(); const logged = new Set(); +/** + * Runtime opt-out for genuinely-optional grammars (Swift/Dart/Kotlin). + * + * `GITNEXUS_SKIP_OPTIONAL_GRAMMARS` has historically been an *install-time* + * env only — the postinstall build scripts read it to skip building the + * vendored grammars. There was no way to disable an optional grammar at + * analyze time, so users on a platform with a broken/partial binding had no + * escape hatch short of uninstalling the package (#2091, #2093). This honors + * the same env name at runtime: when set, the named optional grammars report + * as unavailable and the pipeline skips their files (mirroring a genuinely + * absent binding) instead of attempting to load them. + * + * Accepts `1` / `true` / `all` / `*` (every skippable grammar), or a + * comma-separated list of language ids and/or package names + * (e.g. `swift,tree-sitter-dart`). Only grammars flagged `userSkippable` (the + * genuinely-optional swift/dart/kotlin) can be skipped — required dependencies + * routed through the optional machinery for ABI safety (C) carry no + * `userSkippable` and are never skippable here. + */ +type SkipDirective = 'all' | Set | null; + +// Parsed form of GITNEXUS_SKIP_OPTIONAL_GRAMMARS, resolved lazily ONCE per +// process. The env is set before analyze runs, so re-reading + re-allocating a +// Set on every call was wasted work (and a latent trap for any future per-file +// caller). `vi.resetModules()` gives the unit tests a fresh module — and thus a +// fresh memo — per case, so this stays test-friendly. +// 'all' → every userSkippable grammar; Set → only the named ids +// (and `tree-sitter-` spellings); null → env unset/empty (nothing). +let _skipDirective: SkipDirective | undefined; +const skipDirective = (): SkipDirective => { + if (_skipDirective !== undefined) return _skipDirective; + const raw = (process.env.GITNEXUS_SKIP_OPTIONAL_GRAMMARS ?? '').trim().toLowerCase(); + if (raw === '') return (_skipDirective = null); + if (raw === '1' || raw === 'true' || raw === 'all' || raw === '*') + return (_skipDirective = 'all'); + return (_skipDirective = new Set( + raw + .split(',') + .map((s) => s.trim()) + .filter(Boolean) + .flatMap((s) => [s, s.replace(/^tree-sitter-/, '')]), + )); +}; + +const isRuntimeSkippedGrammar = (key: string, source: GrammarSource): boolean => { + // Only grammars explicitly flagged user-skippable (swift/dart/kotlin) — never + // required deps that use the optional machinery for ABI safety (C carries no + // `userSkippable`). + if (source.userSkippable !== true) return false; + const directive = skipDirective(); + if (directive === null) return false; + if (directive === 'all') return true; + // `key` is the SupportedLanguages value (e.g. `swift`); the directive Set + // already holds both the bare id and the `tree-sitter-` spelling. + return directive.has(key) || directive.has(`tree-sitter-${key}`); +}; + const logFailure = (key: string, result: LoadResult): void => { if (result.ok === true) return; if (logged.has(key)) return; @@ -227,6 +296,25 @@ const loadGrammar = (key: string): LoadResult => { return result; } + // Runtime opt-out: treat a user-skipped optional grammar exactly like an + // absent binding (non-fatal unavailable + one warning), without attempting + // the native load. See `isRuntimeSkippedGrammar`. + if (isRuntimeSkippedGrammar(key, source)) { + // Deliberate opt-out: emit an accurate "disabled on purpose" note rather + // than `source.unavailableNote` (which blames a missing/unbuilt binding and + // would mislead a user who set the env intentionally — #2101 review). + const result: LoadResult = { + ok: false, + error: new Error('runtime opt-out'), + note: `${key} parsing disabled via GITNEXUS_SKIP_OPTIONAL_GRAMMARS (unset it to re-enable).`, + fatal: false, + severity: 'warn', + }; + loadCache.set(key, result); + logFailure(key, result); + return result; + } + let result: LoadResult; try { result = { ok: true, grammar: source.load() }; @@ -248,6 +336,22 @@ const loadGrammar = (key: string): LoadResult => { export const isLanguageAvailable = (language: SupportedLanguages, filePath?: string): boolean => loadGrammar(resolveLanguageKey(language, filePath)).ok; +/** + * True when `language`'s grammar is being treated as unavailable specifically + * because of the runtime GITNEXUS_SKIP_OPTIONAL_GRAMMARS opt-out — as opposed + * to a genuinely-missing/broken native binding. Lets callers surface an + * accurate "skipped on purpose" message instead of a spurious "npm rebuild" + * recovery hint. Returns false for required grammars and for an absent env. + */ +export const isGrammarRuntimeSkipped = ( + language: SupportedLanguages, + filePath?: string, +): boolean => { + const key = resolveLanguageKey(language, filePath); + const source = SOURCES[key]; + return source !== undefined && isRuntimeSkippedGrammar(key, source); +}; + export const getLanguageGrammar = (language: SupportedLanguages, filePath?: string): unknown => { const key = resolveLanguageKey(language, filePath); const result = loadGrammar(key); diff --git a/gitnexus/test/integration/optional-grammars/registry-import-closure.test.ts b/gitnexus/test/integration/optional-grammars/registry-import-closure.test.ts new file mode 100644 index 0000000000..09acea0264 --- /dev/null +++ b/gitnexus/test/integration/optional-grammars/registry-import-closure.test.ts @@ -0,0 +1,130 @@ +/** + * Optional-grammar static-import-closure regression test (#2091, #2093). + * + * The scope-resolution registry (`scope-resolution/pipeline/registry.ts`) and + * the language-provider index statically import all 16 language providers. Each + * per-language `query.ts` used to do a top-level `import X from 'tree-sitter-Y'`. + * For the OPTIONAL grammars (swift/dart/kotlin) that import resolved — and on a + * default install where the vendored/optional binding is absent, THREW + * `ERR_MODULE_NOT_FOUND` — at module-load on the main thread, before any runtime + * gate, crashing `gitnexus analyze` regardless of the repo's actual languages. + * + * The fix routes those three `query.ts` modules through the lazy, guarded + * `parser-loader.getLanguageGrammar()` so the grammar binding is only required + * at first use (inside the worker, for a file of that language) — never at + * module-load. + * + * This test locks the fix in WITHOUT needing to simulate a missing grammar: + * spawn a child Node process, import the built scope-resolution `registry.js` + * (the crash-chain root), and assert no OPTIONAL tree-sitter binding + * (swift/dart/kotlin) appears in the module cache. Pre-fix the static imports + * loaded those bindings at import time (this assertion fails); post-fix they + * are lazy (it passes). Required grammars (python/typescript/...) still load + * eagerly via their own `query.ts` — that is expected and NOT asserted against. + * + * Characterization-first: this MUST fail against the pre-fix code (run against + * the parent commit to verify the regression signal works). + */ + +import { describe, it, expect } from 'vitest'; +import { spawnSync } from 'node:child_process'; +import path from 'node:path'; +import fs from 'node:fs'; +import { fileURLToPath, pathToFileURL } from 'node:url'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const REPO_ROOT = path.resolve(__dirname, '..', '..', '..'); +const DIST_REGISTRY = path.join( + REPO_ROOT, + 'dist', + 'core', + 'ingestion', + 'scope-resolution', + 'pipeline', + 'registry.js', +); +const DIST_REGISTRY_URL = pathToFileURL(DIST_REGISTRY).href; + +// Import the registry, then report every newly-loaded CJS-cache key. The cache +// tracks native/.node bindings loaded by either ESM or CJS importers, which is +// exactly how a tree-sitter grammar binding surfaces. +const PROBE = ` + import { createRequire } from 'node:module'; + const req = createRequire(import.meta.url); + const before = new Set(Object.keys(req.cache)); + await import(process.env.PROBE_TARGET); + const after = new Set(Object.keys(req.cache)); + process.stdout.write(JSON.stringify([...after].filter((k) => !before.has(k)))); +`; + +const OPTIONAL_GRAMMAR_RE = /tree-sitter-(swift|dart|kotlin)[\\/]/; + +describe('optional-grammar static-import closure (#2091/#2093)', () => { + it('importing the scope-resolution registry loads NO optional grammar binding', () => { + if (!fs.existsSync(DIST_REGISTRY)) { + throw new Error( + `${DIST_REGISTRY} missing — run \`npm run build\` first (or \`npm run test:integration\`, ` + + `which builds via pretest:integration).`, + ); + } + + const result = spawnSync(process.execPath, ['--input-type=module', '-e', PROBE], { + cwd: REPO_ROOT, + // NODE_OPTIONS cleared so a session-pinned --max-old-space-size etc. can't + // perturb the child. The skip env is cleared so install state is probed. + env: { + ...process.env, + PROBE_TARGET: DIST_REGISTRY_URL, + NODE_OPTIONS: '', + GITNEXUS_SKIP_OPTIONAL_GRAMMARS: '', + }, + timeout: 60_000, + encoding: 'utf8', + }); + + // Post-fix, importing the registry must not throw even though the chain + // reaches swift/dart/kotlin query.ts. (Pre-fix on a machine missing a + // grammar this would be ERR_MODULE_NOT_FOUND; here the grammar is present + // so pre-fix it would instead surface as a loaded binding below.) + if (result.status !== 0) { + // status is null when the child was killed by a signal (e.g. a native + // addon SIGSEGV) — surface the signal so that's distinguishable from a + // non-zero exit / module-not-found. + const exit = + result.status !== null ? `status ${result.status}` : `signal ${result.signal ?? 'unknown'}`; + throw new Error( + `importing the scope-resolution registry failed (${exit}):\n` + + `stderr:\n${result.stderr}\nstdout:\n${result.stdout}`, + ); + } + + const newlyLoaded = JSON.parse(result.stdout) as string[]; + + // Non-vacuity guard: the registry's static-import closure MUST still reach + // the per-language query.ts modules (which is what makes "no optional + // binding loaded" meaningful). The REQUIRED grammars (python/typescript/…) + // still import their binding eagerly in their own query.ts, so at least one + // non-optional tree-sitter binding must appear. If a future refactor severs + // the registry→query.ts edge, this fails loudly instead of letting the + // optional-binding assertion pass green on a no-longer-exercised path. + const requiredLoaded = newlyLoaded.filter( + (p) => /tree-sitter-[a-z-]+[\\/]/.test(p) && !OPTIONAL_GRAMMAR_RE.test(p), + ); + expect( + requiredLoaded.length, + `Expected the registry import closure to load at least one REQUIRED tree-sitter ` + + `binding (proving the chain still reaches the per-language query.ts modules). ` + + `Newly-loaded (${newlyLoaded.length}):\n${newlyLoaded.join('\n')}`, + ).toBeGreaterThan(0); + + // Headline assertion: no OPTIONAL grammar binding (swift/dart/kotlin) is + // loaded at registry static-import time — they must load lazily. + const optionalLoaded = newlyLoaded.filter((p) => OPTIONAL_GRAMMAR_RE.test(p)); + expect( + optionalLoaded, + `Optional tree-sitter grammar binding(s) loaded at registry static-import time. ` + + `query.ts must load swift/dart/kotlin lazily via parser-loader, not via a ` + + `top-level \`import\`. Offending paths:\n${optionalLoaded.join('\n')}`, + ).toEqual([]); + }); +}); diff --git a/gitnexus/test/integration/optional-grammars/skip-optional-pipeline.test.ts b/gitnexus/test/integration/optional-grammars/skip-optional-pipeline.test.ts new file mode 100644 index 0000000000..d60fca427b --- /dev/null +++ b/gitnexus/test/integration/optional-grammars/skip-optional-pipeline.test.ts @@ -0,0 +1,110 @@ +/** + * Pipeline-level regression for the optional-grammar exclusion (#2091, #2093). + * + * Locks the scope-resolution phase guard added alongside the lazy query.ts + * load: `scopeResolutionPhase` filters its `filesByLang` partition by + * `isLanguageAvailable`, so a file of an unavailable optional grammar never + * falls through to the main-thread re-extract in `run.ts` (which would throw + * "Unsupported language" — caught, but noisy, and it needlessly loads the + * grammar on the main thread). + * + * Drives the REAL pipeline over a mixed Python+Swift repo with the runtime + * `GITNEXUS_SKIP_OPTIONAL_GRAMMARS` opt-out set (so Swift is treated as + * unavailable even though its binding is installed). This is the automated + * analog of the manual end-to-end verification: Python indexes, Swift is + * cleanly skipped, and the "scope extraction failed for …swift" noise never + * appears. + * + * `parser-loader` memoizes availability per process, so we `vi.resetModules()` + * BEFORE setting the env and dynamically import the pipeline + logger from the + * same fresh registry. That makes the first `isLanguageAvailable` call observe + * our env regardless of import order, and keeps the logger capture wired to the + * loader's logger instance (a static import would not survive resetModules). + */ + +import { describe, it, expect, beforeAll, afterAll, vi } from 'vitest'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import type { PipelineResult } from '../resolvers/helpers.js'; + +const ENV = 'GITNEXUS_SKIP_OPTIONAL_GRAMMARS'; + +describe('optional-grammar pipeline exclusion (#2091/#2093)', () => { + let repoDir = ''; + let result: PipelineResult; + let messages: string[] = []; + let prevEnv: string | undefined; + let getNodesByLabel: (r: PipelineResult, label: string) => string[]; + + beforeAll(async () => { + prevEnv = process.env[ENV]; + vi.resetModules(); + process.env[ENV] = 'swift'; + const helpers = await import('../resolvers/helpers.js'); + const loggerMod = await import('../../../src/core/logger.js'); + getNodesByLabel = helpers.getNodesByLabel; + + repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'og-skip-pipeline-')); + fs.writeFileSync( + path.join(repoDir, 'app.py'), + 'def greet(name):\n return f"hi {name}"\n\n\nclass Service:\n def run(self):\n return greet("world")\n', + ); + fs.writeFileSync( + path.join(repoDir, 'Foo.swift'), + 'struct Foo {\n func bar() -> Int { return 42 }\n}\n', + ); + + const cap = loggerMod._captureLogger(); + try { + result = await helpers.runPipelineFromRepo(repoDir, () => {}, { skipGraphPhases: true }); + messages = cap + .records() + .map((r) => (typeof r.msg === 'string' ? r.msg : '')) + .filter(Boolean); + } finally { + cap.restore(); + } + }, 60_000); + + afterAll(() => { + if (prevEnv === undefined) delete process.env[ENV]; + else process.env[ENV] = prevEnv; + if (repoDir) fs.rmSync(repoDir, { recursive: true, force: true }); + }); + + it('completes without crashing when an optional grammar is opted out', () => { + expect(result).toBeDefined(); + }); + + it('skips the Swift file at the parse phase (non-vacuity: Swift was present)', () => { + expect(messages.some((m) => /Skipping 1 swift file\(s\)/.test(m))).toBe(true); + }); + + it('routes the opt-out message, not the missing-binding "npm rebuild" hint', () => { + // The "Skipping N swift file(s)" prefix is shared by BOTH the opt-out and + // the missing-binding branches — so assert the opt-out branch specifically: + // a message naming the env var, and NO "npm rebuild" hint anywhere. This is + // what proves the isGrammarRuntimeSkipped routing in parse-impl.ts fired. + expect(messages.some((m) => /GITNEXUS_SKIP_OPTIONAL_GRAMMARS/.test(m))).toBe(true); + expect( + messages.every((m) => !/npm rebuild/i.test(m)), + messages.join('\n'), + ).toBe(true); + }); + + it('never falls through to the main-thread re-extract (no "scope extraction failed")', () => { + // This is the precise signal the scope-resolution phase guard eliminates. + // Without the `if (!isLanguageAvailable(fileLang)) continue;` in phase.ts + // the Swift file would reach run.ts's extractParsedFile and log this. + const offending = messages.filter((m) => /scope extraction failed/i.test(m)); + expect(offending, offending.join('\n')).toEqual([]); + }); + + it('indexes the available Python language and excludes Swift symbols', () => { + // Python indexed (proves the pipeline actually ran end-to-end). + expect(getNodesByLabel(result, 'Class')).toContain('Service'); + // Swift's struct must not be in the graph — it was excluded, not parsed. + expect(getNodesByLabel(result, 'Struct')).not.toContain('Foo'); + }); +}); diff --git a/gitnexus/test/unit/parser-loader-skip-optional.test.ts b/gitnexus/test/unit/parser-loader-skip-optional.test.ts new file mode 100644 index 0000000000..ae8cff396f --- /dev/null +++ b/gitnexus/test/unit/parser-loader-skip-optional.test.ts @@ -0,0 +1,141 @@ +import { describe, it, expect, afterEach, vi } from 'vitest'; +import { SupportedLanguages } from '../../src/config/supported-languages.js'; + +/** + * Runtime opt-out for optional grammars (#2091, #2093). + * + * `GITNEXUS_SKIP_OPTIONAL_GRAMMARS` used to be an install-time-only env (the + * postinstall build scripts read it). `parser-loader` now also honors it at + * analyze time: when set, genuinely-optional grammars (swift/dart/kotlin) + * report unavailable so the ingestion pipeline skips their files, mirroring a + * genuinely-absent binding. Grammars that are required `dependencies` routed + * through the optional machinery for ABI safety (C — `severity: 'error'`) are + * NEVER skippable this way. + * + * `parser-loader` memoizes load results at module scope, so each case loads a + * fresh copy via `vi.resetModules()` after setting the env. These assertions + * are install-state-robust: they only assert the SKIP direction (skip → false) + * and that required grammars are unaffected (true) — never that an optional + * grammar is positively available, which depends on the install/platform. + */ + +const ENV = 'GITNEXUS_SKIP_OPTIONAL_GRAMMARS'; + +async function freshLoader(skipValue: string | undefined) { + vi.resetModules(); + if (skipValue === undefined) delete process.env[ENV]; + else process.env[ENV] = skipValue; + return import('../../src/core/tree-sitter/parser-loader.js'); +} + +afterEach(() => { + delete process.env[ENV]; + vi.resetModules(); +}); + +describe('parser-loader GITNEXUS_SKIP_OPTIONAL_GRAMMARS runtime gate', () => { + it('skip=1 reports every optional grammar as unavailable', async () => { + const { isLanguageAvailable } = await freshLoader('1'); + expect(isLanguageAvailable(SupportedLanguages.Swift)).toBe(false); + expect(isLanguageAvailable(SupportedLanguages.Dart)).toBe(false); + expect(isLanguageAvailable(SupportedLanguages.Kotlin)).toBe(false); + }); + + it('skip=all/true/* also skip every optional grammar', async () => { + for (const v of ['all', 'true', '*']) { + const { isLanguageAvailable } = await freshLoader(v); + expect(isLanguageAvailable(SupportedLanguages.Swift), `value=${v}`).toBe(false); + expect(isLanguageAvailable(SupportedLanguages.Dart), `value=${v}`).toBe(false); + expect(isLanguageAvailable(SupportedLanguages.Kotlin), `value=${v}`).toBe(false); + } + }); + + it('does NOT skip required grammars — skip=all is a no-op for C / Python', async () => { + // Compare availability WITH skip=all against the baseline (no skip). The + // runtime opt-out must never change a required grammar's availability: + // C is `optional: true` + `severity: 'error'` (a required dep routed + // through the optional machinery for ABI safety, #1242), and Python is a + // plain required dep. Asserting EQUALITY (not positive truth) keeps this + // install-state-robust — C's native binding is intentionally fallible, so + // a positive assertion could flake on an ABI-mismatched matrix. + const base = await freshLoader(undefined); + const cBase = base.isLanguageAvailable(SupportedLanguages.C); + const pyBase = base.isLanguageAvailable(SupportedLanguages.Python); + const skipped = await freshLoader('all'); + expect(skipped.isLanguageAvailable(SupportedLanguages.C)).toBe(cBase); + expect(skipped.isLanguageAvailable(SupportedLanguages.Python)).toBe(pyBase); + }); + + it('a comma list skips ONLY the named grammars — un-named ones unaffected', async () => { + // Baseline (no skip) so the isolation check is install-state-robust. + const base = await freshLoader(undefined); + const dartBase = base.isLanguageAvailable(SupportedLanguages.Dart); + const kotlinBase = base.isLanguageAvailable(SupportedLanguages.Kotlin); + const { isLanguageAvailable } = await freshLoader('swift'); + expect(isLanguageAvailable(SupportedLanguages.Swift)).toBe(false); + // A prefix/union bug would skip these too — assert they match baseline. + expect(isLanguageAvailable(SupportedLanguages.Dart)).toBe(dartBase); + expect(isLanguageAvailable(SupportedLanguages.Kotlin)).toBe(kotlinBase); + }); + + it('accepts the tree-sitter- package spelling — others unaffected', async () => { + const base = await freshLoader(undefined); + const swiftBase = base.isLanguageAvailable(SupportedLanguages.Swift); + const kotlinBase = base.isLanguageAvailable(SupportedLanguages.Kotlin); + const { isLanguageAvailable } = await freshLoader('tree-sitter-dart'); + expect(isLanguageAvailable(SupportedLanguages.Dart)).toBe(false); + expect(isLanguageAvailable(SupportedLanguages.Swift)).toBe(swiftBase); + expect(isLanguageAvailable(SupportedLanguages.Kotlin)).toBe(kotlinBase); + }); + + it('accepts a multi-entry list', async () => { + const { isLanguageAvailable } = await freshLoader('kotlin, dart'); + expect(isLanguageAvailable(SupportedLanguages.Kotlin)).toBe(false); + expect(isLanguageAvailable(SupportedLanguages.Dart)).toBe(false); + }); + + it('getLanguageGrammar throws a clean "Unsupported language" for a skipped optional grammar', async () => { + const { getLanguageGrammar } = await freshLoader('all'); + expect(() => getLanguageGrammar(SupportedLanguages.Swift)).toThrow(/Unsupported language/); + }); + + it('an empty / unset env does not skip (required grammars load)', async () => { + const { isLanguageAvailable } = await freshLoader(undefined); + expect(isLanguageAvailable(SupportedLanguages.Python)).toBe(true); + }); + + it('isGrammarRuntimeSkipped reflects the opt-out, never for required grammars', async () => { + const swiftOnly = await freshLoader('swift'); + expect(swiftOnly.isGrammarRuntimeSkipped(SupportedLanguages.Swift)).toBe(true); + expect(swiftOnly.isGrammarRuntimeSkipped(SupportedLanguages.Dart)).toBe(false); + const all = await freshLoader('all'); + expect(all.isGrammarRuntimeSkipped(SupportedLanguages.Swift)).toBe(true); + // C is not `userSkippable` (required dep via the optional machinery) — even + // `all` must not mark it runtime-skipped. + expect(all.isGrammarRuntimeSkipped(SupportedLanguages.C)).toBe(false); + }); + + it('logs an accurate runtime-skip note, not a missing-binding message', async () => { + // Import the logger AND parser-loader from the SAME fresh module registry so + // the capture intercepts the loader's logger instance. + vi.resetModules(); + process.env[ENV] = 'swift'; + const { _captureLogger } = await import('../../src/core/logger.js'); + const { isLanguageAvailable } = await import('../../src/core/tree-sitter/parser-loader.js'); + const cap = _captureLogger(); + try { + isLanguageAvailable(SupportedLanguages.Swift); // triggers the one-time skip log + const msgs = cap + .records() + .map((r) => (typeof r.msg === 'string' ? r.msg : '')) + .filter(Boolean); + const skipMsg = msgs.find((m) => m.includes('GITNEXUS_SKIP_OPTIONAL_GRAMMARS')); + expect(skipMsg, `captured:\n${msgs.join('\n')}`).toBeTruthy(); + // The opt-out note must NOT borrow the install/platform "missing binding" + // language — that would tell a deliberate opt-out to reinstall/rebuild. + expect(skipMsg).not.toMatch(/no prebuilt|failed to load|npm rebuild/i); + } finally { + cap.restore(); + } + }); +});