diff --git a/gitnexus/src/core/ingestion/export-detection.ts b/gitnexus/src/core/ingestion/export-detection.ts index 3eb1b4f070..31d0722f4e 100644 --- a/gitnexus/src/core/ingestion/export-detection.ts +++ b/gitnexus/src/core/ingestion/export-detection.ts @@ -73,11 +73,6 @@ const CSHARP_DECL_TYPES = new Set([ 'struct_declaration', 'enum_declaration', 'record_declaration', - // tree-sitter-c-sharp absorbs 'record struct' and 'record class' into - // record_declaration — these two node types are listed defensively but - // never emitted by the grammar in practice (verified against ^0.23.1). - 'record_struct_declaration', - 'record_class_declaration', 'delegate_declaration', 'property_declaration', 'field_declaration', diff --git a/gitnexus/src/core/ingestion/field-extractors/configs/dart.ts b/gitnexus/src/core/ingestion/field-extractors/configs/dart.ts index ce68c1f34b..52f4c0ca73 100644 --- a/gitnexus/src/core/ingestion/field-extractors/configs/dart.ts +++ b/gitnexus/src/core/ingestion/field-extractors/configs/dart.ts @@ -45,12 +45,7 @@ export const dartConfig: FieldExtractionConfig = { // declaration > type_identifier (first named child usually) for (let i = 0; i < node.namedChildCount; i++) { const child = node.namedChild(i); - if ( - child && - (child.type === 'type_identifier' || - child.type === 'generic_type' || - child.type === 'function_type') - ) { + if (child && (child.type === 'type_identifier' || child.type === 'function_type')) { return extractSimpleTypeName(child) ?? child.text?.trim(); } } diff --git a/gitnexus/src/core/ingestion/field-extractors/configs/php.ts b/gitnexus/src/core/ingestion/field-extractors/configs/php.ts index ac4e6010b5..a9dec1f6fe 100644 --- a/gitnexus/src/core/ingestion/field-extractors/configs/php.ts +++ b/gitnexus/src/core/ingestion/field-extractors/configs/php.ts @@ -53,8 +53,7 @@ export const phpConfig: FieldExtractionConfig = { child.type === 'named_type' || child.type === 'optional_type' || child.type === 'primitive_type' || - child.type === 'intersection_type' || - child.type === 'nullable_type' + child.type === 'intersection_type' ) { return extractSimpleTypeName(child) ?? child.text?.trim(); } diff --git a/gitnexus/src/core/ingestion/field-extractors/configs/swift.ts b/gitnexus/src/core/ingestion/field-extractors/configs/swift.ts index 007ad66899..75c70ab95d 100644 --- a/gitnexus/src/core/ingestion/field-extractors/configs/swift.ts +++ b/gitnexus/src/core/ingestion/field-extractors/configs/swift.ts @@ -22,7 +22,7 @@ const SWIFT_VIS = new Set([ */ export const swiftConfig: FieldExtractionConfig = { language: SupportedLanguages.Swift, - typeDeclarationNodes: ['class_declaration', 'struct_declaration', 'protocol_declaration'], + typeDeclarationNodes: ['class_declaration', 'protocol_declaration'], fieldNodeTypes: ['property_declaration'], bodyNodeTypes: ['class_body', 'protocol_body'], defaultVisibility: 'internal', diff --git a/gitnexus/src/core/ingestion/field-extractors/typescript.ts b/gitnexus/src/core/ingestion/field-extractors/typescript.ts index 0a6a614434..974148a8d1 100644 --- a/gitnexus/src/core/ingestion/field-extractors/typescript.ts +++ b/gitnexus/src/core/ingestion/field-extractors/typescript.ts @@ -86,18 +86,6 @@ export class TypeScriptFieldExtractor extends BaseFieldExtractor { } } - // Check for modifier node (tree-sitter typescript may group these) - const modifiers = node.childForFieldName('modifiers'); - if (modifiers) { - for (let i = 0; i < modifiers.childCount; i++) { - const modifier = modifiers.child(i); - const modText = modifier?.text.trim() as FieldVisibility | undefined; - if (modText && TypeScriptFieldExtractor.VISIBILITY_MODIFIERS.has(modText)) { - return modText; - } - } - } - // TypeScript class members are public by default return 'public'; } @@ -113,16 +101,6 @@ export class TypeScriptFieldExtractor extends BaseFieldExtractor { } } - const modifiers = node.childForFieldName('modifiers'); - if (modifiers) { - for (let i = 0; i < modifiers.childCount; i++) { - const modifier = modifiers.child(i); - if (modifier && modifier.text === 'static') { - return true; - } - } - } - return false; } @@ -137,16 +115,6 @@ export class TypeScriptFieldExtractor extends BaseFieldExtractor { } } - const modifiers = node.childForFieldName('modifiers'); - if (modifiers) { - for (let i = 0; i < modifiers.childCount; i++) { - const modifier = modifiers.child(i); - if (modifier && modifier.text === 'readonly') { - return true; - } - } - } - return false; } diff --git a/gitnexus/src/core/ingestion/languages/cpp/arity-metadata.ts b/gitnexus/src/core/ingestion/languages/cpp/arity-metadata.ts index ad7b172bdc..e2afd51a5d 100644 --- a/gitnexus/src/core/ingestion/languages/cpp/arity-metadata.ts +++ b/gitnexus/src/core/ingestion/languages/cpp/arity-metadata.ts @@ -33,7 +33,6 @@ export function computeCppDeclarationArity(node: SyntaxNode): CppArityInfo { if ( child.type === 'parameter_declaration' || child.type === 'optional_parameter_declaration' || - child.type === 'variadic_parameter' || child.type === 'variadic_parameter_declaration' ) { params.push(child); @@ -60,11 +59,7 @@ export function computeCppDeclarationArity(node: SyntaxNode): CppArityInfo { // token in tree-sitter-cpp, detected via `hasEllipsis` above. // C++ parameter packs: `template void foo(Ts... args)` — // detected as `variadic_parameter_declaration`. - const isVariadic = - hasEllipsis || - params.some( - (p) => p.type === 'variadic_parameter' || p.type === 'variadic_parameter_declaration', - ); + const isVariadic = hasEllipsis || params.some((p) => p.type === 'variadic_parameter_declaration'); const optionalCount = params.filter((p) => p.type === 'optional_parameter_declaration').length; const requiredCount = params.filter( (p) => @@ -77,10 +72,7 @@ export function computeCppDeclarationArity(node: SyntaxNode): CppArityInfo { const types: string[] = []; const typeClasses: ParameterTypeClass[] = []; for (const p of params) { - if (p.type === 'variadic_parameter') { - types.push('...'); - typeClasses.push(unknownTypeClass('...')); - } else if (p.type === 'variadic_parameter_declaration') { + if (p.type === 'variadic_parameter_declaration') { // Parameter pack: treated as variadic types.push('...'); typeClasses.push(unknownTypeClass('...')); diff --git a/gitnexus/src/core/ingestion/languages/cpp/captures.ts b/gitnexus/src/core/ingestion/languages/cpp/captures.ts index f0e5e9a88e..f249284c86 100644 --- a/gitnexus/src/core/ingestion/languages/cpp/captures.ts +++ b/gitnexus/src/core/ingestion/languages/cpp/captures.ts @@ -1360,7 +1360,7 @@ function lookupAdlIdentifierType(identNode: SyntaxNode): CppAdlArgInfo | null { inner = next; continue; } - if (inner.type === 'reference_declarator' || inner.type === 'rvalue_reference_declarator') { + if (inner.type === 'reference_declarator') { // reference_declarator has a single child (the inner declarator). let next: SyntaxNode | null = null; for (let j = 0; j < inner.namedChildCount; j++) { diff --git a/gitnexus/src/core/ingestion/languages/csharp/captures.ts b/gitnexus/src/core/ingestion/languages/csharp/captures.ts index 5aafe6d193..2dba41803a 100644 --- a/gitnexus/src/core/ingestion/languages/csharp/captures.ts +++ b/gitnexus/src/core/ingestion/languages/csharp/captures.ts @@ -293,7 +293,9 @@ function terminalTypeNameNode(node: SyntaxNode): SyntaxNode | null { case 'qualified_name': return node.lastNamedChild; case 'generic_name': - return node.childForFieldName('name') ?? node.firstNamedChild; + // generic_name has no `name` field (verified by real parse, #1920); the + // base identifier is the first named child. + return node.firstNamedChild; default: return null; } diff --git a/gitnexus/src/core/ingestion/languages/java/captures.ts b/gitnexus/src/core/ingestion/languages/java/captures.ts index 5ca4700251..f922276310 100644 --- a/gitnexus/src/core/ingestion/languages/java/captures.ts +++ b/gitnexus/src/core/ingestion/languages/java/captures.ts @@ -165,10 +165,15 @@ export function emitJavaScopeCaptures( findNodeAtRange(tree.rootNode, anchor.range, 'object_creation_expression'); if (callNode !== null) { const argList = callNode.childForFieldName('arguments'); + // Exclude interleaved comments — tree-sitter-java emits `block_comment` / + // `line_comment` as named children of argument_list, which would inflate + // arity (and arity feeds call-processor symbol-ID generation). #1920 const args = argList === null ? [] - : argList.namedChildren.filter((c) => c !== null && c.type !== 'comment'); + : argList.namedChildren.filter( + (c) => c !== null && c.type !== 'block_comment' && c.type !== 'line_comment', + ); grouped['@reference.arity'] = syntheticCapture( '@reference.arity', callNode, diff --git a/gitnexus/src/core/ingestion/languages/php/import-decomposer.ts b/gitnexus/src/core/ingestion/languages/php/import-decomposer.ts index f174568050..5875df8c7a 100644 --- a/gitnexus/src/core/ingestion/languages/php/import-decomposer.ts +++ b/gitnexus/src/core/ingestion/languages/php/import-decomposer.ts @@ -116,23 +116,7 @@ function parseUseClause(clause: SyntaxNode, qualifier: PhpImportKind): PhpImport const source = qualName.text.trim(); if (source === '') return null; - // Strategy 1: explicit alias_clause wrapper (older grammar versions). - const aliasClause = findNamedChild(clause, 'alias_clause'); - if (aliasClause !== null) { - // alias_clause: "as" name - const aliasName = findNamedChild(aliasClause, 'name') ?? aliasClause.firstNamedChild; - const alias = aliasName?.text.trim() ?? ''; - if (alias === '') return null; - return { - kind: 'alias', - source, - name: alias, - alias, - atNode: clause, - }; - } - - // Strategy 2: bare sibling `name` node after the qualified_name. + // Strategy: bare sibling `name` node after the qualified_name. // tree-sitter-php (≥ 0.22) emits `use Foo\Bar as Baz` as: // namespace_use_clause // qualified_name "Foo\Bar" @@ -231,22 +215,7 @@ function parseInnerClause( const source = prefix !== '' ? `${prefix}\\${innerPath}` : innerPath; - // Strategy 1: explicit alias_clause wrapper (older grammar versions). - const aliasClause = findNamedChild(clause, 'alias_clause'); - if (aliasClause !== null) { - const aliasName = findNamedChild(aliasClause, 'name') ?? aliasClause.firstNamedChild; - const alias = aliasName?.text.trim() ?? ''; - if (alias === '') return null; - return { - kind: 'alias', - source, - name: alias, - alias, - atNode: clause, - }; - } - - // Strategy 2: bare sibling `name` node after the qualified_name (tree-sitter-php ≥ 0.22). + // Strategy: bare sibling `name` node after the qualified_name (tree-sitter-php ≥ 0.22). if (clause.namedChildCount >= 2) { const lastChild = clause.namedChild(clause.namedChildCount - 1); if (lastChild !== null && lastChild !== qualName && lastChild.type === 'name') { diff --git a/gitnexus/src/core/ingestion/languages/php/receiver-binding.ts b/gitnexus/src/core/ingestion/languages/php/receiver-binding.ts index 79709fe614..1074d3335a 100644 --- a/gitnexus/src/core/ingestion/languages/php/receiver-binding.ts +++ b/gitnexus/src/core/ingestion/languages/php/receiver-binding.ts @@ -27,6 +27,11 @@ const TYPE_DECL_NODE_TYPES = new Set([ 'interface_declaration', 'trait_declaration', 'enum_declaration', + // tree-sitter-php node for `new class {...}` (real node is `anonymous_class`, + // not `anonymous_class_declaration`). Included so the enclosing-type walk + // stops AT the anon class and the guard below skips it (otherwise a method in + // an anon class nested in a named class would mis-bind $this to the outer class). + 'anonymous_class', ]); const FUNCTION_NODE_TYPES = new Set([ @@ -98,7 +103,7 @@ export function synthesizePhpReceiverBinding(fnNode: SyntaxNode): CaptureMatch[] if (enclosingType === null) return []; // Anonymous class — skip (no stable name). - if (enclosingType.type === 'anonymous_class_declaration') return []; + if (enclosingType.type === 'anonymous_class') return []; const enclosingName = typeName(enclosingType); if (enclosingName === null) return []; @@ -106,10 +111,8 @@ export function synthesizePhpReceiverBinding(fnNode: SyntaxNode): CaptureMatch[] // Anchor the synthesized captures to the method body (compound_statement) // so they land inside the function scope, not at the class scope. // For interface/abstract methods that have no body, skip. - const bodyNode = - fnNode.childForFieldName('body') ?? - // arrow_function: body is the expression after `=>` - fnNode.childForFieldName('return_value'); + // tree-sitter-php arrow_function also exposes its expression via the `body` field. + const bodyNode = fnNode.childForFieldName('body'); if (bodyNode === null) return []; const out: CaptureMatch[] = []; diff --git a/gitnexus/src/core/ingestion/languages/python/depends-references.ts b/gitnexus/src/core/ingestion/languages/python/depends-references.ts index 333c4f7e28..4982b7dce7 100644 --- a/gitnexus/src/core/ingestion/languages/python/depends-references.ts +++ b/gitnexus/src/core/ingestion/languages/python/depends-references.ts @@ -32,7 +32,7 @@ export function synthesizeDependsReferences(fnNode: SyntaxNode): readonly Captur continue; } - const defaultValue = param.childForFieldName('value') ?? param.childForFieldName('default'); + const defaultValue = param.childForFieldName('value'); if (defaultValue === null) continue; const callNode = defaultValue.type === 'call' ? defaultValue : null; diff --git a/gitnexus/src/core/ingestion/languages/ruby/captures.ts b/gitnexus/src/core/ingestion/languages/ruby/captures.ts index 3cff9dbb36..63d9bf053e 100644 --- a/gitnexus/src/core/ingestion/languages/ruby/captures.ts +++ b/gitnexus/src/core/ingestion/languages/ruby/captures.ts @@ -183,7 +183,7 @@ export function emitRubyScopeCaptures( if (argList !== null) { for (let ai = 0; ai < argList.namedChildCount; ai++) { const arg = argList.namedChild(ai); - if (arg !== null && (arg.type === 'simple_symbol' || arg.type === 'symbol')) { + if (arg !== null && arg.type === 'simple_symbol') { const propName = arg.text.replace(/^:/, ''); out.push({ '@import.statement': grouped['@reference.call.free']!, @@ -327,7 +327,7 @@ export function emitRubyScopeCaptures( if (argList !== null) { for (let ai = 0; ai < argList.namedChildCount; ai++) { const arg = argList.namedChild(ai); - if (arg !== null && (arg.type === 'simple_symbol' || arg.type === 'symbol')) { + if (arg !== null && arg.type === 'simple_symbol') { const propName = arg.text.replace(/^:/, ''); out.push({ '@type-binding.return': syntheticCapture('@type-binding.return', attrNode, text), diff --git a/gitnexus/src/core/ingestion/languages/rust/range-binding.ts b/gitnexus/src/core/ingestion/languages/rust/range-binding.ts index a09eaa23c5..0309c7a523 100644 --- a/gitnexus/src/core/ingestion/languages/rust/range-binding.ts +++ b/gitnexus/src/core/ingestion/languages/rust/range-binding.ts @@ -310,9 +310,9 @@ function processStructDestructuring( for (const fieldNode of patternNode.namedChildren) { let fieldName: string | undefined; if (fieldNode.type === 'field_pattern') { + // shorthand `{ a }` and full `{ b: c }` are both field_pattern; the + // `name` field is shorthand_field_identifier or field_identifier. fieldName = fieldNode.childForFieldName('name')?.text; - } else if (fieldNode.type === 'shorthand_field_pattern') { - fieldName = fieldNode.firstNamedChild?.text; } if (fieldName === undefined) continue; diff --git a/gitnexus/src/core/ingestion/languages/typescript/receiver-binding.ts b/gitnexus/src/core/ingestion/languages/typescript/receiver-binding.ts index bc213e353b..9ecd42b7a7 100644 --- a/gitnexus/src/core/ingestion/languages/typescript/receiver-binding.ts +++ b/gitnexus/src/core/ingestion/languages/typescript/receiver-binding.ts @@ -52,7 +52,6 @@ const TYPE_DECL_NODE_TYPES = new Set([ 'class_declaration', 'abstract_class_declaration', 'class', - 'class_expression', 'interface_declaration', ]); diff --git a/gitnexus/src/core/ingestion/method-extractors/configs/dart.ts b/gitnexus/src/core/ingestion/method-extractors/configs/dart.ts index 8f3225c103..f9ae25fcbb 100644 --- a/gitnexus/src/core/ingestion/method-extractors/configs/dart.ts +++ b/gitnexus/src/core/ingestion/method-extractors/configs/dart.ts @@ -17,7 +17,6 @@ import type { SyntaxNode } from '../../utils/ast-helpers.js'; /** Type node types that represent a return type in function/getter/setter signatures. */ const TYPE_NODE_TYPES = new Set([ 'type_identifier', - 'generic_type', 'function_type', 'nullable_type', 'void_type', diff --git a/gitnexus/src/core/ingestion/method-extractors/configs/php.ts b/gitnexus/src/core/ingestion/method-extractors/configs/php.ts index c3a8ff65da..2d04a6bb39 100644 --- a/gitnexus/src/core/ingestion/method-extractors/configs/php.ts +++ b/gitnexus/src/core/ingestion/method-extractors/configs/php.ts @@ -113,7 +113,6 @@ function extractPhpReturnType(node: SyntaxNode): string | undefined { 'named_type', 'union_type', 'optional_type', - 'nullable_type', 'intersection_type', ]); diff --git a/gitnexus/src/core/ingestion/type-env.ts b/gitnexus/src/core/ingestion/type-env.ts index a38df617b3..2dd261dd4b 100644 --- a/gitnexus/src/core/ingestion/type-env.ts +++ b/gitnexus/src/core/ingestion/type-env.ts @@ -112,7 +112,6 @@ type PatternOverrides = Map>; * Includes both multi-arm pattern-match branches AND if-statement bodies for null-check narrowing. */ const NARROWING_BRANCH_TYPES = new Set([ 'when_entry', // Kotlin when - 'switch_block_label', // Java switch (enhanced) 'if_statement', // TS/JS, Java, C/C++ 'if_expression', // Kotlin (if is an expression) 'statement_block', // TS/JS: { ... } body of if @@ -977,7 +976,6 @@ export const buildTypeEnv = ( (child.type === 'user_type' || child.type === 'type_identifier' || child.type === 'generic_type' || - child.type === 'parameterized_type' || child.type === 'nullable_type') ) { fallbackType = child; diff --git a/gitnexus/src/core/ingestion/type-extractors/c-cpp.ts b/gitnexus/src/core/ingestion/type-extractors/c-cpp.ts index 0544d4f9da..0284d4ba0f 100644 --- a/gitnexus/src/core/ingestion/type-extractors/c-cpp.ts +++ b/gitnexus/src/core/ingestion/type-extractors/c-cpp.ts @@ -142,14 +142,14 @@ const extractInitializer: InitializerExtractor = ( const templateFunc = func.type === 'template_function' ? func - : func.type === 'qualified_identifier' || func.type === 'scoped_identifier' + : func.type === 'qualified_identifier' ? (func.namedChildren.find((c: SyntaxNode) => c.type === 'template_function') ?? null) : null; if (templateFunc) { const nameNode = templateFunc.firstNamedChild; if (nameNode) { const funcName = - nameNode.type === 'qualified_identifier' || nameNode.type === 'scoped_identifier' + nameNode.type === 'qualified_identifier' ? (nameNode.lastNamedChild?.text ?? '') : nameNode.text; if (SMART_PTR_FACTORIES.has(funcName)) { @@ -214,7 +214,7 @@ const scanConstructorBinding: ConstructorBindingScanner = (node) => { if (!value || value.type !== 'call_expression') return undefined; const func = value.childForFieldName('function'); if (!func) return undefined; - if (func.type === 'qualified_identifier' || func.type === 'scoped_identifier') { + if (func.type === 'qualified_identifier') { const last = func.lastNamedChild; if (!last) return undefined; const nameNode = declarator.childForFieldName('declarator'); @@ -331,17 +331,13 @@ const extractCppElementTypeFromTypeNode = ( const args = extractCppTemplateTypeArgs(typeNode); if (args.length >= 1) return pos === 'first' ? args[0] : args[args.length - 1]; } - // reference/pointer types: unwrap and recurse (vector& → vector) - if ( - typeNode.type === 'reference_type' || - typeNode.type === 'pointer_type' || - typeNode.type === 'type_descriptor' - ) { + // type_descriptor wrapper: unwrap and recurse (vector& → vector) + if (typeNode.type === 'type_descriptor') { const inner = typeNode.lastNamedChild; if (inner) return extractCppElementTypeFromTypeNode(inner, pos, depth + 1); } - // qualified/scoped types: std::vector → unwrap to template_type child - if (typeNode.type === 'qualified_identifier' || typeNode.type === 'scoped_type_identifier') { + // qualified types: std::vector → unwrap to template_type child + if (typeNode.type === 'qualified_identifier') { const inner = typeNode.lastNamedChild; if (inner) return extractCppElementTypeFromTypeNode(inner, pos, depth + 1); } @@ -527,7 +523,7 @@ const detectCppConstructorType: ConstructorTypeDetector = (node, classNames) => const nameNode = func.firstNamedChild; if (!nameNode) return undefined; let funcName: string; - if (nameNode.type === 'qualified_identifier' || nameNode.type === 'scoped_identifier') { + if (nameNode.type === 'qualified_identifier') { funcName = nameNode.lastNamedChild?.text ?? ''; } else { funcName = nameNode.text; diff --git a/gitnexus/src/core/ingestion/type-extractors/csharp.ts b/gitnexus/src/core/ingestion/type-extractors/csharp.ts index bfc898f452..b3af048568 100644 --- a/gitnexus/src/core/ingestion/type-extractors/csharp.ts +++ b/gitnexus/src/core/ingestion/type-extractors/csharp.ts @@ -52,7 +52,7 @@ const extractDeclaration: TypeBindingExtractor = ( const child = node.namedChild(i); if (!child) continue; - if (!typeNode && child.type !== 'variable_declarator' && child.type !== 'equals_value_clause') { + if (!typeNode && child.type !== 'variable_declarator') { // First non-declarator child is the type (identifier, implicit_type, generic_name, etc.) typeNode = child; } @@ -67,12 +67,9 @@ const extractDeclaration: TypeBindingExtractor = ( let typeName: string | undefined; if (typeNode.type === 'implicit_type' && typeNode.text === 'var') { // Try to infer from initializer: var x = new Foo() - // tree-sitter-c-sharp may put object_creation_expression as direct child - // or inside equals_value_clause depending on grammar version + // tree-sitter-c-sharp puts object_creation_expression as a direct child if (declarators.length === 1) { - const initializer = - findChild(declarators[0], 'object_creation_expression') ?? - findChild(declarators[0], 'equals_value_clause')?.firstNamedChild; + const initializer = findChild(declarators[0], 'object_creation_expression'); if (initializer?.type === 'object_creation_expression') { const ctorType = initializer.childForFieldName('type'); if (ctorType) typeName = extractSimpleTypeName(ctorType); @@ -101,7 +98,7 @@ const extractParameter: ParameterExtractor = (node: SyntaxNode, env: Map { if (!declarator) return undefined; const nameNode = declarator.childForFieldName('name') ?? declarator.firstNamedChild; if (!nameNode || nameNode.type !== 'identifier') return undefined; - // Find the initializer value: either inside equals_value_clause or as a direct child + // Find the initializer value as a direct child // (tree-sitter-c-sharp puts invocation_expression directly inside variable_declarator) let value: SyntaxNode | null = null; for (let i = 0; i < declarator.namedChildCount; i++) { const child = declarator.namedChild(i); if (!child) continue; - if (child.type === 'equals_value_clause') { - value = child.firstNamedChild; - break; - } if ( child.type === 'invocation_expression' || child.type === 'object_creation_expression' || @@ -471,20 +464,9 @@ const extractPendingAssignment: PendingAssignmentExtractor = (node, scopeEnv) => if (!nameNode) continue; const lhs = nameNode.text; if (scopeEnv.has(lhs)) continue; - // C# wraps value in equals_value_clause; fall back to last named child - let evc: SyntaxNode | null = null; - for (let j = 0; j < child.childCount; j++) { - if (child.child(j)?.type === 'equals_value_clause') { - evc = child.child(j); - break; - } - } - const valueNode = evc?.firstNamedChild ?? child.namedChild(child.namedChildCount - 1); - if ( - valueNode && - valueNode !== nameNode && - (valueNode.type === 'identifier' || valueNode.type === 'simple_identifier') - ) { + // C# variable_declarator holds the initializer value as a direct named child + const valueNode = child.namedChild(child.namedChildCount - 1); + if (valueNode && valueNode !== nameNode && valueNode.type === 'identifier') { return { kind: 'copy', lhs, rhs: valueNode.text }; } // member_access_expression RHS → fieldAccess (a.Field) @@ -498,7 +480,7 @@ const extractPendingAssignment: PendingAssignmentExtractor = (node, scopeEnv) => // invocation_expression RHS if (valueNode?.type === 'invocation_expression') { const funcNode = valueNode.firstNamedChild; - if (funcNode?.type === 'identifier_name' || funcNode?.type === 'identifier') { + if (funcNode?.type === 'identifier') { return { kind: 'callResult', lhs, callee: funcNode.text }; } // method call with receiver → methodCallResult: a.GetC() @@ -515,7 +497,7 @@ const extractPendingAssignment: PendingAssignmentExtractor = (node, scopeEnv) => const inner = valueNode.firstNamedChild; if (inner?.type === 'invocation_expression') { const funcNode = inner.firstNamedChild; - if (funcNode?.type === 'identifier_name' || funcNode?.type === 'identifier') { + if (funcNode?.type === 'identifier') { return { kind: 'callResult', lhs, callee: funcNode.text }; } if (funcNode?.type === 'member_access_expression') { @@ -565,15 +547,13 @@ export const typeConfig: LanguageTypeConfig = { const direct = node.childForFieldName('type'); if (direct) return direct; - const wrapped = - node.childForFieldName('declaration') ?? - (() => { - for (let i = 0; i < node.namedChildCount; i++) { - const c = node.namedChild(i); - if (c?.type === 'variable_declaration') return c; - } - return null; - })(); + const wrapped = (() => { + for (let i = 0; i < node.namedChildCount; i++) { + const c = node.namedChild(i); + if (c?.type === 'variable_declaration') return c; + } + return null; + })(); return wrapped?.childForFieldName('type') ?? null; }, diff --git a/gitnexus/src/core/ingestion/type-extractors/go.ts b/gitnexus/src/core/ingestion/type-extractors/go.ts index 75aa40e50b..7338c731d0 100644 --- a/gitnexus/src/core/ingestion/type-extractors/go.ts +++ b/gitnexus/src/core/ingestion/type-extractors/go.ts @@ -145,16 +145,8 @@ const extractDeclaration: TypeBindingExtractor = ( /** Go: parameter → name type */ const extractParameter: ParameterExtractor = (node: SyntaxNode, env: Map): void => { - let nameNode: SyntaxNode | null = null; - let typeNode: SyntaxNode | null = null; - - if (node.type === 'parameter') { - nameNode = node.childForFieldName('name'); - typeNode = node.childForFieldName('type'); - } else { - nameNode = node.childForFieldName('name') ?? node.childForFieldName('pattern'); - typeNode = node.childForFieldName('type'); - } + const nameNode = node.childForFieldName('name'); + const typeNode = node.childForFieldName('type'); if (!nameNode || !typeNode) return; const varName = extractVarName(nameNode); diff --git a/gitnexus/src/core/ingestion/type-extractors/jvm.ts b/gitnexus/src/core/ingestion/type-extractors/jvm.ts index 3382097e21..c8538914b8 100644 --- a/gitnexus/src/core/ingestion/type-extractors/jvm.ts +++ b/gitnexus/src/core/ingestion/type-extractors/jvm.ts @@ -87,7 +87,7 @@ const extractJavaParameter: ParameterExtractor = ( nameNode = node.childForFieldName('name'); } else { // Generic fallback - nameNode = node.childForFieldName('name') ?? node.childForFieldName('pattern'); + nameNode = node.childForFieldName('name'); typeNode = node.childForFieldName('type'); } @@ -382,9 +382,10 @@ const extractKotlinDeclaration: TypeBindingExtractor = ( if (varName && typeName) env.set(varName, typeName); return; } - // Fallback: try direct fields - const nameNode = node.childForFieldName('name') ?? findChild(node, 'simple_identifier'); - const typeNode = node.childForFieldName('type') ?? findChild(node, 'user_type'); + // Fallback: Kotlin property_declaration has no name/type fields (verified by + // real parse, #1920); the name/type are positional children. + const nameNode = findChild(node, 'simple_identifier'); + const typeNode = findChild(node, 'user_type'); if (!nameNode || !typeNode) return; const varName = extractVarName(nameNode); const typeName = extractSimpleTypeName(typeNode); @@ -416,7 +417,7 @@ const extractKotlinParameter: ParameterExtractor = ( typeNode = node.childForFieldName('type'); nameNode = node.childForFieldName('name'); } else { - nameNode = node.childForFieldName('name') ?? node.childForFieldName('pattern'); + nameNode = node.childForFieldName('name'); typeNode = node.childForFieldName('type'); } diff --git a/gitnexus/src/core/ingestion/type-extractors/php.ts b/gitnexus/src/core/ingestion/type-extractors/php.ts index ca517ee906..1eb4719a78 100644 --- a/gitnexus/src/core/ingestion/type-extractors/php.ts +++ b/gitnexus/src/core/ingestion/type-extractors/php.ts @@ -290,7 +290,7 @@ const extractParameter: ParameterExtractor = (node: SyntaxNode, env: Map const rhsNode = node.childForFieldName('right'); if (!rhsNode) return undefined; if (rhsNode.type === 'identifier') return { kind: 'copy', lhs: varName, rhs: rhsNode.text }; - // call/method_call RHS — Ruby uses method calls for both field access and method calls - if (rhsNode.type === 'call' || rhsNode.type === 'method_call') { + // call RHS — Ruby uses method calls for both field access and method calls + if (rhsNode.type === 'call') { const methodNode = rhsNode.childForFieldName('method'); const receiverNode = rhsNode.childForFieldName('receiver'); if (!receiverNode && methodNode?.type === 'identifier') { diff --git a/gitnexus/src/core/ingestion/type-extractors/rust.ts b/gitnexus/src/core/ingestion/type-extractors/rust.ts index a233c0ea93..721ef366df 100644 --- a/gitnexus/src/core/ingestion/type-extractors/rust.ts +++ b/gitnexus/src/core/ingestion/type-extractors/rust.ts @@ -277,16 +277,6 @@ const extractPendingAssignment: PendingAssignmentExtractor = (node, scopeEnv) => return { kind: 'callResult', lhs, callee: funcNode.text }; } } - // method_call_expression RHS → methodCallResult (receiver.method()) - if (unwrapped.type === 'method_call_expression') { - const obj = unwrapped.firstNamedChild; - if (obj?.type === 'identifier') { - const methodNode = unwrapped.childForFieldName('name') ?? unwrapped.namedChild(1); - if (methodNode?.type === 'field_identifier') { - return { kind: 'methodCallResult', lhs, receiver: obj.text, method: methodNode.text }; - } - } - } return undefined; }; @@ -410,11 +400,6 @@ const extractRustElementTypeFromTypeNode = ( const elemNode = typeNode.firstNamedChild; if (elemNode) return extractSimpleTypeName(elemNode); } - // slice_type: [User] — element is the first child - if (typeNode.type === 'slice_type') { - const elemNode = typeNode.firstNamedChild; - if (elemNode) return extractSimpleTypeName(elemNode); - } return undefined; }; diff --git a/gitnexus/src/core/ingestion/type-extractors/shared.ts b/gitnexus/src/core/ingestion/type-extractors/shared.ts index 576588c83d..17cb962cd8 100644 --- a/gitnexus/src/core/ingestion/type-extractors/shared.ts +++ b/gitnexus/src/core/ingestion/type-extractors/shared.ts @@ -257,11 +257,7 @@ export const extractSimpleTypeName = (typeNode: SyntaxNode, depth = 0): string | // Generic types: extract the base type (e.g., List → List) // For nullable wrappers (Optional, Option), unwrap to inner type. - if ( - typeNode.type === 'generic_type' || - typeNode.type === 'parameterized_type' || - typeNode.type === 'generic_name' - ) { + if (typeNode.type === 'generic_type' || typeNode.type === 'generic_name') { const base = typeNode.childForFieldName('name') ?? typeNode.childForFieldName('type') ?? @@ -411,17 +407,20 @@ export const TYPED_PARAMETER_TYPES = new Set([ * Note: Go slices/maps use slice_type/map_type, not generic_type — those are * NOT handled here. Use language-specific extractors for Go container types. * - * @param typeNode A generic_type or parameterized_type AST node (or any node — - * returns [] for non-generic types). + * @param typeNode A generic_type / generic_name / user_type AST node (or any + * node — returns [] for non-generic types). * @returns Array of resolved type argument names. Unresolvable arguments are omitted. */ export const extractGenericTypeArgs = (typeNode: SyntaxNode, depth = 0): string[] => { if (depth > 50) return []; - // Unwrap wrapper nodes that may sit above the generic_type + // Unwrap pure wrapper nodes (which carry no type_arguments of their own) that + // may sit above the generic type. `user_type` is intentionally NOT unwrapped + // here: a Kotlin `user_type` can itself carry a `type_arguments` child + // (`List` → user_type > [type_identifier, type_arguments]), so it is + // handled as a generic-bearing node below. if ( typeNode.type === 'type_annotation' || typeNode.type === 'type' || - typeNode.type === 'user_type' || typeNode.type === 'nullable_type' || typeNode.type === 'optional_type' ) { @@ -430,11 +429,15 @@ export const extractGenericTypeArgs = (typeNode: SyntaxNode, depth = 0): string[ return []; } - // Only process generic/parameterized type nodes (includes C#'s generic_name) + // Generic-bearing nodes hold their arguments in a `type_arguments` / + // `type_argument_list` child: generic_type (Java/TypeScript/Rust/Go), + // generic_name (C#), and Kotlin's user_type. Verified against the installed + // grammars by real parse (#1920). A user_type without its own type_arguments + // is unwrapped at the argsNode guard below. if ( typeNode.type !== 'generic_type' && - typeNode.type !== 'parameterized_type' && - typeNode.type !== 'generic_name' + typeNode.type !== 'generic_name' && + typeNode.type !== 'user_type' ) { return []; } @@ -448,7 +451,17 @@ export const extractGenericTypeArgs = (typeNode: SyntaxNode, depth = 0): string[ break; } } - if (!argsNode) return []; + if (!argsNode) { + // A `user_type` without its own type_arguments wraps an inner type node + // (e.g. user_type > generic_type, or a plain user_type > type_identifier with + // no generics) — recurse into that child. generic_type / generic_name with no + // args simply have no type arguments to report. + if (typeNode.type === 'user_type') { + const inner = typeNode.firstNamedChild; + return inner ? extractGenericTypeArgs(inner, depth + 1) : []; + } + return []; + } const result: string[] = []; for (let i = 0; i < argsNode.namedChildCount; i++) { diff --git a/gitnexus/src/core/ingestion/type-extractors/swift.ts b/gitnexus/src/core/ingestion/type-extractors/swift.ts index 89e63ecc55..ec4e43b488 100644 --- a/gitnexus/src/core/ingestion/type-extractors/swift.ts +++ b/gitnexus/src/core/ingestion/type-extractors/swift.ts @@ -51,7 +51,7 @@ const extractDeclaration: TypeBindingExtractor = ( env: Map, ): void => { // Swift property_declaration has pattern and type_annotation - const pattern = node.childForFieldName('pattern') ?? findChild(node, 'pattern'); + const pattern = findChild(node, 'pattern'); const typeAnnotation = node.childForFieldName('type') ?? findChild(node, 'type_annotation'); if (!pattern || !typeAnnotation) return; const varName = extractVarName(pattern) ?? pattern.text; @@ -65,10 +65,10 @@ const extractParameter: ParameterExtractor = (node: SyntaxNode, env: Map { if (node.type !== 'property_declaration') return undefined; if (hasTypeAnnotation(node)) return undefined; - const pattern = node.childForFieldName('pattern') ?? findChild(node, 'pattern'); + const pattern = findChild(node, 'pattern'); if (!pattern) return undefined; const varName = pattern.text; if (!varName) return undefined; diff --git a/gitnexus/src/core/ingestion/type-extractors/typescript.ts b/gitnexus/src/core/ingestion/type-extractors/typescript.ts index fcc58cff47..8aba03c177 100644 --- a/gitnexus/src/core/ingestion/type-extractors/typescript.ts +++ b/gitnexus/src/core/ingestion/type-extractors/typescript.ts @@ -300,8 +300,7 @@ const findTsIterableElementType = ( while (current) { if (TS_FUNCTION_NODE_TYPES.has(current.type)) { // Search function parameters - const paramsNode = - current.childForFieldName('parameters') ?? current.childForFieldName('formal_parameters'); + const paramsNode = current.childForFieldName('parameters'); if (paramsNode) { for (let i = 0; i < paramsNode.namedChildCount; i++) { const param = paramsNode.namedChild(i); diff --git a/gitnexus/src/core/ingestion/variable-extractors/configs/dart.ts b/gitnexus/src/core/ingestion/variable-extractors/configs/dart.ts index a7e52afa4c..bd626212b4 100644 --- a/gitnexus/src/core/ingestion/variable-extractors/configs/dart.ts +++ b/gitnexus/src/core/ingestion/variable-extractors/configs/dart.ts @@ -3,7 +3,6 @@ import { SupportedLanguages } from 'gitnexus-shared'; import type { VariableExtractionConfig } from '../../variable-types.js'; import type { VariableVisibility } from '../../variable-types.js'; -import { extractSimpleTypeName } from '../../type-extractors/shared.js'; import type { SyntaxNode } from '../../utils/ast-helpers.js'; /** @@ -47,13 +46,6 @@ function extractDartVarName(node: SyntaxNode): string | undefined { } function extractDartVarType(node: SyntaxNode): string | undefined { - for (let i = 0; i < node.namedChildCount; i++) { - const child = node.namedChild(i); - if (child?.type === 'initialized_variable_definition') { - const typeNode = child.childForFieldName('type'); - if (typeNode) return extractSimpleTypeName(typeNode) ?? typeNode.text?.trim(); - } - } // Look for type_identifier directly on the node for (let i = 0; i < node.namedChildCount; i++) { const child = node.namedChild(i); diff --git a/gitnexus/test/helpers/grammar-introspection.ts b/gitnexus/test/helpers/grammar-introspection.ts new file mode 100644 index 0000000000..74558e6c0c --- /dev/null +++ b/gitnexus/test/helpers/grammar-introspection.ts @@ -0,0 +1,320 @@ +/** + * Grammar introspection helper for the tree-sitter node-type / field-name + * validation gate (issue #1920). + * + * Two oracles, layered (see the plan's KTD1): + * 1. A fast **membership set** built from each grammar's static + * `node-types.json` — the union of every top-level `type`, every + * `subtypes[].type`, and every children/per-field `types[].type`, + * retaining anonymous (`named:false`) tokens and supertype names. + * 2. A `probeNodeType` **authoritative fallback** that compiles a probe + * query against the *live* grammar — used for any literal the static + * JSON under-reports (regex / `token(...)` tokens, aliased nodes). + * + * This file lives under `test/` and is therefore allowed to name languages + * (the AGENTS.md "shared pipeline code must not name languages" rule applies + * to `src/core/ingestion/`, not to test helpers). The live-grammar access and + * the tsx/php_only variant handling are delegated to the production + * `parser-loader.ts` so the gate validates against exactly the grammar the + * runtime uses. + */ +import Parser from 'tree-sitter'; +import { createRequire } from 'node:module'; +import { readFileSync, existsSync } from 'node:fs'; +import { dirname, join } from 'node:path'; +import { SupportedLanguages } from '../../src/config/supported-languages.js'; +import { + getLanguageGrammar, + isLanguageAvailable, + resolveLanguageKey, +} from '../../src/core/tree-sitter/parser-loader.js'; + +const _require = createRequire(import.meta.url); + +/** + * Per-language grammar package + the `node-types.json` subpath(s) to union. + * COBOL is intentionally absent (regex preprocessor, no grammar). Vue has no + * grammar of its own and reuses tree-sitter-typescript, so its literals are + * validated against the typescript ∪ tsx node set (JSX/TSX-only nodes + * included). The package names mirror `parser-loader.ts` `SOURCES`. + */ +const GRAMMAR_PACKAGES: Partial> = { + [SupportedLanguages.JavaScript]: { + pkg: 'tree-sitter-javascript', + subpaths: ['src/node-types.json'], + }, + [SupportedLanguages.TypeScript]: { + pkg: 'tree-sitter-typescript', + subpaths: ['typescript/src/node-types.json', 'tsx/src/node-types.json'], + }, + [SupportedLanguages.Python]: { pkg: 'tree-sitter-python', subpaths: ['src/node-types.json'] }, + [SupportedLanguages.Java]: { pkg: 'tree-sitter-java', subpaths: ['src/node-types.json'] }, + [SupportedLanguages.C]: { pkg: 'tree-sitter-c', subpaths: ['src/node-types.json'] }, + [SupportedLanguages.CPlusPlus]: { pkg: 'tree-sitter-cpp', subpaths: ['src/node-types.json'] }, + [SupportedLanguages.CSharp]: { pkg: 'tree-sitter-c-sharp', subpaths: ['src/node-types.json'] }, + [SupportedLanguages.Go]: { pkg: 'tree-sitter-go', subpaths: ['src/node-types.json'] }, + [SupportedLanguages.Ruby]: { pkg: 'tree-sitter-ruby', subpaths: ['src/node-types.json'] }, + [SupportedLanguages.Rust]: { pkg: 'tree-sitter-rust', subpaths: ['src/node-types.json'] }, + // tree-sitter-php's runtime export is `php_only` (see parser-loader), so the + // gate must validate against that variant's node set, not the embedded-HTML + // `php` grammar. + [SupportedLanguages.PHP]: { pkg: 'tree-sitter-php', subpaths: ['php_only/src/node-types.json'] }, + [SupportedLanguages.Kotlin]: { pkg: 'tree-sitter-kotlin', subpaths: ['src/node-types.json'] }, + [SupportedLanguages.Swift]: { pkg: 'tree-sitter-swift', subpaths: ['src/node-types.json'] }, + [SupportedLanguages.Dart]: { pkg: 'tree-sitter-dart', subpaths: ['src/node-types.json'] }, + [SupportedLanguages.Vue]: { + pkg: 'tree-sitter-typescript', + subpaths: ['typescript/src/node-types.json', 'tsx/src/node-types.json'], + }, +}; + +/** Languages the gate validates (everything with a grammar package). */ +export const GATED_LANGUAGES: readonly SupportedLanguages[] = Object.keys( + GRAMMAR_PACKAGES, +) as SupportedLanguages[]; + +export interface GrammarModel { + language: SupportedLanguages; + /** Every node-type string the grammar can surface (named + anonymous + supertypes). */ + nodeTypes: ReadonlySet; + /** Valid field names per node type. */ + fieldsByNode: ReadonlyMap>; + /** Union of every field name across all node types (sound global existence check). */ + allFields: ReadonlySet; +} + +// ---- node-types.json shape (only the parts we read) ---- +interface ChildType { + type: string; + named: boolean; +} +interface FieldInfo { + types?: ChildType[]; +} +interface NodeTypeEntry { + type: string; + named?: boolean; + fields?: Record; + children?: { types?: ChildType[] }; + subtypes?: ChildType[]; +} + +/** Resolve the on-disk directory of an installed package, or null if absent. */ +function resolvePackageDir(pkg: string): string | null { + try { + return dirname(_require.resolve(`${pkg}/package.json`)); + } catch { + /* package.json may be blocked by an `exports` map — fall back to main */ + } + try { + let dir = dirname(_require.resolve(pkg)); + for (let i = 0; i < 10; i++) { + if (existsSync(join(dir, 'package.json'))) return dir; + const parent = dirname(dir); + if (parent === dir) break; + dir = parent; + } + } catch { + /* not installed (optional grammar) */ + } + return null; +} + +function addChildTypes(into: Set, types: ChildType[] | undefined): void { + if (!types) return; + for (const t of types) into.add(t.type); +} + +/** + * Build the membership model for one language by unioning its node-types.json + * file(s). Returns null when no node-types.json can be resolved (e.g. an + * optional grammar is not installed) so callers can skip rather than fail. + */ +export function loadGrammarModel(language: SupportedLanguages): GrammarModel | null { + const entry = GRAMMAR_PACKAGES[language]; + if (!entry) return null; + const dir = resolvePackageDir(entry.pkg); + if (!dir) return null; + + const nodeTypes = new Set(); + const fieldsByNode = new Map>(); + const allFields = new Set(); + let read = 0; + + for (const subpath of entry.subpaths) { + const file = join(dir, subpath); + if (!existsSync(file)) continue; + let parsed: NodeTypeEntry[]; + try { + parsed = JSON.parse(readFileSync(file, 'utf8')) as NodeTypeEntry[]; + } catch { + continue; + } + read += 1; + for (const node of parsed) { + if (typeof node.type === 'string') nodeTypes.add(node.type); + addChildTypes(nodeTypes, node.subtypes); + addChildTypes(nodeTypes, node.children?.types); + if (node.fields) { + const fieldSet = fieldsByNode.get(node.type) ?? new Set(); + for (const [fieldName, info] of Object.entries(node.fields)) { + fieldSet.add(fieldName); + allFields.add(fieldName); + addChildTypes(nodeTypes, info.types); + } + fieldsByNode.set(node.type, fieldSet); + } + } + } + + if (read === 0) return null; + return { language, nodeTypes, fieldsByNode, allFields }; +} + +/** True when the thrown object is tree-sitter's "invalid node type" query error. */ +export function isNodeTypeError(err: unknown): boolean { + return err instanceof Error && /TSQueryErrorNodeType/.test(err.message); +} + +/** + * True when the thrown object is tree-sitter's "field invalid for this node" + * query error. `TSQueryErrorStructure` is thrown when a field exists on other + * nodes but not on the queried one (the common dead-field case, e.g. + * `(parameter pattern: (_))`); `TSQueryErrorField` is thrown for a field name + * unknown to the grammar entirely. Both mean the field is dead on that node. + * `TSQueryErrorNodeType` is deliberately NOT a field error — it means the node + * type is absent in this grammar, which `probeField` reports as `unavailable` + * (abstain), never `dead`. + */ +export function isFieldError(err: unknown): boolean { + return err instanceof Error && /TSQueryError(Structure|Field)/.test(err.message); +} + +/** Escape a string so it is safe inside a `"..."` anonymous-node query literal. */ +function escapeAnonymous(literal: string): string { + return literal.replace(/\\/g, '\\\\').replace(/"/g, '\\"'); +} + +/** The live grammar object(s) a language's literals should be probed against. */ +function grammarsFor(language: SupportedLanguages): unknown[] { + if (!isLanguageAvailable(language)) return []; + const grammars: unknown[] = [getLanguageGrammar(language)]; + // TypeScript and Vue (which reuses the TS grammar) also have a tsx grammar + // with JSX-only node types; probe both. + if (language === SupportedLanguages.TypeScript || language === SupportedLanguages.Vue) { + try { + // resolveLanguageKey only switches TypeScript -> tsx on a .tsx path. + const tsx = getLanguageGrammar(SupportedLanguages.TypeScript, 'x.tsx'); + if (resolveLanguageKey(SupportedLanguages.TypeScript, 'x.tsx').endsWith(':tsx')) { + grammars.push(tsx); + } + } catch { + /* tsx unavailable — base grammar still probed */ + } + } + return grammars; +} + +/** + * Authoritative fallback: ask the live grammar whether `literal` can be a node + * type. A literal is `valid` if it compiles in ANY of the named `(x)`, + * anonymous `"x"`, or supertype `(_x)` forms against ANY of the language's + * grammars; `dead` only if every form is rejected; `unavailable` if no grammar + * loads (so the caller skips rather than fails). See KTD1. + */ +export function probeNodeType( + language: SupportedLanguages, + literal: string, +): 'valid' | 'dead' | 'unavailable' { + const grammars = grammarsFor(language); + if (grammars.length === 0) return 'unavailable'; + + const forms = [`(${literal}) @_`, `"${escapeAnonymous(literal)}" @_`, `(_${literal}) @_`]; + for (const grammar of grammars) { + for (const form of forms) { + try { + // Constructing the Query is the validation: it throws + // TSQueryErrorNodeType iff the node type cannot exist. + new Parser.Query(grammar as ConstructorParameters[0], form); + return 'valid'; + } catch { + /* this (form, grammar) rejected — try the next */ + } + } + } + return 'dead'; +} + +/** + * Field-existence oracle — the node-scoped analogue of `probeNodeType`. Compiles + * a field-bearing probe query `( : (_)) @_` against the live + * grammar(s) for `language`: + * - compiles on ANY grammar → `valid` + * - rejected as a field/structure error on a grammar that HAS the node, and + * never accepted → `dead` + * - the node type is absent in every probed grammar (only `TSQueryErrorNodeType`), + * or no grammar loads → `unavailable` (abstain — never `dead`, so multi-language + * valid-if-any can defer to the grammar that actually emits the node) + * + * Conservative-toward-valid: supertype-typed fields make some structurally-wrong + * field queries compile, so the probe can return `valid` for a semantically wrong + * field. That is the sound direction — false negatives only, never a false + * positive that would block CI on correct code. + */ +export function probeField( + language: SupportedLanguages, + nodeType: string, + field: string, +): 'valid' | 'dead' | 'unavailable' { + const grammars = grammarsFor(language); + if (grammars.length === 0) return 'unavailable'; + + const form = `(${nodeType} ${field}: (_)) @_`; + let sawFieldDead = false; + for (const grammar of grammars) { + try { + new Parser.Query(grammar as ConstructorParameters[0], form); + return 'valid'; + } catch (err) { + if (isFieldError(err)) sawFieldDead = true; + // TSQueryErrorNodeType (node absent here) or any other error → abstain + } + } + return sawFieldDead ? 'dead' : 'unavailable'; +} + +/** + * Combined check used by the gate: fast membership first, authoritative live + * probe only for literals the static JSON does not list. Returns `valid`, + * `dead`, or `unavailable`. + */ +export function validateNodeType( + language: SupportedLanguages, + model: GrammarModel | null, + literal: string, +): 'valid' | 'dead' | 'unavailable' { + if (model && model.nodeTypes.has(literal)) return 'valid'; + return probeNodeType(language, literal); +} + +/** + * Field-name validation. Node-scoped when `receiverNodeType` is given: + * membership hit is authoritative, and a miss falls through to the live + * `probeField` rather than declaring `dead` — node-types.json is not a sound + * negative oracle for fields (it can under-report). Without a receiver it is a + * sound global existence check. Returns `unavailable` when the model could not + * be loaded. See KTD1. + */ +export function validateField( + model: GrammarModel | null, + field: string, + receiverNodeType?: string, +): 'valid' | 'dead' | 'unavailable' { + if (!model) return 'unavailable'; + if (receiverNodeType) { + const scoped = model.fieldsByNode.get(receiverNodeType); + if (scoped && scoped.has(field)) return 'valid'; + return probeField(model.language, receiverNodeType, field); + } + return model.allFields.has(field) ? 'valid' : 'dead'; +} diff --git a/gitnexus/test/helpers/literal-collectors.ts b/gitnexus/test/helpers/literal-collectors.ts new file mode 100644 index 0000000000..e75a44e2d6 --- /dev/null +++ b/gitnexus/test/helpers/literal-collectors.ts @@ -0,0 +1,811 @@ +/** + * Literal collectors for the node-type / field validation gate (issue #1920). + * + * Collects every tree-sitter node-type and field-name literal the ingestion + * layer references in CODE (the query strings themselves are validated by + * compilation — see Mode 3 and query-compilation.test.ts), each tagged with + * the grammar language(s) it is checked against. + * + * FOUR modes (plan KTD3): + * 1. Config reflection — import `*-extractors/configs/*.ts`, read each + * config-shaped export's node-type-array keys. Exact `config.language`. + * 2. AST scan (`typescript` parser, no type-checker) over the EXTRACTION + * surface — `*-extractors/**`, every `languages//captures.ts`, and + * `export-detection.ts`. Collected BY CONSUMPTION SITE: `.type === '..'`, + * `childForFieldName('..')` (capturing the receiver node type when an + * enclosing `recv.type === 'X'` guard / `case 'X':` narrows it — see + * `receiverNodeTypeOf`), `findNodeAtRange(.., '..')`, and members of a + * `Set`/array consumed via `SET.has(.type)`. No `*_TYPES` name heuristic: + * a `Set`'s members are collected only when consumed against a node's `.type`. + * 3. Registry scope-query probes — invoke each `languages//query.ts` + * `get*ScopeQuery()` (gated by `isLanguageAvailable`) so the gate compiles + * the registry scope queries too (new coverage vs query-compilation.test.ts). + * 4. Resolution-layer scan (TypeChecker-gated) — the registry production path + * (`languages//{scope-resolver,type-binding,receiver-binding,interpret, + * arity,import-decomposer,…}`) PLUS shared resolution files directly under + * `ingestion/` (e.g. `type-env.ts`). These files MIX SyntaxNode `.type` with + * resolved-symbol `.type` (kinds like 'Class'), so a literal is collected + * ONLY when its `.type` / `childForFieldName` receiver resolves to a + * tree-sitter SyntaxNode (via the TS TypeChecker). Per-`languages//` + * files tag to that one grammar; shared (non-`languages//`) files tag + * to the full gated set (valid-if-any). + * + * (In-file section order is 1, 2, 4, 3 for historical reasons; the logical order + * is as numbered above.) + * + * Test-only file: allowed to name languages. + */ +import ts from 'typescript'; +import { readFileSync, readdirSync, existsSync, statSync } from 'node:fs'; +import { join } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { SupportedLanguages } from '../../src/config/supported-languages.js'; +import { isLanguageAvailable } from '../../src/core/tree-sitter/parser-loader.js'; +import { GATED_LANGUAGES } from './grammar-introspection.js'; + +const INGESTION_DIR = fileURLToPath(new URL('../../src/core/ingestion/', import.meta.url)); + +export interface CollectedNodeType { + literal: string; + languages: SupportedLanguages[]; + file: string; // ingestion-relative + line: number; + source: 'config' | 'compare' | 'set-member' | 'find-node-arg'; +} +export interface CollectedField { + field: string; + languages: SupportedLanguages[]; + file: string; + line: number; + /** Receiver node type when statically narrowed by an enclosing positive guard + * (`if (recv.type === 'X')` / `case 'X':`). When set, the gate validates the + * field node-scoped (membership-then-probe); otherwise it uses the sound + * global existence check. See `receiverNodeTypeOf` / KTD2. */ + receiverNodeType?: string; +} +export interface RegistryQueryProbe { + language: SupportedLanguages; + getter: string; + error: string | null; +} + +const ALL_LANGS = GATED_LANGUAGES; + +/** Directory name (under languages/) → language. */ +const DIR_LANG: Record = { + javascript: SupportedLanguages.JavaScript, + typescript: SupportedLanguages.TypeScript, + python: SupportedLanguages.Python, + java: SupportedLanguages.Java, + c: SupportedLanguages.C, + cpp: SupportedLanguages.CPlusPlus, + csharp: SupportedLanguages.CSharp, + go: SupportedLanguages.Go, + ruby: SupportedLanguages.Ruby, + rust: SupportedLanguages.Rust, + php: SupportedLanguages.PHP, + kotlin: SupportedLanguages.Kotlin, + swift: SupportedLanguages.Swift, + dart: SupportedLanguages.Dart, + vue: SupportedLanguages.Vue, +}; + +/** Basename (no .ts) → language set, for extractor files that name a language. */ +const BASENAME_LANGS: Record = { + 'c-cpp': [SupportedLanguages.C, SupportedLanguages.CPlusPlus], + jvm: [SupportedLanguages.Java, SupportedLanguages.Kotlin], + 'typescript-javascript': [SupportedLanguages.TypeScript, SupportedLanguages.JavaScript], + csharp: [SupportedLanguages.CSharp], + dart: [SupportedLanguages.Dart], + go: [SupportedLanguages.Go], + php: [SupportedLanguages.PHP], + python: [SupportedLanguages.Python], + ruby: [SupportedLanguages.Ruby], + rust: [SupportedLanguages.Rust], + swift: [SupportedLanguages.Swift], + typescript: [SupportedLanguages.TypeScript], + javascript: [SupportedLanguages.JavaScript], + java: [SupportedLanguages.Java], + kotlin: [SupportedLanguages.Kotlin], + laravel: [SupportedLanguages.PHP], + nextjs: [SupportedLanguages.TypeScript, SupportedLanguages.JavaScript], + expo: [SupportedLanguages.TypeScript, SupportedLanguages.JavaScript], + 'fastapi-router-bindings': [SupportedLanguages.Python], +}; + +/** const-name prefix → language (for export-detection.ts style named sets). */ +const PREFIX_LANGS: Record = { + CSHARP: [SupportedLanguages.CSharp], + RUST: [SupportedLanguages.Rust], + GO: [SupportedLanguages.Go], + JAVA: [SupportedLanguages.Java], + KOTLIN: [SupportedLanguages.Kotlin], + PYTHON: [SupportedLanguages.Python], + RUBY: [SupportedLanguages.Ruby], + PHP: [SupportedLanguages.PHP], + SWIFT: [SupportedLanguages.Swift], + DART: [SupportedLanguages.Dart], + CPP: [SupportedLanguages.CPlusPlus], + TS: [SupportedLanguages.TypeScript], + JS: [SupportedLanguages.JavaScript], +}; + +/** Candidate grammar languages a CODE literal in `relPath` should be checked against. */ +function fileLanguages(relPath: string): SupportedLanguages[] { + const langsMatch = relPath.match(/(?:^|\/)languages\/([^/]+)\//); + if (langsMatch) { + const lang = DIR_LANG[langsMatch[1]]; + return lang ? [lang] : [...ALL_LANGS]; + } + const base = relPath.replace(/\.ts$/, '').split('/').pop() ?? ''; + if (BASENAME_LANGS[base]) return BASENAME_LANGS[base]; + // generic / shared / cross-language helpers → any grammar (valid-if-any) + return [...ALL_LANGS]; +} + +/** Narrow a Set's candidate languages by a `_...` const-name prefix. */ +function constNameLanguages( + constName: string, + fallback: SupportedLanguages[], +): SupportedLanguages[] { + const m = constName.match(/^([A-Z]+)_/); + if (m && PREFIX_LANGS[m[1]]) return PREFIX_LANGS[m[1]]; + return fallback; +} + +// --------------------------------------------------------------------------- +// File discovery +// --------------------------------------------------------------------------- +function walkTs(dir: string, out: string[]): void { + if (!existsSync(dir)) return; + for (const entry of readdirSync(dir)) { + const full = join(dir, entry); + const st = statSync(full); + if (st.isDirectory()) { + walkTs(full, out); + } else if (entry.endsWith('.ts') && !entry.endsWith('.test.ts')) { + out.push(full); + } + } +} + +/** The Mode-2 scan surface: every *-extractors/** file + each captures.ts + export-detection.ts. */ +function mode2Files(): string[] { + const files: string[] = []; + for (const entry of readdirSync(INGESTION_DIR)) { + if (entry.endsWith('-extractors')) walkTs(join(INGESTION_DIR, entry), files); + } + const langsDir = join(INGESTION_DIR, 'languages'); + if (existsSync(langsDir)) { + for (const lang of readdirSync(langsDir)) { + if (lang === 'cobol') continue; + const cap = join(langsDir, lang, 'captures.ts'); + if (existsSync(cap)) files.push(cap); + } + } + const exportDetection = join(INGESTION_DIR, 'export-detection.ts'); + if (existsSync(exportDetection)) files.push(exportDetection); + return files; +} + +/** The config files for Mode-1 reflection. */ +function configFiles(): string[] { + const files: string[] = []; + for (const entry of readdirSync(INGESTION_DIR)) { + if (!entry.endsWith('-extractors')) continue; + const cfgDir = join(INGESTION_DIR, entry, 'configs'); + if (existsSync(cfgDir)) walkTs(cfgDir, files); + } + return files; +} + +const rel = (abs: string): string => abs.slice(INGESTION_DIR.length); + +// --------------------------------------------------------------------------- +// Mode 1 — config reflection +// --------------------------------------------------------------------------- +const CONFIG_NODE_TYPE_KEYS = new Set([ + 'typeDeclarationNodes', + 'methodNodeTypes', + 'bodyNodeTypes', + 'fieldNodeTypes', + 'variableNodeTypes', + 'staticNodeTypes', + 'constNodeTypes', + 'ancestorScopeNodeTypes', + 'fileScopeNodeTypes', + 'enumNodeTypes', + 'propertyNodeTypes', +]); + +const isStringArray = (v: unknown): v is string[] => + Array.isArray(v) && v.every((x) => typeof x === 'string'); + +async function collectConfigNodeTypes(): Promise { + const out: CollectedNodeType[] = []; + for (const file of configFiles()) { + const relPath = rel(file); + let mod: Record; + try { + // import the compiled .js sibling (vitest transpiles src on import) + mod = (await import(file)) as Record; + } catch { + continue; + } + for (const exported of Object.values(mod)) { + if (!exported || typeof exported !== 'object') continue; + const cfg = exported as Record; + const lang = cfg.language; + if (typeof lang !== 'string' || !ALL_LANGS.includes(lang as SupportedLanguages)) continue; + // Tag by the config FILE's served language set, not the single config + // object's `.language`: a shared file (typescript-javascript, c-cpp, jvm) + // legitimately lists nodes valid in a sibling grammar, so a node valid in + // ANY served language must not be flagged dead. Union the object's own + // language in case the file map is broader/narrower. + const fileLangs = fileLanguages(relPath); + const languages = fileLangs.includes(lang as SupportedLanguages) + ? fileLangs + : [...fileLangs, lang as SupportedLanguages]; + for (const [key, value] of Object.entries(cfg)) { + if (!CONFIG_NODE_TYPE_KEYS.has(key) || !isStringArray(value)) continue; + for (const literal of value) { + out.push({ literal, languages, file: relPath, line: 0, source: 'config' }); + } + } + } + } + return out; +} + +// --------------------------------------------------------------------------- +// Mode 2 — AST scan +// --------------------------------------------------------------------------- +const FIELD_LOOKUP_NAMES = new Set(['childForFieldName', 'childrenForFieldName']); +const MEMBERSHIP_NAMES = new Set(['has', 'includes']); + +/** Is `node` a `.type` property access? */ +function isDotType(node: ts.Node): node is ts.PropertyAccessExpression { + return ts.isPropertyAccessExpression(node) && node.name.text === 'type'; +} + +function lineOf(sf: ts.SourceFile, node: ts.Node): number { + return sf.getLineAndCharacterOfPosition(node.getStart(sf)).line + 1; +} + +interface ScanResult { + nodeTypes: CollectedNodeType[]; + fields: CollectedField[]; +} + +/** Extract string members of `new Set([...])` / `[...]` / `[...] as const`, or null if not a literal string array. */ +function collectConstMembers(init: ts.Expression): string[] | null { + let arr: ts.Expression | undefined; + if (ts.isNewExpression(init) && init.arguments && init.arguments.length > 0) { + arr = init.arguments[0]; + } else if (ts.isArrayLiteralExpression(init)) { + arr = init; + } else if (ts.isAsExpression(init)) { + return collectConstMembers(init.expression); + } + if (arr && ts.isArrayLiteralExpression(arr)) { + const members = arr.elements + .filter((e): e is ts.StringLiteral => ts.isStringLiteral(e)) + .map((e) => e.text); + return members.length === arr.elements.length ? members : null; + } + return null; +} + +// ── Receiver-node-type capture (KTD2) ────────────────────────────────────── +function isFunctionLikeNode(n: ts.Node): boolean { + return ( + ts.isFunctionDeclaration(n) || + ts.isFunctionExpression(n) || + ts.isArrowFunction(n) || + ts.isMethodDeclaration(n) || + ts.isConstructorDeclaration(n) || + ts.isGetAccessorDeclaration(n) || + ts.isSetAccessorDeclaration(n) + ); +} + +function rangeContains(outer: ts.Node, inner: ts.Node): boolean { + return inner.getStart() >= outer.getStart() && inner.getEnd() <= outer.getEnd(); +} + +function enclosingFunctionOf(node: ts.Node): ts.Node | undefined { + let cur: ts.Node | undefined = node.parent; + while (cur) { + if (isFunctionLikeNode(cur)) return cur; + cur = cur.parent; + } + return undefined; +} + +/** True if `recvText` is reassigned, mutated (++/--), or re-declared (shadowed) within `scope`. */ +function receiverMutatedIn(recvText: string, scope: ts.Node): boolean { + let mutated = false; + const walk = (n: ts.Node): void => { + if (mutated) return; + if ( + ts.isBinaryExpression(n) && + n.left.getText() === recvText && + n.operatorToken.kind >= ts.SyntaxKind.FirstAssignment && + n.operatorToken.kind <= ts.SyntaxKind.LastAssignment + ) { + mutated = true; + return; + } + if ( + (ts.isPrefixUnaryExpression(n) || ts.isPostfixUnaryExpression(n)) && + (n.operator === ts.SyntaxKind.PlusPlusToken || + n.operator === ts.SyntaxKind.MinusMinusToken) && + n.operand.getText() === recvText + ) { + mutated = true; + return; + } + if (ts.isVariableDeclaration(n) && ts.isIdentifier(n.name) && n.name.text === recvText) { + mutated = true; // re-declaration / shadow + return; + } + ts.forEachChild(n, walk); + }; + walk(scope); + return mutated; +} + +/** + * Conservative receiver-node-type capture for `recv.childForFieldName('field')`. + * Returns X only when `recv` is unambiguously narrowed by a single enclosing + * positive guard — `if (recv.type === 'X') {…}` (then-branch only) or + * `switch (recv.type) { case 'X': … }` — and the receiver is not reassigned or + * shadowed within the enclosing function. Any uncertainty → undefined, so the + * gate falls back to the sound global field check. Fail-safe by design: the + * failure mode is a benign false negative, never a false positive (KTD2). + */ +function receiverNodeTypeOf(call: ts.CallExpression, sf: ts.SourceFile): string | undefined { + if (!ts.isPropertyAccessExpression(call.expression)) return undefined; + const recvText = call.expression.expression.getText(sf); + + const isRecvDotType = (e: ts.Node): boolean => + ts.isPropertyAccessExpression(e) && + e.name.text === 'type' && + e.expression.getText(sf) === recvText; + const bareEq = (e: ts.Expression): string | undefined => { + if ( + ts.isBinaryExpression(e) && + e.operatorToken.kind === ts.SyntaxKind.EqualsEqualsEqualsToken + ) { + const lit = ts.isStringLiteralLike(e.left) + ? e.left + : ts.isStringLiteralLike(e.right) + ? e.right + : undefined; + if (lit && (isRecvDotType(e.left) || isRecvDotType(e.right))) return lit.text; + } + return undefined; + }; + + let found: string | undefined; + let enclosingFn: ts.Node | undefined; + let cur: ts.Node = call; + while (cur.parent) { + const p: ts.Node = cur.parent; + if ( + ts.isIfStatement(p) && + rangeContains(p.thenStatement, call) && + !(p.elseStatement !== undefined && rangeContains(p.elseStatement, call)) + ) { + const x = bareEq(p.expression); + if (x !== undefined) { + found = x; + break; + } + } else if (ts.isCaseClause(p)) { + const sw = p.parent.parent; + if ( + ts.isSwitchStatement(sw) && + isRecvDotType(sw.expression) && + ts.isStringLiteralLike(p.expression) + ) { + found = p.expression.text; + break; + } + } + if (isFunctionLikeNode(p)) { + enclosingFn = p; + break; + } + cur = p; + } + if (found === undefined) return undefined; + const scope = enclosingFn ?? enclosingFunctionOf(call) ?? sf; + return receiverMutatedIn(recvText, scope) ? undefined : found; +} + +function scanFile(file: string): ScanResult { + const relPath = rel(file); + const langs = fileLanguages(relPath); + const src = readFileSync(file, 'utf8'); + const sf = ts.createSourceFile(file, src, ts.ScriptTarget.Latest, true); + const nodeTypes: CollectedNodeType[] = []; + const fields: CollectedField[] = []; + + // First pass: index module-level string Set/array consts, and record which + // const identifiers are consumed via `SET.has(.type)` / `.includes(.type)`. + const constMembers = new Map(); + const typeConsumed = new Set(); + + const visit = (node: ts.Node): void => { + // module-level const Set/array of strings + if (ts.isVariableDeclaration(node) && ts.isIdentifier(node.name) && node.initializer) { + const members = collectConstMembers(node.initializer); + if (members) constMembers.set(node.name.text, members); + } + + // `.type === 'lit'` / `!==` + if ( + ts.isBinaryExpression(node) && + (node.operatorToken.kind === ts.SyntaxKind.EqualsEqualsEqualsToken || + node.operatorToken.kind === ts.SyntaxKind.ExclamationEqualsEqualsToken) + ) { + const { left, right } = node; + const lit = ts.isStringLiteral(left) ? left : ts.isStringLiteral(right) ? right : null; + const dot = isDotType(left) ? left : isDotType(right) ? right : null; + if (lit && dot) { + nodeTypes.push({ + literal: lit.text, + languages: langs, + file: relPath, + line: lineOf(sf, lit), + source: 'compare', + }); + } + } + + if (ts.isCallExpression(node) && ts.isPropertyAccessExpression(node.expression)) { + const method = node.expression.name.text; + const arg0 = node.arguments[0]; + // childForFieldName('field') + if (FIELD_LOOKUP_NAMES.has(method) && arg0 && ts.isStringLiteral(arg0)) { + fields.push({ + field: arg0.text, + languages: langs, + file: relPath, + line: lineOf(sf, arg0), + receiverNodeType: receiverNodeTypeOf(node, sf), + }); + } + // SET.has(.type) / SET.includes(.type) → mark the receiver set + if ( + MEMBERSHIP_NAMES.has(method) && + arg0 && + isDotType(arg0) && + ts.isIdentifier(node.expression.expression) + ) { + typeConsumed.add(node.expression.expression.text); + } + } + + // findNodeAtRange(a, b, 'lit') — 3rd arg, literal only (skip dynamic) + if ( + ts.isCallExpression(node) && + ((ts.isIdentifier(node.expression) && node.expression.text === 'findNodeAtRange') || + (ts.isPropertyAccessExpression(node.expression) && + node.expression.name.text === 'findNodeAtRange')) + ) { + const a2 = node.arguments[2]; + if (a2 && ts.isStringLiteral(a2)) { + nodeTypes.push({ + literal: a2.text, + languages: langs, + file: relPath, + line: lineOf(sf, a2), + source: 'find-node-arg', + }); + } + } + + ts.forEachChild(node, visit); + }; + visit(sf); + + // Second pass: emit members of every set that was consumed against `.type`. + for (const constName of typeConsumed) { + const members = constMembers.get(constName); + if (!members) continue; // imported or non-literal set — skip (sound: don't guess) + const memberLangs = constNameLanguages(constName, langs); + for (const literal of members) { + nodeTypes.push({ + literal, + languages: memberLangs, + file: relPath, + line: 0, + source: 'set-member', + }); + } + } + + return { nodeTypes, fields }; +} + +function collectInCodeLiterals(): ScanResult { + const nodeTypes: CollectedNodeType[] = []; + const fields: CollectedField[] = []; + for (const file of mode2Files()) { + const r = scanFile(file); + nodeTypes.push(...r.nodeTypes); + fields.push(...r.fields); + } + return { nodeTypes, fields }; +} + +// --------------------------------------------------------------------------- +// Mode 4 — registry RESOLUTION layer (scope-resolver/type-binding/receiver- +// binding/interpret/arity/import-decomposer/...), the production path for +// migrated languages. These files mix SyntaxNode `.type` (grammar nodes) with +// resolved-symbol `.type` (kinds like 'Class'); a naive scan would false- +// positive on the latter. So this mode uses the TS TypeChecker to collect a +// literal ONLY when its `.type` receiver / childForFieldName target resolves to +// a tree-sitter SyntaxNode. Per-language dir => grammar (no cross-lang ambiguity). +// --------------------------------------------------------------------------- +const REPO_ROOT = fileURLToPath(new URL('../../', import.meta.url)); +const RES_SKIP = new Set(['captures.ts', 'query.ts', 'index.ts']); +const NODE_ARG_FNS = new Set(['findChild', 'findNamedChild', 'findSiblingChild']); +/** + * Shared resolution-layer files directly under ingestion/ (NOT in + * `languages//`). They mix SyntaxNode `.type` with resolved-symbol `.type`, + * so they belong in the TypeChecker-gated Mode 4; being language-agnostic, they + * are tagged with the full gated set (valid-if-any). See KTD3. + */ +const SHARED_RESOLUTION_FILES = ['type-env.ts']; + +function resolutionLayerFiles(): { file: string; langs: SupportedLanguages[] }[] { + const out: { file: string; langs: SupportedLanguages[] }[] = []; + const langsDir = join(INGESTION_DIR, 'languages'); + // Per-language registry resolution files → tagged to that one grammar. + if (existsSync(langsDir)) { + for (const dir of readdirSync(langsDir)) { + if (dir === 'cobol') continue; + const lang = DIR_LANG[dir]; + if (!lang) continue; + const d = join(langsDir, dir); + if (!statSync(d).isDirectory()) continue; + const sub: string[] = []; + walkTs(d, sub); + for (const f of sub) { + if (!RES_SKIP.has(f.split('/').pop() ?? '')) out.push({ file: f, langs: [lang] }); + } + } + } + // Shared, language-agnostic resolution files → full gated set via fileLanguages. + for (const name of SHARED_RESOLUTION_FILES) { + const f = join(INGESTION_DIR, name); + if (existsSync(f)) out.push({ file: f, langs: fileLanguages(rel(f)) }); + } + return out; +} + +let _program: ts.Program | null = null; +let _checker: ts.TypeChecker | null = null; +function buildProgram( + rootFiles: string[], +): { program: ts.Program; checker: ts.TypeChecker } | null { + if (_program && _checker) return { program: _program, checker: _checker }; + try { + const cfg = ts.readConfigFile(join(REPO_ROOT, 'tsconfig.json'), ts.sys.readFile); + const parsed = ts.parseJsonConfigFileContent(cfg.config ?? {}, ts.sys, REPO_ROOT); + const options: ts.CompilerOptions = { ...parsed.options, noEmit: true, skipLibCheck: true }; + _program = ts.createProgram(rootFiles, options); + _checker = _program.getTypeChecker(); + return { program: _program, checker: _checker }; + } catch { + return null; + } +} + +/** True when `node`'s resolved type is (or includes) a tree-sitter SyntaxNode. */ +function isSyntaxNodeReceiver(checker: ts.TypeChecker, node: ts.Node): boolean { + try { + const s = checker.typeToString(checker.getTypeAtLocation(node)); + return /\bSyntaxNode\b/.test(s); + } catch { + return false; + } +} + +/** Did the build succeed? (false => mode degraded; surfaced so coverage isn't silently lost) */ +export let resolutionLayerProgramOk = true; + +function collectResolutionLayerLiterals(): ScanResult { + const nodeTypes: CollectedNodeType[] = []; + const fields: CollectedField[] = []; + const entries = resolutionLayerFiles(); + const built = buildProgram(entries.map((e) => e.file)); + if (!built) { + resolutionLayerProgramOk = false; + return { nodeTypes, fields }; + } + const { program, checker } = built; + + for (const { file, langs } of entries) { + const sf = program.getSourceFile(file); + if (!sf) continue; + const relPath = rel(file); + const constMembers = new Map(); + const consumedSets = new Set(); + + const visit = (node: ts.Node): void => { + if (ts.isVariableDeclaration(node) && ts.isIdentifier(node.name) && node.initializer) { + const m = collectConstMembers(node.initializer); + if (m) constMembers.set(node.name.text, m); + } + // `.type === 'lit'` — only when recv is a SyntaxNode + if ( + ts.isBinaryExpression(node) && + (node.operatorToken.kind === ts.SyntaxKind.EqualsEqualsEqualsToken || + node.operatorToken.kind === ts.SyntaxKind.ExclamationEqualsEqualsToken) + ) { + const { left, right } = node; + const lit = ts.isStringLiteral(left) ? left : ts.isStringLiteral(right) ? right : null; + const dot = isDotType(left) ? left : isDotType(right) ? right : null; + if (lit && dot && isSyntaxNodeReceiver(checker, dot.expression)) { + nodeTypes.push({ + literal: lit.text, + languages: langs, + file: relPath, + line: lineOf(sf, lit), + source: 'compare', + }); + } + } + if (ts.isCallExpression(node) && ts.isPropertyAccessExpression(node.expression)) { + const method = node.expression.name.text; + const arg0 = node.arguments[0]; + // childForFieldName('field') on a SyntaxNode + if ( + FIELD_LOOKUP_NAMES.has(method) && + arg0 && + ts.isStringLiteral(arg0) && + isSyntaxNodeReceiver(checker, node.expression.expression) + ) { + fields.push({ + field: arg0.text, + languages: langs, + file: relPath, + line: lineOf(sf, arg0), + receiverNodeType: receiverNodeTypeOf(node, sf), + }); + } + // SET.has(.type) where recv is a SyntaxNode + if ( + MEMBERSHIP_NAMES.has(method) && + arg0 && + isDotType(arg0) && + ts.isIdentifier(node.expression.expression) && + isSyntaxNodeReceiver(checker, arg0.expression) + ) { + consumedSets.add(node.expression.expression.text); + } + } + // findChild/findNamedChild/findSiblingChild(, 'lit') 2nd arg, or + // findNodeAtRange(_, _, 'lit') 3rd arg — node-type literals; gate recv. + if (ts.isCallExpression(node)) { + const callee = node.expression; + const fname = ts.isIdentifier(callee) + ? callee.text + : ts.isPropertyAccessExpression(callee) + ? callee.name.text + : ''; + if (NODE_ARG_FNS.has(fname)) { + const recv = node.arguments[0]; + const a1 = node.arguments[1]; + if (a1 && ts.isStringLiteral(a1) && recv && isSyntaxNodeReceiver(checker, recv)) { + nodeTypes.push({ + literal: a1.text, + languages: langs, + file: relPath, + line: lineOf(sf, a1), + source: 'find-node-arg', + }); + } + } else if (fname === 'findNodeAtRange') { + const a2 = node.arguments[2]; + if (a2 && ts.isStringLiteral(a2)) { + nodeTypes.push({ + literal: a2.text, + languages: langs, + file: relPath, + line: lineOf(sf, a2), + source: 'find-node-arg', + }); + } + } + } + ts.forEachChild(node, visit); + }; + visit(sf); + + for (const constName of consumedSets) { + const members = constMembers.get(constName); + if (!members) continue; + for (const literal of members) { + nodeTypes.push({ + literal, + languages: constNameLanguages(constName, langs), + file: relPath, + line: 0, + source: 'set-member', + }); + } + } + } + return { nodeTypes, fields }; +} + +// --------------------------------------------------------------------------- +// Mode 3 — registry scope-query probes +// --------------------------------------------------------------------------- +async function collectRegistryQueryProbes(): Promise { + const out: RegistryQueryProbe[] = []; + const langsDir = join(INGESTION_DIR, 'languages'); + if (!existsSync(langsDir)) return out; + for (const dir of readdirSync(langsDir)) { + if (dir === 'cobol') continue; + const lang = DIR_LANG[dir]; + if (!lang) continue; + const queryFile = join(langsDir, dir, 'query.ts'); + if (!existsSync(queryFile)) continue; + // Importing query.ts loads the grammar at module top level — gate it. + if (!isLanguageAvailable(lang)) continue; + let mod: Record; + try { + mod = (await import(queryFile)) as Record; + } catch (e) { + out.push({ language: lang, getter: '(import)', error: String((e as Error).message ?? e) }); + continue; + } + for (const [name, value] of Object.entries(mod)) { + if (typeof value !== 'function' || !/ScopeQuery$/.test(name)) continue; + try { + (value as () => unknown)(); + out.push({ language: lang, getter: name, error: null }); + } catch (e) { + out.push({ language: lang, getter: name, error: String((e as Error).message ?? e) }); + } + } + } + return out; +} + +// --------------------------------------------------------------------------- +// Public entry point +// --------------------------------------------------------------------------- +export interface CollectedLiterals { + nodeTypes: CollectedNodeType[]; + fields: CollectedField[]; + queryProbes: RegistryQueryProbe[]; +} + +export async function collectAllLiterals(): Promise { + const config = await collectConfigNodeTypes(); + const inCode = collectInCodeLiterals(); + const resolution = collectResolutionLayerLiterals(); // Mode 4 (TypeChecker-gated) + const queryProbes = await collectRegistryQueryProbes(); + return { + nodeTypes: [...config, ...inCode.nodeTypes, ...resolution.nodeTypes], + fields: [...inCode.fields, ...resolution.fields], + queryProbes, + }; +} + +// Exposed for focused unit tests. +export const __test = { + collectConfigNodeTypes, + collectInCodeLiterals, + collectResolutionLayerLiterals, + resolutionLayerFiles, + mode2Files, + fileLanguages, +}; diff --git a/gitnexus/test/integration/grammar-introspection.test.ts b/gitnexus/test/integration/grammar-introspection.test.ts new file mode 100644 index 0000000000..70194b0c33 --- /dev/null +++ b/gitnexus/test/integration/grammar-introspection.test.ts @@ -0,0 +1,206 @@ +import { describe, it, expect } from 'vitest'; +import Parser from 'tree-sitter'; +import { SupportedLanguages } from '../../src/config/supported-languages.js'; +import { + getLanguageGrammar, + isLanguageAvailable, +} from '../../src/core/tree-sitter/parser-loader.js'; +import { + GATED_LANGUAGES, + loadGrammarModel, + probeNodeType, + probeField, + validateNodeType, + validateField, + isNodeTypeError, + isFieldError, +} from '../helpers/grammar-introspection.js'; + +describe('grammar-introspection helper', () => { + describe('loadGrammarModel — membership set', () => { + it('builds named, anonymous, supertype node types and per-node fields for Python', () => { + const model = loadGrammarModel(SupportedLanguages.Python); + expect(model).not.toBeNull(); + // named node, anonymous token, and a supertype name are all members + expect(model!.nodeTypes.has('function_definition')).toBe(true); + expect(model!.nodeTypes.has('{')).toBe(true); + expect(model!.nodeTypes.has('expression')).toBe(true); + // per-node fields + const fields = model!.fieldsByNode.get('function_definition'); + expect(fields).toBeDefined(); + expect(fields!.has('name')).toBe(true); + expect(fields!.has('body')).toBe(true); + expect(fields!.has('parameters')).toBe(true); + expect(model!.allFields.has('name')).toBe(true); + }); + + it('unions typescript ∪ tsx so JSX-only nodes are members', () => { + const model = loadGrammarModel(SupportedLanguages.TypeScript); + expect(model).not.toBeNull(); + expect(model!.nodeTypes.has('jsx_element')).toBe(true); // tsx-only + expect(model!.nodeTypes.has('type_annotation')).toBe(true); // typescript + }); + + it('resolves PHP to the php_only variant (excludes embedded-HTML nodes)', () => { + const model = loadGrammarModel(SupportedLanguages.PHP); + expect(model).not.toBeNull(); + expect(model!.nodeTypes.has('function_definition')).toBe(true); + // text_interpolation exists only in the full `php` (embedded-HTML) grammar + expect(model!.nodeTypes.has('text_interpolation')).toBe(false); + }); + + it('excludes COBOL and never throws for any gated language', () => { + expect(GATED_LANGUAGES).not.toContain(SupportedLanguages.Cobol); + for (const lang of GATED_LANGUAGES) { + // returns a model (installed) or null (optional grammar absent) — never throws + expect(() => loadGrammarModel(lang)).not.toThrow(); + } + }); + }); + + describe('probeNodeType — live-grammar fallback', () => { + it('classifies an absent node type as dead and a real one as valid (Rust)', () => { + if (!isLanguageAvailable(SupportedLanguages.Rust)) return; + expect(probeNodeType(SupportedLanguages.Rust, 'method_call_expression')).toBe('dead'); + expect(probeNodeType(SupportedLanguages.Rust, 'call_expression')).toBe('valid'); + }); + + it('accepts an anonymous token via the "x" form (Python)', () => { + if (!isLanguageAvailable(SupportedLanguages.Python)) return; + expect(probeNodeType(SupportedLanguages.Python, '{')).toBe('valid'); + }); + + it('accepts a supertype via membership without needing a probe (Python)', () => { + const model = loadGrammarModel(SupportedLanguages.Python); + expect(validateNodeType(SupportedLanguages.Python, model, 'expression')).toBe('valid'); + }); + + it('classifies a bogus node type as dead for installed grammars (never just not-throw)', () => { + for (const lang of GATED_LANGUAGES) { + const verdict = probeNodeType(lang, 'definitely_not_a_node_type_xyz'); + // installed → an absent node type is 'dead'; uninstalled optional grammar → 'unavailable'. + if (isLanguageAvailable(lang)) { + expect(verdict, `${lang} should classify a bogus node type as dead`).toBe('dead'); + } else { + expect(verdict).toBe('unavailable'); + } + } + }); + + it('distinguishes the null-model paths: validateField unavailable vs validateNodeType still probes', () => { + // validateField short-circuits to unavailable with no model (no grammar set). + expect(validateField(null, 'anything', 'some_node')).toBe('unavailable'); + // validateNodeType, by contrast, still probes the LIVE grammar when the model + // is null, so for an installed language a bogus node type is 'dead'. + if (isLanguageAvailable(SupportedLanguages.Python)) { + expect(validateNodeType(SupportedLanguages.Python, null, 'definitely_not_xyz')).toBe( + 'dead', + ); + } + }); + }); + + describe('isNodeTypeError — classifier self-test', () => { + it('matches the TSQueryErrorNodeType message and rejects valid queries', () => { + if (!isLanguageAvailable(SupportedLanguages.Rust)) return; + const grammar = getLanguageGrammar(SupportedLanguages.Rust) as ConstructorParameters< + typeof Parser.Query + >[0]; + let caught: unknown; + try { + // method_call_expression does not exist in tree-sitter-rust + new Parser.Query(grammar, '(method_call_expression) @_'); + } catch (e) { + caught = e; + } + expect(caught).toBeDefined(); + // If a future tree-sitter bump changes the wording, this fails loudly + // instead of silently passing every literal. + expect(isNodeTypeError(caught)).toBe(true); + // a valid node type compiles without throwing + expect(() => new Parser.Query(grammar, '(call_expression) @_')).not.toThrow(); + }); + }); + + describe('validateField', () => { + it('passes a real node-scoped field and fails a non-existent one', () => { + const model = loadGrammarModel(SupportedLanguages.Python); + expect(validateField(model, 'name', 'function_definition')).toBe('valid'); + expect(validateField(model, 'nonexistent_field_xyz', 'function_definition')).toBe('dead'); + }); + + it('rescues a JSON-under-reported / supertype-permissive field via the probe (not a false positive)', () => { + // C# `parameter` has no `pattern` field (TSQueryErrorStructure), but + // `binary_expression` accepts `pattern` through its supertype-typed slots, + // so the probe compiles and validateField must NOT flag it dead. This pins + // the conservative-toward-valid direction: a membership miss falls through + // to the probe, never straight to dead. + if (!isLanguageAvailable(SupportedLanguages.CSharp)) return; + const model = loadGrammarModel(SupportedLanguages.CSharp); + expect(validateField(model, 'pattern', 'parameter')).toBe('dead'); // structurally impossible + expect(validateField(model, 'pattern', 'binary_expression')).toBe('valid'); // probe-rescued + }); + }); + + describe('probeField — conservative node-scoped field oracle', () => { + it('classifies a structurally-impossible field as dead (C# parameter/pattern)', () => { + if (!isLanguageAvailable(SupportedLanguages.CSharp)) return; + // (parameter pattern: (_)) throws TSQueryErrorStructure + expect(probeField(SupportedLanguages.CSharp, 'parameter', 'pattern')).toBe('dead'); + // an unknown field name throws TSQueryErrorField + expect(probeField(SupportedLanguages.CSharp, 'parameter', 'total_garbage_field')).toBe( + 'dead', + ); + // a real field compiles + expect(probeField(SupportedLanguages.CSharp, 'parameter', 'type')).toBe('valid'); + }); + + it('returns unavailable (not dead) when the node type is absent in the grammar', () => { + if (!isLanguageAvailable(SupportedLanguages.Java)) return; + // `parameter` is not a Java node (Java uses `formal_parameter`) → NodeType error + // → unavailable, so multi-language ANY-semantics can defer to the right grammar. + expect(probeField(SupportedLanguages.Java, 'parameter', 'name')).toBe('unavailable'); + }); + + it('is conservative-toward-valid for supertype-typed fields (never false-positive)', () => { + if (!isLanguageAvailable(SupportedLanguages.CSharp)) return; + // `binary_expression` has no `pattern` field, but its supertype-typed slots + // make the query compile → valid. The probe errs toward valid by design. + expect(probeField(SupportedLanguages.CSharp, 'binary_expression', 'pattern')).toBe('valid'); + }); + + it('never throws for any gated language', () => { + for (const lang of GATED_LANGUAGES) { + expect(() => probeField(lang, 'some_node', 'some_field')).not.toThrow(); + } + }); + }); + + describe('isFieldError — classifier self-test', () => { + it('matches TSQueryErrorStructure and TSQueryErrorField but not NodeType', () => { + if (!isLanguageAvailable(SupportedLanguages.CSharp)) return; + const grammar = getLanguageGrammar(SupportedLanguages.CSharp) as ConstructorParameters< + typeof Parser.Query + >[0]; + const grab = (q: string): unknown => { + try { + new Parser.Query(grammar, q); + return undefined; + } catch (e) { + return e; + } + }; + const structureErr = grab('(parameter pattern: (_)) @_'); // TSQueryErrorStructure + const fieldErr = grab('(parameter total_garbage_field: (_)) @_'); // TSQueryErrorField + const nodeTypeErr = grab('(nonexistent_node_xyz) @_'); // TSQueryErrorNodeType + expect(structureErr).toBeDefined(); + expect(fieldErr).toBeDefined(); + expect(nodeTypeErr).toBeDefined(); + expect(isFieldError(structureErr)).toBe(true); + expect(isFieldError(fieldErr)).toBe(true); + // a node-type error is NOT a field error (it routes to `unavailable`, not `dead`) + expect(isFieldError(nodeTypeErr)).toBe(false); + expect(isNodeTypeError(nodeTypeErr)).toBe(true); + }); + }); +}); diff --git a/gitnexus/test/integration/grammar-literal-validation.test.ts b/gitnexus/test/integration/grammar-literal-validation.test.ts new file mode 100644 index 0000000000..ffba56b6c6 --- /dev/null +++ b/gitnexus/test/integration/grammar-literal-validation.test.ts @@ -0,0 +1,160 @@ +import { describe, it, expect, beforeAll } from 'vitest'; +import { SupportedLanguages } from '../../src/config/supported-languages.js'; +import { + GATED_LANGUAGES, + loadGrammarModel, + validateNodeType, + validateField, + type GrammarModel, +} from '../helpers/grammar-introspection.js'; +import { + collectAllLiterals, + resolutionLayerProgramOk, + type CollectedLiterals, +} from '../helpers/literal-collectors.js'; + +/** + * Grammar-drift gate (issue #1920): every tree-sitter node-type and field-name + * literal referenced in the ingestion CODE must be emittable by at least one of + * the grammar(s) that code path serves. A literal absent from every candidate + * grammar is a "dead branch keyed on a node type the grammar never emits" — + * the systemic defect this gate kills. + * + * Complements query-compilation.test.ts (which compiles the legacy *_QUERIES + * banks): this gate covers the NON-compiled literal surface (node.type ===, + * childForFieldName, Set/array node-type lists) plus the registry scope queries. + */ + +// Empty by design: every dead grammar literal this gate surfaces is removed in +// this PR — no allowlisted debt. Mirrors query-compilation.test.ts:40. Keep it +// empty; fix the literal at its source rather than allowlisting it here. +const knownFailures = new Set([]); + +interface Failure { + kind: 'node-type' | 'field' | 'query'; + literal: string; + file: string; + line: number; + languages: string[]; +} + +const fmt = (f: Failure): string => + `${f.kind} "${f.literal}" — ${f.file}:${f.line} — not valid in [${f.languages.join(', ')}]`; + +describe('grammar literal validation gate', () => { + let collected: CollectedLiterals; + const models = new Map(); + + beforeAll(async () => { + for (const lang of GATED_LANGUAGES) models.set(lang, loadGrammarModel(lang)); + collected = await collectAllLiterals(); + }, 120_000); + + /** + * "valid" if ANY candidate grammar accepts it; "dead" if at least one + * candidate rejects it and none accept; "unavailable" if every candidate + * grammar is absent (so we skip rather than fail — R9). + */ + function classify( + languages: SupportedLanguages[], + check: (lang: SupportedLanguages) => 'valid' | 'dead' | 'unavailable', + ): 'valid' | 'dead' | 'unavailable' { + let sawDead = false; + for (const lang of languages) { + const r = check(lang); + if (r === 'valid') return 'valid'; + if (r === 'dead') sawDead = true; + } + return sawDead ? 'dead' : 'unavailable'; + } + + it('every node-type and field literal exists in its grammar; registry queries compile', () => { + const failures: Failure[] = []; + + for (const n of collected.nodeTypes) { + if (knownFailures.has(n.literal)) continue; + const verdict = classify(n.languages, (lang) => + validateNodeType(lang, models.get(lang) ?? null, n.literal), + ); + if (verdict === 'dead') { + failures.push({ + kind: 'node-type', + literal: n.literal, + file: n.file, + line: n.line, + languages: n.languages, + }); + } + } + + for (const f of collected.fields) { + if (knownFailures.has(f.field)) continue; + const verdict = classify(f.languages, (lang) => + validateField(models.get(lang) ?? null, f.field, f.receiverNodeType), + ); + if (verdict === 'dead') { + failures.push({ + kind: 'field', + literal: f.field, + file: f.file, + line: f.line, + languages: f.languages, + }); + } + } + + for (const q of collected.queryProbes) { + if (q.error) { + failures.push({ + kind: 'query', + literal: `${q.getter} (${q.error})`, + file: `languages/${q.language}/query.ts`, + line: 0, + languages: [q.language], + }); + } + } + + // De-dup identical (kind, literal, file) rows for a readable report. + const seen = new Set(); + const unique = failures.filter((f) => { + const k = `${f.kind}|${f.literal}|${f.file}`; + if (seen.has(k)) return false; + seen.add(k); + return true; + }); + + const report = + unique.length === 0 + ? '' + : `\n${unique.length} dead grammar literal(s) found:\n` + + unique + .slice() + .sort((a, b) => a.file.localeCompare(b.file)) + .map((f) => ` - ${fmt(f)}`) + .join('\n') + + '\n'; + + expect(unique, report).toHaveLength(0); + }, 120_000); + + it('runs non-vacuously: collector populated and the Mode-4 resolution layer built', () => { + // A vacuous pass — empty collection, or a degraded TS-program build that + // silently zeroes Mode-4 — must FAIL the gate rather than slip through green. + // (#1937 tri-review: Mode-4 silent-degrade + gate-vacuity holes.) + expect(resolutionLayerProgramOk, 'Mode-4 TypeScript program failed to build').toBe(true); + expect(collected.nodeTypes.length, 'collector returned too few node types').toBeGreaterThan(50); + expect(collected.fields.length, 'collector returned too few fields').toBeGreaterThan(50); + expect(knownFailures.size, 'knownFailures must stay empty per policy').toBe(0); + }); + + it('does not flag capture-tag strings', () => { + expect(collected.nodeTypes.some((n) => n.literal.startsWith('@'))).toBe(false); + }); + + it('validates a real node-scoped field and rejects a bogus one (Python)', () => { + const model = loadGrammarModel(SupportedLanguages.Python); + expect(validateField(model, 'name', 'function_definition')).toBe('valid'); + expect(validateField(model, 'definitely_not_a_field', 'function_definition')).toBe('dead'); + }); +}); diff --git a/gitnexus/test/integration/literal-collectors.test.ts b/gitnexus/test/integration/literal-collectors.test.ts new file mode 100644 index 0000000000..9715cdb188 --- /dev/null +++ b/gitnexus/test/integration/literal-collectors.test.ts @@ -0,0 +1,119 @@ +import { describe, it, expect } from 'vitest'; +import { SupportedLanguages } from '../../src/config/supported-languages.js'; +import { + collectAllLiterals, + __test, + type CollectedNodeType, +} from '../helpers/literal-collectors.js'; + +const hasNodeType = ( + list: CollectedNodeType[], + literal: string, + lang?: SupportedLanguages, +): boolean => + list.some((n) => n.literal === literal && (lang === undefined || n.languages.includes(lang))); + +describe('literal-collectors', () => { + describe('Mode 1 — config reflection', () => { + it('splits c-cpp configs by language', async () => { + const { nodeTypes } = await collectAllLiterals(); + const config = nodeTypes.filter((n) => n.source === 'config'); + expect(hasNodeType(config, 'struct_specifier', SupportedLanguages.C)).toBe(true); + expect(hasNodeType(config, 'class_specifier', SupportedLanguages.CPlusPlus)).toBe(true); + }); + }); + + describe('Mode 2 — AST scan over the extraction surface', () => { + const { nodeTypes, fields } = __test.collectInCodeLiterals(); + + it('collects literals that live OUTSIDE configs/ (the surface fix)', () => { + // direct `.type ===` in a single-language type-extractor (a valid, kept literal) + expect(hasNodeType(nodeTypes, 'call_expression', SupportedLanguages.Rust)).toBe(true); + // a literal inside a per-language captures.ts (valid, kept) + expect(hasNodeType(nodeTypes, 'reference_declarator', SupportedLanguages.CPlusPlus)).toBe( + true, + ); + }); + + it('collects members of a Set consumed against node.type (RUBY_METHOD_NODE_TYPES)', () => { + const setMembers = nodeTypes.filter((n) => n.source === 'set-member'); + expect(hasNodeType(setMembers, 'singleton_method', SupportedLanguages.Ruby)).toBe(true); + }); + + it('collects export-detection.ts language-named set members tagged by const prefix', () => { + // CSHARP_DECL_TYPES is consumed via `.has(node.type)`; a valid, kept member + expect(hasNodeType(nodeTypes, 'record_declaration', SupportedLanguages.CSharp)).toBe(true); + }); + + it('B1 guard: semantic type-name sets are NOT collected as node types', () => { + // PRIMITIVE_TYPES / NULLABLE_WRAPPER_TYPES are consumed via .has(text) / + // .has(name), never .has(node.type), so their members must never appear. + expect(hasNodeType(nodeTypes, 'i32')).toBe(false); + expect(hasNodeType(nodeTypes, 'usize')).toBe(false); + expect(hasNodeType(nodeTypes, 'Optional')).toBe(false); + }); + + it('collects field literals and never collects capture-tag strings as node types', () => { + expect(fields.length).toBeGreaterThan(0); + // capture tags start with '@' and are compared by name/role, never as node types + expect(nodeTypes.some((n) => n.literal.startsWith('@'))).toBe(false); + }); + + it('captures the receiver node type from a positive type-guard; leaves ungated lookups unscoped', () => { + // if (node.type === 'is_pattern_expression') { ... node.childForFieldName('pattern') } + const scoped = fields.find( + (f) => + f.field === 'pattern' && + f.receiverNodeType === 'is_pattern_expression' && + f.file.endsWith('type-extractors/csharp.ts'), + ); + expect(scoped).toBeDefined(); + // a childForFieldName NOT inside a single positive type-guard stays unscoped + // (receiverNodeType undefined) → the gate uses the sound global field check. + const unscoped = fields.find((f) => f.receiverNodeType === undefined); + expect(unscoped).toBeDefined(); + }); + + it('does not scan the COBOL or resolution layer', () => { + expect(nodeTypes.some((n) => n.file.includes('cobol'))).toBe(false); + // resolution-layer files (where .type is a resolved-symbol kind) are excluded + expect(nodeTypes.some((n) => n.file.endsWith('call-processor.ts'))).toBe(false); + expect(nodeTypes.some((n) => n.file.endsWith('type-env.ts'))).toBe(false); + }); + }); + + describe('Mode 4 — registry resolution layer (TypeChecker-gated)', () => { + it('scans the resolution layer and tags literals by language dir', () => { + const { nodeTypes } = __test.collectResolutionLayerLiterals(); + // The TS Program must have built (else coverage is silently lost). + expect(nodeTypes.length).toBeGreaterThan(0); + // a real cpp resolution-layer node type (arity-metadata.ts) tagged C++ + expect(hasNodeType(nodeTypes, 'parameter_declaration', SupportedLanguages.CPlusPlus)).toBe( + true, + ); + // discriminator: resolution-layer literals are grammar nodes (snake_case / + // anonymous), never resolved-symbol PascalCase kinds like 'Class'/'Struct'. + expect(nodeTypes.some((n) => /^[A-Z]/.test(n.literal))).toBe(false); + }); + + it('scans shared resolution files (type-env.ts) tagged with the full language set', () => { + const { nodeTypes } = __test.collectResolutionLayerLiterals(); + const typeEnv = nodeTypes.filter((n) => n.file.endsWith('type-env.ts')); + expect(typeEnv.length).toBeGreaterThan(0); + // shared (non-languages//) file → tagged with the full gated set + // (valid-if-any), not a single language. + expect(typeEnv.every((n) => n.languages.length > 1)).toBe(true); + }); + }); + + describe('Mode 3 — registry scope-query probes', () => { + it('probes available languages registry scope queries', async () => { + const { queryProbes } = await collectAllLiterals(); + expect(queryProbes.length).toBeGreaterThan(0); + // every probe carries a language + getter name + for (const p of queryProbes) { + expect(p.getter).toBeTruthy(); + } + }); + }); +}); diff --git a/gitnexus/test/integration/parsing.test.ts b/gitnexus/test/integration/parsing.test.ts index 501cfaced6..9f9079c180 100644 --- a/gitnexus/test/integration/parsing.test.ts +++ b/gitnexus/test/integration/parsing.test.ts @@ -697,8 +697,12 @@ describe('parsing', () => { it('record_struct with public modifier is exported', () => { const modifier = mockNode('modifier', 'public'); const nameNode = mockNode('identifier', 'Coord'); + // tree-sitter-c-sharp emits `record_declaration` for `record`, `record + // struct`, and `record class` alike (verified via real parse, #1920) — + // there is no separate `record_struct_declaration` node type, so the + // export check sees a `record_declaration`. const recStruct = mockNode( - 'record_struct_declaration', + 'record_declaration', 'public record struct Coord {}', undefined, [modifier, nameNode], @@ -709,8 +713,10 @@ describe('parsing', () => { it('record_class with public modifier is exported', () => { const modifier = mockNode('modifier', 'public'); const nameNode = mockNode('identifier', 'UserRecord'); + // `record class` also parses to `record_declaration` (see note above); + // there is no `record_class_declaration` node in tree-sitter-c-sharp. const recClass = mockNode( - 'record_class_declaration', + 'record_declaration', 'public record class UserRecord {}', undefined, [modifier, nameNode], diff --git a/gitnexus/test/unit/extract-generic-type-args.test.ts b/gitnexus/test/unit/extract-generic-type-args.test.ts index 7c03ba6477..5366e3a1a8 100644 --- a/gitnexus/test/unit/extract-generic-type-args.test.ts +++ b/gitnexus/test/unit/extract-generic-type-args.test.ts @@ -1,5 +1,8 @@ import { describe, it, expect } from 'vitest'; +import Parser from 'tree-sitter'; import { extractGenericTypeArgs } from '../../src/core/ingestion/type-extractors/shared.js'; +import { getLanguageGrammar } from '../../src/core/tree-sitter/parser-loader.js'; +import { SupportedLanguages } from '../../src/config/supported-languages.js'; import type { SyntaxNode } from '../../src/core/ingestion/utils/ast-helpers.js'; /** @@ -112,19 +115,108 @@ describe('extractGenericTypeArgs', () => { }); }); - describe('parameterized_type (Java/Kotlin alternate node type)', () => { - it('extracts type arguments from parameterized_type', () => { - const baseNode = mockNode('type_identifier', { text: 'List' }); - const argNode = mockNode('type_identifier', { text: 'User' }); - const typeArgsNode = mockNode('type_arguments', { - namedChildren: [argNode], - }); - const node = mockNode('parameterized_type', { - namedChildren: [baseNode, typeArgsNode], - fields: { name: baseNode }, + // Ground the extractor against the REAL node types each shipped grammar emits + // for a generic — no mocks. This is what catches grammar drift / wrong guesses + // (e.g. the never-emitted `parameterized_type` the extractor used to special- + // case): Java/TypeScript/Rust → generic_type, C# → generic_name, Kotlin → + // user_type (List → user_type > [type_identifier, type_arguments]). #1920 + describe('real grammar generic types (parsed, not mocked)', () => { + // Return the smallest parsed node whose text is exactly `typeText`. + function parseTypeNode( + lang: SupportedLanguages, + file: string, + code: string, + typeText: string, + ): SyntaxNode { + const parser = new Parser(); + parser.setLanguage(getLanguageGrammar(lang, file) as Parameters[0]); + const tree = parser.parse(code); + let best: SyntaxNode | null = null; + const walk = (n: SyntaxNode): void => { + if (n.text === typeText && (best === null || n.text.length <= best.text.length)) best = n; + for (let i = 0; i < n.childCount; i++) { + const c = n.child(i); + if (c) walk(c as unknown as SyntaxNode); + } + }; + walk(tree.rootNode as unknown as SyntaxNode); + if (best === null) throw new Error(`no node with text "${typeText}" parsed for ${lang}`); + return best; + } + + const cases: Array<{ + lang: SupportedLanguages; + file: string; + code: string; + typeText: string; + expected: string[]; + }> = [ + { + lang: SupportedLanguages.Java, + file: 'C.java', + code: 'class C { List f; }', + typeText: 'List', + expected: ['User'], + }, + { + lang: SupportedLanguages.TypeScript, + file: 'c.ts', + code: 'let f: Array;', + typeText: 'Array', + expected: ['User'], + }, + { + lang: SupportedLanguages.CSharp, + file: 'C.cs', + code: 'class C { List f; }', + typeText: 'List', + expected: ['User'], + }, + { + lang: SupportedLanguages.Rust, + file: 'c.rs', + code: 'struct C { f: Vec }', + typeText: 'Vec', + expected: ['User'], + }, + { + lang: SupportedLanguages.Kotlin, + file: 'C.kt', + code: 'class C { val f: List = x }', + typeText: 'List', + expected: ['User'], + }, + { + lang: SupportedLanguages.Java, + file: 'C.java', + code: 'class C { Map f; }', + typeText: 'Map', + expected: ['String', 'User'], + }, + { + // Kotlin multi-arg through user_type > type_arguments > type_projection. + lang: SupportedLanguages.Kotlin, + file: 'C.kt', + code: 'class C { val f: Map = x }', + typeText: 'Map', + expected: ['String', 'User'], + }, + { + // C# multi-arg through generic_name > type_argument_list. + lang: SupportedLanguages.CSharp, + file: 'C.cs', + code: 'class C { Dictionary f; }', + typeText: 'Dictionary', + expected: ['string', 'User'], + }, + ]; + + for (const { lang, file, code, typeText, expected } of cases) { + it(`captures [${expected.join(', ')}] from a real ${lang} \`${typeText}\``, () => { + const node = parseTypeNode(lang, file, code, typeText); + expect(extractGenericTypeArgs(node)).toEqual(expected); }); - expect(extractGenericTypeArgs(node)).toEqual(['User']); - }); + } }); describe('wrapper node unwrapping', () => { diff --git a/gitnexus/test/unit/java-call-arity.test.ts b/gitnexus/test/unit/java-call-arity.test.ts new file mode 100644 index 0000000000..7ed49a847e --- /dev/null +++ b/gitnexus/test/unit/java-call-arity.test.ts @@ -0,0 +1,39 @@ +import { describe, expect, it } from 'vitest'; +import { emitJavaScopeCaptures } from '../../src/core/ingestion/languages/java/captures.js'; + +/** Return the `@reference.arity` of the call named `callName`, or undefined. */ +function arityOf(source: string, callName: string): string | undefined { + const matches = emitJavaScopeCaptures(source, 'Fixture.java').map((m) => + Object.fromEntries(Object.entries(m).map(([tag, cap]) => [tag, cap.text])), + ); + const call = matches.find( + (m) => m['@reference.name'] === callName && m['@reference.arity'] !== undefined, + ); + return call?.['@reference.arity']; +} + +// Java argument-list nodes interleave `block_comment` / `line_comment` with the +// real arguments; arity (which feeds call-processor symbol-ID generation) must +// exclude them. The removed `comment` literal never matched — the grammar emits +// `block_comment` / `line_comment`. (#1920 / PR #1937 tri-review) +describe('Java call arity excludes interleaved comments', () => { + it('ignores a block comment between arguments', () => { + expect(arityOf('class A { void m(){ foo(a, /* x */ b, c); } }', 'foo')).toBe('3'); + }); + + it('ignores a line comment between arguments', () => { + expect(arityOf('class A { void m(){ foo(a, // hi\n b); } }', 'foo')).toBe('2'); + }); + + it('ignores a leading block comment on the first argument', () => { + expect(arityOf('class A { void m(){ foo(/* lead */ a); } }', 'foo')).toBe('1'); + }); + + it('excludes comments in a constructor (object_creation_expression) call', () => { + expect(arityOf('class A { void m(){ new Bar(a, /*c*/ b); } }', 'Bar')).toBe('2'); + }); + + it('regression: a comment-free call counts normally', () => { + expect(arityOf('class A { void m(){ foo(a, b, c); } }', 'foo')).toBe('3'); + }); +});