From af1ed13bd3fa12363ffcab3456a180a91b6fec75 Mon Sep 17 00:00:00 2001 From: Grouchy Date: Thu, 5 Mar 2026 17:18:12 -0800 Subject: [PATCH] feat(python): index module-level singleton instances as CodeElement nodes - Add definition.instance query to PYTHON_QUERIES, anchored to module scope with call on right-hand side - Add definition.instance to DEFINITION_CAPTURE_KEYS and label maps in both sequential (parsing-processor.ts) and worker (parse-worker.ts) paths - Compose with #137: instances are now linkable targets for File->Symbol IMPORTS edges created by symbol-level import resolution --- .../src/core/ingestion/import-processor.ts | 80 ++++++++++++++++++- .../src/core/ingestion/parsing-processor.ts | 2 + gitnexus/src/core/ingestion/pipeline.ts | 4 +- .../src/core/ingestion/tree-sitter-queries.ts | 7 ++ .../core/ingestion/workers/parse-worker.ts | 49 ++++++++++++ 5 files changed, 139 insertions(+), 3 deletions(-) diff --git a/gitnexus/src/core/ingestion/import-processor.ts b/gitnexus/src/core/ingestion/import-processor.ts index 990f968afd..b80cde3d79 100644 --- a/gitnexus/src/core/ingestion/import-processor.ts +++ b/gitnexus/src/core/ingestion/import-processor.ts @@ -2,6 +2,7 @@ import fs from 'fs/promises'; import path from 'path'; import { KnowledgeGraph } from '../graph/types.js'; import { ASTCache } from './ast-cache.js'; +import { SymbolTable } from './symbol-table.js'; import Parser from 'tree-sitter'; import { loadParser, loadLanguage } from '../tree-sitter/parser-loader.js'; import { LANGUAGE_QUERIES } from './tree-sitter-queries.js'; @@ -728,6 +729,7 @@ export const processImports = async ( onProgress?: (current: number, total: number) => void, repoRoot?: string, allPaths?: string[], + symbolTable?: SymbolTable, ) => { // Use allPaths (full repo) when available for cross-chunk resolution, else fall back to chunk files const allFileList = allPaths ?? files.map(f => f.path); @@ -773,6 +775,57 @@ export const processImports = async ( importMap.get(filePath)!.add(resolvedPath); }; + // Helper: add symbol-level IMPORTS edges for named imports + const addSymbolImportEdges = (filePath: string, resolvedPath: string, symbolNames?: string[]) => { + if (!symbolNames || !symbolTable) return; + const sourceId = generateId('File', filePath); + for (const name of symbolNames) { + const targetNodeId = symbolTable.lookupExact(resolvedPath, name); + if (!targetNodeId) continue; + const relId = generateId('IMPORTS', `${filePath}:${name}->${resolvedPath}`); + graph.addRelationship({ + id: relId, + sourceId, + targetId: targetNodeId, + type: 'IMPORTS', + confidence: 1.0, + reason: '', + }); + } + }; + + // Helper: extract imported symbol names from AST node (for sequential path) + const extractSymbolNames = (importNode: any, language: string): string[] => { + const names: string[] = []; + if (language === SupportedLanguages.Python) { + for (const child of importNode.namedChildren) { + if (child.type === 'module_name') continue; + if (child.type === 'wildcard_import') continue; + if (child.type === 'dotted_name' || child.type === 'identifier') { + names.push(child.text); + } else if (child.type === 'aliased_import') { + const nameNode = child.childForFieldName?.('name') || child.namedChildren?.[0]; + if (nameNode) names.push(nameNode.text); + } + } + return names; + } + if (language === SupportedLanguages.TypeScript || language === SupportedLanguages.JavaScript) { + const importClause = importNode.namedChildren?.find((c: any) => c.type === 'import_clause'); + const namedImports = importClause?.namedChildren?.find((c: any) => c.type === 'named_imports'); + if (namedImports) { + for (const spec of namedImports.namedChildren) { + if (spec.type === 'import_specifier') { + const nameNode = spec.childForFieldName?.('name'); + if (nameNode) names.push(nameNode.text); + } + } + } + return names; + } + return names; + }; + for (let i = 0; i < files.length; i++) { const file = files[i]; onProgress?.(i + 1, files.length); @@ -844,6 +897,9 @@ export const processImports = async ( : sourceNode.text.replace(/['"<>]/g, ''); totalImportsFound++; + // Extract imported symbol names for symbol-level edges + const symbolNames = extractSymbolNames(captureMap['import'], language); + // ---- JVM languages (Java + Kotlin): handle wildcards and member imports ---- if (language === SupportedLanguages.Java || language === SupportedLanguages.Kotlin) { const exts = language === SupportedLanguages.Java ? ['.java'] : KOTLIN_EXTENSIONS; @@ -932,6 +988,7 @@ export const processImports = async ( if (resolvedPath) { addImportEdge(file.path, resolvedPath); + addSymbolImportEdges(file.path, resolvedPath, symbolNames); } } }); @@ -956,6 +1013,7 @@ export const processImportsFromExtracted = async ( onProgress?: (current: number, total: number) => void, repoRoot?: string, prebuiltCtx?: ImportResolutionContext, + symbolTable?: SymbolTable, ) => { const ctx = prebuiltCtx ?? buildImportResolutionContext(files.map(f => f.path)); const { allFilePaths, allFileList, normalizedFileList, suffixIndex: index, resolveCache } = ctx; @@ -991,6 +1049,25 @@ export const processImportsFromExtracted = async ( importMap.get(filePath)!.add(resolvedPath); }; + // Helper: add symbol-level IMPORTS edges for named imports + const addSymbolImportEdges = (filePath: string, resolvedPath: string, symbolNames?: string[]) => { + if (!symbolNames || !symbolTable) return; + const sourceId = generateId('File', filePath); + for (const name of symbolNames) { + const targetNodeId = symbolTable.lookupExact(resolvedPath, name); + if (!targetNodeId) continue; + const relId = generateId('IMPORTS', `${filePath}:${name}->${resolvedPath}`); + graph.addRelationship({ + id: relId, + sourceId, + targetId: targetNodeId, + type: 'IMPORTS', + confidence: 1.0, + reason: '', + }); + } + }; + // Group by file for progress reporting (users see file count, not import count) const importsByFile = new Map(); for (const imp of extractedImports) { @@ -1027,7 +1104,7 @@ export const processImportsFromExtracted = async ( await yieldToEventLoop(); } - for (const { rawImportPath, language } of fileImports) { + for (const { rawImportPath, language, symbolNames } of fileImports) { totalImportsFound++; // Check resolve cache first @@ -1120,6 +1197,7 @@ export const processImportsFromExtracted = async ( if (resolvedPath) { addImportEdge(filePath, resolvedPath); + addSymbolImportEdges(filePath, resolvedPath, symbolNames); } } } diff --git a/gitnexus/src/core/ingestion/parsing-processor.ts b/gitnexus/src/core/ingestion/parsing-processor.ts index ae0e7026f2..d6125e2cce 100644 --- a/gitnexus/src/core/ingestion/parsing-processor.ts +++ b/gitnexus/src/core/ingestion/parsing-processor.ts @@ -42,6 +42,7 @@ const DEFINITION_CAPTURE_KEYS = [ 'definition.annotation', 'definition.constructor', 'definition.template', + 'definition.instance', ] as const; const getDefinitionNodeFromCaptures = (captureMap: Record): any | null => { @@ -365,6 +366,7 @@ const processParsingSequential = async ( else if (captureMap['definition.annotation']) nodeLabel = 'Annotation'; else if (captureMap['definition.constructor']) nodeLabel = 'Constructor'; else if (captureMap['definition.template']) nodeLabel = 'Template'; + else if (captureMap['definition.instance']) nodeLabel = 'CodeElement'; const definitionNodeForRange = getDefinitionNodeFromCaptures(captureMap); const startLine = definitionNodeForRange ? definitionNodeForRange.startPosition.row : (nameNode ? nameNode.startPosition.row : 0); diff --git a/gitnexus/src/core/ingestion/pipeline.ts b/gitnexus/src/core/ingestion/pipeline.ts index f067eaccc4..6dba2355f8 100644 --- a/gitnexus/src/core/ingestion/pipeline.ts +++ b/gitnexus/src/core/ingestion/pipeline.ts @@ -192,7 +192,7 @@ export const runPipelineFromRepo = async ( if (chunkWorkerData) { // Imports - await processImportsFromExtracted(graph, allPathObjects, chunkWorkerData.imports, importMap, undefined, repoPath, importCtx); + await processImportsFromExtracted(graph, allPathObjects, chunkWorkerData.imports, importMap, undefined, repoPath, importCtx, symbolTable); // Calls — resolve immediately, then free the array if (chunkWorkerData.calls.length > 0) { await processCallsFromExtracted(graph, chunkWorkerData.calls, symbolTable, importMap); @@ -206,7 +206,7 @@ export const runPipelineFromRepo = async ( await processRoutesFromExtracted(graph, chunkWorkerData.routes, symbolTable, importMap); } } else { - await processImports(graph, chunkFiles, astCache, importMap, undefined, repoPath, allPaths); + await processImports(graph, chunkFiles, astCache, importMap, undefined, repoPath, allPaths, symbolTable); sequentialChunkPaths.push(chunkPaths); } diff --git a/gitnexus/src/core/ingestion/tree-sitter-queries.ts b/gitnexus/src/core/ingestion/tree-sitter-queries.ts index 7eeeb73e05..d88ed5ae5b 100644 --- a/gitnexus/src/core/ingestion/tree-sitter-queries.ts +++ b/gitnexus/src/core/ingestion/tree-sitter-queries.ts @@ -141,6 +141,13 @@ export const PYTHON_QUERIES = ` function: (attribute attribute: (identifier) @call.name)) @call +; Module-level singleton instances: service = ServiceClass() +(module + (expression_statement + (assignment + left: (identifier) @name + right: (call))) @definition.instance) + ; Heritage queries - Python class inheritance (class_definition name: (identifier) @heritage.class diff --git a/gitnexus/src/core/ingestion/workers/parse-worker.ts b/gitnexus/src/core/ingestion/workers/parse-worker.ts index 2e9890b7b6..826282c9fb 100644 --- a/gitnexus/src/core/ingestion/workers/parse-worker.ts +++ b/gitnexus/src/core/ingestion/workers/parse-worker.ts @@ -63,6 +63,7 @@ export interface ExtractedImport { filePath: string; rawImportPath: string; language: string; + symbolNames?: string[]; } export interface ExtractedCall { @@ -280,6 +281,47 @@ const FUNCTION_NODE_TYPES = new Set([ 'init_declaration', 'deinit_declaration', ]); +/** Extract the specific symbol names from an import AST node. + * Python: `from X import foo, bar` → ['foo', 'bar'] + * JS/TS: `import { foo, bar } from 'X'` → ['foo', 'bar'] + * Returns empty array for bare module imports or unsupported patterns. */ +const extractImportedSymbolNames = (importNode: any, language: string): string[] => { + const names: string[] = []; + + if (language === SupportedLanguages.Python) { + // import_from_statement children: module_name (dotted_name) + name fields (dotted_name | aliased_import) + for (const child of importNode.namedChildren) { + if (child.type === 'module_name') continue; + if (child.type === 'wildcard_import') continue; + if (child.type === 'dotted_name' || child.type === 'identifier') { + names.push(child.text); + } else if (child.type === 'aliased_import') { + // from X import foo as bar — use original name 'foo' + const nameNode = child.childForFieldName?.('name') || child.namedChildren?.[0]; + if (nameNode) names.push(nameNode.text); + } + } + return names; + } + + if (language === SupportedLanguages.TypeScript || language === SupportedLanguages.JavaScript) { + // import_statement > import_clause > named_imports > import_specifier* + const importClause = importNode.namedChildren?.find((c: any) => c.type === 'import_clause'); + const namedImports = importClause?.namedChildren?.find((c: any) => c.type === 'named_imports'); + if (namedImports) { + for (const spec of namedImports.namedChildren) { + if (spec.type === 'import_specifier') { + const nameNode = spec.childForFieldName?.('name'); + if (nameNode) names.push(nameNode.text); + } + } + } + return names; + } + + return names; +}; + /** Walk up AST to find enclosing function, return its generateId or null for top-level */ const findEnclosingFunctionId = (node: any, filePath: string): string | null => { let current = node.parent; @@ -478,6 +520,7 @@ const getLabelFromCaptures = (captureMap: Record): string | null => if (captureMap['definition.annotation']) return 'Annotation'; if (captureMap['definition.constructor']) return 'Constructor'; if (captureMap['definition.template']) return 'Template'; + if (captureMap['definition.instance']) return 'CodeElement'; return 'CodeElement'; }; @@ -504,6 +547,7 @@ const DEFINITION_CAPTURE_KEYS = [ 'definition.annotation', 'definition.constructor', 'definition.template', + 'definition.instance', ] as const; const getDefinitionNodeFromCaptures = (captureMap: Record): any | null => { @@ -1133,10 +1177,15 @@ const processFileGroup = ( const rawImportPath = language === SupportedLanguages.Kotlin ? appendKotlinWildcard(captureMap['import.source'].text.replace(/['"<>]/g, ''), captureMap['import']) : captureMap['import.source'].text.replace(/['"<>]/g, ''); + + // Extract imported symbol names from the AST node + const symbolNames = extractImportedSymbolNames(captureMap['import'], language); + result.imports.push({ filePath: file.path, rawImportPath, language: language, + symbolNames: symbolNames.length > 0 ? symbolNames : undefined, }); continue; }