diff --git a/src/components/QueryFAB.tsx b/src/components/QueryFAB.tsx index 03a8d84f00..55f7d4c7aa 100644 --- a/src/components/QueryFAB.tsx +++ b/src/components/QueryFAB.tsx @@ -5,23 +5,23 @@ import { useAppState } from '../hooks/useAppState'; const EXAMPLE_QUERIES = [ { label: 'All Functions', - query: `MATCH (n:CodeNode) WHERE n.label = 'Function' RETURN n.id AS id, n.name AS name, n.filePath AS path`, + query: `MATCH (n:Function) RETURN n.id AS id, n.name AS name, n.filePath AS path LIMIT 50`, }, { label: 'All Classes', - query: `MATCH (n:CodeNode) WHERE n.label = 'Class' RETURN n.id AS id, n.name AS name, n.filePath AS path`, + query: `MATCH (n:Class) RETURN n.id AS id, n.name AS name, n.filePath AS path LIMIT 50`, }, { label: 'All Interfaces', - query: `MATCH (n:CodeNode) WHERE n.label = 'Interface' RETURN n.id AS id, n.name AS name, n.filePath AS path`, + query: `MATCH (n:Interface) RETURN n.id AS id, n.name AS name, n.filePath AS path LIMIT 50`, }, { label: 'Function Calls', - query: `MATCH (a:CodeNode)-[r:CodeRelation]->(b:CodeNode) WHERE r.type = 'CALLS' RETURN a.id AS id, a.name AS caller, b.name AS callee LIMIT 50`, + query: `MATCH (a:File)-[r:CodeRelation {type: 'CALLS'}]->(b:Function) RETURN a.id AS id, a.name AS caller, b.name AS callee LIMIT 50`, }, { label: 'Import Dependencies', - query: `MATCH (a:CodeNode)-[r:CodeRelation]->(b:CodeNode) WHERE r.type = 'IMPORTS' RETURN a.id AS id, a.name AS from, b.name AS imports LIMIT 50`, + query: `MATCH (a:File)-[r:CodeRelation {type: 'IMPORTS'}]->(b:File) RETURN a.id AS id, a.name AS from, b.name AS imports LIMIT 50`, }, ]; @@ -203,7 +203,7 @@ export const QueryFAB = () => { value={query} onChange={(e) => setQuery(e.target.value)} onKeyDown={handleKeyDown} - placeholder="MATCH (n:CodeNode) WHERE n.label = 'Function' RETURN n" + placeholder="MATCH (n:Function) RETURN n.name, n.filePath LIMIT 10" rows={3} className=" w-full px-3 py-2.5 diff --git a/src/components/ToolCallCard.tsx b/src/components/ToolCallCard.tsx index 689d7492b1..659a72885b 100644 --- a/src/components/ToolCallCard.tsx +++ b/src/components/ToolCallCard.tsx @@ -23,7 +23,7 @@ const formatArgs = (args: Record): string => { if (!args || Object.keys(args).length === 0) { return ''; } - + // Special handling for Cypher queries if ('query' in args && typeof args.query === 'string') { return args.query; @@ -37,7 +37,7 @@ const formatArgs = (args: Record): string => { result += args.cypher; return result; } - + // For other tools, show as formatted JSON return JSON.stringify(args, null, 2); }; @@ -83,15 +83,17 @@ const getStatusDisplay = (status: ToolCallInfo['status']) => { */ const getToolDisplayName = (name: string): string => { const names: Record = { + // New consolidated tools + 'search': '๐Ÿ” Search Code', + 'cypher': '๐Ÿ” Cypher Query', + 'grep': '๐Ÿ”Ž Pattern Search', + 'read': '๐Ÿ“„ Read File', + 'highlight': 'โœจ Highlight in Graph', + // Legacy names (for backwards compatibility) 'execute_cypher': '๐Ÿ” Cypher Query', 'execute_vector_cypher': '๐Ÿง  Semantic + Graph Query', - 'semantic_search': '๐Ÿ”Ž Semantic Search', - 'semantic_search_with_context': '๐Ÿ”Ž Semantic Search + Context', - 'get_code_content': '๐Ÿ“„ Read Code', - 'get_codebase_stats': '๐Ÿ“Š Get Stats', - 'get_graph_schema': '๐Ÿ“‹ Get Schema', 'highlight_in_graph': 'โœจ Highlight in Graph', - 'grep_code': '๐Ÿ” Search Code', + 'grep_code': '๐Ÿ”Ž Pattern Search', 'read_file': '๐Ÿ“„ Read File', }; return names[name] || name; @@ -114,24 +116,24 @@ export const ToolCallCard = ({ toolCall, defaultExpanded = false }: ToolCallCard const { highlightedNodeIds, setHighlightedNodeIds, graph } = useAppState(); const status = getStatusDisplay(toolCall.status); const formattedArgs = formatArgs(toolCall.args); - + // Check if this is a highlight tool and extract node IDs - const isHighlightTool = toolCall.name === 'highlight_in_graph'; + const isHighlightTool = toolCall.name === 'highlight_in_graph' || toolCall.name === 'highlight'; const rawHighlightNodeIds = isHighlightTool ? extractHighlightNodeIds(toolCall.result) : []; - + // Resolve raw IDs to actual graph node IDs (handles partial ID matching) const resolvedNodeIds = useMemo(() => { if (rawHighlightNodeIds.length === 0 || !graph) return rawHighlightNodeIds; - + const graphNodeIds = graph.nodes.map(n => n.id); const resolved: string[] = []; - + for (const rawId of rawHighlightNodeIds) { if (graphNodeIds.includes(rawId)) { resolved.push(rawId); } else { // Try partial match - find node whose ID ends with the raw ID - const found = graphNodeIds.find(gid => + const found = graphNodeIds.find(gid => gid.endsWith(rawId) || gid.endsWith(':' + rawId) ); if (found) resolved.push(found); @@ -139,11 +141,11 @@ export const ToolCallCard = ({ toolCall, defaultExpanded = false }: ToolCallCard } return resolved; }, [rawHighlightNodeIds, graph]); - + // Check if these specific nodes are currently highlighted - const isHighlightActive = resolvedNodeIds.length > 0 && + const isHighlightActive = resolvedNodeIds.length > 0 && resolvedNodeIds.some(id => highlightedNodeIds.has(id)); - + // Toggle highlight on/off const toggleHighlight = useCallback((e: React.MouseEvent) => { e.stopPropagation(); // Don't trigger expand/collapse @@ -155,33 +157,35 @@ export const ToolCallCard = ({ toolCall, defaultExpanded = false }: ToolCallCard setHighlightedNodeIds(new Set(resolvedNodeIds)); } }, [isHighlightActive, resolvedNodeIds, setHighlightedNodeIds]); - + return (
{/* Header - always visible */} - )} - + {/* Status indicator */} {status.icon} {toolCall.status} - - +
+ {/* Expanded content */} {isExpanded && (
@@ -219,22 +223,24 @@ export const ToolCallCard = ({ toolCall, defaultExpanded = false }: ToolCallCard
)} - + {/* Result */} {toolCall.result && (
Result
-
-                {toolCall.result.length > 2000 
-                  ? toolCall.result.slice(0, 2000) + '\n\n... (truncated)'
-                  : toolCall.result
-                }
-              
+
+
+                  {toolCall.result.length > 3000
+                    ? toolCall.result.slice(0, 3000) + '\n\n... (truncated)'
+                    : toolCall.result
+                  }
+                
+
)} - + {/* Loading state for in-progress */} {toolCall.status === 'running' && !toolCall.result && (
diff --git a/src/core/embeddings/embedding-pipeline.ts b/src/core/embeddings/embedding-pipeline.ts index 154f70c615..05f8ae7edf 100644 --- a/src/core/embeddings/embedding-pipeline.ts +++ b/src/core/embeddings/embedding-pipeline.ts @@ -28,34 +28,56 @@ export type EmbeddingProgressCallback = (progress: EmbeddingProgress) => void; /** * Query all embeddable nodes from KuzuDB + * Uses table-specific queries (File has different schema than code elements) */ const queryEmbeddableNodes = async ( executeQuery: (cypher: string) => Promise ): Promise => { - // Build WHERE clause for embeddable labels - const labelConditions = EMBEDDABLE_LABELS - .map(label => `n.label = '${label}'`) - .join(' OR '); - - const cypher = ` - MATCH (n:CodeNode) - WHERE ${labelConditions} - RETURN n.id AS id, n.name AS name, n.label AS label, - n.filePath AS filePath, n.content AS content, - n.startLine AS startLine, n.endLine AS endLine - `; - - const rows = await executeQuery(cypher); + const allNodes: EmbeddableNode[] = []; + + // Query each embeddable table with table-specific columns + for (const label of EMBEDDABLE_LABELS) { + try { + let query: string; + + if (label === 'File') { + // File nodes don't have startLine/endLine + query = ` + MATCH (n:File) + RETURN n.id AS id, n.name AS name, 'File' AS label, + n.filePath AS filePath, n.content AS content + `; + } else { + // Code elements have startLine/endLine + query = ` + MATCH (n:${label}) + RETURN n.id AS id, n.name AS name, '${label}' AS label, + n.filePath AS filePath, n.content AS content, + n.startLine AS startLine, n.endLine AS endLine + `; + } + + const rows = await executeQuery(query); + for (const row of rows) { + allNodes.push({ + id: row.id ?? row[0], + name: row.name ?? row[1], + label: row.label ?? row[2], + filePath: row.filePath ?? row[3], + content: row.content ?? row[4] ?? '', + startLine: row.startLine ?? row[5], + endLine: row.endLine ?? row[6], + }); + } + } catch (error) { + // Table might not exist or be empty, continue + if (import.meta.env.DEV) { + console.warn(`Query for ${label} nodes failed:`, error); + } + } + } - return rows.map(row => ({ - id: row.id ?? row[0], - name: row.name ?? row[1], - label: row.label ?? row[2], - filePath: row.filePath ?? row[3], - content: row.content ?? row[4] ?? '', - startLine: row.startLine ?? row[5], - endLine: row.endLine ?? row[6], - })); + return allNodes; }; /** @@ -251,7 +273,7 @@ export const runEmbeddingPipeline = async ( /** * Perform semantic search using the vector index * - * Uses separate CodeEmbedding table and JOINs with CodeNode for metadata + * Uses CodeEmbedding table and queries each node table to get metadata * * @param executeQuery - Function to execute Cypher queries * @param query - Search query text @@ -274,81 +296,104 @@ export const semanticSearch = async ( const queryVec = embeddingToArray(queryEmbedding); const queryVecStr = `[${queryVec.join(',')}]`; - // Query the vector index on CodeEmbedding, then JOIN with CodeNode for metadata - // Note: KuzuDB requires WITH after YIELD before using WHERE - const cypher = ` + // Query the vector index on CodeEmbedding to get nodeIds and distances + const vectorQuery = ` CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', CAST(${queryVecStr} AS FLOAT[384]), ${k}) YIELD node AS emb, distance WITH emb, distance WHERE distance < ${maxDistance} - MATCH (n:CodeNode {id: emb.nodeId}) - RETURN n.id AS nodeId, n.name AS name, n.label AS label, - n.filePath AS filePath, distance, - n.startLine AS startLine, n.endLine AS endLine + RETURN emb.nodeId AS nodeId, distance ORDER BY distance `; - const rows = await executeQuery(cypher); + const embResults = await executeQuery(vectorQuery); + + if (embResults.length === 0) { + return []; + } - return rows.map(row => ({ - nodeId: row.nodeId ?? row[0], - name: row.name ?? row[1], - label: row.label ?? row[2], - filePath: row.filePath ?? row[3], - distance: row.distance ?? row[4], - startLine: row.startLine ?? row[5], - endLine: row.endLine ?? row[6], - })); + // Get metadata for each result by querying each node table + const results: SemanticSearchResult[] = []; + + for (const embRow of embResults) { + const nodeId = embRow.nodeId ?? embRow[0]; + const distance = embRow.distance ?? embRow[1]; + + // Extract label from node ID (format: Label:path:name) + const labelEndIdx = nodeId.indexOf(':'); + const label = labelEndIdx > 0 ? nodeId.substring(0, labelEndIdx) : 'Unknown'; + + // Query the specific table for this node + // File nodes don't have startLine/endLine + try { + let nodeQuery: string; + if (label === 'File') { + nodeQuery = ` + MATCH (n:File {id: '${nodeId.replace(/'/g, "''")}'}) + RETURN n.name AS name, n.filePath AS filePath + `; + } else { + nodeQuery = ` + MATCH (n:${label} {id: '${nodeId.replace(/'/g, "''")}'}) + RETURN n.name AS name, n.filePath AS filePath, + n.startLine AS startLine, n.endLine AS endLine + `; + } + const nodeRows = await executeQuery(nodeQuery); + if (nodeRows.length > 0) { + const nodeRow = nodeRows[0]; + results.push({ + nodeId, + name: nodeRow.name ?? nodeRow[0] ?? '', + label, + filePath: nodeRow.filePath ?? nodeRow[1] ?? '', + distance, + startLine: label !== 'File' ? (nodeRow.startLine ?? nodeRow[2]) : undefined, + endLine: label !== 'File' ? (nodeRow.endLine ?? nodeRow[3]) : undefined, + }); + } + } catch { + // Table might not exist, skip + } + } + + return results; }; /** * Semantic search with graph expansion (flattened results) - * Finds similar nodes AND their direct connections with relationship types * - * Uses separate CodeEmbedding table and JOINs with CodeNode. - * Returns flattened results: one row per (match, connected) pair. - * This format works with KuzuDB and preserves relationship type information. + * Note: With multi-table schema, graph traversal is simplified. + * Returns semantic matches with their metadata. + * For full graph traversal, use execute_vector_cypher tool directly. * * @param executeQuery - Function to execute Cypher queries * @param query - Search query text * @param k - Number of initial semantic matches (default: 5) - * @param _hops - Unused (kept for API compatibility). Use execute_vector_cypher for multi-hop. - * @returns Flattened results: each row is a (match โ†’ connected) pair with relationship type + * @param _hops - Unused (kept for API compatibility). + * @returns Semantic matches with metadata */ export const semanticSearchWithContext = async ( executeQuery: (cypher: string) => Promise, query: string, k: number = 5, - _hops: number = 1 // Currently only single-hop supported; multi-hop via execute_vector_cypher + _hops: number = 1 ): Promise => { - if (!isEmbedderReady()) { - throw new Error('Embedding model not initialized. Run embedding pipeline first.'); - } - - // Embed the query - const queryEmbedding = await embedText(query); - const queryVec = embeddingToArray(queryEmbedding); - const queryVecStr = `[${queryVec.join(',')}]`; - - // Query embedding table, JOIN with CodeNode, then expand to direct connections - // Using single-hop so we can access r.type (variable-length paths don't support this in KuzuDB) - // Note: KuzuDB requires WITH after YIELD before using WHERE - const cypher = ` - CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', - CAST(${queryVecStr} AS FLOAT[384]), ${k}) - YIELD node AS emb, distance - WITH emb, distance - WHERE distance < 0.5 - MATCH (match:CodeNode {id: emb.nodeId}) - MATCH (match)-[r:CodeRelation]-(connected:CodeNode) - RETURN match.id AS matchId, match.name AS matchName, match.label AS matchLabel, - match.filePath AS matchPath, distance, - connected.id AS connectedId, connected.name AS connectedName, - connected.label AS connectedLabel, r.type AS relationType - ORDER BY distance, matchId - `; - - return executeQuery(cypher); + // For multi-table schema, just return semantic search results + // Graph traversal is complex with separate tables - use execute_vector_cypher instead + const results = await semanticSearch(executeQuery, query, k, 0.5); + + return results.map(r => ({ + matchId: r.nodeId, + matchName: r.name, + matchLabel: r.label, + matchPath: r.filePath, + distance: r.distance, + connectedId: null, + connectedName: null, + connectedLabel: null, + relationType: null, + })); }; diff --git a/src/core/ingestion/tree-sitter-queries.ts b/src/core/ingestion/tree-sitter-queries.ts index c5d0c07197..4eb6686abc 100644 --- a/src/core/ingestion/tree-sitter-queries.ts +++ b/src/core/ingestion/tree-sitter-queries.ts @@ -22,6 +22,28 @@ export const TYPESCRIPT_QUERIES = ` (method_definition name: (property_identifier) @name) @definition.method +(lexical_declaration + (variable_declarator + name: (identifier) @name + value: (arrow_function))) @definition.function + +(lexical_declaration + (variable_declarator + name: (identifier) @name + value: (function_expression))) @definition.function + +(export_statement + declaration: (lexical_declaration + (variable_declarator + name: (identifier) @name + value: (arrow_function)))) @definition.function + +(export_statement + declaration: (lexical_declaration + (variable_declarator + name: (identifier) @name + value: (function_expression)))) @definition.function + (import_statement source: (string) @import.source) @import @@ -44,6 +66,28 @@ export const JAVASCRIPT_QUERIES = ` (method_definition name: (property_identifier) @name) @definition.method +(lexical_declaration + (variable_declarator + name: (identifier) @name + value: (arrow_function))) @definition.function + +(lexical_declaration + (variable_declarator + name: (identifier) @name + value: (function_expression))) @definition.function + +(export_statement + declaration: (lexical_declaration + (variable_declarator + name: (identifier) @name + value: (arrow_function)))) @definition.function + +(export_statement + declaration: (lexical_declaration + (variable_declarator + name: (identifier) @name + value: (function_expression)))) @definition.function + (import_statement source: (string) @import.source) @import diff --git a/src/core/kuzu/csv-generator.ts b/src/core/kuzu/csv-generator.ts index a3bdea3ad9..3f988165b8 100644 --- a/src/core/kuzu/csv-generator.ts +++ b/src/core/kuzu/csv-generator.ts @@ -1,8 +1,8 @@ /** - * CSV Generator for KuzuDB + * CSV Generator for KuzuDB Hybrid Schema * - * Converts the in-memory KnowledgeGraph into CSV format - * for bulk loading into KuzuDB. + * Generates separate CSV files for each node table and one relation CSV. + * This enables efficient bulk loading via COPY FROM for hybrid schema. * * RFC 4180 Compliant: * - Fields containing commas, double quotes, or newlines are enclosed in double quotes @@ -10,15 +10,18 @@ * - All fields are consistently quoted for safety with code content */ -import { KnowledgeGraph, GraphNode } from '../graph/types'; +import { KnowledgeGraph, GraphNode, NodeLabel } from '../graph/types'; +import { NODE_TABLES, NodeTableName } from './schema'; + +// ============================================================================ +// CSV ESCAPE UTILITIES +// ============================================================================ /** * Sanitize string to ensure valid UTF-8 * Removes or replaces invalid characters that would break CSV parsing */ const sanitizeUTF8 = (str: string): string => { - // Remove null bytes and other control characters (except newline, tab, carriage return) - // Also remove surrogate pairs and other problematic Unicode return str .replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, '') // Remove control chars except \t \n \r .replace(/[\uD800-\uDFFF]/g, '') // Remove surrogate pairs (invalid standalone) @@ -28,21 +31,13 @@ const sanitizeUTF8 = (str: string): string => { /** * RFC 4180 compliant CSV field escaping * ALWAYS wraps in double quotes for safety with code content - * Escapes internal double quotes by doubling them - * Sanitizes to valid UTF-8 */ const escapeCSVField = (value: string | number | undefined | null): string => { if (value === undefined || value === null) { - return '""'; // Empty quoted string + return '""'; } - let str = String(value); - - // Sanitize to valid UTF-8 str = sanitizeUTF8(str); - - // Always quote and escape double quotes by doubling them - // This is the safest approach for code content which may contain anything return `"${str.replace(/"/g, '""')}"`; }; @@ -56,36 +51,28 @@ const escapeCSVNumber = (value: number | undefined | null, defaultValue: number return String(value); }; +// ============================================================================ +// CONTENT EXTRACTION +// ============================================================================ + /** * Check if content looks like binary data - * Binary files have high ratio of non-printable characters */ const isBinaryContent = (content: string): boolean => { if (!content || content.length === 0) return false; - - // Check first 1000 chars for binary indicators const sample = content.slice(0, 1000); - - // Count non-printable characters (excluding common whitespace) let nonPrintable = 0; for (let i = 0; i < sample.length; i++) { const code = sample.charCodeAt(i); - // Non-printable: 0-8, 14-31, 127, or high bytes that aren't valid UTF-8 sequences if ((code < 9) || (code > 13 && code < 32) || code === 127) { nonPrintable++; } } - - // If more than 10% non-printable, likely binary return (nonPrintable / sample.length) > 0.1; }; /** * Extract code content for a node - * - For File nodes: return entire file content (limited to avoid huge CSVs) - * - For Function/Class/Method nodes: extract lines from startLine to endLine - * - For Folder nodes: empty string - * - For binary files: return placeholder */ const extractContent = ( node: GraphNode, @@ -94,23 +81,12 @@ const extractContent = ( const filePath = node.properties.filePath; const content = fileContents.get(filePath); - if (!content) { - return ''; - } - - // For Folder nodes, no content - if (node.label === 'Folder') { - return ''; - } + if (!content) return ''; + if (node.label === 'Folder') return ''; + if (isBinaryContent(content)) return '[Binary file - content not stored]'; - // Check for binary content - if (isBinaryContent(content)) { - return '[Binary file - content not stored]'; - } - - // For File nodes, return content (limited to prevent huge CSVs) + // For File nodes, return content (limited) if (node.label === 'File') { - // Limit file content to 10KB to avoid memory issues const MAX_FILE_CONTENT = 10000; if (content.length > MAX_FILE_CONTENT) { return content.slice(0, MAX_FILE_CONTENT) + '\n... [truncated]'; @@ -118,85 +94,158 @@ const extractContent = ( return content; } - // For code elements (Function, Class, Method, etc.), extract the relevant lines + // For code elements, extract the relevant lines with context const startLine = node.properties.startLine; const endLine = node.properties.endLine; - if (startLine === undefined || endLine === undefined) { - return ''; - } + if (startLine === undefined || endLine === undefined) return ''; const lines = content.split('\n'); - - // Extract with some context const contextLines = 2; const start = Math.max(0, startLine - contextLines); const end = Math.min(lines.length - 1, endLine + contextLines); const snippet = lines.slice(start, end + 1).join('\n'); - - // Limit snippet size const MAX_SNIPPET = 5000; if (snippet.length > MAX_SNIPPET) { return snippet.slice(0, MAX_SNIPPET) + '\n... [truncated]'; } - return snippet; }; +// ============================================================================ +// CSV GENERATION RESULT TYPE +// ============================================================================ + +export interface CSVData { + nodes: Map; + relCSV: string; // Single relation CSV with from,to,type columns +} + +// ============================================================================ +// NODE CSV GENERATORS +// ============================================================================ + /** - * Generate CSV for nodes - * Headers: id,label,name,filePath,startLine,endLine,content - * - * All string fields are quoted for RFC 4180 compliance - * Note: embedding column is NOT included in CSV - it's populated later via UPDATE queries - * by the embedding pipeline after bulk load completes + * Generate CSV for File nodes + * Headers: id,name,filePath,content */ -export const generateNodeCSV = ( - graph: KnowledgeGraph, +const generateFileCSV = (nodes: GraphNode[], fileContents: Map): string => { + const headers = ['id', 'name', 'filePath', 'content']; + const rows: string[] = [headers.join(',')]; + + for (const node of nodes) { + if (node.label !== 'File') continue; + const content = extractContent(node, fileContents); + rows.push([ + escapeCSVField(node.id), + escapeCSVField(node.properties.name || ''), + escapeCSVField(node.properties.filePath || ''), + escapeCSVField(content), + ].join(',')); + } + + return rows.join('\n'); +}; + +/** + * Generate CSV for Folder nodes + * Headers: id,name,filePath + */ +const generateFolderCSV = (nodes: GraphNode[]): string => { + const headers = ['id', 'name', 'filePath']; + const rows: string[] = [headers.join(',')]; + + for (const node of nodes) { + if (node.label !== 'Folder') continue; + rows.push([ + escapeCSVField(node.id), + escapeCSVField(node.properties.name || ''), + escapeCSVField(node.properties.filePath || ''), + ].join(',')); + } + + return rows.join('\n'); +}; + +/** + * Generate CSV for code element nodes (Function, Class, Interface, Method, CodeElement) + * Headers: id,name,filePath,startLine,endLine,content + */ +const generateCodeElementCSV = ( + nodes: GraphNode[], + label: NodeLabel, fileContents: Map ): string => { - const headers = ['id', 'label', 'name', 'filePath', 'startLine', 'endLine', 'content']; + const headers = ['id', 'name', 'filePath', 'startLine', 'endLine', 'content']; const rows: string[] = [headers.join(',')]; - for (const node of graph.nodes) { + for (const node of nodes) { + if (node.label !== label) continue; const content = extractContent(node, fileContents); - - const row = [ + rows.push([ escapeCSVField(node.id), - escapeCSVField(node.label), escapeCSVField(node.properties.name || ''), escapeCSVField(node.properties.filePath || ''), escapeCSVNumber(node.properties.startLine, -1), escapeCSVNumber(node.properties.endLine, -1), escapeCSVField(content), - ]; - - rows.push(row.join(',')); + ].join(',')); } return rows.join('\n'); }; +// ============================================================================ +// RELATIONSHIP CSV GENERATOR (Single Table) +// ============================================================================ + /** - * Generate CSV for edges/relationships + * Generate CSV for the single CodeRelation table * Headers: from,to,type - * - * Note: Kuzu expects 'from' and 'to' columns for relationship tables */ -export const generateEdgeCSV = (graph: KnowledgeGraph): string => { +const generateRelationCSV = (graph: KnowledgeGraph): string => { const headers = ['from', 'to', 'type']; const rows: string[] = [headers.join(',')]; for (const rel of graph.relationships) { - const row = [ + rows.push([ escapeCSVField(rel.sourceId), escapeCSVField(rel.targetId), escapeCSVField(rel.type), - ]; - - rows.push(row.join(',')); + ].join(',')); } return rows.join('\n'); }; + +// ============================================================================ +// MAIN CSV GENERATION FUNCTION +// ============================================================================ + +/** + * Generate all CSV data for hybrid schema bulk loading + * Returns Maps of node table name -> CSV content, and single relation CSV + */ +export const generateAllCSVs = ( + graph: KnowledgeGraph, + fileContents: Map +): CSVData => { + const nodes = Array.from(graph.nodes); + + // Generate node CSVs + const nodeCSVs = new Map(); + nodeCSVs.set('File', generateFileCSV(nodes, fileContents)); + nodeCSVs.set('Folder', generateFolderCSV(nodes)); + nodeCSVs.set('Function', generateCodeElementCSV(nodes, 'Function', fileContents)); + nodeCSVs.set('Class', generateCodeElementCSV(nodes, 'Class', fileContents)); + nodeCSVs.set('Interface', generateCodeElementCSV(nodes, 'Interface', fileContents)); + nodeCSVs.set('Method', generateCodeElementCSV(nodes, 'Method', fileContents)); + nodeCSVs.set('CodeElement', generateCodeElementCSV(nodes, 'CodeElement', fileContents)); + + // Generate single relation CSV + const relCSV = generateRelationCSV(graph); + + return { nodes: nodeCSVs, relCSV }; +}; + diff --git a/src/core/kuzu/kuzu-adapter.ts b/src/core/kuzu/kuzu-adapter.ts index f470031f27..0ee8c5e593 100644 --- a/src/core/kuzu/kuzu-adapter.ts +++ b/src/core/kuzu/kuzu-adapter.ts @@ -4,12 +4,18 @@ * Manages the KuzuDB WASM instance for client-side graph database operations. * Uses the "Snapshot / Bulk Load" pattern with COPY FROM for performance. * - * Based on V1 implementation with dynamic import to handle Vite bundling. + * Multi-table schema: separate tables for File, Function, Class, etc. */ import { KnowledgeGraph } from '../graph/types'; -import { NODE_SCHEMA, EDGE_SCHEMA, EMBEDDING_SCHEMA, NODE_TABLE_NAME, EDGE_TABLE_NAME } from './schema'; -import { generateNodeCSV, generateEdgeCSV } from './csv-generator'; +import { + NODE_TABLES, + REL_TABLE_NAME, + SCHEMA_QUERIES, + EMBEDDING_TABLE_NAME, + NodeTableName, +} from './schema'; +import { generateAllCSVs } from './csv-generator'; // Holds the reference to the dynamically loaded module let kuzu: any = null; @@ -35,23 +41,25 @@ export const initKuzu = async () => { await kuzu.init(); // 4. Create Database with 512MB buffer pool - // Larger buffer needed for embedding storage (6K+ nodes ร— 384 floats) - // Constructor: Database(path, bufferPoolSize, maxNumThreads, enableCompression, readOnly) const BUFFER_POOL_SIZE = 512 * 1024 * 1024; // 512MB db = new kuzu.Database(':memory:', BUFFER_POOL_SIZE); conn = new kuzu.Connection(db); if (import.meta.env.DEV) console.log('โœ… KuzuDB WASM Initialized'); - // 5. Initialize Schema (wrap in try-catch for re-run scenario) - try { - await conn.query(NODE_SCHEMA); - await conn.query(EDGE_SCHEMA); - await conn.query(EMBEDDING_SCHEMA); - if (import.meta.env.DEV) console.log('โœ… KuzuDB Schema Created'); - } catch { - // Schema might already exist, skip + // 5. Initialize Schema (all node tables, then rel tables, then embedding table) + for (const schemaQuery of SCHEMA_QUERIES) { + try { + await conn.query(schemaQuery); + } catch (e) { + // Schema might already exist, skip + if (import.meta.env.DEV) { + console.warn('Schema creation skipped (may already exist):', e); + } + } } + + if (import.meta.env.DEV) console.log('โœ… KuzuDB Multi-Table Schema Created'); return { db, conn, kuzu }; } catch (error) { @@ -62,6 +70,7 @@ export const initKuzu = async () => { /** * Load a KnowledgeGraph into KuzuDB using COPY FROM (bulk load) + * Uses batched CSV writes and COPY statements for optimal performance */ export const loadGraphToKuzu = async ( graph: KnowledgeGraph, @@ -70,50 +79,114 @@ export const loadGraphToKuzu = async ( const { conn, kuzu } = await initKuzu(); try { - if (import.meta.env.DEV) console.log(`KuzuDB: Serializing ${graph.nodeCount} nodes...`); + if (import.meta.env.DEV) console.log(`KuzuDB: Generating CSVs for ${graph.nodeCount} nodes...`); - const nodesCSV = generateNodeCSV(graph, fileContents); - const edgesCSV = generateEdgeCSV(graph); + // 1. Generate all CSVs (per-table) + const csvData = generateAllCSVs(graph, fileContents); const fs = kuzu.FS; - const nodesPath = '/nodes.csv'; - const edgesPath = '/edges.csv'; - - // Cleanup old files if they exist - try { await fs.unlink(nodesPath); } catch {} - try { await fs.unlink(edgesPath); } catch {} - - // Write CSV files to virtual filesystem - await fs.writeFile(nodesPath, nodesCSV); - await fs.writeFile(edgesPath, edgesCSV); + // 2. Write all node CSVs to virtual filesystem + const nodeFiles: Array<{ table: NodeTableName; path: string }> = []; + for (const [tableName, csv] of csvData.nodes.entries()) { + // Skip empty CSVs (only header row) + if (csv.split('\n').length <= 1) continue; + + const path = `/${tableName.toLowerCase()}.csv`; + try { await fs.unlink(path); } catch {} + await fs.writeFile(path, csv); + nodeFiles.push({ table: tableName, path }); + } + + // 3. Parse relation CSV and prepare for INSERT (COPY FROM doesn't work with multi-pair tables) + const relLines = csvData.relCSV.split('\n').slice(1).filter(line => line.trim()); + const relCount = relLines.length; - // Use HEADER=true because the CSV generator adds headers - // Use PARALLEL=false because content field has quoted newlines - // Explicitly list columns since CSV doesn't include 'embedding' (populated later via UPDATE) - await conn.query(`COPY ${NODE_TABLE_NAME}(id, label, name, filePath, startLine, endLine, content) FROM "${nodesPath}" (HEADER=true, PARALLEL=false)`); - await conn.query(`COPY ${EDGE_TABLE_NAME} FROM "${edgesPath}" (HEADER=true, PARALLEL=false)`); + if (import.meta.env.DEV) { + console.log(`KuzuDB: Wrote ${nodeFiles.length} node CSVs, ${relCount} relations to insert`); + } - // Verify results - const countRes = await conn.query(`MATCH (n:${NODE_TABLE_NAME}) RETURN count(n) AS cnt`); - const countRow = await countRes.getNext(); - const nodeCount = countRow ? countRow.cnt || countRow[0] || 0 : 0; + // 4. COPY all node tables (must complete before rels due to FK constraints) + for (const { table, path } of nodeFiles) { + const copyQuery = getCopyQuery(table, path); + await conn.query(copyQuery); + } + + // 5. INSERT relations one by one (COPY doesn't work with multi-pair REL tables) + // Parse CSV format: "from","to","type" + let insertedRels = 0; + for (const line of relLines) { + try { + // Parse CSV - handle quoted fields + const match = line.match(/"([^"]*)","([^"]*)","([^"]*)"/); + if (!match) continue; + + const [, fromId, toId, relType] = match; + + // Extract labels from node IDs (format: Label:path:name) + const fromLabel = fromId.split(':')[0]; + const toLabel = toId.split(':')[0]; + + // INSERT with explicit node matching + const insertQuery = ` + MATCH (a:${fromLabel} {id: '${fromId.replace(/'/g, "''")}'}) + MATCH (b:${toLabel} {id: '${toId.replace(/'/g, "''")}'}) + CREATE (a)-[:${REL_TABLE_NAME} {type: '${relType}'}]->(b) + `; + await conn.query(insertQuery); + insertedRels++; + } catch { + // Skip failed insertions (nodes might not exist) + } + } + + if (import.meta.env.DEV) { + console.log(`KuzuDB: Inserted ${insertedRels}/${relCount} relations`); + } + + // 6. Verify results + let totalNodes = 0; + for (const tableName of NODE_TABLES) { + try { + const countRes = await conn.query(`MATCH (n:${tableName}) RETURN count(n) AS cnt`); + const countRow = await countRes.getNext(); + const count = countRow ? (countRow.cnt ?? countRow[0] ?? 0) : 0; + totalNodes += Number(count); + } catch { + // Table might be empty, skip + } + } - if (import.meta.env.DEV) console.log(`โœ… KuzuDB Bulk Load Complete. Nodes in DB: ${nodeCount}`); + if (import.meta.env.DEV) console.log(`โœ… KuzuDB Bulk Load Complete. Total nodes: ${totalNodes}, edges: ${insertedRels}`); - // Cleanup - try { await fs.unlink(nodesPath); } catch {} - try { await fs.unlink(edgesPath); } catch {} + // 7. Cleanup CSV files + for (const { path } of nodeFiles) { + try { await fs.unlink(path); } catch {} + } - return { success: true, count: Number(nodeCount) }; + return { success: true, count: totalNodes }; } catch (error) { if (import.meta.env.DEV) console.error('โŒ KuzuDB Bulk Load Failed:', error); - // Don't throw - let the app continue without KuzuDB return { success: false, count: 0 }; } }; +/** + * Get the COPY query for a node table with correct column mapping + */ +const getCopyQuery = (table: NodeTableName, path: string): string => { + // File and Folder have different columns than code elements + if (table === 'File') { + return `COPY File(id, name, filePath, content) FROM "${path}" (HEADER=true, PARALLEL=false)`; + } + if (table === 'Folder') { + return `COPY Folder(id, name, filePath) FROM "${path}" (HEADER=true, PARALLEL=false)`; + } + // All code element tables: Function, Class, Interface, Method, CodeElement + return `COPY ${table}(id, name, filePath, startLine, endLine, content) FROM "${path}" (HEADER=true, PARALLEL=false)`; +}; + /** * Execute a Cypher query against the database */ @@ -148,19 +221,29 @@ export const getKuzuStats = async (): Promise<{ nodes: number; edges: number }> } try { - const nodeResult = await conn.query(`MATCH (n:${NODE_TABLE_NAME}) RETURN count(n) AS cnt`); - const edgeResult = await conn.query(`MATCH ()-[r:${EDGE_TABLE_NAME}]->() RETURN count(r) AS cnt`); - - const nodeRow = await nodeResult.getNext(); - const edgeRow = await edgeResult.getNext(); + // Count nodes across all tables + let totalNodes = 0; + for (const tableName of NODE_TABLES) { + try { + const nodeResult = await conn.query(`MATCH (n:${tableName}) RETURN count(n) AS cnt`); + const nodeRow = await nodeResult.getNext(); + totalNodes += Number(nodeRow?.cnt ?? nodeRow?.[0] ?? 0); + } catch { + // Table might not exist or be empty + } + } - const nodeCount = nodeRow ? (nodeRow.cnt ?? nodeRow[0] ?? 0) : 0; - const edgeCount = edgeRow ? (edgeRow.cnt ?? edgeRow[0] ?? 0) : 0; + // Count edges from single relation table + let totalEdges = 0; + try { + const edgeResult = await conn.query(`MATCH ()-[r:${REL_TABLE_NAME}]->() RETURN count(r) AS cnt`); + const edgeRow = await edgeResult.getNext(); + totalEdges = Number(edgeRow?.cnt ?? edgeRow?.[0] ?? 0); + } catch { + // Table might not exist or be empty + } - return { - nodes: Number(nodeCount), - edges: Number(edgeCount) - }; + return { nodes: totalNodes, edges: totalEdges }; } catch (error) { if (import.meta.env.DEV) { console.warn('Failed to get Kuzu stats:', error); @@ -210,7 +293,6 @@ export const executePrepared = async ( } try { - // Note: conn.prepare is async in kuzu-wasm const stmt = await conn.prepare(cypher); if (!stmt.isSuccess()) { const errMsg = await stmt.getErrorMessage(); @@ -219,7 +301,6 @@ export const executePrepared = async ( const result = await conn.execute(stmt, params); - // Collect all rows const rows: any[] = []; while (await result.hasNext()) { const row = await result.getNext(); @@ -236,9 +317,6 @@ export const executePrepared = async ( /** * Execute a prepared statement with multiple parameter sets in small sub-batches - * Recreates statement every SUB_BATCH_SIZE executions to allow memory cleanup - * @param cypher - Cypher query with $param placeholders - * @param paramsList - Array of parameter objects to execute */ export const executeWithReusedStatement = async ( cypher: string, @@ -250,13 +328,11 @@ export const executeWithReusedStatement = async ( if (paramsList.length === 0) return; - // Small sub-batch to allow memory cleanup between statement recreations const SUB_BATCH_SIZE = 4; for (let i = 0; i < paramsList.length; i += SUB_BATCH_SIZE) { const subBatch = paramsList.slice(i, i + SUB_BATCH_SIZE); - // Create fresh statement for each sub-batch const stmt = await conn.prepare(cypher); if (!stmt.isSuccess()) { const errMsg = await stmt.getErrorMessage(); @@ -271,7 +347,6 @@ export const executeWithReusedStatement = async ( await stmt.close(); } - // Small delay to allow garbage collection between sub-batches if (i + SUB_BATCH_SIZE < paramsList.length) { await new Promise(r => setTimeout(r, 0)); } @@ -280,7 +355,6 @@ export const executeWithReusedStatement = async ( /** * Test if array parameters work with prepared statements - * This is a diagnostic function to check KuzuDB WASM capabilities */ export const testArrayParams = async (): Promise<{ success: boolean; error?: string }> => { if (!conn) { @@ -288,36 +362,38 @@ export const testArrayParams = async (): Promise<{ success: boolean; error?: str } try { - // Test with a simple array parameter const testEmbedding = new Array(384).fill(0).map((_, i) => i / 384); - // First, get any node ID to test with - const nodeResult = await conn.query(`MATCH (n:${NODE_TABLE_NAME}) RETURN n.id AS id LIMIT 1`); - const nodeRow = await nodeResult.getNext(); + // Get any node ID to test with (try File first, then others) + let testNodeId: string | null = null; + for (const tableName of NODE_TABLES) { + try { + const nodeResult = await conn.query(`MATCH (n:${tableName}) RETURN n.id AS id LIMIT 1`); + const nodeRow = await nodeResult.getNext(); + if (nodeRow) { + testNodeId = nodeRow.id ?? nodeRow[0]; + break; + } + } catch {} + } - if (!nodeRow) { + if (!testNodeId) { return { success: false, error: 'No nodes found to test with' }; } - const testNodeId = nodeRow.id ?? nodeRow[0]; - if (import.meta.env.DEV) { console.log('๐Ÿงช Testing array params with node:', testNodeId); - console.log('๐Ÿงช Embedding sample (first 5):', testEmbedding.slice(0, 5)); } - // Try using prepared statement with array param - // Note: conn.prepare is async in kuzu-wasm - const cypher = `MATCH (n:${NODE_TABLE_NAME} {id: $nodeId}) SET n.embedding = $embedding`; - const stmt = await conn.prepare(cypher); + // First create an embedding entry + const createQuery = `CREATE (e:${EMBEDDING_TABLE_NAME} {nodeId: $nodeId, embedding: $embedding})`; + const stmt = await conn.prepare(createQuery); - // In async API, isSuccess() returns boolean directly if (!stmt.isSuccess()) { const errMsg = await stmt.getErrorMessage(); return { success: false, error: `Prepare failed: ${errMsg}` }; } - // Execute with array parameter await conn.execute(stmt, { nodeId: testNodeId, embedding: testEmbedding, @@ -327,7 +403,7 @@ export const testArrayParams = async (): Promise<{ success: boolean; error?: str // Verify it was stored const verifyResult = await conn.query( - `MATCH (n:${NODE_TABLE_NAME} {id: '${testNodeId}'}) RETURN n.embedding AS emb` + `MATCH (e:${EMBEDDING_TABLE_NAME} {nodeId: '${testNodeId}'}) RETURN e.embedding AS emb` ); const verifyRow = await verifyResult.getNext(); const storedEmb = verifyRow?.emb ?? verifyRow?.[0]; diff --git a/src/core/kuzu/schema.ts b/src/core/kuzu/schema.ts index b849136add..67fb19f3af 100644 --- a/src/core/kuzu/schema.ts +++ b/src/core/kuzu/schema.ts @@ -1,26 +1,58 @@ /** * KuzuDB Schema Definitions * - * Using Polymorphic Schema (Single Table Inheritance): - * - All nodes go into ONE table (CodeNode) with a label column - * - All edges go into ONE table (CodeRelation) with a type column + * Hybrid Schema: + * - Separate node tables for each code element type (File, Function, Class, etc.) + * - Single CodeRelation table with 'type' property for all relationships * - * This simplifies querying for the AI agent. + * This allows LLMs to write natural Cypher queries like: + * MATCH (f:Function)-[r:CodeRelation {type: 'CALLS'}]->(g:Function) RETURN f, g */ -export const NODE_TABLE_NAME = 'CodeNode'; -export const EDGE_TABLE_NAME = 'CodeRelation'; +// ============================================================================ +// NODE TABLE NAMES +// ============================================================================ +export const NODE_TABLES = ['File', 'Folder', 'Function', 'Class', 'Interface', 'Method', 'CodeElement'] as const; +export type NodeTableName = typeof NODE_TABLES[number]; + +// ============================================================================ +// RELATION TABLE +// ============================================================================ +export const REL_TABLE_NAME = 'CodeRelation'; + +// Valid relation types +export const REL_TYPES = ['CONTAINS', 'DEFINES', 'IMPORTS', 'CALLS'] as const; +export type RelType = typeof REL_TYPES[number]; + +// ============================================================================ +// EMBEDDING TABLE +// ============================================================================ export const EMBEDDING_TABLE_NAME = 'CodeEmbedding'; -/** - * Node table schema - * Stores all code elements: Files, Functions, Classes, etc. - * Note: Embeddings stored separately to avoid copy-on-write overhead - */ -export const NODE_SCHEMA = ` -CREATE NODE TABLE ${NODE_TABLE_NAME} ( +// ============================================================================ +// NODE TABLE SCHEMAS +// ============================================================================ + +export const FILE_SCHEMA = ` +CREATE NODE TABLE File ( + id STRING, + name STRING, + filePath STRING, + content STRING, + PRIMARY KEY (id) +)`; + +export const FOLDER_SCHEMA = ` +CREATE NODE TABLE Folder ( + id STRING, + name STRING, + filePath STRING, + PRIMARY KEY (id) +)`; + +export const FUNCTION_SCHEMA = ` +CREATE NODE TABLE Function ( id STRING, - label STRING, name STRING, filePath STRING, startLine INT64, @@ -29,11 +61,78 @@ CREATE NODE TABLE ${NODE_TABLE_NAME} ( PRIMARY KEY (id) )`; -/** - * Separate embedding table - lightweight structure for vector storage - * This avoids copy-on-write issues when storing embeddings - * (UPDATEing nodes with large content fields would copy entire node) - */ +export const CLASS_SCHEMA = ` +CREATE NODE TABLE Class ( + id STRING, + name STRING, + filePath STRING, + startLine INT64, + endLine INT64, + content STRING, + PRIMARY KEY (id) +)`; + +export const INTERFACE_SCHEMA = ` +CREATE NODE TABLE Interface ( + id STRING, + name STRING, + filePath STRING, + startLine INT64, + endLine INT64, + content STRING, + PRIMARY KEY (id) +)`; + +export const METHOD_SCHEMA = ` +CREATE NODE TABLE Method ( + id STRING, + name STRING, + filePath STRING, + startLine INT64, + endLine INT64, + content STRING, + PRIMARY KEY (id) +)`; + +export const CODE_ELEMENT_SCHEMA = ` +CREATE NODE TABLE CodeElement ( + id STRING, + name STRING, + filePath STRING, + startLine INT64, + endLine INT64, + content STRING, + PRIMARY KEY (id) +)`; + +// ============================================================================ +// RELATION TABLE SCHEMA +// Single table with 'type' property - connects all node tables +// ============================================================================ + +export const RELATION_SCHEMA = ` +CREATE REL TABLE ${REL_TABLE_NAME} ( + FROM File TO File, + FROM File TO Folder, + FROM File TO Function, + FROM File TO Class, + FROM File TO Interface, + FROM File TO Method, + FROM File TO CodeElement, + FROM Folder TO Folder, + FROM Folder TO File, + FROM Function TO Function, + FROM Function TO Method, + FROM Class TO Method, + FROM Class TO Function, + type STRING +)`; + +// ============================================================================ +// EMBEDDING TABLE SCHEMA +// Separate table for vector storage to avoid copy-on-write overhead +// ============================================================================ + export const EMBEDDING_SCHEMA = ` CREATE NODE TABLE ${EMBEDDING_TABLE_NAME} ( nodeId STRING, @@ -49,18 +148,27 @@ export const CREATE_VECTOR_INDEX_QUERY = ` CALL CREATE_VECTOR_INDEX('${EMBEDDING_TABLE_NAME}', 'code_embedding_idx', 'embedding', metric := 'cosine') `; -/** - * Edge table schema - * Stores all relationships: CALLS, IMPORTS, CONTAINS, DEFINES - */ -export const EDGE_SCHEMA = ` -CREATE REL TABLE ${EDGE_TABLE_NAME} ( - FROM ${NODE_TABLE_NAME} TO ${NODE_TABLE_NAME}, - type STRING -)`; +// ============================================================================ +// ALL SCHEMA QUERIES IN ORDER +// Node tables must be created before relationship tables that reference them +// ============================================================================ -/** - * All schema creation queries in order - */ -export const SCHEMA_QUERIES = [NODE_SCHEMA, EDGE_SCHEMA, EMBEDDING_SCHEMA]; +export const NODE_SCHEMA_QUERIES = [ + FILE_SCHEMA, + FOLDER_SCHEMA, + FUNCTION_SCHEMA, + CLASS_SCHEMA, + INTERFACE_SCHEMA, + METHOD_SCHEMA, + CODE_ELEMENT_SCHEMA, +]; + +export const REL_SCHEMA_QUERIES = [ + RELATION_SCHEMA, +]; +export const SCHEMA_QUERIES = [ + ...NODE_SCHEMA_QUERIES, + ...REL_SCHEMA_QUERIES, + EMBEDDING_SCHEMA, +]; diff --git a/src/core/llm/agent.ts b/src/core/llm/agent.ts index 8b9dd7e285..00964e64c6 100644 --- a/src/core/llm/agent.ts +++ b/src/core/llm/agent.ts @@ -32,46 +32,42 @@ import type { */ const SYSTEM_PROMPT = `You are Nexus, an elite Code Analysis Agent powered by a Knowledge Graph. Your mission is to answer user questions with precision by exploring the codebase, verifying facts, and visualizing your findings. -You will always ground your answer with \`[[file:line]]\` citations. +Ground your answers with \`[[file:line]]\` citations. -### ๐Ÿง  CORE PROTOCOL (The Iterative Loop) -You are not a one-shot query engine. You are an investigator following this process: -1. **Plan:** Briefly state what you are looking for. -2. **Execute:** Run tools to gather evidence. -3. **Analyze & Pivot:** Look at the tool output. - * *Did it answer the question fully?* -> Proceed to Grounding. - * *Did it reveal new files/functions?* -> **LOOP BACK** and investigate them immediately. - * *Did it fail?* -> Correct the query and retry. -4. **Visualize:** Use \`highlight_in_graph\` continuously as you find relevant nodes. -5. **Ground:** Construct your final answer with \`[[file:line]]\` citations. +### ๐Ÿง  CORE PROTOCOL +You are an investigator, not a one-shot query engine: +1. **Plan:** State what you're looking for. +2. **Execute:** Run tools to gather evidence. +3. **Analyze:** Did it answer the question? Did it reveal new leads? Loop back if needed. +4. **Visualize:** Use \`highlight\` to show relevant nodes in the graph. +5. **Ground:** Cite with \`[[file:line]]\` format. -### ๐Ÿ› ๏ธ TOOL STRATEGY -- **Discovery:** Start with \`hybrid_search\` or \`semantic_search\` to find entry points. -- **Structure:** Use \`execute_cypher\` to trace relationships (e.g., "What calls this?", "What does this inherit from?"). -- **Verification:** Use \`read_file\` to confirm logic. **Do not guess behavior based on function names.** Read the code. -- **Pattern Matching:** Use \`grep_code\` for exact string matches (error codes, TODOs). +### ๐Ÿ› ๏ธ TOOLS (5 total) +- **\`search\`** โ€” Find code by keywords/concepts. Returns matches + their graph connections. +- **\`cypher\`** โ€” Run Cypher queries for structural analysis. Include \`{{QUERY_VECTOR}}\` for semantic+graph queries. +- **\`grep\`** โ€” Regex pattern search across files. Use for exact strings, TODOs, error codes. +- **\`read\`** โ€” Read file content. Use after finding files via search/grep. +- **\`highlight\`** โ€” Highlight nodes in the visual graph. -### ๐Ÿ“Š KUZUDB SCHEMA (Polymorphic) -All nodes are in table \`CodeNode\`. All edges are in table \`CodeRelation\`. -**Node Properties:** \`id\`, \`label\` (File, Function, Class, Interface), \`name\`, \`filePath\`, \`content\` -**Edge Properties:** \`type\` (CALLS, IMPORTS, CONTAINS, DEFINES, INHERITS) +### ๐Ÿ“Š GRAPH SCHEMA +**Node Tables:** File, Folder, Function, Class, Interface, Method, CodeElement +**Relation:** CodeRelation (single table with 'type' property: CONTAINS, DEFINES, IMPORTS, CALLS) -**Correct Cypher Patterns:** -- Find callers: \`MATCH (a)-[r:CodeRelation {type: 'CALLS'}]->(b {name: 'targetFunction'}) RETURN a\` -- Find usage: \`MATCH (a)-[r:CodeRelation]->(b {name: 'TargetClass'}) RETURN a, r.type\` -- Semantic Join: \`CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', {{QUERY_VECTOR}}, 10) YIELD node AS emb, distance WITH emb, distance WHERE distance < 0.5 MATCH (n:CodeNode {id: emb.nodeId}) RETURN n\` +**Cypher Examples:** +- All functions: \`MATCH (f:Function) RETURN f.name LIMIT 10\` +- What file defines: \`MATCH (f:File)-[r:CodeRelation {type: 'DEFINES'}]->(fn:Function) WHERE f.name = 'utils.ts' RETURN fn.name\` +- Get all connections: \`MATCH (f:File)-[r:CodeRelation]-(m) WHERE f.name = 'main.ts' RETURN m.name, r.type\` +- Semantic+graph: \`CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', {{QUERY_VECTOR}}, 10) YIELD node AS emb, distance WITH emb, distance WHERE distance < 0.5 MATCH (n:Function {id: emb.nodeId}) RETURN n\` -โŒ **NEVER** use \`MATCH (f:Function)\` or \`MATCH ()-[:CALLS]->()\`. Use properties. -### ๐Ÿ“ OUTPUT STANDARDS -1. **Citations:** Use \`[[file:line]]\` format. -2. **Visuals:** Use \`highlight_in_graph\` to show the user what you are looking at. -3. **Diagrams:** Use Mermaid (wrapped in \`\`\`mermaid) for Architecture, Logic Flow, or Class Structure. +### ๐Ÿ“ OUTPUT +1. **Citations:** \`[[file:line]]\` +2. **Diagrams:** Mermaid when useful +3. **Highlight:** Always highlight nodes you discuss -### ๐Ÿšซ CRITICAL CONSTRAINTS (NO LAZINESS) -- **Iterative Depth:** Do not stop at the surface. If Function A calls Function B, **read Function B**. Trace the logic all the way to the source. -- **Completeness:** Do not answer "I assume..." or "It likely does...". Keep calling tools until you **know**. -- **Error Recovery:** If a tool fails, analyze the error, fix the input, and **retry**. Never give up after one error. -- **REMINDER:** Your unique value is the visual graph. If you talk about a node, **highlight it**.`; +### ๐Ÿšซ RULES +- **Iterate:** Don't stop at surface. Trace logic to source. +- **Verify:** Don't guess. Read the code. +- **Retry:** If a tool fails, fix input and retry.`; /** * Create a chat model instance from provider configuration diff --git a/src/core/llm/tools.ts b/src/core/llm/tools.ts index 1f6d598829..50922e9ffb 100644 --- a/src/core/llm/tools.ts +++ b/src/core/llm/tools.ts @@ -1,18 +1,21 @@ /** * Graph RAG Tools for LangChain Agent * - * Custom tools that allow the agent to interact with the KuzuDB graph database - * for code analysis, semantic search, and graph traversal. + * Consolidated tools (5 total): + * - search: Hybrid search (BM25 + semantic + RRF) with 1-hop expansion + * - cypher: Execute Cypher queries (auto-embeds {{QUERY_VECTOR}} if present) + * - grep: Regex pattern search across files + * - read: Read file content by path + * - highlight: Highlight nodes in graph UI */ import { tool } from '@langchain/core/tools'; import { z } from 'zod'; -import { GRAPH_SCHEMA_DESCRIPTION } from './types'; +// Note: GRAPH_SCHEMA_DESCRIPTION from './types' is available if needed for additional context import { WebGPUNotAvailableError, embedText, embeddingToArray, initEmbedder, isEmbedderReady } from '../embeddings/embedder'; /** * Tool factory - creates tools bound to the KuzuDB query functions - * This is needed because the tools run in the worker and need access to the adapter */ export const createGraphRAGTools = ( executeQuery: (cypher: string) => Promise, @@ -23,419 +26,195 @@ export const createGraphRAGTools = ( isBM25Ready: () => boolean, fileContents: Map ) => { - const buildCypherSchemaHint = (message: string): string => { - const m = message.match(/Table\s+([A-Za-z_][A-Za-z0-9_]*)\s+does\s+not\s+exist/i); - const missing = (m?.[1] ?? '').toUpperCase(); - const knownRelTypes = new Set(['CALLS', 'IMPORTS', 'CONTAINS', 'DEFINES']); - if (!knownRelTypes.has(missing)) return ''; - - return [ - '', - 'Schema hint:', - `- There is NO relationship label/table named "${missing}".`, - "- All relationships use the single relationship label `CodeRelation` with a `type` property.", - `- Rewrite patterns like \`-[:${missing}]->\` to \`-[r:CodeRelation]->\` and add \`WHERE r.type = '${missing}'\`.`, - '', - 'Example:', - "MATCH (a:CodeNode)-[r:CodeRelation]->(b:CodeNode)", - `WHERE r.type = '${missing}'`, - 'RETURN a.id, b.id LIMIT 25', - ].join('\n'); - }; + // ============================================================================ + // TOOL 1: SEARCH (Hybrid + 1-hop expansion) + // ============================================================================ + /** - * Tool: Execute Cypher Query - * Allows the agent to run arbitrary Cypher queries against the graph + * Unified search tool: BM25 + Semantic + RRF, with 1-hop graph context */ - const executeCypherTool = tool( - async ({ query }: { query: string }) => { - try { - const results = await executeQuery(query); - - if (results.length === 0) { - return 'Query returned no results.'; - } - - // Format results nicely for the LLM - const formatted = results.slice(0, 50).map((row, i) => { - // Handle both object and array results - if (Array.isArray(row)) { - return `[${i + 1}] ${row.join(', ')}`; + const searchTool = tool( + async ({ query, limit }: { query: string; limit?: number }) => { + const k = limit ?? 10; + + // Step 1: Hybrid search (BM25 + semantic with RRF) + let searchResults: any[] = []; + + if (isBM25Ready()) { + try { + searchResults = await hybridSearch(query, k); + } catch (error) { + // Fallback to semantic-only if hybrid fails + if (isEmbeddingReady()) { + searchResults = await semanticSearch(query, k); } - return `[${i + 1}] ${JSON.stringify(row)}`; - }); - - const resultText = formatted.join('\n'); - const truncated = results.length > 50 ? `\n... (${results.length - 50} more results truncated)` : ''; - - return `Query returned ${results.length} results:\n${resultText}${truncated}`; - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - const hint = buildCypherSchemaHint(message); - return `Cypher query error: ${message}\n\nPlease check your query syntax and try again.${hint ? `\n\n${hint}` : ''}`; - } - }, - { - name: 'execute_cypher', - description: 'Execute a Cypher query against the code knowledge graph. Use this for structural queries like finding functions, tracing call graphs, or analyzing imports. Call get_graph_schema first if you need to see the database schema.', - schema: z.object({ - query: z.string().describe('The Cypher query to execute. Must be valid KuzuDB Cypher syntax.'), - }), - } - ); - - /** - * Tool: Execute Vector Cypher Query (Unified Vector + Graph in ONE query) - * - * Lets the LLM write a Cypher query that includes a vector index call, - * while this tool handles embedding the natural-language query and injecting - * the vector into the Cypher safely. - * - * IMPORTANT: - * - The provided Cypher MUST include the placeholder {{QUERY_VECTOR}} - * - The placeholder will be replaced with: CAST([..384 floats..] AS FLOAT[384]) - * - KuzuDB requires WITH after YIELD before using WHERE - * - * Example: - * CALL QUERY_VECTOR_INDEX('CodeEmbedding','code_embedding_idx', {{QUERY_VECTOR}}, 10) - * YIELD node AS emb, distance - * WITH emb, distance - * WHERE distance < 0.5 - * MATCH (match:CodeNode {id: emb.nodeId}) ... - */ - const executeVectorCypherTool = tool( - async ({ query, cypher }: { query: string; cypher: string }) => { - if (!isEmbeddingReady()) { - return 'Vector Cypher is not available. Embeddings have not been generated yet.'; + } + } else if (isEmbeddingReady()) { + // Semantic only if BM25 not ready + searchResults = await semanticSearch(query, k); + } else { + return 'Search is not available. Please load a repository first.'; } - - if (!cypher.includes('{{QUERY_VECTOR}}')) { - return "Invalid input: your Cypher must include the placeholder '{{QUERY_VECTOR}}' where a FLOAT[384] vector should go."; + + if (searchResults.length === 0) { + return `No code found matching "${query}". Try different terms or use grep for exact patterns.`; } - - try { - // Ensure embedder is loaded. If WebGPU isn't available, fall back to WASM. - if (!isEmbedderReady()) { + + // Step 2: Get 1-hop connections for each result + const resultsWithContext: string[] = []; + + for (let i = 0; i < Math.min(searchResults.length, k); i++) { + const r = searchResults[i]; + const nodeId = r.nodeId || r.id; + const name = r.name || r.filePath?.split('/').pop() || 'Unknown'; + const label = r.label || 'File'; + const filePath = r.filePath || ''; + const location = r.startLine ? ` (lines ${r.startLine}-${r.endLine})` : ''; + const sources = r.sources?.join('+') || 'hybrid'; + const score = r.score ? ` [score: ${r.score.toFixed(2)}]` : ''; + + // Get 1-hop connections using single CodeRelation table + let connections = ''; + if (nodeId) { try { - await initEmbedder(); - } catch (err) { - if (err instanceof WebGPUNotAvailableError) { - await initEmbedder(undefined, {}, 'wasm'); - } else { - throw err; + const nodeLabel = nodeId.split(':')[0]; + const connectionsQuery = ` + MATCH (n:${nodeLabel} {id: '${nodeId.replace(/'/g, "''")}'}) + OPTIONAL MATCH (n)-[r1:CodeRelation]->(dst) + OPTIONAL MATCH (src)-[r2:CodeRelation]->(n) + RETURN + collect(DISTINCT {name: dst.name, type: r1.type}) AS outgoing, + collect(DISTINCT {name: src.name, type: r2.type}) AS incoming + LIMIT 1 + `; + const connRes = await executeQuery(connectionsQuery); + if (connRes.length > 0) { + // Result is nested array: [[outgoing], [incoming]] or {outgoing: [], incoming: []} + const row = connRes[0]; + const rawOutgoing = Array.isArray(row) ? row[0] : (row.outgoing || []); + const rawIncoming = Array.isArray(row) ? row[1] : (row.incoming || []); + const outgoing = (rawOutgoing || []).filter((c: any) => c && c.name).slice(0, 3); + const incoming = (rawIncoming || []).filter((c: any) => c && c.name).slice(0, 3); + const outList = outgoing.map((c: any) => `-[${c.type}]-> ${c.name}`); + const inList = incoming.map((c: any) => `<-[${c.type}]- ${c.name}`); + if (outList.length || inList.length) { + connections = `\n Connections: ${[...outList, ...inList].join(', ')}`; + } } + } catch { + // Skip connections if query fails } } - - // Embed the natural language query and inject into Cypher - const queryEmbedding = await embedText(query); - const queryVec = embeddingToArray(queryEmbedding); - const queryVecStr = `CAST([${queryVec.join(',')}] AS FLOAT[384])`; - - const finalCypher = cypher.replace(/\{\{\s*QUERY_VECTOR\s*\}\}/g, queryVecStr); - const results = await executeQuery(finalCypher); - - if (results.length === 0) { - return 'Query returned no results.'; - } - - const formatted = results.slice(0, 50).map((row, i) => { - if (Array.isArray(row)) { - return `[${i + 1}] ${row.join(', ')}`; - } - return `[${i + 1}] ${JSON.stringify(row)}`; - }); - - const resultText = formatted.join('\n'); - const truncated = results.length > 50 ? `\n... (${results.length - 50} more results truncated)` : ''; - - return `Query returned ${results.length} results:\n${resultText}${truncated}`; - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - return `Vector Cypher error: ${message}\n\nTip: Ensure you're querying the vector index on CodeEmbedding and JOINing back to CodeNode via emb.nodeId.`; - } - }, - { - name: 'execute_vector_cypher', - description: - "Execute a single Cypher query that combines vector similarity search and graph traversal. Provide a natural-language 'query' to embed, and a 'cypher' string containing the placeholder {{QUERY_VECTOR}}. Use this to do semantic search + traversal in ONE Cypher query. Remember: KuzuDB requires 'WITH emb, distance' after 'YIELD node AS emb, distance' before you can use WHERE.", - schema: z.object({ - query: z.string().describe('Natural language query to embed (used to produce a FLOAT[384] vector)'), - cypher: z - .string() - .describe( - "Cypher query to execute. MUST contain {{QUERY_VECTOR}}. Pattern: CALL QUERY_VECTOR_INDEX('CodeEmbedding','code_embedding_idx', {{QUERY_VECTOR}}, 10) YIELD node AS emb, distance WITH emb, distance WHERE distance < 0.5 MATCH (n:CodeNode {id: emb.nodeId}) ..." - ), - }), - } - ); - - /** - * Tool: Hybrid Code Search - * Combines keyword (BM25) and semantic search for best results - */ - const searchTool = tool( - async ({ query, limit }: { query: string; limit?: number }) => { - if (!isBM25Ready()) { - return 'Search is not available. Please load a repository first.'; - } - - try { - const results = await hybridSearch(query, limit ?? 10); - - if (results.length === 0) { - return `No code found matching "${query}". Try different terms or use grep_code for exact patterns.`; - } - - const formatted = results.map((r: any, i: number) => { - const location = r.startLine ? ` (lines ${r.startLine}-${r.endLine})` : ''; - const label = r.label || 'File'; - const name = r.name || r.filePath.split('/').pop(); - const sources = r.sources?.join('+') || 'hybrid'; - return `[${i + 1}] ${label}: ${name}\n File: ${r.filePath}${location}\n Found by: ${sources}`; - }); - return `Found ${results.length} matches:\n\n${formatted.join('\n\n')}`; - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - return `Search error: ${message}`; + resultsWithContext.push( + `[${i + 1}] ${label}: ${name}${score}\n ID: ${nodeId}\n File: ${filePath}${location}\n Found by: ${sources}${connections}` + ); } + + return `Found ${searchResults.length} matches:\n\n${resultsWithContext.join('\n\n')}`; }, { name: 'search', - description: 'Search for code by keywords or concepts. Finds relevant files and functions.', + description: 'Search for code by keywords or concepts. Combines keyword matching and semantic understanding. Returns relevant code with their graph connections (what calls them, what they import, etc.).', schema: z.object({ - query: z.string().describe('What you are looking for'), - limit: z.number().optional().nullable().describe('Max results (default: 10)'), + query: z.string().describe('What you are looking for (e.g., "authentication middleware", "database connection")'), + limit: z.number().optional().nullable().describe('Max results to return (default: 10)'), }), } ); + // ============================================================================ + // TOOL 2: CYPHER (Raw Cypher, auto-embeds {{QUERY_VECTOR}} if present) + // ============================================================================ + /** - * Tool: Semantic Search with Graph Context - * Find similar code AND expand to connected nodes (flattened format with relationship types) + * Execute Cypher queries with optional vector embedding */ - const semanticSearchWithContextTool = tool( - async ({ query, limit }: { query: string; limit?: number }) => { - if (!isEmbeddingReady()) { - return 'Semantic search is not available. Embeddings have not been generated yet. Please use execute_cypher tool for structured queries instead.'; - } - + const cypherTool = tool( + async ({ query, cypher }: { query?: string; cypher: string }) => { try { - const results = await semanticSearchWithContext(query, limit ?? 5); + let finalCypher = cypher; - if (results.length === 0) { - return `No code found matching "${query}". Try a different search term.`; - } - - // Results are flattened: one row per (match โ†’ connected) pair - // Group by match for cleaner output - const grouped = new Map; - }>(); - - for (const r of results) { - const matchId = r.matchId ?? r[0]; - const matchName = r.matchName ?? r[1]; - const matchLabel = r.matchLabel ?? r[2]; - const matchPath = r.matchPath ?? r[3]; - const distance = r.distance ?? r[4]; - const connectedName = r.connectedName ?? r[6]; - const connectedLabel = r.connectedLabel ?? r[7]; - const relationType = r.relationType ?? r[8]; + // Auto-embed if {{QUERY_VECTOR}} placeholder is present + if (cypher.includes('{{QUERY_VECTOR}}')) { + if (!query) { + return "Error: Your Cypher contains {{QUERY_VECTOR}} but you didn't provide a 'query' to embed. Add a natural language query."; + } - if (!grouped.has(matchId)) { - grouped.set(matchId, { - matchId, - matchName, - matchLabel, - matchPath, - distance, - connections: [], - }); + if (!isEmbeddingReady()) { + // Try to init embedder + try { + await initEmbedder(); + } catch (err) { + if (err instanceof WebGPUNotAvailableError) { + await initEmbedder(undefined, {}, 'wasm'); + } else { + return 'Embeddings not available. Remove {{QUERY_VECTOR}} and use a non-vector query.'; + } + } } - grouped.get(matchId)!.connections.push({ - name: connectedName, - label: connectedLabel, - relType: relationType, - }); + const queryEmbedding = await embedText(query); + const queryVec = embeddingToArray(queryEmbedding); + const queryVecStr = `CAST([${queryVec.join(',')}] AS FLOAT[384])`; + finalCypher = cypher.replace(/\{\{\s*QUERY_VECTOR\s*\}\}/g, queryVecStr); } - // Format grouped results - const formatted = Array.from(grouped.values()).map((g, i) => { - const connectionsList = g.connections - .slice(0, 15) - .map(c => `${c.name} (${c.label}) via ${c.relType}`) - .join('\n '); - const more = g.connections.length > 15 ? `\n ... and ${g.connections.length - 15} more` : ''; - - return `[${i + 1}] ${g.matchLabel}: ${g.matchName}\n ID: ${g.matchId}\n File: ${g.matchPath}\n Relevance: ${(1 - g.distance).toFixed(2)}\n Connections:\n ${connectionsList}${more}`; - }); - - return `Found ${grouped.size} code elements with ${results.length} total connections (use ID with get_code_content to see source):\n\n${formatted.join('\n\n')}`; - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - return `Search with context error: ${message}`; - } - }, - { - name: 'semantic_search_with_context', - description: 'Search for code semantically AND show directly connected code elements with relationship types (CALLS, IMPORTS, DEFINES, CONTAINS). Shows what each match is connected to and how.', - schema: z.object({ - query: z.string().describe('Natural language description of what you are looking for'), - limit: z.number().optional().nullable().describe('Number of semantic matches to find (default: 5)'), - }), - } - ); - - /** - * Tool: Get Graph Schema - * Returns the schema for reference - LLM should call this before writing Cypher queries - */ - const getSchemaTool = tool( - async ({ includeExamples }: { includeExamples?: boolean }) => { - return GRAPH_SCHEMA_DESCRIPTION; - }, - { - name: 'get_graph_schema', - description: 'Get the graph database schema including node types, relationships, and Cypher query patterns. Call this before writing Cypher queries.', - schema: z.object({ - includeExamples: z.boolean().optional().nullable().describe('Whether to include query examples (default: true)'), - }), - } - ); - - /** - * Tool: Get Code Content - * Retrieve the source code for a specific node - * Uses fileContents Map for full content (not truncated DB content) - */ - const getCodeContentTool = tool( - async ({ nodeId }: { nodeId: string }) => { - try { - // Query graph for node metadata (fast, small data) - const results = await executeQuery( - `MATCH (n:CodeNode {id: '${nodeId.replace(/'/g, "''")}'}) - RETURN n.name AS name, n.label AS label, n.filePath AS filePath, - n.startLine AS startLine, n.endLine AS endLine` - ); + const results = await executeQuery(finalCypher); if (results.length === 0) { - return `No node found with ID: ${nodeId}`; - } - - const node = results[0]; - const name = node.name ?? node[0]; - const label = node.label ?? node[1]; - const filePath = node.filePath ?? node[2]; - const startLine = node.startLine ?? node[3]; - const endLine = node.endLine ?? node[4]; - - // Get FULL content from fileContents Map (not truncated DB) - const fileContent = fileContents.get(filePath); - - if (!fileContent) { - return `${label}: ${name}\nFile: ${filePath}\n(File content not available in memory)`; + return 'Query returned no results.'; } - // For File nodes, return full content (limited for very large files) - if (label === 'File' || label === 'Folder') { - const MAX_FILE_CONTENT = 30000; - if (fileContent.length > MAX_FILE_CONTENT) { - return `${label}: ${name}\nFile: ${filePath}\nTotal size: ${fileContent.length} characters\n\n\`\`\`\n${fileContent.slice(0, MAX_FILE_CONTENT)}\n\`\`\`\n\n... [truncated, use read_file for full content]`; + // Format results + const formatted = results.slice(0, 50).map((row, i) => { + if (Array.isArray(row)) { + return `[${i + 1}] ${row.join(', ')}`; } - return `${label}: ${name}\nFile: ${filePath}\n\n\`\`\`\n${fileContent}\n\`\`\``; - } - - // For Function/Class/Method nodes, extract specific lines with context - const lines = fileContent.split('\n'); - const contextBefore = 3; - const contextAfter = 20; // Show more after to capture full function body - - const start = Math.max(0, (startLine ?? 0) - contextBefore); - const end = Math.min(lines.length - 1, (endLine ?? startLine ?? 0) + contextAfter); + return `[${i + 1}] ${JSON.stringify(row)}`; + }); - const snippet = lines.slice(start, end + 1).join('\n'); + const resultText = formatted.join('\n'); + const truncated = results.length > 50 ? `\n... (${results.length - 50} more results)` : ''; - return `${label}: ${name}\nFile: ${filePath}\nLines: ${startLine + 1}-${endLine + 1}\n\n\`\`\`\n${snippet}\n\`\`\``; + return `Query returned ${results.length} results:\n${resultText}${truncated}`; } catch (error) { const message = error instanceof Error ? error.message : String(error); - return `Error retrieving code: ${message}`; + return `Cypher error: ${message}\n\nCheck your query syntax. Node tables: File, Folder, Function, Class, Interface, Method, CodeElement. Relation: CodeRelation with type property (CONTAINS, DEFINES, IMPORTS, CALLS). Example: MATCH (f:File)-[:CodeRelation {type: 'IMPORTS'}]->(g:File) RETURN f, g`; } }, { - name: 'get_code_content', - description: 'Retrieve the source code content for a specific node by its ID. Use this after finding relevant nodes to see the actual implementation.', - schema: z.object({ - nodeId: z.string().describe('The ID of the node to retrieve code for'), - }), - } - ); + name: 'cypher', + description: `Execute a Cypher query against the code graph. Use for structural queries like finding callers, tracing imports, or custom traversals. - /** - * Tool: Get Codebase Statistics - * Quick overview of what's in the graph - */ - const getStatsTool = tool( - async ({ verbose }: { verbose?: boolean }) => { - try { - const labelCounts = await executeQuery(` - MATCH (n:CodeNode) - RETURN n.label AS label, count(*) AS count - ORDER BY count DESC - `); - - const relCounts = await executeQuery(` - MATCH ()-[r:CodeRelation]->() - RETURN r.type AS type, count(*) AS count - ORDER BY count DESC - `); - - const nodeStats = labelCounts.map(r => { - const label = r.label ?? r[0]; - const count = r.count ?? r[1]; - return ` ${label}: ${count}`; - }).join('\n'); - - const relStats = relCounts.map(r => { - const type = r.type ?? r[0]; - const count = r.count ?? r[1]; - return ` ${type}: ${count}`; - }).join('\n'); - - const embeddingStatus = isEmbeddingReady() - ? 'Ready (semantic search available)' - : 'Not generated (use execute_cypher for queries)'; - - return `Codebase Statistics:\n\nNodes by type:\n${nodeStats}\n\nRelationships by type:\n${relStats}\n\nEmbeddings: ${embeddingStatus}\n\nFiles in memory: ${fileContents.size}`; - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - return `Error getting stats: ${message}`; - } - }, - { - name: 'get_codebase_stats', - description: 'Get an overview of the codebase including counts of different element types (files, functions, classes) and relationship types.', +Node tables: File, Folder, Function, Class, Interface, Method, CodeElement +Relation: CodeRelation (single table with 'type' property: CONTAINS, DEFINES, IMPORTS, CALLS) + +Example queries: +- Files importing a file: MATCH (f:File)-[:CodeRelation {type: 'IMPORTS'}]->(target:File) WHERE target.name = 'utils.ts' RETURN f.name +- Functions defined in file: MATCH (f:File {name: 'main.ts'})-[:CodeRelation {type: 'DEFINES'}]->(fn:Function) RETURN fn.name +- All connections: MATCH (f:File {name: 'index.ts'})-[r:CodeRelation]-(m) RETURN m.name, r.type + +For semantic+graph queries, include {{QUERY_VECTOR}} placeholder and provide a 'query' parameter: +CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', {{QUERY_VECTOR}}, 10) YIELD node AS emb, distance +WITH emb, distance WHERE distance < 0.5 +MATCH (n:Function {id: emb.nodeId}) RETURN n`, schema: z.object({ - verbose: z.boolean().optional().nullable().describe('Include detailed breakdown (default: false)'), + cypher: z.string().describe('The Cypher query to execute'), + query: z.string().optional().nullable().describe('Natural language query to embed (required if cypher contains {{QUERY_VECTOR}})'), }), } ); - /** - * Tool: Grep Code - * Search for patterns across all file contents using regex - */ - const grepCodeTool = tool( - async ({ pattern, filePattern, caseSensitive, maxResults }: { + // ============================================================================ + // TOOL 3: GREP (Regex pattern search) + // ============================================================================ + + const grepTool = tool( + async ({ pattern, fileFilter, caseSensitive, maxResults }: { pattern: string; - filePattern?: string; + fileFilter?: string; caseSensitive?: boolean; maxResults?: number; }) => { @@ -445,20 +224,14 @@ export const createGraphRAGTools = ( try { regex = new RegExp(pattern, flags); } catch (e) { - return `Invalid regex pattern: ${pattern}. Error: ${e instanceof Error ? e.message : String(e)}`; + return `Invalid regex: ${pattern}. Error: ${e instanceof Error ? e.message : String(e)}`; } - const results: Array<{ - file: string; - line: number; - content: string; - }> = []; - + const results: Array<{ file: string; line: number; content: string }> = []; const limit = maxResults ?? 100; for (const [filePath, content] of fileContents.entries()) { - // Optional file pattern filter - if (filePattern && !filePath.toLowerCase().includes(filePattern.toLowerCase())) { + if (fileFilter && !filePath.toLowerCase().includes(fileFilter.toLowerCase())) { continue; } @@ -468,105 +241,83 @@ export const createGraphRAGTools = ( results.push({ file: filePath, line: i + 1, - content: lines[i].trim().slice(0, 200), // Limit line length + content: lines[i].trim().slice(0, 150), }); - if (results.length >= limit) break; } - // Reset regex lastIndex for global flag regex.lastIndex = 0; } - if (results.length >= limit) break; } if (results.length === 0) { - return `No matches found for pattern: "${pattern}"${filePattern ? ` in files matching "${filePattern}"` : ''}`; + return `No matches for "${pattern}"${fileFilter ? ` in files matching "${fileFilter}"` : ''}`; } - const formatted = results.map(r => - `${r.file}:${r.line}: ${r.content}` - ).join('\n'); + const formatted = results.map(r => `${r.file}:${r.line}: ${r.content}`).join('\n'); + const truncatedMsg = results.length >= limit ? `\n\n(Showing first ${limit} results)` : ''; - const truncatedMsg = results.length >= limit - ? `\n\n(Showing first ${limit} results. Use filePattern to narrow search.)` - : ''; - - return `Found ${results.length} matches for "${pattern}":\n\n${formatted}${truncatedMsg}`; + return `Found ${results.length} matches:\n\n${formatted}${truncatedMsg}`; } catch (error) { - const message = error instanceof Error ? error.message : String(error); - return `Grep error: ${message}`; + return `Grep error: ${error instanceof Error ? error.message : String(error)}`; } }, { - name: 'grep_code', - description: 'Search for text patterns across all files in the codebase using regex. Use this to find exact strings, error messages, TODOs, specific variable names, or any text pattern. Returns file paths and line numbers of matches.', + name: 'grep', + description: 'Search for exact text patterns across all files using regex. Use for finding specific strings, error messages, TODOs, variable names, etc.', schema: z.object({ pattern: z.string().describe('Regex pattern to search for (e.g., "TODO", "console\\.log", "API_KEY")'), - filePattern: z.string().optional().nullable().describe('Optional filter - only search files whose path contains this string (e.g., ".ts", "src/api")'), - caseSensitive: z.boolean().optional().nullable().describe('Whether search is case-sensitive (default: false)'), - maxResults: z.number().optional().nullable().describe('Maximum number of results to return (default: 100)'), + fileFilter: z.string().optional().nullable().describe('Only search files containing this string (e.g., ".ts", "src/api")'), + caseSensitive: z.boolean().optional().nullable().describe('Case-sensitive search (default: false)'), + maxResults: z.number().optional().nullable().describe('Max results (default: 100)'), }), } ); - /** - * Tool: Read File - * Read the full content of a file by its path - */ - const readFileTool = tool( + // ============================================================================ + // TOOL 4: READ (Read file content) + // ============================================================================ + + const readTool = tool( async ({ filePath }: { filePath: string }) => { - // Normalize the requested path (handle Windows-style paths) const normalizedRequest = filePath.replace(/\\/g, '/').toLowerCase(); // Try exact match first let content = fileContents.get(filePath); let actualPath = filePath; - // If not found, try smarter matching + // Smart matching if not found if (!content) { - // Score each file path by how well it matches the request const candidates: Array<{ path: string; score: number }> = []; for (const [path] of fileContents.entries()) { const normalizedPath = path.toLowerCase(); - // Exact match (case-insensitive) if (normalizedPath === normalizedRequest) { candidates.push({ path, score: 1000 }); - continue; - } - - // Ends with the requested path (e.g., "README.md" matches "src/agent_service/README.md") - if (normalizedPath.endsWith(normalizedRequest)) { - // Score higher for shorter paths (more specific match) + } else if (normalizedPath.endsWith(normalizedRequest)) { candidates.push({ path, score: 100 + (200 - path.length) }); - continue; - } - - // Path contains all segments of the request in order - const requestSegments = normalizedRequest.split('/').filter(Boolean); - const pathSegments = normalizedPath.split('/'); - let matchScore = 0; - let lastMatchIdx = -1; - - for (const seg of requestSegments) { - const idx = pathSegments.findIndex((s, i) => i > lastMatchIdx && s.includes(seg)); - if (idx > lastMatchIdx) { - matchScore += 10; - lastMatchIdx = idx; + } else { + const requestSegments = normalizedRequest.split('/').filter(Boolean); + const pathSegments = normalizedPath.split('/'); + let matchScore = 0; + let lastMatchIdx = -1; + + for (const seg of requestSegments) { + const idx = pathSegments.findIndex((s, i) => i > lastMatchIdx && s.includes(seg)); + if (idx > lastMatchIdx) { + matchScore += 10; + lastMatchIdx = idx; + } + } + + if (matchScore >= requestSegments.length * 5) { + candidates.push({ path, score: matchScore }); } - } - - // Only include if we matched more than half the segments - if (matchScore >= requestSegments.length * 5) { - candidates.push({ path, score: matchScore }); } } - // Sort by score descending, pick best match candidates.sort((a, b) => b.score - a.score); - if (candidates.length > 0) { actualPath = candidates[0].path; content = fileContents.get(actualPath); @@ -574,84 +325,70 @@ export const createGraphRAGTools = ( } if (!content) { - // List similar files to help the user const fileName = filePath.split('/').pop()?.toLowerCase() || ''; - const similarFiles = Array.from(fileContents.keys()) + const similar = Array.from(fileContents.keys()) .filter(p => p.toLowerCase().includes(fileName)) .slice(0, 5); - if (similarFiles.length > 0) { - return `File not found: "${filePath}"\n\nDid you mean one of these?\n${similarFiles.map(f => ` - ${f}`).join('\n')}`; + if (similar.length > 0) { + return `File not found: "${filePath}"\n\nDid you mean:\n${similar.map(f => ` - ${f}`).join('\n')}`; } return `File not found: "${filePath}"`; } - // For very large files, truncate with a warning - const MAX_CONTENT = 50000; // ~50KB + // Truncate large files + const MAX_CONTENT = 50000; if (content.length > MAX_CONTENT) { - const truncated = content.slice(0, MAX_CONTENT); const lines = content.split('\n').length; - return `File: ${actualPath}\nTotal lines: ${lines}\n\n(Showing first ${MAX_CONTENT} characters, file is ${content.length} characters total)\n\n${truncated}\n\n... [truncated]`; + return `File: ${actualPath} (${lines} lines, truncated)\n\n${content.slice(0, MAX_CONTENT)}\n\n... [truncated]`; } const lines = content.split('\n').length; - return `File: ${actualPath}\nLines: ${lines}\n\n${content}`; + return `File: ${actualPath} (${lines} lines)\n\n${content}`; }, { - name: 'read_file', - description: 'Read the full content of a file by its path. Use this to see the complete source code of any file in the codebase.', + name: 'read', + description: 'Read the full content of a file. Use to see source code after finding files via search or grep.', schema: z.object({ - filePath: z.string().describe('The file path to read (can be partial path like "src/utils.ts")'), + filePath: z.string().describe('File path to read (can be partial like "src/utils.ts")'), }), } ); - /** - * Tool: Highlight in Graph - * Highlight specific nodes in the visual knowledge graph - * Returns a special marker that the UI parses to highlight nodes - */ - const highlightInGraphTool = tool( + // ============================================================================ + // TOOL 5: HIGHLIGHT (Highlight nodes in graph UI) + // ============================================================================ + + const highlightTool = tool( async ({ nodeIds, description }: { nodeIds: string[]; description?: string }) => { if (!nodeIds || nodeIds.length === 0) { - return 'No node IDs provided to highlight.'; + return 'No node IDs provided.'; } - // Return a special marker format that the UI will parse - // Format: [HIGHLIGHT_NODES:id1,id2,id3] const marker = `[HIGHLIGHT_NODES:${nodeIds.join(',')}]`; + const desc = description || `Highlighting ${nodeIds.length} node(s)`; - const desc = description || `Highlighting ${nodeIds.length} node(s) in the knowledge graph`; - return `${desc}\n\n${marker}\n\nThe nodes have been highlighted in the graph visualization on the left. You can click on them to see their details.`; + return `${desc}\n\n${marker}\n\nNodes highlighted in the graph.`; }, { - name: 'highlight_in_graph', - description: `Highlight specific nodes in the visual knowledge graph. Pass the EXACT node IDs from your query results. - -IMPORTANT: Node IDs include a label prefix! Format is: Label:filepath:name -Examples: -- Class:src/agents/base.py:BaseAgent -- Function:src/utils.ts:calculateSum -- File:src/main.py - -Copy the ID EXACTLY as it appears in query results (the "classId", "fnId", "fileId", etc. columns).`, + name: 'highlight', + description: 'Highlight nodes in the visual graph. Use node IDs from search/cypher results (format: Label:filepath:name).', schema: z.object({ - nodeIds: z.array(z.string()).describe('Array of EXACT node IDs to highlight - must include the label prefix like "Class:" or "Function:"'), - description: z.string().optional().nullable().describe('Brief description of what these nodes represent'), + nodeIds: z.array(z.string()).describe('Node IDs to highlight (e.g., ["Function:src/utils.ts:calculate"])'), + description: z.string().optional().nullable().describe('What these nodes represent'), }), } ); + // ============================================================================ + // RETURN ALL TOOLS + // ============================================================================ + return [ - executeCypherTool, - executeVectorCypherTool, searchTool, - semanticSearchWithContextTool, - getSchemaTool, - getCodeContentTool, - getStatsTool, - grepCodeTool, - readFileTool, - highlightInGraphTool, + cypherTool, + grepTool, + readTool, + highlightTool, ]; }; diff --git a/src/core/llm/types.ts b/src/core/llm/types.ts index b7b4087735..c01811933f 100644 --- a/src/core/llm/types.ts +++ b/src/core/llm/types.ts @@ -200,85 +200,127 @@ export interface AgentStep { * Graph schema information for LLM context */ export const GRAPH_SCHEMA_DESCRIPTION = ` -KUZU GRAPH DATABASE SCHEMA: - -โš ๏ธ CRITICAL: There is NO "File" table, NO "Function" table, etc! -โš ๏ธ ALL nodes use the SINGLE "CodeNode" table with a "label" property! -โŒ WRONG: MATCH (f:File) or MATCH (fn:Function) -โœ… RIGHT: MATCH (n:CodeNode {label: 'File'}) or MATCH (n:CodeNode {label: 'Function'}) +KUZU GRAPH DATABASE SCHEMA (Multi-Table): NODE TABLES: -1. CodeNode - All code elements (polymorphic) +1. File - Source files + - id: STRING (primary key) + - name: STRING + - filePath: STRING + - content: STRING + +2. Folder - Directories + - id: STRING (primary key) + - name: STRING + - filePath: STRING + +3. Function - Function definitions + - id: STRING (primary key) + - name: STRING + - filePath: STRING + - startLine: INT64 + - endLine: INT64 + - content: STRING + +4. Class - Class definitions + - id: STRING (primary key) + - name: STRING + - filePath: STRING + - startLine: INT64 + - endLine: INT64 + - content: STRING + +5. Interface - Interface/Type definitions - id: STRING (primary key) - - label: STRING (one of: File, Folder, Function, Class, Method, Interface) - - name: STRING (element name) - - filePath: STRING (path in project) - - startLine: INT64 (line number where element starts) - - endLine: INT64 (line number where element ends) - - content: STRING (source code snippet) + - name: STRING + - filePath: STRING + - startLine: INT64 + - endLine: INT64 + - content: STRING -2. CodeEmbedding - Vector embeddings (SEPARATE TABLE for memory efficiency) - - nodeId: STRING (primary key, references CodeNode.id) - - embedding: FLOAT[384] (semantic vector) +6. Method - Class methods + - id: STRING (primary key) + - name: STRING + - filePath: STRING + - startLine: INT64 + - endLine: INT64 + - content: STRING + +7. CodeElement - Other code elements (fallback) + - id: STRING (primary key) + - name: STRING + - filePath: STRING + - startLine: INT64 + - endLine: INT64 + - content: STRING + +8. CodeEmbedding - Vector embeddings (separate for efficiency) + - nodeId: STRING (primary key) + - embedding: FLOAT[384] RELATIONSHIP TABLE: -- CodeRelation (FROM CodeNode TO CodeNode) - - type: STRING (one of: CALLS, IMPORTS, CONTAINS, DEFINES) +CodeRelation - Single table with 'type' property connecting all node tables + - type: STRING (values: CONTAINS, DEFINES, IMPORTS, CALLS) -IMPORTANT QUERY PATTERNS: +Connection patterns: +- CONTAINS: Folder->Folder, Folder->File +- DEFINES: File->Function, File->Class, File->Interface, File->Method, File->CodeElement +- IMPORTS: File->File +- CALLS: File->Function, File->Method, Function->Function, Function->Method -1. Basic node queries: - MATCH (n:CodeNode {label: 'Function'}) RETURN n.name, n.filePath LIMIT 10 +QUERY PATTERNS: -2. Relationship traversal: - MATCH (f:CodeNode {label: 'File'})-[r:CodeRelation {type: 'DEFINES'}]->(fn:CodeNode {label: 'Function'}) - RETURN f.name AS file, fn.name AS function +1. Find all functions: + MATCH (f:Function) RETURN f.name, f.filePath LIMIT 10 -3. SEMANTIC SEARCH (embeddings in separate table - MUST JOIN): - CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', $queryVector, 10) - YIELD node AS emb, distance - WITH emb, distance -- KuzuDB requires WITH after YIELD before WHERE - WHERE distance < 0.4 - MATCH (n:CodeNode {id: emb.nodeId}) -- JOIN required! - RETURN n.name, n.label, n.filePath, distance - ORDER BY distance +2. Find what a file defines: + MATCH (f:File)-[:CodeRelation {type: 'DEFINES'}]->(fn:Function) + WHERE f.name = 'utils.ts' + RETURN fn.name -4. Semantic search + graph expansion: - CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', $queryVector, 5) - YIELD node AS emb, distance - WITH emb, distance - WHERE distance < 0.5 - MATCH (match:CodeNode {id: emb.nodeId}) - MATCH (match)-[r:CodeRelation*1..2]-(connected:CodeNode) - RETURN match.name, distance, collect(DISTINCT connected.name) AS related +3. Find function callers: + MATCH (caller:File)-[:CodeRelation {type: 'CALLS'}]->(fn:Function {name: 'myFunction'}) + RETURN caller.name, caller.filePath -5. Find callers of a function: - MATCH (caller:CodeNode)-[r:CodeRelation {type: 'CALLS'}]->(fn:CodeNode {name: $functionName}) - RETURN caller.name, caller.label, caller.filePath +4. Find imports: + MATCH (f:File {name: 'main.ts'})-[:CodeRelation {type: 'IMPORTS'}]->(imported:File) + RETURN imported.name -6. Import chain analysis: - MATCH (f:CodeNode {name: $fileName})-[r:CodeRelation {type: 'IMPORTS'}]->(imported:CodeNode) - RETURN imported.name AS imports +5. Find files that import a specific file: + MATCH (f:File)-[:CodeRelation {type: 'IMPORTS'}]->(target:File {name: 'utils.ts'}) + RETURN f.name, f.filePath -7. Unified vector + graph traversal in ONE query (recommended pattern): +6. SEMANTIC SEARCH (embeddings in separate table - MUST JOIN): CALL QUERY_VECTOR_INDEX('CodeEmbedding', 'code_embedding_idx', $queryVector, 10) YIELD node AS emb, distance WITH emb, distance - WHERE distance < 0.5 - MATCH (match:CodeNode {id: emb.nodeId}) - MATCH (match)-[r:CodeRelation*1..2]-(ctx:CodeNode) - RETURN match.name AS found, match.label AS label, match.filePath AS path, - distance, collect(DISTINCT ctx.name) AS context + WHERE distance < 0.4 + MATCH (n:Function {id: emb.nodeId}) + RETURN n.name, n.filePath, distance ORDER BY distance +7. Search across all code types (use UNION or separate queries): + MATCH (f:Function) WHERE f.name CONTAINS 'auth' RETURN f.id, f.name, 'Function' AS type + UNION ALL + MATCH (c:Class) WHERE c.name CONTAINS 'auth' RETURN c.id, c.name, 'Class' AS type + +8. Folder structure: + MATCH (parent:Folder)-[:CodeRelation {type: 'CONTAINS'}]->(child) + WHERE parent.name = 'src' + RETURN child.name, labels(child)[0] AS type + +9. Get all connections for a node: + MATCH (f:File {name: 'index.ts'})-[r:CodeRelation]-(m) + RETURN m.name, r.type + TOOLING NOTE (for execute_vector_cypher): -- When using the execute_vector_cypher tool, write Cypher containing {{QUERY_VECTOR}} where the vector should go. -- The tool will replace {{QUERY_VECTOR}} with a CAST([..] AS FLOAT[384]) literal. +- Write Cypher containing {{QUERY_VECTOR}} where the vector should go. +- The tool will replace {{QUERY_VECTOR}} with CAST([..] AS FLOAT[384]). NOTES: -- Always use WHERE clauses to filter by label when possible for performance +- Use proper table names: File, Folder, Function, Class, Interface, Method, CodeElement +- Use CodeRelation with type property: [:CodeRelation {type: 'DEFINES'}] +- For vector search, join CodeEmbedding.nodeId to the appropriate table's id - Use LIMIT to avoid returning too many results -- For semantic search, the vector index is on CodeEmbedding table, not CodeNode -- distance in vector search is cosine distance (0 = identical, 1 = orthogonal) `; diff --git a/src/services/git-clone.ts b/src/services/git-clone.ts index 7924462521..8d46273774 100644 --- a/src/services/git-clone.ts +++ b/src/services/git-clone.ts @@ -146,7 +146,15 @@ export const cloneRepository = async ( */ const readAllFiles = async (baseDir: string, currentDir: string): Promise => { const files: FileEntry[] = []; - const entries = await pfs.readdir(currentDir); + + let entries: string[]; + try { + entries = await pfs.readdir(currentDir); + } catch (err) { + // Directory might not exist or be inaccessible + console.warn(`Cannot read directory: ${currentDir}`); + return files; + } for (const entry of entries) { // Skip .git directory @@ -158,7 +166,17 @@ const readAllFiles = async (baseDir: string, currentDir: string): Promise