Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion gitnexus/src/core/graph/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ export type NodeLabel =
| 'Delegate'
| 'Annotation'
| 'Constructor'
| 'Template';
| 'Template'
| 'Section';


import { SupportedLanguages } from '../../config/supported-languages.js';
Expand Down Expand Up @@ -65,6 +66,8 @@ export type NodeProperties = {
entryPointReason?: string,
// Method signature (for MRO disambiguation)
parameterCount?: number,
// Section-specific (markdown heading level, 1-6)
level?: number,
returnType?: string,
}

Expand Down
157 changes: 157 additions & 0 deletions gitnexus/src/core/ingestion/markdown-processor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
/**
* Markdown Processor
*
* Extracts structure from .md files using regex (no tree-sitter dependency).
* Creates Section nodes for headings with hierarchy, and IMPORTS edges for
* cross-file links.
*/

import path from 'node:path';
import { generateId } from '../../lib/utils.js';
import { KnowledgeGraph, GraphNode, GraphRelationship } from '../graph/types.js';

const HEADING_RE = /^(#{1,6})\s+(.+)$/;
const LINK_RE = /\[([^\]]*)\]\(([^)]+)\)/g;
const MD_EXTENSIONS = new Set(['.md', '.mdx']);

interface MdFile {
path: string;
content: string;
}

export const processMarkdown = (
graph: KnowledgeGraph,
files: MdFile[],
allPathSet: Set<string>,
): { sections: number; links: number } => {
let totalSections = 0;
let totalLinks = 0;

for (const file of files) {
const ext = path.extname(file.path).toLowerCase();
if (!MD_EXTENSIONS.has(ext)) continue;

const fileNodeId = generateId('File', file.path);
// Skip if file node doesn't exist (shouldn't happen, structure-processor creates it)
if (!graph.getNode(fileNodeId)) continue;

const lines = file.content.split('\n');

// --- Extract headings and build hierarchy ---
// First pass: collect all heading positions so we can compute endLine spans
const headings: { level: number; heading: string; lineNum: number }[] = [];

for (let i = 0; i < lines.length; i++) {
const match = lines[i].match(HEADING_RE);
if (!match) continue;

headings.push({
level: match[1].length,
heading: match[2].trim(),
lineNum: i + 1, // 1-indexed
});
}

// Second pass: create nodes with proper endLine spans
const sectionStack: { level: number; id: string }[] = [];

for (let h = 0; h < headings.length; h++) {
const { level, heading, lineNum } = headings[h];

// endLine = line before next heading at same or higher level, or EOF
let endLine = lines.length;
for (let j = h + 1; j < headings.length; j++) {
if (headings[j].level <= level) {
endLine = headings[j].lineNum - 1;
break;
}
}

const sectionId = generateId('Section', `${file.path}:L${lineNum}:${heading}`);

const node: GraphNode = {
id: sectionId,
label: 'Section',
properties: {
name: heading,
filePath: file.path,
startLine: lineNum,
endLine,
level,
description: `h${level}`,
},
};
graph.addNode(node);
totalSections++;

// Find parent: pop stack until we find a level strictly less than current
while (sectionStack.length > 0 && sectionStack[sectionStack.length - 1].level >= level) {
sectionStack.pop();
}

const parentId = sectionStack.length > 0
? sectionStack[sectionStack.length - 1].id
: fileNodeId;

graph.addRelationship({
id: generateId('CONTAINS', `${parentId}->${sectionId}`),
type: 'CONTAINS',
sourceId: parentId,
targetId: sectionId,
confidence: 1.0,
reason: 'markdown-heading',
});

sectionStack.push({ level, id: sectionId });
}

// --- Extract links to other files in the repo ---
const fileDir = path.dirname(file.path);
const seenLinks = new Set<string>();
let linkMatch: RegExpExecArray | null;
LINK_RE.lastIndex = 0;

while ((linkMatch = LINK_RE.exec(file.content)) !== null) {
const href = linkMatch[2];

// Skip external URLs, anchors, and mailto
if (href.startsWith('http://') || href.startsWith('https://') ||
href.startsWith('#') || href.startsWith('mailto:')) {
continue;
}

// Strip anchor fragments from local links
const cleanHref = href.split('#')[0];
if (!cleanHref) continue;

// Resolve relative to the file's directory, then normalize
const resolved = path.posix.normalize(path.posix.join(fileDir, cleanHref));

if (allPathSet.has(resolved)) {
const targetFileId = generateId('File', resolved);

// Skip if target file node doesn't exist
if (!graph.getNode(targetFileId)) continue;

// Dedup: skip if we've already linked this file pair
const linkKey = `${fileNodeId}->${targetFileId}`;
if (seenLinks.has(linkKey)) continue;
seenLinks.add(linkKey);

const relId = generateId('IMPORTS', linkKey);

graph.addRelationship({
id: relId,
type: 'IMPORTS',
sourceId: fileNodeId,
targetId: targetFileId,
confidence: 0.8,
reason: 'markdown-link',
});
totalLinks++;
}
}
}

return { sections: totalSections, links: totalLinks };
};
16 changes: 16 additions & 0 deletions gitnexus/src/core/ingestion/pipeline.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { createKnowledgeGraph } from '../graph/graph.js';
import { processStructure } from './structure-processor.js';
import { processMarkdown } from './markdown-processor.js';
import { processParsing } from './parsing-processor.js';
import {
processImports,
Expand Down Expand Up @@ -99,6 +100,21 @@ export const runPipelineFromRepo = async (
stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
});


// ── Phase 2.5: Markdown processing (headings + cross-links) ────────
const mdScanned = scannedFiles.filter(f => f.path.endsWith('.md') || f.path.endsWith('.mdx'));
if (mdScanned.length > 0) {
const mdContents = await readFileContents(repoPath, mdScanned.map(f => f.path));
const mdFiles = mdScanned
.filter(f => mdContents.has(f.path))
.map(f => ({ path: f.path, content: mdContents.get(f.path)! }));
const allPathSet = new Set(allPaths);
const mdResult = processMarkdown(graph, mdFiles, allPathSet);
if (isDev) {
console.log(` Markdown: ${mdResult.sections} sections, ${mdResult.links} cross-links from ${mdFiles.length} files`);
}
}

// ── Phase 3+4: Chunked read + parse ────────────────────────────────
// Group parseable files into byte-budget chunks so only ~20MB of source
// is in memory at a time. Each chunk is: read → parse → extract → free.
Expand Down
20 changes: 19 additions & 1 deletion gitnexus/src/core/lbug/csv-generator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,9 @@ export const streamAllCSVsToDisk = async (
const communityWriter = new BufferedCSVWriter(path.join(csvDir, 'community.csv'), 'id,label,heuristicLabel,keywords,description,enrichedBy,cohesion,symbolCount');
const processWriter = new BufferedCSVWriter(path.join(csvDir, 'process.csv'), 'id,label,heuristicLabel,processType,stepCount,communities,entryPointId,terminalId');

// Section nodes have an extra 'level' column
const sectionWriter = new BufferedCSVWriter(path.join(csvDir, 'section.csv'), 'id,name,filePath,startLine,endLine,level,content,description');

// Multi-language node types share the same CSV shape (no isExported column)
const multiLangHeader = 'id,name,filePath,startLine,endLine,content,description';
const MULTI_LANG_TYPES = ['Struct', 'Enum', 'Macro', 'Typedef', 'Union', 'Namespace', 'Trait', 'Impl',
Expand Down Expand Up @@ -324,6 +327,20 @@ export const streamAllCSVsToDisk = async (
].join(','));
break;
}
case 'Section': {
const content = await extractContent(node, contentCache);
await sectionWriter.addRow([
escapeCSVField(node.id),
escapeCSVField(node.properties.name || ''),
escapeCSVField(node.properties.filePath || ''),
escapeCSVNumber(node.properties.startLine, -1),
escapeCSVNumber(node.properties.endLine, -1),
escapeCSVNumber((node.properties as any).level, 1),
escapeCSVField(content),
escapeCSVField((node.properties as any).description || ''),
].join(','));
break;
}
default: {
// Code element nodes (Function, Class, Interface, CodeElement)
const writer = codeWriterMap[node.label];
Expand Down Expand Up @@ -361,7 +378,7 @@ export const streamAllCSVsToDisk = async (
}

// Finish all node writers
const allWriters = [fileWriter, folderWriter, functionWriter, classWriter, interfaceWriter, methodWriter, codeElemWriter, communityWriter, processWriter, ...multiLangWriters.values()];
const allWriters = [fileWriter, folderWriter, functionWriter, classWriter, interfaceWriter, methodWriter, codeElemWriter, communityWriter, processWriter, sectionWriter, ...multiLangWriters.values()];
await Promise.all(allWriters.map(w => w.finish()));

// --- Stream relationship CSV ---
Expand All @@ -387,6 +404,7 @@ export const streamAllCSVsToDisk = async (
['Interface', interfaceWriter], ['Method', methodWriter],
['CodeElement', codeElemWriter],
['Community', communityWriter], ['Process', processWriter],
['Section' as NodeTableName, sectionWriter],
...Array.from(multiLangWriters.entries()).map(([name, w]) => [name as NodeTableName, w] as [NodeTableName, BufferedCSVWriter]),
];
for (const [name, writer] of tableMap) {
Expand Down
15 changes: 15 additions & 0 deletions gitnexus/src/core/lbug/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,19 @@ export const ANNOTATION_SCHEMA = CODE_ELEMENT_BASE('Annotation');
export const CONSTRUCTOR_SCHEMA = CODE_ELEMENT_BASE('Constructor');
export const TEMPLATE_SCHEMA = CODE_ELEMENT_BASE('Template');
export const MODULE_SCHEMA = CODE_ELEMENT_BASE('Module');
// Markdown heading sections
export const SECTION_SCHEMA = `
CREATE NODE TABLE Section (
id STRING,
name STRING,
filePath STRING,
startLine INT64,
endLine INT64,
level INT64,
content STRING,
description STRING,
PRIMARY KEY (id)
)`;

// ============================================================================
// RELATION TABLE SCHEMA
Expand Down Expand Up @@ -289,6 +302,8 @@ CREATE REL TABLE ${REL_TABLE_NAME} (
FROM \`Template\` TO Interface,
FROM \`Template\` TO \`Constructor\`,
FROM \`Module\` TO \`Module\`,
FROM Section TO Section,
FROM Section TO File,
FROM CodeElement TO Community,
FROM Interface TO Community,
FROM Interface TO Function,
Expand Down
Loading