abhigyanpatwari · abhigyanpatwari · Mar 20, 2026 · Mar 20, 2026
@@ -32,7 +32,8 @@ export type NodeLabel =
   | 'Delegate'
   | 'Annotation'
   | 'Constructor'
-  | 'Template';
+  | 'Template'
+  | 'Section';
 
 
 import { SupportedLanguages } from '../../config/supported-languages.js';
@@ -65,6 +66,8 @@ export type NodeProperties = {
   entryPointReason?: string,
   // Method signature (for MRO disambiguation)
   parameterCount?: number,
+  // Section-specific (markdown heading level, 1-6)
+  level?: number,
   returnType?: string,
 }
 

@@ -0,0 +1,157 @@
+/**
+ * Markdown Processor
+ *
+ * Extracts structure from .md files using regex (no tree-sitter dependency).
+ * Creates Section nodes for headings with hierarchy, and IMPORTS edges for
+ * cross-file links.
+ */
+
+import path from 'node:path';
+import { generateId } from '../../lib/utils.js';
+import { KnowledgeGraph, GraphNode, GraphRelationship } from '../graph/types.js';
+
+const HEADING_RE = /^(#{1,6})\s+(.+)$/;
+const LINK_RE = /\[([^\]]*)\]\(([^)]+)\)/g;
+const MD_EXTENSIONS = new Set(['.md', '.mdx']);
+
+interface MdFile {
+  path: string;
+  content: string;
+}
+
+export const processMarkdown = (
+  graph: KnowledgeGraph,
+  files: MdFile[],
+  allPathSet: Set<string>,
+): { sections: number; links: number } => {
+  let totalSections = 0;
+  let totalLinks = 0;
+
+  for (const file of files) {
+    const ext = path.extname(file.path).toLowerCase();
+    if (!MD_EXTENSIONS.has(ext)) continue;
+
+    const fileNodeId = generateId('File', file.path);
+    // Skip if file node doesn't exist (shouldn't happen, structure-processor creates it)
+    if (!graph.getNode(fileNodeId)) continue;
+
+    const lines = file.content.split('\n');
+
+    // --- Extract headings and build hierarchy ---
+    // First pass: collect all heading positions so we can compute endLine spans
+    const headings: { level: number; heading: string; lineNum: number }[] = [];
+
+    for (let i = 0; i < lines.length; i++) {
+      const match = lines[i].match(HEADING_RE);
+      if (!match) continue;
+
+      headings.push({
+        level: match[1].length,
+        heading: match[2].trim(),
+        lineNum: i + 1, // 1-indexed
+      });
+    }
+
+    // Second pass: create nodes with proper endLine spans
+    const sectionStack: { level: number; id: string }[] = [];
+
+    for (let h = 0; h < headings.length; h++) {
+      const { level, heading, lineNum } = headings[h];
+
+      // endLine = line before next heading at same or higher level, or EOF
+      let endLine = lines.length;
+      for (let j = h + 1; j < headings.length; j++) {
+        if (headings[j].level <= level) {
+          endLine = headings[j].lineNum - 1;
+          break;
+        }
+      }
+
+      const sectionId = generateId('Section', `${file.path}:L${lineNum}:${heading}`);
+
+      const node: GraphNode = {
+        id: sectionId,
+        label: 'Section',
+        properties: {
+          name: heading,
+          filePath: file.path,
+          startLine: lineNum,
+          endLine,
+          level,
+          description: `h${level}`,
+        },
+      };
+      graph.addNode(node);
+      totalSections++;
+
+      // Find parent: pop stack until we find a level strictly less than current
+      while (sectionStack.length > 0 && sectionStack[sectionStack.length - 1].level >= level) {
+        sectionStack.pop();
+      }
+
+      const parentId = sectionStack.length > 0
+        ? sectionStack[sectionStack.length - 1].id
+        : fileNodeId;
+
+      graph.addRelationship({
+        id: generateId('CONTAINS', `${parentId}->${sectionId}`),
+        type: 'CONTAINS',
+        sourceId: parentId,
+        targetId: sectionId,
+        confidence: 1.0,
+        reason: 'markdown-heading',
+      });
+
+      sectionStack.push({ level, id: sectionId });
+    }
+
+    // --- Extract links to other files in the repo ---
+    const fileDir = path.dirname(file.path);
+    const seenLinks = new Set<string>();
+    let linkMatch: RegExpExecArray | null;
+    LINK_RE.lastIndex = 0;
+
+    while ((linkMatch = LINK_RE.exec(file.content)) !== null) {
+      const href = linkMatch[2];
+
+      // Skip external URLs, anchors, and mailto
+      if (href.startsWith('http://') || href.startsWith('https://') ||
+          href.startsWith('#') || href.startsWith('mailto:')) {
+        continue;
+      }
+
+      // Strip anchor fragments from local links
+      const cleanHref = href.split('#')[0];
+      if (!cleanHref) continue;
+
+      // Resolve relative to the file's directory, then normalize
+      const resolved = path.posix.normalize(path.posix.join(fileDir, cleanHref));
+
+      if (allPathSet.has(resolved)) {
+        const targetFileId = generateId('File', resolved);
+
+        // Skip if target file node doesn't exist
+        if (!graph.getNode(targetFileId)) continue;
+
+        // Dedup: skip if we've already linked this file pair
+        const linkKey = `${fileNodeId}->${targetFileId}`;
+        if (seenLinks.has(linkKey)) continue;
+        seenLinks.add(linkKey);
+
+        const relId = generateId('IMPORTS', linkKey);
+
+        graph.addRelationship({
+          id: relId,
+          type: 'IMPORTS',
+          sourceId: fileNodeId,
+          targetId: targetFileId,
+          confidence: 0.8,
+          reason: 'markdown-link',
+        });
+        totalLinks++;
+      }
+    }
+  }
+
+  return { sections: totalSections, links: totalLinks };
+};
@@ -1,5 +1,6 @@
 import { createKnowledgeGraph } from '../graph/graph.js';
 import { processStructure } from './structure-processor.js';
+import { processMarkdown } from './markdown-processor.js';
 import { processParsing } from './parsing-processor.js';
 import {
   processImports,
@@ -99,6 +100,21 @@ export const runPipelineFromRepo = async (
       stats: { filesProcessed: totalFiles, totalFiles, nodesCreated: graph.nodeCount },
     });
 
+
+    // ── Phase 2.5: Markdown processing (headings + cross-links) ────────
+    const mdScanned = scannedFiles.filter(f => f.path.endsWith('.md') || f.path.endsWith('.mdx'));
+    if (mdScanned.length > 0) {
+      const mdContents = await readFileContents(repoPath, mdScanned.map(f => f.path));
+      const mdFiles = mdScanned
+        .filter(f => mdContents.has(f.path))
+        .map(f => ({ path: f.path, content: mdContents.get(f.path)! }));
+      const allPathSet = new Set(allPaths);
+      const mdResult = processMarkdown(graph, mdFiles, allPathSet);
+      if (isDev) {
+        console.log(`  Markdown: ${mdResult.sections} sections, ${mdResult.links} cross-links from ${mdFiles.length} files`);
+      }
+    }
+
     // ── Phase 3+4: Chunked read + parse ────────────────────────────────
     // Group parseable files into byte-budget chunks so only ~20MB of source
     // is in memory at a time. Each chunk is: read → parse → extract → free.

@@ -238,6 +238,9 @@ export const streamAllCSVsToDisk = async (
   const communityWriter = new BufferedCSVWriter(path.join(csvDir, 'community.csv'), 'id,label,heuristicLabel,keywords,description,enrichedBy,cohesion,symbolCount');
   const processWriter = new BufferedCSVWriter(path.join(csvDir, 'process.csv'), 'id,label,heuristicLabel,processType,stepCount,communities,entryPointId,terminalId');
 
+  // Section nodes have an extra 'level' column
+  const sectionWriter = new BufferedCSVWriter(path.join(csvDir, 'section.csv'), 'id,name,filePath,startLine,endLine,level,content,description');
+
   // Multi-language node types share the same CSV shape (no isExported column)
   const multiLangHeader = 'id,name,filePath,startLine,endLine,content,description';
   const MULTI_LANG_TYPES = ['Struct', 'Enum', 'Macro', 'Typedef', 'Union', 'Namespace', 'Trait', 'Impl',
@@ -324,6 +327,20 @@ export const streamAllCSVsToDisk = async (
         ].join(','));
         break;
       }
+      case 'Section': {
+        const content = await extractContent(node, contentCache);
+        await sectionWriter.addRow([
+          escapeCSVField(node.id),
+          escapeCSVField(node.properties.name || ''),
+          escapeCSVField(node.properties.filePath || ''),
+          escapeCSVNumber(node.properties.startLine, -1),
+          escapeCSVNumber(node.properties.endLine, -1),
+          escapeCSVNumber((node.properties as any).level, 1),
+          escapeCSVField(content),
+          escapeCSVField((node.properties as any).description || ''),
+        ].join(','));
+        break;
+      }
       default: {
         // Code element nodes (Function, Class, Interface, CodeElement)
         const writer = codeWriterMap[node.label];
@@ -361,7 +378,7 @@ export const streamAllCSVsToDisk = async (
   }
 
   // Finish all node writers
-  const allWriters = [fileWriter, folderWriter, functionWriter, classWriter, interfaceWriter, methodWriter, codeElemWriter, communityWriter, processWriter, ...multiLangWriters.values()];
+  const allWriters = [fileWriter, folderWriter, functionWriter, classWriter, interfaceWriter, methodWriter, codeElemWriter, communityWriter, processWriter, sectionWriter, ...multiLangWriters.values()];
   await Promise.all(allWriters.map(w => w.finish()));
 
   // --- Stream relationship CSV ---
@@ -387,6 +404,7 @@ export const streamAllCSVsToDisk = async (
     ['Interface', interfaceWriter], ['Method', methodWriter],
     ['CodeElement', codeElemWriter],
     ['Community', communityWriter], ['Process', processWriter],
+    ['Section' as NodeTableName, sectionWriter],
     ...Array.from(multiLangWriters.entries()).map(([name, w]) => [name as NodeTableName, w] as [NodeTableName, BufferedCSVWriter]),
   ];
   for (const [name, writer] of tableMap) {

@@ -192,6 +192,19 @@ export const ANNOTATION_SCHEMA = CODE_ELEMENT_BASE('Annotation');
 export const CONSTRUCTOR_SCHEMA = CODE_ELEMENT_BASE('Constructor');
 export const TEMPLATE_SCHEMA = CODE_ELEMENT_BASE('Template');
 export const MODULE_SCHEMA = CODE_ELEMENT_BASE('Module');
+// Markdown heading sections
+export const SECTION_SCHEMA = `
+CREATE NODE TABLE Section (
+  id STRING,
+  name STRING,
+  filePath STRING,
+  startLine INT64,
+  endLine INT64,
+  level INT64,
+  content STRING,
+  description STRING,
+  PRIMARY KEY (id)
+)`;
 
 // ============================================================================
 // RELATION TABLE SCHEMA
@@ -289,6 +302,8 @@ CREATE REL TABLE ${REL_TABLE_NAME} (
   FROM \`Template\` TO Interface,
   FROM \`Template\` TO \`Constructor\`,
   FROM \`Module\` TO \`Module\`,
+  FROM Section TO Section,
+  FROM Section TO File,
   FROM CodeElement TO Community,
   FROM Interface TO Community,
   FROM Interface TO Function,