coleam00 · Wirasm · Aug 29, 2025 · Aug 25, 2025 · Aug 25, 2025 · Aug 25, 2025
diff --git a/migration/add_source_url_display_name.sql b/migration/add_source_url_display_name.sql
@@ -0,0 +1,36 @@
+-- =====================================================
+-- Add source_url and source_display_name columns
+-- =====================================================
+-- This migration adds two new columns to better identify sources:
+-- - source_url: The original URL that was crawled
+-- - source_display_name: Human-readable name for UI display
+--
+-- This solves the race condition issue where multiple crawls
+-- to the same domain would conflict by using domain as source_id
+-- =====================================================
+
+-- Add new columns to archon_sources table
+ALTER TABLE archon_sources 
+ADD COLUMN IF NOT EXISTS source_url TEXT,
+ADD COLUMN IF NOT EXISTS source_display_name TEXT;
+
+-- Add indexes for the new columns for better query performance
+CREATE INDEX IF NOT EXISTS idx_archon_sources_url ON archon_sources(source_url);
+CREATE INDEX IF NOT EXISTS idx_archon_sources_display_name ON archon_sources(source_display_name);
+
+-- Add comments to document the new columns
+COMMENT ON COLUMN archon_sources.source_url IS 'The original URL that was crawled to create this source';
+COMMENT ON COLUMN archon_sources.source_display_name IS 'Human-readable name for UI display (e.g., "GitHub - microsoft/typescript")';
+
+-- Backfill existing data
+-- For existing sources, copy source_id to both new fields as a fallback
+UPDATE archon_sources 
+SET 
+    source_url = COALESCE(source_url, source_id),
+    source_display_name = COALESCE(source_display_name, source_id)
+WHERE 
+    source_url IS NULL 
+    OR source_display_name IS NULL;
+
+-- Note: source_id will now contain a unique hash instead of domain
+-- This ensures no conflicts when multiple sources from same domain are crawled
diff --git a/migration/complete_setup.sql b/migration/complete_setup.sql
@@ -170,6 +170,8 @@ COMMENT ON TABLE archon_settings IS 'Stores application configuration including
 -- Create the sources table
 CREATE TABLE IF NOT EXISTS archon_sources (
     source_id TEXT PRIMARY KEY,
+    source_url TEXT,
+    source_display_name TEXT,
     summary TEXT,
     total_word_count INTEGER DEFAULT 0,
     title TEXT,
@@ -180,10 +182,15 @@ CREATE TABLE IF NOT EXISTS archon_sources (
 
 -- Create indexes for better query performance
 CREATE INDEX IF NOT EXISTS idx_archon_sources_title ON archon_sources(title);
+CREATE INDEX IF NOT EXISTS idx_archon_sources_url ON archon_sources(source_url);
+CREATE INDEX IF NOT EXISTS idx_archon_sources_display_name ON archon_sources(source_display_name);
 CREATE INDEX IF NOT EXISTS idx_archon_sources_metadata ON archon_sources USING GIN(metadata);
 CREATE INDEX IF NOT EXISTS idx_archon_sources_knowledge_type ON archon_sources((metadata->>'knowledge_type'));
 
--- Add comments to document the new columns
+-- Add comments to document the columns
+COMMENT ON COLUMN archon_sources.source_id IS 'Unique hash identifier for the source (16-char SHA256 hash of URL)';
+COMMENT ON COLUMN archon_sources.source_url IS 'The original URL that was crawled to create this source';
+COMMENT ON COLUMN archon_sources.source_display_name IS 'Human-readable name for UI display (e.g., "GitHub - microsoft/typescript")';
 COMMENT ON COLUMN archon_sources.title IS 'Descriptive title for the source (e.g., "Pydantic AI API Reference")';
 COMMENT ON COLUMN archon_sources.metadata IS 'JSONB field storing knowledge_type, tags, and other metadata';
 

diff --git a/python/src/server/services/crawling/code_extraction_service.py b/python/src/server/services/crawling/code_extraction_service.py
@@ -7,7 +7,6 @@
 import re
 from collections.abc import Callable
 from typing import Any
-from urllib.parse import urlparse
 
 from ...config.logfire_config import safe_logfire_error, safe_logfire_info
 from ...services.credential_service import credential_service
@@ -136,6 +135,7 @@ async def extract_and_store_code_examples(
         self,
         crawl_results: list[dict[str, Any]],
         url_to_full_document: dict[str, str],
+        source_id: str,
         progress_callback: Callable | None = None,
         start_progress: int = 0,
         end_progress: int = 100,
@@ -146,6 +146,7 @@ async def extract_and_store_code_examples(
         Args:
             crawl_results: List of crawled documents with url and markdown content
             url_to_full_document: Mapping of URLs to full document content
+            source_id: The unique source_id for all documents
             progress_callback: Optional async callback for progress updates
             start_progress: Starting progress percentage (default: 0)
             end_progress: Ending progress percentage (default: 100)
@@ -163,7 +164,7 @@ async def extract_and_store_code_examples(
 
         # Extract code blocks from all documents
         all_code_blocks = await self._extract_code_blocks_from_documents(
-            crawl_results, progress_callback, start_progress, extract_end
+            crawl_results, source_id, progress_callback, start_progress, extract_end
         )
 
         if not all_code_blocks:
@@ -201,13 +202,18 @@ async def extract_and_store_code_examples(
     async def _extract_code_blocks_from_documents(
         self,
         crawl_results: list[dict[str, Any]],
+        source_id: str,
         progress_callback: Callable | None = None,
         start_progress: int = 0,
         end_progress: int = 100,
     ) -> list[dict[str, Any]]:
         """
         Extract code blocks from all documents.
 
+        Args:
+            crawl_results: List of crawled documents
+            source_id: The unique source_id for all documents
+
         Returns:
             List of code blocks with metadata
         """
@@ -306,10 +312,7 @@ async def _extract_code_blocks_from_documents(
                     )
 
                 if code_blocks:
-                    # Always extract source_id from URL
-                    parsed_url = urlparse(source_url)
-                    source_id = parsed_url.netloc or parsed_url.path
-
+                    # Use the provided source_id for all code blocks
                     for block in code_blocks:
                         all_code_blocks.append({
                             "block": block,

diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py
@@ -304,10 +304,12 @@ async def send_heartbeat_if_needed():
             url = str(request.get("url", ""))
             safe_logfire_info(f"Starting async crawl orchestration | url={url} | task_id={task_id}")
 
-            # Extract source_id from the original URL
-            parsed_original_url = urlparse(url)
-            original_source_id = parsed_original_url.netloc or parsed_original_url.path
-            safe_logfire_info(f"Using source_id '{original_source_id}' from original URL '{url}'")
+            # Generate unique source_id and display name from the original URL
+            original_source_id = self.url_handler.generate_unique_source_id(url)
+            source_display_name = self.url_handler.extract_display_name(url)
+            safe_logfire_info(
+                f"Generated unique source_id '{original_source_id}' and display name '{source_display_name}' from URL '{url}'"
+            )
 
             # Helper to update progress with mapper
             async def update_mapped_progress(
@@ -386,6 +388,8 @@ async def doc_storage_callback(
                 original_source_id,
                 doc_storage_callback,
                 self._check_cancellation,
+                source_url=url,
+                source_display_name=source_display_name,
             )
 
             # Check for cancellation after document storage
@@ -410,6 +414,7 @@ async def code_progress_callback(data: dict):
                 code_examples_count = await self.doc_storage_ops.extract_and_store_code_examples(
                     crawl_results,
                     storage_results["url_to_full_document"],
+                    storage_results["source_id"],
                     code_progress_callback,
                     85,
                     95,
@@ -558,7 +563,7 @@ async def _crawl_by_url_type(self, url: str, request: Dict[str, Any]) -> tuple:
             max_depth = request.get("max_depth", 1)
             # Let the strategy handle concurrency from settings
             # This will use CRAWL_MAX_CONCURRENT from database (default: 10)
-            
+
             crawl_results = await self.crawl_recursive_with_progress(
                 [url],
                 max_depth=max_depth,