Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
47edbb1
Fix race condition in concurrent crawling with unique source IDs
Wirasm Aug 25, 2025
bc11a0c
Fix title generation to use source_display_name for better AI context
Wirasm Aug 25, 2025
40abaf9
Skip AI title generation when display name is available
Wirasm Aug 25, 2025
56220bd
Fix critical issues from code review
Wirasm Aug 25, 2025
b9d52fb
Add safety improvements from code review
Wirasm Aug 25, 2025
5e603ea
Fix code extraction to use hash-based source_ids and improve display …
Wirasm Aug 25, 2025
76bf0f0
Fix critical variable shadowing and source_type determination issues
Wirasm Aug 25, 2025
698e3b9
Fix URL canonicalization and document metrics calculation
Wirasm Aug 25, 2025
f5de76d
Fix synchronous extract_source_summary blocking async event loop
Wirasm Aug 25, 2025
353264d
Fix synchronous update_source_info blocking async event loop
Wirasm Aug 25, 2025
52187e2
Fix race condition in source creation using upsert
Wirasm Aug 25, 2025
fd9209c
Add migration detection UI components
Wirasm Aug 26, 2025
a7da288
Integrate migration banner into main app
Wirasm Aug 26, 2025
49f9280
Enhance backend startup error instructions
Wirasm Aug 26, 2025
a8b5a65
Add database schema caching to health endpoint
Wirasm Aug 26, 2025
3eda01e
Clean up knowledge API imports and logging
Wirasm Aug 26, 2025
f65c4ae
Remove unused instructions prop from MigrationBanner
Wirasm Aug 26, 2025
75958f4
Add schema_valid flag to migration_required health response
Wirasm Aug 26, 2025
7dca34b
Merge remote-tracking branch 'origin/main' into fix/source-id-archite…
Wirasm Aug 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions migration/add_source_url_display_name.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
-- =====================================================
-- Add source_url and source_display_name columns
-- =====================================================
-- This migration adds two new columns to better identify sources:
-- - source_url: The original URL that was crawled
-- - source_display_name: Human-readable name for UI display
--
-- This solves the race condition issue where multiple crawls
-- to the same domain would conflict by using domain as source_id
-- =====================================================

-- Add new columns to archon_sources table
ALTER TABLE archon_sources
ADD COLUMN IF NOT EXISTS source_url TEXT,
ADD COLUMN IF NOT EXISTS source_display_name TEXT;

-- Add indexes for the new columns for better query performance
CREATE INDEX IF NOT EXISTS idx_archon_sources_url ON archon_sources(source_url);
CREATE INDEX IF NOT EXISTS idx_archon_sources_display_name ON archon_sources(source_display_name);

-- Add comments to document the new columns
COMMENT ON COLUMN archon_sources.source_url IS 'The original URL that was crawled to create this source';
COMMENT ON COLUMN archon_sources.source_display_name IS 'Human-readable name for UI display (e.g., "GitHub - microsoft/typescript")';

-- Backfill existing data
-- For existing sources, copy source_id to both new fields as a fallback
UPDATE archon_sources
SET
source_url = COALESCE(source_url, source_id),
source_display_name = COALESCE(source_display_name, source_id)
WHERE
source_url IS NULL
OR source_display_name IS NULL;

-- Note: source_id will now contain a unique hash instead of domain
-- This ensures no conflicts when multiple sources from same domain are crawled
9 changes: 8 additions & 1 deletion migration/complete_setup.sql
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ COMMENT ON TABLE archon_settings IS 'Stores application configuration including
-- Create the sources table
CREATE TABLE IF NOT EXISTS archon_sources (
source_id TEXT PRIMARY KEY,
source_url TEXT,
source_display_name TEXT,
summary TEXT,
Comment on lines +173 to 175

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Schema extension is correct; add an updated_at trigger for archon_sources.

You already use update_updated_at_column() for other tables; archon_sources lacks that trigger. Today, the app attempts to set updated_at to "now()" (string), which will store a literal string unless a trigger updates it.

Add this DDL (outside this hunk):

CREATE OR REPLACE TRIGGER update_archon_sources_updated_at
  BEFORE UPDATE ON archon_sources
  FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();

Then remove manual "updated_at": "now()" writes in app code (see review in source_management_service.py).

🤖 Prompt for AI Agents
In migration/complete_setup.sql around lines 173 to 175, the archon_sources
table is missing the update_updated_at trigger; add a CREATE OR REPLACE TRIGGER
update_archon_sources_updated_at BEFORE UPDATE ON archon_sources FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column(); statement (place it after the table
definitions / alongside other trigger creations, outside the shown hunk), and
then remove the manual application code that sets updated_at to the literal
"now()" in source_management_service.py so the trigger can correctly set the
timestamp.

total_word_count INTEGER DEFAULT 0,
title TEXT,
Expand All @@ -180,10 +182,15 @@ CREATE TABLE IF NOT EXISTS archon_sources (

-- Create indexes for better query performance
CREATE INDEX IF NOT EXISTS idx_archon_sources_title ON archon_sources(title);
CREATE INDEX IF NOT EXISTS idx_archon_sources_url ON archon_sources(source_url);
CREATE INDEX IF NOT EXISTS idx_archon_sources_display_name ON archon_sources(source_display_name);
CREATE INDEX IF NOT EXISTS idx_archon_sources_metadata ON archon_sources USING GIN(metadata);
CREATE INDEX IF NOT EXISTS idx_archon_sources_knowledge_type ON archon_sources((metadata->>'knowledge_type'));

-- Add comments to document the new columns
-- Add comments to document the columns
COMMENT ON COLUMN archon_sources.source_id IS 'Unique hash identifier for the source (16-char SHA256 hash of URL)';
COMMENT ON COLUMN archon_sources.source_url IS 'The original URL that was crawled to create this source';
COMMENT ON COLUMN archon_sources.source_display_name IS 'Human-readable name for UI display (e.g., "GitHub - microsoft/typescript")';
COMMENT ON COLUMN archon_sources.title IS 'Descriptive title for the source (e.g., "Pydantic AI API Reference")';
COMMENT ON COLUMN archon_sources.metadata IS 'JSONB field storing knowledge_type, tags, and other metadata';

Expand Down
15 changes: 9 additions & 6 deletions python/src/server/services/crawling/code_extraction_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import re
from collections.abc import Callable
from typing import Any
from urllib.parse import urlparse

from ...config.logfire_config import safe_logfire_error, safe_logfire_info
from ...services.credential_service import credential_service
Expand Down Expand Up @@ -136,6 +135,7 @@ async def extract_and_store_code_examples(
self,
crawl_results: list[dict[str, Any]],
url_to_full_document: dict[str, str],
source_id: str,
progress_callback: Callable | None = None,
start_progress: int = 0,
end_progress: int = 100,
Expand All @@ -146,6 +146,7 @@ async def extract_and_store_code_examples(
Args:
crawl_results: List of crawled documents with url and markdown content
url_to_full_document: Mapping of URLs to full document content
source_id: The unique source_id for all documents
progress_callback: Optional async callback for progress updates
start_progress: Starting progress percentage (default: 0)
end_progress: Ending progress percentage (default: 100)
Expand All @@ -163,7 +164,7 @@ async def extract_and_store_code_examples(

# Extract code blocks from all documents
all_code_blocks = await self._extract_code_blocks_from_documents(
crawl_results, progress_callback, start_progress, extract_end
crawl_results, source_id, progress_callback, start_progress, extract_end
)

if not all_code_blocks:
Expand Down Expand Up @@ -201,13 +202,18 @@ async def extract_and_store_code_examples(
async def _extract_code_blocks_from_documents(
self,
crawl_results: list[dict[str, Any]],
source_id: str,
progress_callback: Callable | None = None,
start_progress: int = 0,
end_progress: int = 100,
) -> list[dict[str, Any]]:
"""
Extract code blocks from all documents.

Args:
crawl_results: List of crawled documents
source_id: The unique source_id for all documents

Returns:
List of code blocks with metadata
"""
Expand Down Expand Up @@ -306,10 +312,7 @@ async def _extract_code_blocks_from_documents(
)

if code_blocks:
# Always extract source_id from URL
parsed_url = urlparse(source_url)
source_id = parsed_url.netloc or parsed_url.path

# Use the provided source_id for all code blocks
for block in code_blocks:
all_code_blocks.append({
"block": block,
Expand Down
15 changes: 10 additions & 5 deletions python/src/server/services/crawling/crawling_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,10 +304,12 @@ async def send_heartbeat_if_needed():
url = str(request.get("url", ""))
safe_logfire_info(f"Starting async crawl orchestration | url={url} | task_id={task_id}")

# Extract source_id from the original URL
parsed_original_url = urlparse(url)
original_source_id = parsed_original_url.netloc or parsed_original_url.path
safe_logfire_info(f"Using source_id '{original_source_id}' from original URL '{url}'")
# Generate unique source_id and display name from the original URL
original_source_id = self.url_handler.generate_unique_source_id(url)
source_display_name = self.url_handler.extract_display_name(url)
safe_logfire_info(
f"Generated unique source_id '{original_source_id}' and display name '{source_display_name}' from URL '{url}'"
)

# Helper to update progress with mapper
async def update_mapped_progress(
Expand Down Expand Up @@ -386,6 +388,8 @@ async def doc_storage_callback(
original_source_id,
doc_storage_callback,
self._check_cancellation,
source_url=url,
source_display_name=source_display_name,
)

# Check for cancellation after document storage
Expand All @@ -410,6 +414,7 @@ async def code_progress_callback(data: dict):
code_examples_count = await self.doc_storage_ops.extract_and_store_code_examples(
crawl_results,
storage_results["url_to_full_document"],
storage_results["source_id"],
code_progress_callback,
85,
95,
Expand Down Expand Up @@ -558,7 +563,7 @@ async def _crawl_by_url_type(self, url: str, request: Dict[str, Any]) -> tuple:
max_depth = request.get("max_depth", 1)
# Let the strategy handle concurrency from settings
# This will use CRAWL_MAX_CONCURRENT from database (default: 10)

crawl_results = await self.crawl_recursive_with_progress(
[url],
max_depth=max_depth,
Expand Down
Loading