diff --git a/python/src/server/api_routes/settings_api.py b/python/src/server/api_routes/settings_api.py index 48e2d76479..e8990e063c 100644 --- a/python/src/server/api_routes/settings_api.py +++ b/python/src/server/api_routes/settings_api.py @@ -135,6 +135,9 @@ async def create_credential(request: CredentialRequest): "DISCONNECT_SCREEN_ENABLED": "true", # Show disconnect screen when server is unavailable "PROJECTS_ENABLED": "false", # Enable project management features "LOGFIRE_ENABLED": "false", # Enable Pydantic Logfire integration + "CRAWL_DISCOVERY_LLM_FILES": '["llms-full.txt", "llms-ctx.txt", "llms.md", "llms.txt"]', # LLM files to auto-discover (priority order) + "CRAWL_DISCOVERY_SITEMAP_FILES": '["sitemap.xml", "sitemap_index.xml", "sitemap-*.xml"]', # Sitemap files to auto-discover + "CRAWL_DISCOVERY_METADATA_FILES": '["robots.txt", ".well-known/security.txt", ".well-known/humans.txt", "humans.txt", "security.txt"]', # Metadata files to auto-discover } diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index 5b5d43044e..eefa26e435 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -42,6 +42,7 @@ def _ensure_socketio_imports(): # Import helpers from .helpers.url_handler import URLHandler from .helpers.site_config import SiteConfig +from .helpers.file_discovery import FileDiscoveryService # Import operations from .document_storage_operations import DocumentStorageOperations @@ -91,6 +92,7 @@ def __init__(self, crawler=None, supabase_client=None, progress_id=None): # Initialize helpers self.url_handler = URLHandler() self.site_config = SiteConfig() + self.file_discovery = FileDiscoveryService() self.markdown_generator = self.site_config.get_markdown_generator() # Initialize strategies @@ -129,6 +131,45 @@ def _check_cancellation(self): if self._cancelled: raise asyncio.CancelledError("Crawl operation was cancelled by user") + async def auto_discover_files(self, base_url: str) -> Dict[str, List[str]]: + """ + Automatically discover files using FileDiscoveryService. + + Args: + base_url: Base URL of the website + + Returns: + Dictionary with discovered files categorized by type + """ + safe_logfire_info(f"🔥 DEBUG: auto_discover_files called with base_url: {base_url}") + try: + discovery_results = await self.file_discovery.discover_all_files(base_url) + + # Log discovery results + total_discovered = sum(len(files) for files in discovery_results.values()) + if total_discovered > 0: + safe_logfire_info( + f"File discovery completed for {base_url} | " + f"total_files={total_discovered} | " + f"llm_files={len(discovery_results.get('llm_files', []))} | " + f"sitemaps={len(discovery_results.get('sitemap_files', []))} | " + f"robots_sitemaps={len(discovery_results.get('robots_sitemaps', []))}" + ) + else: + safe_logfire_info(f"No discoverable files found for {base_url}") + + return discovery_results + + except Exception as e: + safe_logfire_error(f"File discovery failed for {base_url} | error={str(e)}") + # Return empty results on discovery failure to not block main crawl + return { + "robots_sitemaps": [], + "llm_files": [], + "sitemap_files": [], + "metadata_files": [] + } + async def _create_crawl_progress_callback( self, base_status: str ) -> Callable[[str, int, str], Awaitable[None]]: @@ -490,29 +531,141 @@ async def code_progress_callback(data: dict): async def _crawl_by_url_type(self, url: str, request: Dict[str, Any]) -> tuple: """ Detect URL type and perform appropriate crawling. + Includes automatic file discovery before main crawling logic. Returns: Tuple of (crawl_results, crawl_type) """ _ensure_socketio_imports() + + # DEBUG: Check if this method is being called + safe_logfire_info(f"🚨 DEBUG: _crawl_by_url_type called with URL: {url}") crawl_results = [] crawl_type = None + + # Perform automatic file discovery before main crawling + try: + if self.progress_id: + self.progress_state.update({ + "status": "discovering", + "percentage": 5, + "log": "Discovering LLM files and sitemaps...", + }) + await update_crawl_progress(self.progress_id, self.progress_state) + + base_url = f"{urlparse(url).scheme}://{urlparse(url).netloc}" + discovery_results = await self.auto_discover_files(base_url) + + # Debug log the discovery results structure + safe_logfire_info(f"🔍 DEBUG: Discovery results = {discovery_results}") + + # If we discovered LLM files, prioritize them and STOP regular crawling + if discovery_results.get('llm_files'): + llm_files = discovery_results['llm_files'] + safe_logfire_info(f"🎯 DISCOVERED LLM FILES: {llm_files} - Will crawl these instead of regular website") + safe_logfire_info(f"📋 CRAWLING DECISION: Using LLM files for content - {llm_files}") + + if self.progress_id: + self.progress_state.update({ + "status": "crawling", + "percentage": 8, + "log": f"Crawling discovered LLM files ({len(llm_files)} files) - skipping regular crawl...", + }) + await update_crawl_progress(self.progress_id, self.progress_state) + + # Log exactly what we're about to crawl + safe_logfire_info(f"🚀 STARTING CRAWL: LLM files only - {llm_files}") + + # Crawl LLM files as batch + llm_crawl_results = await self.crawl_batch_with_progress( + llm_files, + progress_callback=await self._create_crawl_progress_callback("crawling"), + start_progress=8, + end_progress=95, # Complete the progress since we're done + ) + if llm_crawl_results: + crawl_results.extend(llm_crawl_results) + safe_logfire_info(f"🎉 SUCCESS: LLM files crawled successfully! Found {len(llm_crawl_results)} results. STOPPING regular crawl.") + return crawl_results, "llm_files_discovered" + else: + safe_logfire_info(f"⚠️ LLM files discovered but crawling failed, falling back to regular crawl") + # Continue with normal crawling logic below + + # If we discovered additional sitemaps from robots.txt, add them to processing + # (only if no LLM files were found - LLM files take priority) + if discovery_results.get('robots_sitemaps') and not discovery_results.get('llm_files'): + robots_sitemaps = discovery_results['robots_sitemaps'] + safe_logfire_info(f"Found sitemaps in robots.txt: {robots_sitemaps}") + + if self.progress_id: + self.progress_state.update({ + "status": "crawling", + "percentage": 12, + "log": f"Processing sitemaps from robots.txt ({len(robots_sitemaps)} sitemaps)...", + }) + await update_crawl_progress(self.progress_id, self.progress_state) + + # Process discovered sitemaps + sitemap_crawl_results = [] + for sitemap_url in robots_sitemaps: + sitemap_urls = self.parse_sitemap(sitemap_url) + if sitemap_urls: + batch_results = await self.crawl_batch_with_progress( + sitemap_urls, + progress_callback=await self._create_crawl_progress_callback("crawling"), + start_progress=12, + end_progress=90, + ) + if batch_results: + sitemap_crawl_results.extend(batch_results) + + if sitemap_crawl_results: + crawl_results.extend(sitemap_crawl_results) + safe_logfire_info(f"📋 CRAWLING DECISION: Using robots.txt sitemaps for content") + safe_logfire_info(f"🎉 SUCCESS: Sitemaps from robots.txt crawled successfully, skipping regular crawl. Found {len(sitemap_crawl_results)} results.") + return crawl_results, "robots_sitemaps_discovered" + + except Exception as e: + safe_logfire_error(f"File discovery integration failed: {e}") + # Continue with normal crawling if discovery fails + + # If no discovery results, log that we're falling back to regular crawling + if not crawl_results: + safe_logfire_info(f"📋 CRAWLING DECISION: No LLM files or sitemaps discovered, using regular website crawling for {url}") - if self.url_handler.is_txt(url): - # Handle text files + # Check if this is specifically an LLM file + if self.url_handler.is_llm_file(url): + # Handle LLM files specifically + if self.progress_id: + self.progress_state.update({ + "status": "crawling", + "percentage": 15, + "log": "Detected LLM file, fetching content...", + }) + await update_crawl_progress(self.progress_id, self.progress_state) + crawl_results.extend(await self.crawl_markdown_file( + url, + progress_callback=await self._create_crawl_progress_callback("crawling"), + start_progress=15, + end_progress=25, + )) + crawl_type = "llm_file" + + elif self.url_handler.is_txt(url): + # Handle other text files if self.progress_id: self.progress_state.update({ "status": "crawling", - "percentage": 10, + "percentage": 15, "log": "Detected text file, fetching content...", }) await update_crawl_progress(self.progress_id, self.progress_state) crawl_results = await self.crawl_markdown_file( url, progress_callback=await self._create_crawl_progress_callback("crawling"), - start_progress=10, - end_progress=20, + start_progress=15, + end_progress=25, ) crawl_type = "text_file" @@ -521,7 +674,7 @@ async def _crawl_by_url_type(self, url: str, request: Dict[str, Any]) -> tuple: if self.progress_id: self.progress_state.update({ "status": "crawling", - "percentage": 10, + "percentage": 15, "log": "Detected sitemap, parsing URLs...", }) await update_crawl_progress(self.progress_id, self.progress_state) @@ -532,7 +685,7 @@ async def _crawl_by_url_type(self, url: str, request: Dict[str, Any]) -> tuple: if self.progress_id: self.progress_state.update({ "status": "crawling", - "percentage": 15, + "percentage": 20, "log": f"Starting batch crawl of {len(sitemap_urls)} URLs...", }) await update_crawl_progress(self.progress_id, self.progress_state) @@ -540,8 +693,8 @@ async def _crawl_by_url_type(self, url: str, request: Dict[str, Any]) -> tuple: crawl_results = await self.crawl_batch_with_progress( sitemap_urls, progress_callback=await self._create_crawl_progress_callback("crawling"), - start_progress=15, - end_progress=20, + start_progress=20, + end_progress=25, ) crawl_type = "sitemap" @@ -550,7 +703,7 @@ async def _crawl_by_url_type(self, url: str, request: Dict[str, Any]) -> tuple: if self.progress_id: self.progress_state.update({ "status": "crawling", - "percentage": 10, + "percentage": 15, "log": f"Starting recursive crawl with max depth {request.get('max_depth', 1)}...", }) await update_crawl_progress(self.progress_id, self.progress_state) @@ -564,8 +717,8 @@ async def _crawl_by_url_type(self, url: str, request: Dict[str, Any]) -> tuple: max_depth=max_depth, max_concurrent=None, # Let strategy use settings progress_callback=await self._create_crawl_progress_callback("crawling"), - start_progress=10, - end_progress=20, + start_progress=15, + end_progress=25, ) crawl_type = "webpage" diff --git a/python/src/server/services/crawling/helpers/file_discovery.py b/python/src/server/services/crawling/helpers/file_discovery.py new file mode 100644 index 0000000000..cc482c44b8 --- /dev/null +++ b/python/src/server/services/crawling/helpers/file_discovery.py @@ -0,0 +1,275 @@ +""" +File Discovery Service + +Handles automatic discovery of llms.txt, sitemap.xml, and related files +using database-driven configuration with fallback defaults. +""" + +import asyncio +import json +import re +from urllib.parse import urljoin + +import aiohttp + +from ....config.logfire_config import get_logger +from ...credential_service import credential_service + +logger = get_logger(__name__) + + +class FileDiscoveryService: + """Service for discovering files on websites automatically.""" + + def __init__(self): + """Initialize the file discovery service.""" + self.timeout = aiohttp.ClientTimeout(total=10) # 10 second timeout for discovery + + # Hardcoded fallback defaults if database access fails + self._fallback_defaults = { + "CRAWL_DISCOVERY_LLM_FILES": ["llms-full.txt", "llms-ctx.txt", "llms.md", "llms.txt"], + "CRAWL_DISCOVERY_SITEMAP_FILES": ["sitemap.xml", "sitemap_index.xml", "sitemap-*.xml"], + "CRAWL_DISCOVERY_METADATA_FILES": ["robots.txt", ".well-known/security.txt", ".well-known/humans.txt", "humans.txt", "security.txt"] + } + + async def _get_file_list_from_db(self, setting_key: str) -> list[str]: + """ + Get file list from database settings with fallback to hardcoded defaults. + + Args: + setting_key: The database setting key + + Returns: + List of files to discover + """ + try: + # Get setting from database via credential service + raw_value = await credential_service.get_credential(setting_key, decrypt=False) + + if raw_value is None: + logger.info(f"No database setting found for {setting_key}, using fallback defaults") + return self._fallback_defaults.get(setting_key, []) + + # Parse JSON string to list + if isinstance(raw_value, str): + file_list = json.loads(raw_value) + elif isinstance(raw_value, dict) and 'value' in raw_value: + file_list = json.loads(raw_value['value']) + else: + file_list = raw_value + + if not isinstance(file_list, list): + logger.warning(f"Setting {setting_key} is not a list, using fallback defaults") + return self._fallback_defaults.get(setting_key, []) + + logger.info(f"Loaded {len(file_list)} files from database setting {setting_key}") + return file_list + + except (json.JSONDecodeError, TypeError, KeyError) as e: + logger.error(f"Error parsing database setting {setting_key}: {e}, using fallback defaults") + return self._fallback_defaults.get(setting_key, []) + except Exception as e: + logger.error(f"Unexpected error getting database setting {setting_key}: {e}, using fallback defaults") + return self._fallback_defaults.get(setting_key, []) + + async def _check_file_exists(self, session: aiohttp.ClientSession, url: str) -> bool: + """ + Check if a file exists at the given URL using HEAD request. + + Args: + session: HTTP session + url: URL to check + + Returns: + True if file exists and is accessible + """ + try: + async with session.head(url, timeout=self.timeout) as response: + # Consider 2xx status codes as successful + return 200 <= response.status < 300 + except Exception as e: + logger.debug(f"File check failed for {url}: {e}") + return False + + async def discover_robots_sitemaps(self, base_url: str) -> list[str]: + """ + Extract sitemap URLs from robots.txt file. + + Args: + base_url: Base URL of the website + + Returns: + List of sitemap URLs found in robots.txt + """ + sitemaps = [] + robots_url = urljoin(base_url, "/robots.txt") + + try: + async with aiohttp.ClientSession() as session: + async with session.get(robots_url, timeout=self.timeout) as response: + if response.status == 200: + robots_content = await response.text() + + # Parse all Sitemap directives (case-insensitive) + sitemap_pattern = re.compile(r'^sitemap:\s*(.+)$', re.IGNORECASE | re.MULTILINE) + matches = sitemap_pattern.findall(robots_content) + + for match in matches: + sitemap_url = match.strip() + # Convert relative URLs to absolute + if not sitemap_url.startswith(('http://', 'https://')): + sitemap_url = urljoin(base_url, sitemap_url) + sitemaps.append(sitemap_url) + + logger.info(f"Found {len(sitemaps)} sitemaps in robots.txt: {sitemaps}") + else: + logger.debug(f"robots.txt not accessible at {robots_url} (status: {response.status})") + + except Exception as e: + logger.debug(f"Error fetching robots.txt from {robots_url}: {e}") + + return sitemaps + + async def discover_llm_files(self, base_url: str) -> list[str]: + """ + Discover LLM-specific files using database-configured file lists. + Returns the highest priority LLM file found, not all of them. + + Args: + base_url: Base URL of the website + + Returns: + List containing the single best LLM file URL, or empty list if none found + """ + discovered_files = [] + llm_files = await self._get_file_list_from_db("CRAWL_DISCOVERY_LLM_FILES") + + try: + async with aiohttp.ClientSession() as session: + # Check each LLM file pattern in priority order (first in list = highest priority) + for file_pattern in llm_files: + # Simple patterns without wildcards + if "*" not in file_pattern: + file_url = urljoin(base_url, f"/{file_pattern.lstrip('/')}") + if await self._check_file_exists(session, file_url): + logger.info(f"Discovered LLM file: {file_url}") + # Return immediately with the highest priority file found + return [file_url] + + except Exception as e: + logger.error(f"Error during LLM file discovery for {base_url}: {e}") + + return discovered_files + + async def discover_sitemap_files(self, base_url: str) -> list[str]: + """ + Discover sitemap files using database-configured patterns with wildcard support. + + Args: + base_url: Base URL of the website + + Returns: + List of discovered sitemap URLs + """ + discovered_files = [] + sitemap_files = await self._get_file_list_from_db("CRAWL_DISCOVERY_SITEMAP_FILES") + + try: + async with aiohttp.ClientSession() as session: + for file_pattern in sitemap_files: + if "*" in file_pattern: + # Handle wildcard patterns (simplified for now) + # For "sitemap-*.xml", try common numbered patterns + if file_pattern == "sitemap-*.xml": + for i in range(1, 6): # Try sitemap-1.xml to sitemap-5.xml + file_url = urljoin(base_url, f"/sitemap-{i}.xml") + if await self._check_file_exists(session, file_url): + discovered_files.append(file_url) + logger.info(f"Discovered numbered sitemap: {file_url}") + else: + # Simple file patterns + file_url = urljoin(base_url, f"/{file_pattern.lstrip('/')}") + if await self._check_file_exists(session, file_url): + discovered_files.append(file_url) + logger.info(f"Discovered sitemap file: {file_url}") + + except Exception as e: + logger.error(f"Error during sitemap discovery for {base_url}: {e}") + + return discovered_files + + async def discover_metadata_files(self, base_url: str) -> list[str]: + """ + Discover metadata files using database-configured lists. + + Args: + base_url: Base URL of the website + + Returns: + List of discovered metadata file URLs + """ + discovered_files = [] + metadata_files = await self._get_file_list_from_db("CRAWL_DISCOVERY_METADATA_FILES") + + try: + async with aiohttp.ClientSession() as session: + for file_pattern in metadata_files: + file_url = urljoin(base_url, f"/{file_pattern.lstrip('/')}") + if await self._check_file_exists(session, file_url): + discovered_files.append(file_url) + logger.info(f"Discovered metadata file: {file_url}") + + except Exception as e: + logger.error(f"Error during metadata file discovery for {base_url}: {e}") + + return discovered_files + + async def discover_all_files(self, base_url: str) -> dict[str, list[str]]: + """ + Perform comprehensive file discovery using all available methods. + + Args: + base_url: Base URL of the website + + Returns: + Dictionary with discovery results categorized by type + """ + logger.info(f"Starting file discovery for {base_url}") + + try: + # Run all discovery methods concurrently for performance + results = await asyncio.gather( + self.discover_robots_sitemaps(base_url), + self.discover_llm_files(base_url), + self.discover_sitemap_files(base_url), + self.discover_metadata_files(base_url), + return_exceptions=True + ) + + # Handle any exceptions from individual discovery methods + robots_sitemaps = results[0] if not isinstance(results[0], Exception) else [] + llm_files = results[1] if not isinstance(results[1], Exception) else [] + sitemap_files = results[2] if not isinstance(results[2], Exception) else [] + metadata_files = results[3] if not isinstance(results[3], Exception) else [] + + discovery_result = { + "robots_sitemaps": robots_sitemaps, + "llm_files": llm_files, + "sitemap_files": sitemap_files, + "metadata_files": metadata_files + } + + total_discovered = sum(len(files) for files in discovery_result.values()) + logger.info(f"File discovery completed for {base_url}: {total_discovered} files discovered") + + return discovery_result + + except Exception as e: + logger.error(f"Unexpected error during file discovery for {base_url}: {e}") + # Return empty results on failure + return { + "robots_sitemaps": [], + "llm_files": [], + "sitemap_files": [], + "metadata_files": [] + } diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py index d66a2a8281..8e3f0fb8f0 100644 --- a/python/src/server/services/crawling/helpers/url_handler.py +++ b/python/src/server/services/crawling/helpers/url_handler.py @@ -13,11 +13,12 @@ class URLHandler: """Helper class for URL operations.""" - + @staticmethod def is_sitemap(url: str) -> bool: """ Check if a URL is a sitemap with error handling. + Enhanced to detect more sitemap variations. Args: url: URL to check @@ -26,11 +27,62 @@ def is_sitemap(url: str) -> bool: True if URL is a sitemap, False otherwise """ try: - return url.endswith('sitemap.xml') or 'sitemap' in urlparse(url).path + parsed = urlparse(url) + path = parsed.path.lower() + + # Check for various sitemap patterns + sitemap_patterns = [ + 'sitemap.xml', + 'sitemap_index.xml', + 'sitemapindex.xml', + 'sitemap-', # For numbered sitemaps like sitemap-1.xml + '/sitemaps/', # For sitemaps in subdirectory + '/sitemap/', # For sitemap directory + ] + + # Also check if 'sitemap' is in the path and it's an XML file + has_sitemap_in_path = 'sitemap' in path and path.endswith('.xml') + + return any(pattern in path for pattern in sitemap_patterns) or path.endswith('.xml.gz') or has_sitemap_in_path + except Exception as e: logger.warning(f"Error checking if URL is sitemap: {e}") return False - + + @staticmethod + def is_llm_file(url: str) -> bool: + """ + Check if a URL points to an LLM-specific file with error handling. + + Args: + url: URL to check + + Returns: + True if URL is an LLM file, False otherwise + """ + try: + parsed = urlparse(url) + path = parsed.path.lower() + + # LLM file patterns + llm_patterns = [ + 'llms.txt', + 'llms-full.txt', + 'llms.md', + 'llms-ctx.txt', + 'llms-context.txt', + '/llms.txt', + '/llms-full.txt', + '/llms.md', + '/llms-ctx.txt' + ] + + return any(path.endswith(pattern) for pattern in llm_patterns) + + except Exception as e: + logger.warning(f"Error checking if URL is LLM file: {e}") + return False + @staticmethod def is_txt(url: str) -> bool: """ @@ -47,7 +99,7 @@ def is_txt(url: str) -> bool: except Exception as e: logger.warning(f"Error checking if URL is text file: {e}") return False - + @staticmethod def is_binary_file(url: str) -> bool: """ @@ -63,7 +115,7 @@ def is_binary_file(url: str) -> bool: # Remove query parameters and fragments for cleaner extension checking parsed = urlparse(url) path = parsed.path.lower() - + # Comprehensive list of binary and non-HTML file extensions binary_extensions = { # Archives @@ -83,19 +135,19 @@ def is_binary_file(url: str) -> bool: # Development files (usually not meant to be crawled as pages) '.wasm', '.pyc', '.jar', '.war', '.class', '.dll', '.so', '.dylib' } - + # Check if the path ends with any binary extension for ext in binary_extensions: if path.endswith(ext): logger.debug(f"Skipping binary file: {url} (matched extension: {ext})") return True - + return False except Exception as e: logger.warning(f"Error checking if URL is binary file: {e}") # In case of error, don't skip the URL (safer to attempt crawl than miss content) return False - + @staticmethod def transform_github_url(url: str) -> str: """ @@ -115,7 +167,7 @@ def transform_github_url(url: str) -> str: raw_url = f'https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}' logger.info(f"Transformed GitHub file URL to raw: {url} -> {raw_url}") return raw_url - + # Pattern for GitHub directory URLs github_dir_pattern = r'https://github\.com/([^/]+)/([^/]+)/tree/([^/]+)/(.+)' match = re.match(github_dir_pattern, url) @@ -123,5 +175,5 @@ def transform_github_url(url: str) -> str: # For directories, we can't directly get raw content # Return original URL but log a warning logger.warning(f"GitHub directory URL detected: {url} - consider using specific file URLs or GitHub API") - - return url \ No newline at end of file + + return url diff --git a/python/tests/test_file_discovery.py b/python/tests/test_file_discovery.py new file mode 100644 index 0000000000..9eef0b5e6c --- /dev/null +++ b/python/tests/test_file_discovery.py @@ -0,0 +1,402 @@ +"""Comprehensive test suite for FileDiscoveryService.""" + +import asyncio +import json +from unittest.mock import AsyncMock, Mock, patch, MagicMock +import pytest +import aiohttp +from aiohttp import ClientResponse + +from src.server.services.crawling.helpers.file_discovery import FileDiscoveryService + + +class TestFileDiscoveryService: + """Test suite for FileDiscoveryService class.""" + + @pytest.fixture + def discovery_service(self): + """Create a FileDiscoveryService instance for testing.""" + return FileDiscoveryService() + + @pytest.fixture + def mock_response(self): + """Create a mock HTTP response for testing.""" + response = AsyncMock(spec=ClientResponse) + response.status = 200 + response.text = AsyncMock() + return response + + def test_initialization(self, discovery_service): + """Test service initialization with proper defaults.""" + assert discovery_service.timeout.total == 10 + assert "CRAWL_DISCOVERY_LLM_FILES" in discovery_service._fallback_defaults + assert "CRAWL_DISCOVERY_SITEMAP_FILES" in discovery_service._fallback_defaults + assert "CRAWL_DISCOVERY_METADATA_FILES" in discovery_service._fallback_defaults + + @pytest.mark.asyncio + async def test_get_file_list_from_db_success(self, discovery_service): + """Test successful database settings retrieval.""" + with patch('src.server.services.crawling.helpers.file_discovery.credential_service') as mock_cred: + # Mock successful database response with async return + mock_cred.get_credential = AsyncMock(return_value='["llms.txt", "llms-full.txt"]') + + result = await discovery_service._get_file_list_from_db("CRAWL_DISCOVERY_LLM_FILES") + + assert result == ["llms.txt", "llms-full.txt"] + mock_cred.get_credential.assert_called_once_with("CRAWL_DISCOVERY_LLM_FILES", decrypt=False) + + @pytest.mark.asyncio + async def test_get_file_list_from_db_dict_format(self, discovery_service): + """Test database settings retrieval with dict format.""" + with patch('src.server.services.crawling.helpers.file_discovery.credential_service') as mock_cred: + # Mock database response in dict format with async return + mock_cred.get_credential = AsyncMock(return_value={"value": '["sitemap.xml", "sitemap_index.xml"]'}) + + result = await discovery_service._get_file_list_from_db("CRAWL_DISCOVERY_SITEMAP_FILES") + + assert result == ["sitemap.xml", "sitemap_index.xml"] + + @pytest.mark.asyncio + async def test_get_file_list_from_db_fallback_on_none(self, discovery_service): + """Test fallback to defaults when database returns None.""" + with patch('src.server.services.crawling.helpers.file_discovery.credential_service') as mock_cred: + mock_cred.get_credential = AsyncMock(return_value=None) + + result = await discovery_service._get_file_list_from_db("CRAWL_DISCOVERY_LLM_FILES") + + assert result == discovery_service._fallback_defaults["CRAWL_DISCOVERY_LLM_FILES"] + + @pytest.mark.asyncio + async def test_get_file_list_from_db_fallback_on_json_error(self, discovery_service): + """Test fallback to defaults on JSON parsing errors.""" + with patch('src.server.services.crawling.helpers.file_discovery.credential_service') as mock_cred: + mock_cred.get_credential = AsyncMock(return_value="invalid json[") + + result = await discovery_service._get_file_list_from_db("CRAWL_DISCOVERY_LLM_FILES") + + assert result == discovery_service._fallback_defaults["CRAWL_DISCOVERY_LLM_FILES"] + + @pytest.mark.asyncio + async def test_get_file_list_from_db_fallback_on_exception(self, discovery_service): + """Test fallback to defaults on unexpected exceptions.""" + with patch('src.server.services.crawling.helpers.file_discovery.credential_service') as mock_cred: + mock_cred.get_credential = AsyncMock(side_effect=Exception("Database error")) + + result = await discovery_service._get_file_list_from_db("CRAWL_DISCOVERY_LLM_FILES") + + assert result == discovery_service._fallback_defaults["CRAWL_DISCOVERY_LLM_FILES"] + + @pytest.mark.asyncio + async def test_check_file_exists_success(self, discovery_service, mock_response): + """Test successful file existence check.""" + mock_response.status = 200 + + with patch('aiohttp.ClientSession.head') as mock_head: + mock_head.return_value.__aenter__ = AsyncMock(return_value=mock_response) + mock_head.return_value.__aexit__ = AsyncMock(return_value=None) + + async with aiohttp.ClientSession() as session: + result = await discovery_service._check_file_exists(session, "https://example.com/llms.txt") + + assert result is True + + @pytest.mark.asyncio + async def test_check_file_exists_not_found(self, discovery_service, mock_response): + """Test file existence check with 404 response.""" + mock_response.status = 404 + + with patch('aiohttp.ClientSession.head', return_value=mock_response): + async with aiohttp.ClientSession() as session: + result = await discovery_service._check_file_exists(session, "https://example.com/nonexistent.txt") + + assert result is False + + @pytest.mark.asyncio + async def test_check_file_exists_exception(self, discovery_service): + """Test file existence check with network exception.""" + with patch('aiohttp.ClientSession.head', side_effect=aiohttp.ClientError("Network error")): + async with aiohttp.ClientSession() as session: + result = await discovery_service._check_file_exists(session, "https://example.com/test.txt") + + assert result is False + + @pytest.mark.asyncio + async def test_discover_robots_sitemaps_success(self, discovery_service, mock_response): + """Test successful robots.txt sitemap extraction.""" + robots_content = """User-agent: * +Allow: / + +Sitemap: https://example.com/sitemap.xml +Sitemap: https://example.com/news-sitemap.xml +sitemap: https://example.com/products-sitemap.xml +""" + mock_response.status = 200 + mock_response.text = AsyncMock(return_value=robots_content) + + with patch('aiohttp.ClientSession.get') as mock_get: + mock_get.return_value.__aenter__ = AsyncMock(return_value=mock_response) + mock_get.return_value.__aexit__ = AsyncMock(return_value=None) + + result = await discovery_service.discover_robots_sitemaps("https://example.com") + + expected = [ + "https://example.com/sitemap.xml", + "https://example.com/news-sitemap.xml", + "https://example.com/products-sitemap.xml" + ] + assert result == expected + + @pytest.mark.asyncio + async def test_discover_robots_sitemaps_relative_urls(self, discovery_service, mock_response): + """Test robots.txt with relative sitemap URLs.""" + robots_content = """User-agent: * +Sitemap: /sitemap.xml +Sitemap: /sitemaps/main.xml +""" + mock_response.status = 200 + mock_response.text.return_value = robots_content + + with patch('aiohttp.ClientSession.get', return_value=mock_response): + result = await discovery_service.discover_robots_sitemaps("https://example.com") + + expected = [ + "https://example.com/sitemap.xml", + "https://example.com/sitemaps/main.xml" + ] + assert result == expected + + @pytest.mark.asyncio + async def test_discover_robots_sitemaps_not_found(self, discovery_service, mock_response): + """Test robots.txt not found.""" + mock_response.status = 404 + + with patch('aiohttp.ClientSession.get', return_value=mock_response): + result = await discovery_service.discover_robots_sitemaps("https://example.com") + + assert result == [] + + @pytest.mark.asyncio + async def test_discover_robots_sitemaps_exception(self, discovery_service): + """Test robots.txt discovery with network exception.""" + with patch('aiohttp.ClientSession.get', side_effect=aiohttp.ClientError("Network error")): + result = await discovery_service.discover_robots_sitemaps("https://example.com") + + assert result == [] + + @pytest.mark.asyncio + async def test_discover_llm_files_success(self, discovery_service): + """Test successful LLM file discovery.""" + with patch.object(discovery_service, '_get_file_list_from_db', return_value=["llms.txt", "llms-full.txt"]): + with patch.object(discovery_service, '_check_file_exists', side_effect=[True, False]) as mock_check: + result = await discovery_service.discover_llm_files("https://example.com") + + assert result == ["https://example.com/llms.txt"] + assert mock_check.call_count == 2 + + @pytest.mark.asyncio + async def test_discover_llm_files_none_found(self, discovery_service): + """Test LLM file discovery when no files exist.""" + with patch.object(discovery_service, '_get_file_list_from_db', return_value=["llms.txt"]): + with patch.object(discovery_service, '_check_file_exists', return_value=False): + result = await discovery_service.discover_llm_files("https://example.com") + + assert result == [] + + @pytest.mark.asyncio + async def test_discover_llm_files_exception(self, discovery_service): + """Test LLM file discovery with exception.""" + with patch.object(discovery_service, '_get_file_list_from_db', side_effect=Exception("Database error")): + result = await discovery_service.discover_llm_files("https://example.com") + + assert result == [] + + @pytest.mark.asyncio + async def test_discover_sitemap_files_standard(self, discovery_service): + """Test standard sitemap file discovery.""" + with patch.object(discovery_service, '_get_file_list_from_db', return_value=["sitemap.xml", "sitemap_index.xml"]): + with patch.object(discovery_service, '_check_file_exists', side_effect=[True, False]): + result = await discovery_service.discover_sitemap_files("https://example.com") + + assert result == ["https://example.com/sitemap.xml"] + + @pytest.mark.asyncio + async def test_discover_sitemap_files_wildcard(self, discovery_service): + """Test sitemap discovery with wildcard patterns.""" + with patch.object(discovery_service, '_get_file_list_from_db', return_value=["sitemap-*.xml"]): + # Mock that sitemap-1.xml and sitemap-3.xml exist + check_responses = [True, False, True, False, False] # 1 exists, 2 doesn't, 3 exists, 4&5 don't + with patch.object(discovery_service, '_check_file_exists', side_effect=check_responses): + result = await discovery_service.discover_sitemap_files("https://example.com") + + expected = ["https://example.com/sitemap-1.xml", "https://example.com/sitemap-3.xml"] + assert result == expected + + @pytest.mark.asyncio + async def test_discover_metadata_files_success(self, discovery_service): + """Test successful metadata file discovery.""" + with patch.object(discovery_service, '_get_file_list_from_db', return_value=["robots.txt", ".well-known/security.txt"]): + with patch.object(discovery_service, '_check_file_exists', side_effect=[True, False]): + result = await discovery_service.discover_metadata_files("https://example.com") + + assert result == ["https://example.com/robots.txt"] + + @pytest.mark.asyncio + async def test_discover_all_files_comprehensive(self, discovery_service): + """Test comprehensive file discovery with all methods.""" + # Mock all the individual discovery methods + with patch.object(discovery_service, 'discover_robots_sitemaps', return_value=["https://example.com/sitemap.xml"]): + with patch.object(discovery_service, 'discover_llm_files', return_value=["https://example.com/llms.txt"]): + with patch.object(discovery_service, 'discover_sitemap_files', return_value=["https://example.com/sitemap_index.xml"]): + with patch.object(discovery_service, 'discover_metadata_files', return_value=["https://example.com/robots.txt"]): + result = await discovery_service.discover_all_files("https://example.com") + + expected = { + "robots_sitemaps": ["https://example.com/sitemap.xml"], + "llm_files": ["https://example.com/llms.txt"], + "sitemap_files": ["https://example.com/sitemap_index.xml"], + "metadata_files": ["https://example.com/robots.txt"] + } + assert result == expected + + @pytest.mark.asyncio + async def test_discover_all_files_with_exceptions(self, discovery_service): + """Test comprehensive discovery with some methods failing.""" + # Mock some methods to succeed and some to fail + with patch.object(discovery_service, 'discover_robots_sitemaps', return_value=["https://example.com/sitemap.xml"]): + with patch.object(discovery_service, 'discover_llm_files', side_effect=Exception("LLM discovery failed")): + with patch.object(discovery_service, 'discover_sitemap_files', return_value=["https://example.com/sitemap_index.xml"]): + with patch.object(discovery_service, 'discover_metadata_files', side_effect=Exception("Metadata discovery failed")): + result = await discovery_service.discover_all_files("https://example.com") + + expected = { + "robots_sitemaps": ["https://example.com/sitemap.xml"], + "llm_files": [], # Failed, so empty + "sitemap_files": ["https://example.com/sitemap_index.xml"], + "metadata_files": [] # Failed, so empty + } + assert result == expected + + @pytest.mark.asyncio + async def test_discover_all_files_complete_failure(self, discovery_service): + """Test discovery with complete failure returning empty results.""" + with patch.object(discovery_service, 'discover_robots_sitemaps', side_effect=Exception("Complete failure")): + result = await discovery_service.discover_all_files("https://example.com") + + expected = { + "robots_sitemaps": [], + "llm_files": [], + "sitemap_files": [], + "metadata_files": [] + } + assert result == expected + + @pytest.mark.asyncio + async def test_concurrent_discovery_performance(self, discovery_service): + """Test that discovery methods run concurrently for performance.""" + # This test verifies that asyncio.gather is used for concurrent execution + with patch('asyncio.gather') as mock_gather: + mock_gather.return_value = [[], [], [], []] # Empty results + + await discovery_service.discover_all_files("https://example.com") + + # Verify asyncio.gather was called (indicating concurrent execution) + mock_gather.assert_called_once() + + # Check that all 4 discovery methods were passed to gather + args = mock_gather.call_args[0] + assert len(args) == 4 # 4 discovery methods + + def test_fallback_defaults_completeness(self, discovery_service): + """Test that fallback defaults contain all required settings.""" + defaults = discovery_service._fallback_defaults + + # Check all required keys are present + required_keys = [ + "CRAWL_DISCOVERY_LLM_FILES", + "CRAWL_DISCOVERY_SITEMAP_FILES", + "CRAWL_DISCOVERY_METADATA_FILES" + ] + + for key in required_keys: + assert key in defaults + assert isinstance(defaults[key], list) + assert len(defaults[key]) > 0 # Should have at least one item + + def test_fallback_defaults_content(self, discovery_service): + """Test that fallback defaults contain expected file patterns.""" + defaults = discovery_service._fallback_defaults + + # Check LLM files + llm_files = defaults["CRAWL_DISCOVERY_LLM_FILES"] + assert "llms.txt" in llm_files + assert "llms-full.txt" in llm_files + + # Check sitemap files + sitemap_files = defaults["CRAWL_DISCOVERY_SITEMAP_FILES"] + assert "sitemap.xml" in sitemap_files + assert "sitemap_index.xml" in sitemap_files + + # Check metadata files + metadata_files = defaults["CRAWL_DISCOVERY_METADATA_FILES"] + assert "robots.txt" in metadata_files + assert ".well-known/security.txt" in metadata_files + + +class TestFileDiscoveryIntegration: + """Integration tests for FileDiscoveryService with real network scenarios.""" + + @pytest.fixture + def discovery_service(self): + """Create a FileDiscoveryService instance for integration testing.""" + return FileDiscoveryService() + + @pytest.mark.asyncio + async def test_timeout_handling(self, discovery_service): + """Test that timeouts are handled gracefully.""" + # Mock a slow response that times out + slow_response = AsyncMock() + slow_response.text.side_effect = asyncio.TimeoutError("Request timed out") + + with patch('aiohttp.ClientSession.get', return_value=slow_response): + result = await discovery_service.discover_robots_sitemaps("https://slow-site.com") + + assert result == [] # Should return empty list on timeout + + @pytest.mark.asyncio + async def test_malformed_robots_txt(self, discovery_service): + """Test handling of malformed robots.txt content.""" + malformed_content = """This is not a valid robots.txt +Random text here +Sitemap: https://example.com/sitemap.xml +More random content +sitemap: not-a-url +Sitemap: https://example.com/valid-sitemap.xml +""" + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.text.return_value = malformed_content + + with patch('aiohttp.ClientSession.get', return_value=mock_response): + result = await discovery_service.discover_robots_sitemaps("https://example.com") + + # Should extract valid sitemap URLs and handle malformed ones gracefully + assert "https://example.com/sitemap.xml" in result + assert "https://example.com/valid-sitemap.xml" in result + # Should handle malformed entries without crashing + + @pytest.mark.asyncio + async def test_edge_case_url_handling(self, discovery_service): + """Test edge cases in URL handling and construction.""" + test_cases = [ + ("https://example.com/", "llms.txt", "https://example.com/llms.txt"), + ("https://example.com", "llms.txt", "https://example.com/llms.txt"), + ("https://example.com/path/", "llms.txt", "https://example.com/path/llms.txt"), + ("https://example.com/path", ".well-known/security.txt", "https://example.com/.well-known/security.txt"), + ] + + with patch.object(discovery_service, '_get_file_list_from_db', return_value=["llms.txt"]): + with patch.object(discovery_service, '_check_file_exists', return_value=True) as mock_check: + for base_url, file_pattern, expected_url in test_cases: + await discovery_service.discover_llm_files(base_url) + # Verify the correct URL was constructed and checked + mock_check.assert_called_with(mock_check.call_args[0][0], expected_url) \ No newline at end of file