diff --git a/python/src/server/api_routes/settings_api.py b/python/src/server/api_routes/settings_api.py
index 48e2d76479..e8990e063c 100644
--- a/python/src/server/api_routes/settings_api.py
+++ b/python/src/server/api_routes/settings_api.py
@@ -135,6 +135,9 @@ async def create_credential(request: CredentialRequest):
     "DISCONNECT_SCREEN_ENABLED": "true",  # Show disconnect screen when server is unavailable
     "PROJECTS_ENABLED": "false",  # Enable project management features
     "LOGFIRE_ENABLED": "false",  # Enable Pydantic Logfire integration
+    "CRAWL_DISCOVERY_LLM_FILES": '["llms-full.txt", "llms-ctx.txt", "llms.md", "llms.txt"]',  # LLM files to auto-discover (priority order)
+    "CRAWL_DISCOVERY_SITEMAP_FILES": '["sitemap.xml", "sitemap_index.xml", "sitemap-*.xml"]',  # Sitemap files to auto-discover
+    "CRAWL_DISCOVERY_METADATA_FILES": '["robots.txt", ".well-known/security.txt", ".well-known/humans.txt", "humans.txt", "security.txt"]',  # Metadata files to auto-discover
 }
 
 
diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py
index 5b5d43044e..eefa26e435 100644
--- a/python/src/server/services/crawling/crawling_service.py
+++ b/python/src/server/services/crawling/crawling_service.py
@@ -42,6 +42,7 @@ def _ensure_socketio_imports():
 # Import helpers
 from .helpers.url_handler import URLHandler
 from .helpers.site_config import SiteConfig
+from .helpers.file_discovery import FileDiscoveryService
 
 # Import operations
 from .document_storage_operations import DocumentStorageOperations
@@ -91,6 +92,7 @@ def __init__(self, crawler=None, supabase_client=None, progress_id=None):
         # Initialize helpers
         self.url_handler = URLHandler()
         self.site_config = SiteConfig()
+        self.file_discovery = FileDiscoveryService()
         self.markdown_generator = self.site_config.get_markdown_generator()
 
         # Initialize strategies
@@ -129,6 +131,45 @@ def _check_cancellation(self):
         if self._cancelled:
             raise asyncio.CancelledError("Crawl operation was cancelled by user")
 
+    async def auto_discover_files(self, base_url: str) -> Dict[str, List[str]]:
+        """
+        Automatically discover files using FileDiscoveryService.
+        
+        Args:
+            base_url: Base URL of the website
+            
+        Returns:
+            Dictionary with discovered files categorized by type
+        """
+        safe_logfire_info(f"🔥 DEBUG: auto_discover_files called with base_url: {base_url}")
+        try:
+            discovery_results = await self.file_discovery.discover_all_files(base_url)
+            
+            # Log discovery results
+            total_discovered = sum(len(files) for files in discovery_results.values())
+            if total_discovered > 0:
+                safe_logfire_info(
+                    f"File discovery completed for {base_url} | "
+                    f"total_files={total_discovered} | "
+                    f"llm_files={len(discovery_results.get('llm_files', []))} | "
+                    f"sitemaps={len(discovery_results.get('sitemap_files', []))} | "
+                    f"robots_sitemaps={len(discovery_results.get('robots_sitemaps', []))}"
+                )
+            else:
+                safe_logfire_info(f"No discoverable files found for {base_url}")
+                
+            return discovery_results
+            
+        except Exception as e:
+            safe_logfire_error(f"File discovery failed for {base_url} | error={str(e)}")
+            # Return empty results on discovery failure to not block main crawl
+            return {
+                "robots_sitemaps": [],
+                "llm_files": [],
+                "sitemap_files": [],
+                "metadata_files": []
+            }
+
     async def _create_crawl_progress_callback(
         self, base_status: str
     ) -> Callable[[str, int, str], Awaitable[None]]:
@@ -490,29 +531,141 @@ async def code_progress_callback(data: dict):
     async def _crawl_by_url_type(self, url: str, request: Dict[str, Any]) -> tuple:
         """
         Detect URL type and perform appropriate crawling.
+        Includes automatic file discovery before main crawling logic.
 
         Returns:
             Tuple of (crawl_results, crawl_type)
         """
         _ensure_socketio_imports()
+        
+        # DEBUG: Check if this method is being called
+        safe_logfire_info(f"🚨 DEBUG: _crawl_by_url_type called with URL: {url}")
 
         crawl_results = []
         crawl_type = None
+        
+        # Perform automatic file discovery before main crawling
+        try:
+            if self.progress_id:
+                self.progress_state.update({
+                    "status": "discovering",
+                    "percentage": 5,
+                    "log": "Discovering LLM files and sitemaps...",
+                })
+                await update_crawl_progress(self.progress_id, self.progress_state)
+            
+            base_url = f"{urlparse(url).scheme}://{urlparse(url).netloc}"
+            discovery_results = await self.auto_discover_files(base_url)
+            
+            # Debug log the discovery results structure
+            safe_logfire_info(f"🔍 DEBUG: Discovery results = {discovery_results}")
+            
+            # If we discovered LLM files, prioritize them and STOP regular crawling
+            if discovery_results.get('llm_files'):
+                llm_files = discovery_results['llm_files']
+                safe_logfire_info(f"🎯 DISCOVERED LLM FILES: {llm_files} - Will crawl these instead of regular website")
+                safe_logfire_info(f"📋 CRAWLING DECISION: Using LLM files for content - {llm_files}")
+                
+                if self.progress_id:
+                    self.progress_state.update({
+                        "status": "crawling",
+                        "percentage": 8,
+                        "log": f"Crawling discovered LLM files ({len(llm_files)} files) - skipping regular crawl...",
+                    })
+                    await update_crawl_progress(self.progress_id, self.progress_state)
+                
+                # Log exactly what we're about to crawl
+                safe_logfire_info(f"🚀 STARTING CRAWL: LLM files only - {llm_files}")
+                
+                # Crawl LLM files as batch
+                llm_crawl_results = await self.crawl_batch_with_progress(
+                    llm_files,
+                    progress_callback=await self._create_crawl_progress_callback("crawling"),
+                    start_progress=8,
+                    end_progress=95,  # Complete the progress since we're done
+                )
+                if llm_crawl_results:
+                    crawl_results.extend(llm_crawl_results)
+                    safe_logfire_info(f"🎉 SUCCESS: LLM files crawled successfully! Found {len(llm_crawl_results)} results. STOPPING regular crawl.")
+                    return crawl_results, "llm_files_discovered"
+                else:
+                    safe_logfire_info(f"⚠️ LLM files discovered but crawling failed, falling back to regular crawl")
+                    # Continue with normal crawling logic below
+                    
+            # If we discovered additional sitemaps from robots.txt, add them to processing
+            # (only if no LLM files were found - LLM files take priority)
+            if discovery_results.get('robots_sitemaps') and not discovery_results.get('llm_files'):
+                robots_sitemaps = discovery_results['robots_sitemaps']
+                safe_logfire_info(f"Found sitemaps in robots.txt: {robots_sitemaps}")
+                
+                if self.progress_id:
+                    self.progress_state.update({
+                        "status": "crawling", 
+                        "percentage": 12,
+                        "log": f"Processing sitemaps from robots.txt ({len(robots_sitemaps)} sitemaps)...",
+                    })
+                    await update_crawl_progress(self.progress_id, self.progress_state)
+                
+                # Process discovered sitemaps
+                sitemap_crawl_results = []
+                for sitemap_url in robots_sitemaps:
+                    sitemap_urls = self.parse_sitemap(sitemap_url)
+                    if sitemap_urls:
+                        batch_results = await self.crawl_batch_with_progress(
+                            sitemap_urls,
+                            progress_callback=await self._create_crawl_progress_callback("crawling"),
+                            start_progress=12,
+                            end_progress=90,
+                        )
+                        if batch_results:
+                            sitemap_crawl_results.extend(batch_results)
+                
+                if sitemap_crawl_results:
+                    crawl_results.extend(sitemap_crawl_results)
+                    safe_logfire_info(f"📋 CRAWLING DECISION: Using robots.txt sitemaps for content")
+                    safe_logfire_info(f"🎉 SUCCESS: Sitemaps from robots.txt crawled successfully, skipping regular crawl. Found {len(sitemap_crawl_results)} results.")
+                    return crawl_results, "robots_sitemaps_discovered"
+                            
+        except Exception as e:
+            safe_logfire_error(f"File discovery integration failed: {e}")
+            # Continue with normal crawling if discovery fails
+
+        # If no discovery results, log that we're falling back to regular crawling
+        if not crawl_results:
+            safe_logfire_info(f"📋 CRAWLING DECISION: No LLM files or sitemaps discovered, using regular website crawling for {url}")
 
-        if self.url_handler.is_txt(url):
-            # Handle text files
+        # Check if this is specifically an LLM file
+        if self.url_handler.is_llm_file(url):
+            # Handle LLM files specifically
+            if self.progress_id:
+                self.progress_state.update({
+                    "status": "crawling",
+                    "percentage": 15,
+                    "log": "Detected LLM file, fetching content...",
+                })
+                await update_crawl_progress(self.progress_id, self.progress_state)
+            crawl_results.extend(await self.crawl_markdown_file(
+                url,
+                progress_callback=await self._create_crawl_progress_callback("crawling"),
+                start_progress=15,
+                end_progress=25,
+            ))
+            crawl_type = "llm_file"
+            
+        elif self.url_handler.is_txt(url):
+            # Handle other text files
             if self.progress_id:
                 self.progress_state.update({
                     "status": "crawling",
-                    "percentage": 10,
+                    "percentage": 15,
                     "log": "Detected text file, fetching content...",
                 })
                 await update_crawl_progress(self.progress_id, self.progress_state)
             crawl_results = await self.crawl_markdown_file(
                 url,
                 progress_callback=await self._create_crawl_progress_callback("crawling"),
-                start_progress=10,
-                end_progress=20,
+                start_progress=15,
+                end_progress=25,
             )
             crawl_type = "text_file"
 
@@ -521,7 +674,7 @@ async def _crawl_by_url_type(self, url: str, request: Dict[str, Any]) -> tuple:
             if self.progress_id:
                 self.progress_state.update({
                     "status": "crawling",
-                    "percentage": 10,
+                    "percentage": 15,
                     "log": "Detected sitemap, parsing URLs...",
                 })
                 await update_crawl_progress(self.progress_id, self.progress_state)
@@ -532,7 +685,7 @@ async def _crawl_by_url_type(self, url: str, request: Dict[str, Any]) -> tuple:
                 if self.progress_id:
                     self.progress_state.update({
                         "status": "crawling",
-                        "percentage": 15,
+                        "percentage": 20,
                         "log": f"Starting batch crawl of {len(sitemap_urls)} URLs...",
                     })
                     await update_crawl_progress(self.progress_id, self.progress_state)
@@ -540,8 +693,8 @@ async def _crawl_by_url_type(self, url: str, request: Dict[str, Any]) -> tuple:
                 crawl_results = await self.crawl_batch_with_progress(
                     sitemap_urls,
                     progress_callback=await self._create_crawl_progress_callback("crawling"),
-                    start_progress=15,
-                    end_progress=20,
+                    start_progress=20,
+                    end_progress=25,
                 )
                 crawl_type = "sitemap"
 
@@ -550,7 +703,7 @@ async def _crawl_by_url_type(self, url: str, request: Dict[str, Any]) -> tuple:
             if self.progress_id:
                 self.progress_state.update({
                     "status": "crawling",
-                    "percentage": 10,
+                    "percentage": 15,
                     "log": f"Starting recursive crawl with max depth {request.get('max_depth', 1)}...",
                 })
                 await update_crawl_progress(self.progress_id, self.progress_state)
@@ -564,8 +717,8 @@ async def _crawl_by_url_type(self, url: str, request: Dict[str, Any]) -> tuple:
                 max_depth=max_depth,
                 max_concurrent=None,  # Let strategy use settings
                 progress_callback=await self._create_crawl_progress_callback("crawling"),
-                start_progress=10,
-                end_progress=20,
+                start_progress=15,
+                end_progress=25,
             )
             crawl_type = "webpage"
 
diff --git a/python/src/server/services/crawling/helpers/file_discovery.py b/python/src/server/services/crawling/helpers/file_discovery.py
new file mode 100644
index 0000000000..cc482c44b8
--- /dev/null
+++ b/python/src/server/services/crawling/helpers/file_discovery.py
@@ -0,0 +1,275 @@
+"""
+File Discovery Service
+
+Handles automatic discovery of llms.txt, sitemap.xml, and related files
+using database-driven configuration with fallback defaults.
+"""
+
+import asyncio
+import json
+import re
+from urllib.parse import urljoin
+
+import aiohttp
+
+from ....config.logfire_config import get_logger
+from ...credential_service import credential_service
+
+logger = get_logger(__name__)
+
+
+class FileDiscoveryService:
+    """Service for discovering files on websites automatically."""
+
+    def __init__(self):
+        """Initialize the file discovery service."""
+        self.timeout = aiohttp.ClientTimeout(total=10)  # 10 second timeout for discovery
+
+        # Hardcoded fallback defaults if database access fails
+        self._fallback_defaults = {
+            "CRAWL_DISCOVERY_LLM_FILES": ["llms-full.txt", "llms-ctx.txt", "llms.md", "llms.txt"],
+            "CRAWL_DISCOVERY_SITEMAP_FILES": ["sitemap.xml", "sitemap_index.xml", "sitemap-*.xml"],
+            "CRAWL_DISCOVERY_METADATA_FILES": ["robots.txt", ".well-known/security.txt", ".well-known/humans.txt", "humans.txt", "security.txt"]
+        }
+
+    async def _get_file_list_from_db(self, setting_key: str) -> list[str]:
+        """
+        Get file list from database settings with fallback to hardcoded defaults.
+        
+        Args:
+            setting_key: The database setting key
+            
+        Returns:
+            List of files to discover
+        """
+        try:
+            # Get setting from database via credential service
+            raw_value = await credential_service.get_credential(setting_key, decrypt=False)
+
+            if raw_value is None:
+                logger.info(f"No database setting found for {setting_key}, using fallback defaults")
+                return self._fallback_defaults.get(setting_key, [])
+
+            # Parse JSON string to list
+            if isinstance(raw_value, str):
+                file_list = json.loads(raw_value)
+            elif isinstance(raw_value, dict) and 'value' in raw_value:
+                file_list = json.loads(raw_value['value'])
+            else:
+                file_list = raw_value
+
+            if not isinstance(file_list, list):
+                logger.warning(f"Setting {setting_key} is not a list, using fallback defaults")
+                return self._fallback_defaults.get(setting_key, [])
+
+            logger.info(f"Loaded {len(file_list)} files from database setting {setting_key}")
+            return file_list
+
+        except (json.JSONDecodeError, TypeError, KeyError) as e:
+            logger.error(f"Error parsing database setting {setting_key}: {e}, using fallback defaults")
+            return self._fallback_defaults.get(setting_key, [])
+        except Exception as e:
+            logger.error(f"Unexpected error getting database setting {setting_key}: {e}, using fallback defaults")
+            return self._fallback_defaults.get(setting_key, [])
+
+    async def _check_file_exists(self, session: aiohttp.ClientSession, url: str) -> bool:
+        """
+        Check if a file exists at the given URL using HEAD request.
+        
+        Args:
+            session: HTTP session
+            url: URL to check
+            
+        Returns:
+            True if file exists and is accessible
+        """
+        try:
+            async with session.head(url, timeout=self.timeout) as response:
+                # Consider 2xx status codes as successful
+                return 200 <= response.status < 300
+        except Exception as e:
+            logger.debug(f"File check failed for {url}: {e}")
+            return False
+
+    async def discover_robots_sitemaps(self, base_url: str) -> list[str]:
+        """
+        Extract sitemap URLs from robots.txt file.
+        
+        Args:
+            base_url: Base URL of the website
+            
+        Returns:
+            List of sitemap URLs found in robots.txt
+        """
+        sitemaps = []
+        robots_url = urljoin(base_url, "/robots.txt")
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.get(robots_url, timeout=self.timeout) as response:
+                    if response.status == 200:
+                        robots_content = await response.text()
+
+                        # Parse all Sitemap directives (case-insensitive)
+                        sitemap_pattern = re.compile(r'^sitemap:\s*(.+)$', re.IGNORECASE | re.MULTILINE)
+                        matches = sitemap_pattern.findall(robots_content)
+
+                        for match in matches:
+                            sitemap_url = match.strip()
+                            # Convert relative URLs to absolute
+                            if not sitemap_url.startswith(('http://', 'https://')):
+                                sitemap_url = urljoin(base_url, sitemap_url)
+                            sitemaps.append(sitemap_url)
+
+                        logger.info(f"Found {len(sitemaps)} sitemaps in robots.txt: {sitemaps}")
+                    else:
+                        logger.debug(f"robots.txt not accessible at {robots_url} (status: {response.status})")
+
+        except Exception as e:
+            logger.debug(f"Error fetching robots.txt from {robots_url}: {e}")
+
+        return sitemaps
+
+    async def discover_llm_files(self, base_url: str) -> list[str]:
+        """
+        Discover LLM-specific files using database-configured file lists.
+        Returns the highest priority LLM file found, not all of them.
+        
+        Args:
+            base_url: Base URL of the website
+            
+        Returns:
+            List containing the single best LLM file URL, or empty list if none found
+        """
+        discovered_files = []
+        llm_files = await self._get_file_list_from_db("CRAWL_DISCOVERY_LLM_FILES")
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                # Check each LLM file pattern in priority order (first in list = highest priority)
+                for file_pattern in llm_files:
+                    # Simple patterns without wildcards
+                    if "*" not in file_pattern:
+                        file_url = urljoin(base_url, f"/{file_pattern.lstrip('/')}")
+                        if await self._check_file_exists(session, file_url):
+                            logger.info(f"Discovered LLM file: {file_url}")
+                            # Return immediately with the highest priority file found
+                            return [file_url]
+
+        except Exception as e:
+            logger.error(f"Error during LLM file discovery for {base_url}: {e}")
+
+        return discovered_files
+
+    async def discover_sitemap_files(self, base_url: str) -> list[str]:
+        """
+        Discover sitemap files using database-configured patterns with wildcard support.
+        
+        Args:
+            base_url: Base URL of the website
+            
+        Returns:
+            List of discovered sitemap URLs
+        """
+        discovered_files = []
+        sitemap_files = await self._get_file_list_from_db("CRAWL_DISCOVERY_SITEMAP_FILES")
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                for file_pattern in sitemap_files:
+                    if "*" in file_pattern:
+                        # Handle wildcard patterns (simplified for now)
+                        # For "sitemap-*.xml", try common numbered patterns
+                        if file_pattern == "sitemap-*.xml":
+                            for i in range(1, 6):  # Try sitemap-1.xml to sitemap-5.xml
+                                file_url = urljoin(base_url, f"/sitemap-{i}.xml")
+                                if await self._check_file_exists(session, file_url):
+                                    discovered_files.append(file_url)
+                                    logger.info(f"Discovered numbered sitemap: {file_url}")
+                    else:
+                        # Simple file patterns
+                        file_url = urljoin(base_url, f"/{file_pattern.lstrip('/')}")
+                        if await self._check_file_exists(session, file_url):
+                            discovered_files.append(file_url)
+                            logger.info(f"Discovered sitemap file: {file_url}")
+
+        except Exception as e:
+            logger.error(f"Error during sitemap discovery for {base_url}: {e}")
+
+        return discovered_files
+
+    async def discover_metadata_files(self, base_url: str) -> list[str]:
+        """
+        Discover metadata files using database-configured lists.
+        
+        Args:
+            base_url: Base URL of the website
+            
+        Returns:
+            List of discovered metadata file URLs
+        """
+        discovered_files = []
+        metadata_files = await self._get_file_list_from_db("CRAWL_DISCOVERY_METADATA_FILES")
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                for file_pattern in metadata_files:
+                    file_url = urljoin(base_url, f"/{file_pattern.lstrip('/')}")
+                    if await self._check_file_exists(session, file_url):
+                        discovered_files.append(file_url)
+                        logger.info(f"Discovered metadata file: {file_url}")
+
+        except Exception as e:
+            logger.error(f"Error during metadata file discovery for {base_url}: {e}")
+
+        return discovered_files
+
+    async def discover_all_files(self, base_url: str) -> dict[str, list[str]]:
+        """
+        Perform comprehensive file discovery using all available methods.
+        
+        Args:
+            base_url: Base URL of the website
+            
+        Returns:
+            Dictionary with discovery results categorized by type
+        """
+        logger.info(f"Starting file discovery for {base_url}")
+
+        try:
+            # Run all discovery methods concurrently for performance
+            results = await asyncio.gather(
+                self.discover_robots_sitemaps(base_url),
+                self.discover_llm_files(base_url),
+                self.discover_sitemap_files(base_url),
+                self.discover_metadata_files(base_url),
+                return_exceptions=True
+            )
+
+            # Handle any exceptions from individual discovery methods
+            robots_sitemaps = results[0] if not isinstance(results[0], Exception) else []
+            llm_files = results[1] if not isinstance(results[1], Exception) else []
+            sitemap_files = results[2] if not isinstance(results[2], Exception) else []
+            metadata_files = results[3] if not isinstance(results[3], Exception) else []
+
+            discovery_result = {
+                "robots_sitemaps": robots_sitemaps,
+                "llm_files": llm_files,
+                "sitemap_files": sitemap_files,
+                "metadata_files": metadata_files
+            }
+
+            total_discovered = sum(len(files) for files in discovery_result.values())
+            logger.info(f"File discovery completed for {base_url}: {total_discovered} files discovered")
+
+            return discovery_result
+
+        except Exception as e:
+            logger.error(f"Unexpected error during file discovery for {base_url}: {e}")
+            # Return empty results on failure
+            return {
+                "robots_sitemaps": [],
+                "llm_files": [],
+                "sitemap_files": [],
+                "metadata_files": []
+            }
diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py
index d66a2a8281..8e3f0fb8f0 100644
--- a/python/src/server/services/crawling/helpers/url_handler.py
+++ b/python/src/server/services/crawling/helpers/url_handler.py
@@ -13,11 +13,12 @@
 
 class URLHandler:
     """Helper class for URL operations."""
-    
+
     @staticmethod
     def is_sitemap(url: str) -> bool:
         """
         Check if a URL is a sitemap with error handling.
+        Enhanced to detect more sitemap variations.
         
         Args:
             url: URL to check
@@ -26,11 +27,62 @@ def is_sitemap(url: str) -> bool:
             True if URL is a sitemap, False otherwise
         """
         try:
-            return url.endswith('sitemap.xml') or 'sitemap' in urlparse(url).path
+            parsed = urlparse(url)
+            path = parsed.path.lower()
+
+            # Check for various sitemap patterns
+            sitemap_patterns = [
+                'sitemap.xml',
+                'sitemap_index.xml',
+                'sitemapindex.xml',
+                'sitemap-',  # For numbered sitemaps like sitemap-1.xml
+                '/sitemaps/',  # For sitemaps in subdirectory
+                '/sitemap/',  # For sitemap directory
+            ]
+
+            # Also check if 'sitemap' is in the path and it's an XML file
+            has_sitemap_in_path = 'sitemap' in path and path.endswith('.xml')
+
+            return any(pattern in path for pattern in sitemap_patterns) or path.endswith('.xml.gz') or has_sitemap_in_path
+
         except Exception as e:
             logger.warning(f"Error checking if URL is sitemap: {e}")
             return False
-    
+
+    @staticmethod
+    def is_llm_file(url: str) -> bool:
+        """
+        Check if a URL points to an LLM-specific file with error handling.
+        
+        Args:
+            url: URL to check
+            
+        Returns:
+            True if URL is an LLM file, False otherwise
+        """
+        try:
+            parsed = urlparse(url)
+            path = parsed.path.lower()
+
+            # LLM file patterns
+            llm_patterns = [
+                'llms.txt',
+                'llms-full.txt',
+                'llms.md',
+                'llms-ctx.txt',
+                'llms-context.txt',
+                '/llms.txt',
+                '/llms-full.txt',
+                '/llms.md',
+                '/llms-ctx.txt'
+            ]
+
+            return any(path.endswith(pattern) for pattern in llm_patterns)
+
+        except Exception as e:
+            logger.warning(f"Error checking if URL is LLM file: {e}")
+            return False
+
     @staticmethod
     def is_txt(url: str) -> bool:
         """
@@ -47,7 +99,7 @@ def is_txt(url: str) -> bool:
         except Exception as e:
             logger.warning(f"Error checking if URL is text file: {e}")
             return False
-    
+
     @staticmethod
     def is_binary_file(url: str) -> bool:
         """
@@ -63,7 +115,7 @@ def is_binary_file(url: str) -> bool:
             # Remove query parameters and fragments for cleaner extension checking
             parsed = urlparse(url)
             path = parsed.path.lower()
-            
+
             # Comprehensive list of binary and non-HTML file extensions
             binary_extensions = {
                 # Archives
@@ -83,19 +135,19 @@ def is_binary_file(url: str) -> bool:
                 # Development files (usually not meant to be crawled as pages)
                 '.wasm', '.pyc', '.jar', '.war', '.class', '.dll', '.so', '.dylib'
             }
-            
+
             # Check if the path ends with any binary extension
             for ext in binary_extensions:
                 if path.endswith(ext):
                     logger.debug(f"Skipping binary file: {url} (matched extension: {ext})")
                     return True
-                    
+
             return False
         except Exception as e:
             logger.warning(f"Error checking if URL is binary file: {e}")
             # In case of error, don't skip the URL (safer to attempt crawl than miss content)
             return False
-    
+
     @staticmethod
     def transform_github_url(url: str) -> str:
         """
@@ -115,7 +167,7 @@ def transform_github_url(url: str) -> str:
             raw_url = f'https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}'
             logger.info(f"Transformed GitHub file URL to raw: {url} -> {raw_url}")
             return raw_url
-        
+
         # Pattern for GitHub directory URLs
         github_dir_pattern = r'https://github\.com/([^/]+)/([^/]+)/tree/([^/]+)/(.+)'
         match = re.match(github_dir_pattern, url)
@@ -123,5 +175,5 @@ def transform_github_url(url: str) -> str:
             # For directories, we can't directly get raw content
             # Return original URL but log a warning
             logger.warning(f"GitHub directory URL detected: {url} - consider using specific file URLs or GitHub API")
-        
-        return url
\ No newline at end of file
+
+        return url
diff --git a/python/tests/test_file_discovery.py b/python/tests/test_file_discovery.py
new file mode 100644
index 0000000000..9eef0b5e6c
--- /dev/null
+++ b/python/tests/test_file_discovery.py
@@ -0,0 +1,402 @@
+"""Comprehensive test suite for FileDiscoveryService."""
+
+import asyncio
+import json
+from unittest.mock import AsyncMock, Mock, patch, MagicMock
+import pytest
+import aiohttp
+from aiohttp import ClientResponse
+
+from src.server.services.crawling.helpers.file_discovery import FileDiscoveryService
+
+
+class TestFileDiscoveryService:
+    """Test suite for FileDiscoveryService class."""
+
+    @pytest.fixture
+    def discovery_service(self):
+        """Create a FileDiscoveryService instance for testing."""
+        return FileDiscoveryService()
+
+    @pytest.fixture
+    def mock_response(self):
+        """Create a mock HTTP response for testing."""
+        response = AsyncMock(spec=ClientResponse)
+        response.status = 200
+        response.text = AsyncMock()
+        return response
+
+    def test_initialization(self, discovery_service):
+        """Test service initialization with proper defaults."""
+        assert discovery_service.timeout.total == 10
+        assert "CRAWL_DISCOVERY_LLM_FILES" in discovery_service._fallback_defaults
+        assert "CRAWL_DISCOVERY_SITEMAP_FILES" in discovery_service._fallback_defaults
+        assert "CRAWL_DISCOVERY_METADATA_FILES" in discovery_service._fallback_defaults
+
+    @pytest.mark.asyncio
+    async def test_get_file_list_from_db_success(self, discovery_service):
+        """Test successful database settings retrieval."""
+        with patch('src.server.services.crawling.helpers.file_discovery.credential_service') as mock_cred:
+            # Mock successful database response with async return
+            mock_cred.get_credential = AsyncMock(return_value='["llms.txt", "llms-full.txt"]')
+            
+            result = await discovery_service._get_file_list_from_db("CRAWL_DISCOVERY_LLM_FILES")
+            
+            assert result == ["llms.txt", "llms-full.txt"]
+            mock_cred.get_credential.assert_called_once_with("CRAWL_DISCOVERY_LLM_FILES", decrypt=False)
+
+    @pytest.mark.asyncio
+    async def test_get_file_list_from_db_dict_format(self, discovery_service):
+        """Test database settings retrieval with dict format."""
+        with patch('src.server.services.crawling.helpers.file_discovery.credential_service') as mock_cred:
+            # Mock database response in dict format with async return
+            mock_cred.get_credential = AsyncMock(return_value={"value": '["sitemap.xml", "sitemap_index.xml"]'})
+            
+            result = await discovery_service._get_file_list_from_db("CRAWL_DISCOVERY_SITEMAP_FILES")
+            
+            assert result == ["sitemap.xml", "sitemap_index.xml"]
+
+    @pytest.mark.asyncio
+    async def test_get_file_list_from_db_fallback_on_none(self, discovery_service):
+        """Test fallback to defaults when database returns None."""
+        with patch('src.server.services.crawling.helpers.file_discovery.credential_service') as mock_cred:
+            mock_cred.get_credential = AsyncMock(return_value=None)
+            
+            result = await discovery_service._get_file_list_from_db("CRAWL_DISCOVERY_LLM_FILES")
+            
+            assert result == discovery_service._fallback_defaults["CRAWL_DISCOVERY_LLM_FILES"]
+
+    @pytest.mark.asyncio
+    async def test_get_file_list_from_db_fallback_on_json_error(self, discovery_service):
+        """Test fallback to defaults on JSON parsing errors."""
+        with patch('src.server.services.crawling.helpers.file_discovery.credential_service') as mock_cred:
+            mock_cred.get_credential = AsyncMock(return_value="invalid json[")
+            
+            result = await discovery_service._get_file_list_from_db("CRAWL_DISCOVERY_LLM_FILES")
+            
+            assert result == discovery_service._fallback_defaults["CRAWL_DISCOVERY_LLM_FILES"]
+
+    @pytest.mark.asyncio
+    async def test_get_file_list_from_db_fallback_on_exception(self, discovery_service):
+        """Test fallback to defaults on unexpected exceptions."""
+        with patch('src.server.services.crawling.helpers.file_discovery.credential_service') as mock_cred:
+            mock_cred.get_credential = AsyncMock(side_effect=Exception("Database error"))
+            
+            result = await discovery_service._get_file_list_from_db("CRAWL_DISCOVERY_LLM_FILES")
+            
+            assert result == discovery_service._fallback_defaults["CRAWL_DISCOVERY_LLM_FILES"]
+
+    @pytest.mark.asyncio
+    async def test_check_file_exists_success(self, discovery_service, mock_response):
+        """Test successful file existence check."""
+        mock_response.status = 200
+        
+        with patch('aiohttp.ClientSession.head') as mock_head:
+            mock_head.return_value.__aenter__ = AsyncMock(return_value=mock_response)
+            mock_head.return_value.__aexit__ = AsyncMock(return_value=None)
+            
+            async with aiohttp.ClientSession() as session:
+                result = await discovery_service._check_file_exists(session, "https://example.com/llms.txt")
+                
+            assert result is True
+
+    @pytest.mark.asyncio
+    async def test_check_file_exists_not_found(self, discovery_service, mock_response):
+        """Test file existence check with 404 response."""
+        mock_response.status = 404
+        
+        with patch('aiohttp.ClientSession.head', return_value=mock_response):
+            async with aiohttp.ClientSession() as session:
+                result = await discovery_service._check_file_exists(session, "https://example.com/nonexistent.txt")
+                
+            assert result is False
+
+    @pytest.mark.asyncio
+    async def test_check_file_exists_exception(self, discovery_service):
+        """Test file existence check with network exception."""
+        with patch('aiohttp.ClientSession.head', side_effect=aiohttp.ClientError("Network error")):
+            async with aiohttp.ClientSession() as session:
+                result = await discovery_service._check_file_exists(session, "https://example.com/test.txt")
+                
+            assert result is False
+
+    @pytest.mark.asyncio
+    async def test_discover_robots_sitemaps_success(self, discovery_service, mock_response):
+        """Test successful robots.txt sitemap extraction."""
+        robots_content = """User-agent: *
+Allow: /
+
+Sitemap: https://example.com/sitemap.xml
+Sitemap: https://example.com/news-sitemap.xml
+sitemap: https://example.com/products-sitemap.xml
+"""
+        mock_response.status = 200
+        mock_response.text = AsyncMock(return_value=robots_content)
+        
+        with patch('aiohttp.ClientSession.get') as mock_get:
+            mock_get.return_value.__aenter__ = AsyncMock(return_value=mock_response)
+            mock_get.return_value.__aexit__ = AsyncMock(return_value=None)
+            
+            result = await discovery_service.discover_robots_sitemaps("https://example.com")
+            
+        expected = [
+            "https://example.com/sitemap.xml",
+            "https://example.com/news-sitemap.xml", 
+            "https://example.com/products-sitemap.xml"
+        ]
+        assert result == expected
+
+    @pytest.mark.asyncio
+    async def test_discover_robots_sitemaps_relative_urls(self, discovery_service, mock_response):
+        """Test robots.txt with relative sitemap URLs."""
+        robots_content = """User-agent: *
+Sitemap: /sitemap.xml
+Sitemap: /sitemaps/main.xml
+"""
+        mock_response.status = 200
+        mock_response.text.return_value = robots_content
+        
+        with patch('aiohttp.ClientSession.get', return_value=mock_response):
+            result = await discovery_service.discover_robots_sitemaps("https://example.com")
+            
+        expected = [
+            "https://example.com/sitemap.xml",
+            "https://example.com/sitemaps/main.xml"
+        ]
+        assert result == expected
+
+    @pytest.mark.asyncio
+    async def test_discover_robots_sitemaps_not_found(self, discovery_service, mock_response):
+        """Test robots.txt not found."""
+        mock_response.status = 404
+        
+        with patch('aiohttp.ClientSession.get', return_value=mock_response):
+            result = await discovery_service.discover_robots_sitemaps("https://example.com")
+            
+        assert result == []
+
+    @pytest.mark.asyncio
+    async def test_discover_robots_sitemaps_exception(self, discovery_service):
+        """Test robots.txt discovery with network exception."""
+        with patch('aiohttp.ClientSession.get', side_effect=aiohttp.ClientError("Network error")):
+            result = await discovery_service.discover_robots_sitemaps("https://example.com")
+            
+        assert result == []
+
+    @pytest.mark.asyncio
+    async def test_discover_llm_files_success(self, discovery_service):
+        """Test successful LLM file discovery."""
+        with patch.object(discovery_service, '_get_file_list_from_db', return_value=["llms.txt", "llms-full.txt"]):
+            with patch.object(discovery_service, '_check_file_exists', side_effect=[True, False]) as mock_check:
+                result = await discovery_service.discover_llm_files("https://example.com")
+                
+        assert result == ["https://example.com/llms.txt"]
+        assert mock_check.call_count == 2
+
+    @pytest.mark.asyncio
+    async def test_discover_llm_files_none_found(self, discovery_service):
+        """Test LLM file discovery when no files exist."""
+        with patch.object(discovery_service, '_get_file_list_from_db', return_value=["llms.txt"]):
+            with patch.object(discovery_service, '_check_file_exists', return_value=False):
+                result = await discovery_service.discover_llm_files("https://example.com")
+                
+        assert result == []
+
+    @pytest.mark.asyncio
+    async def test_discover_llm_files_exception(self, discovery_service):
+        """Test LLM file discovery with exception."""
+        with patch.object(discovery_service, '_get_file_list_from_db', side_effect=Exception("Database error")):
+            result = await discovery_service.discover_llm_files("https://example.com")
+            
+        assert result == []
+
+    @pytest.mark.asyncio
+    async def test_discover_sitemap_files_standard(self, discovery_service):
+        """Test standard sitemap file discovery."""
+        with patch.object(discovery_service, '_get_file_list_from_db', return_value=["sitemap.xml", "sitemap_index.xml"]):
+            with patch.object(discovery_service, '_check_file_exists', side_effect=[True, False]):
+                result = await discovery_service.discover_sitemap_files("https://example.com")
+                
+        assert result == ["https://example.com/sitemap.xml"]
+
+    @pytest.mark.asyncio
+    async def test_discover_sitemap_files_wildcard(self, discovery_service):
+        """Test sitemap discovery with wildcard patterns."""
+        with patch.object(discovery_service, '_get_file_list_from_db', return_value=["sitemap-*.xml"]):
+            # Mock that sitemap-1.xml and sitemap-3.xml exist
+            check_responses = [True, False, True, False, False]  # 1 exists, 2 doesn't, 3 exists, 4&5 don't
+            with patch.object(discovery_service, '_check_file_exists', side_effect=check_responses):
+                result = await discovery_service.discover_sitemap_files("https://example.com")
+                
+        expected = ["https://example.com/sitemap-1.xml", "https://example.com/sitemap-3.xml"]
+        assert result == expected
+
+    @pytest.mark.asyncio
+    async def test_discover_metadata_files_success(self, discovery_service):
+        """Test successful metadata file discovery."""
+        with patch.object(discovery_service, '_get_file_list_from_db', return_value=["robots.txt", ".well-known/security.txt"]):
+            with patch.object(discovery_service, '_check_file_exists', side_effect=[True, False]):
+                result = await discovery_service.discover_metadata_files("https://example.com")
+                
+        assert result == ["https://example.com/robots.txt"]
+
+    @pytest.mark.asyncio
+    async def test_discover_all_files_comprehensive(self, discovery_service):
+        """Test comprehensive file discovery with all methods."""
+        # Mock all the individual discovery methods
+        with patch.object(discovery_service, 'discover_robots_sitemaps', return_value=["https://example.com/sitemap.xml"]):
+            with patch.object(discovery_service, 'discover_llm_files', return_value=["https://example.com/llms.txt"]):
+                with patch.object(discovery_service, 'discover_sitemap_files', return_value=["https://example.com/sitemap_index.xml"]):
+                    with patch.object(discovery_service, 'discover_metadata_files', return_value=["https://example.com/robots.txt"]):
+                        result = await discovery_service.discover_all_files("https://example.com")
+                        
+        expected = {
+            "robots_sitemaps": ["https://example.com/sitemap.xml"],
+            "llm_files": ["https://example.com/llms.txt"],
+            "sitemap_files": ["https://example.com/sitemap_index.xml"],
+            "metadata_files": ["https://example.com/robots.txt"]
+        }
+        assert result == expected
+
+    @pytest.mark.asyncio
+    async def test_discover_all_files_with_exceptions(self, discovery_service):
+        """Test comprehensive discovery with some methods failing."""
+        # Mock some methods to succeed and some to fail
+        with patch.object(discovery_service, 'discover_robots_sitemaps', return_value=["https://example.com/sitemap.xml"]):
+            with patch.object(discovery_service, 'discover_llm_files', side_effect=Exception("LLM discovery failed")):
+                with patch.object(discovery_service, 'discover_sitemap_files', return_value=["https://example.com/sitemap_index.xml"]):
+                    with patch.object(discovery_service, 'discover_metadata_files', side_effect=Exception("Metadata discovery failed")):
+                        result = await discovery_service.discover_all_files("https://example.com")
+                        
+        expected = {
+            "robots_sitemaps": ["https://example.com/sitemap.xml"],
+            "llm_files": [],  # Failed, so empty
+            "sitemap_files": ["https://example.com/sitemap_index.xml"],
+            "metadata_files": []  # Failed, so empty
+        }
+        assert result == expected
+
+    @pytest.mark.asyncio
+    async def test_discover_all_files_complete_failure(self, discovery_service):
+        """Test discovery with complete failure returning empty results."""
+        with patch.object(discovery_service, 'discover_robots_sitemaps', side_effect=Exception("Complete failure")):
+            result = await discovery_service.discover_all_files("https://example.com")
+            
+        expected = {
+            "robots_sitemaps": [],
+            "llm_files": [],
+            "sitemap_files": [],
+            "metadata_files": []
+        }
+        assert result == expected
+
+    @pytest.mark.asyncio
+    async def test_concurrent_discovery_performance(self, discovery_service):
+        """Test that discovery methods run concurrently for performance."""
+        # This test verifies that asyncio.gather is used for concurrent execution
+        with patch('asyncio.gather') as mock_gather:
+            mock_gather.return_value = [[], [], [], []]  # Empty results
+            
+            await discovery_service.discover_all_files("https://example.com")
+            
+            # Verify asyncio.gather was called (indicating concurrent execution)
+            mock_gather.assert_called_once()
+            
+            # Check that all 4 discovery methods were passed to gather
+            args = mock_gather.call_args[0]
+            assert len(args) == 4  # 4 discovery methods
+
+    def test_fallback_defaults_completeness(self, discovery_service):
+        """Test that fallback defaults contain all required settings."""
+        defaults = discovery_service._fallback_defaults
+        
+        # Check all required keys are present
+        required_keys = [
+            "CRAWL_DISCOVERY_LLM_FILES",
+            "CRAWL_DISCOVERY_SITEMAP_FILES", 
+            "CRAWL_DISCOVERY_METADATA_FILES"
+        ]
+        
+        for key in required_keys:
+            assert key in defaults
+            assert isinstance(defaults[key], list)
+            assert len(defaults[key]) > 0  # Should have at least one item
+
+    def test_fallback_defaults_content(self, discovery_service):
+        """Test that fallback defaults contain expected file patterns."""
+        defaults = discovery_service._fallback_defaults
+        
+        # Check LLM files
+        llm_files = defaults["CRAWL_DISCOVERY_LLM_FILES"]
+        assert "llms.txt" in llm_files
+        assert "llms-full.txt" in llm_files
+        
+        # Check sitemap files
+        sitemap_files = defaults["CRAWL_DISCOVERY_SITEMAP_FILES"]
+        assert "sitemap.xml" in sitemap_files
+        assert "sitemap_index.xml" in sitemap_files
+        
+        # Check metadata files
+        metadata_files = defaults["CRAWL_DISCOVERY_METADATA_FILES"]
+        assert "robots.txt" in metadata_files
+        assert ".well-known/security.txt" in metadata_files
+
+
+class TestFileDiscoveryIntegration:
+    """Integration tests for FileDiscoveryService with real network scenarios."""
+
+    @pytest.fixture
+    def discovery_service(self):
+        """Create a FileDiscoveryService instance for integration testing."""
+        return FileDiscoveryService()
+
+    @pytest.mark.asyncio
+    async def test_timeout_handling(self, discovery_service):
+        """Test that timeouts are handled gracefully."""
+        # Mock a slow response that times out
+        slow_response = AsyncMock()
+        slow_response.text.side_effect = asyncio.TimeoutError("Request timed out")
+        
+        with patch('aiohttp.ClientSession.get', return_value=slow_response):
+            result = await discovery_service.discover_robots_sitemaps("https://slow-site.com")
+            
+        assert result == []  # Should return empty list on timeout
+
+    @pytest.mark.asyncio 
+    async def test_malformed_robots_txt(self, discovery_service):
+        """Test handling of malformed robots.txt content."""
+        malformed_content = """This is not a valid robots.txt
+Random text here
+Sitemap: https://example.com/sitemap.xml
+More random content
+sitemap: not-a-url
+Sitemap: https://example.com/valid-sitemap.xml
+"""
+        mock_response = AsyncMock()
+        mock_response.status = 200
+        mock_response.text.return_value = malformed_content
+        
+        with patch('aiohttp.ClientSession.get', return_value=mock_response):
+            result = await discovery_service.discover_robots_sitemaps("https://example.com")
+            
+        # Should extract valid sitemap URLs and handle malformed ones gracefully
+        assert "https://example.com/sitemap.xml" in result
+        assert "https://example.com/valid-sitemap.xml" in result
+        # Should handle malformed entries without crashing
+
+    @pytest.mark.asyncio
+    async def test_edge_case_url_handling(self, discovery_service):
+        """Test edge cases in URL handling and construction."""
+        test_cases = [
+            ("https://example.com/", "llms.txt", "https://example.com/llms.txt"),
+            ("https://example.com", "llms.txt", "https://example.com/llms.txt"),
+            ("https://example.com/path/", "llms.txt", "https://example.com/path/llms.txt"),
+            ("https://example.com/path", ".well-known/security.txt", "https://example.com/.well-known/security.txt"),
+        ]
+        
+        with patch.object(discovery_service, '_get_file_list_from_db', return_value=["llms.txt"]):
+            with patch.object(discovery_service, '_check_file_exists', return_value=True) as mock_check:
+                for base_url, file_pattern, expected_url in test_cases:
+                    await discovery_service.discover_llm_files(base_url)
+                    # Verify the correct URL was constructed and checked
+                    mock_check.assert_called_with(mock_check.call_args[0][0], expected_url)
\ No newline at end of file