From 33abb6033741e0db13cde3c28e7a008850a12af6 Mon Sep 17 00:00:00 2001 From: Steve Antonakakis Date: Wed, 26 Nov 2025 22:33:41 -0500 Subject: [PATCH] fix(discovery): Detect soft 404s via content-type validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Servers sometimes return HTTP 200 with text/html for missing files instead of a proper 404 status. This caused the crawler to incorrectly detect llms.txt files that don't exist. Added content-type validation to _check_url_exists() that verifies: - .txt/.md files return text/plain or text/markdown (not text/html) - .xml files return text/xml or application/xml variants When a soft 404 is detected, the method now returns False and logs an informative message, allowing the crawler to fall back to HTML crawling correctly. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../services/crawling/discovery_service.py | 35 ++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/python/src/server/services/crawling/discovery_service.py b/python/src/server/services/crawling/discovery_service.py index 103a277296..f1c3230f8f 100644 --- a/python/src/server/services/crawling/discovery_service.py +++ b/python/src/server/services/crawling/discovery_service.py @@ -290,16 +290,24 @@ def _resolve_and_validate_hostname(self, hostname: str) -> bool: logger.warning(f"Error resolving hostname {hostname}: {e}") return False + # Expected content types for discovery files (to detect soft 404s) + EXPECTED_CONTENT_TYPES = { + '.txt': ['text/plain', 'text/markdown', 'text/x-markdown'], + '.md': ['text/plain', 'text/markdown', 'text/x-markdown'], + '.xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/atom+xml'], + } + def _check_url_exists(self, url: str) -> bool: """ Check if a URL exists and returns a successful response. Includes SSRF protection by validating hostnames and blocking private IPs. + Also validates content-type to detect soft 404s (HTML pages returned for missing files). Args: url: URL to check Returns: - True if URL returns 200, False otherwise + True if URL returns 200 with valid content-type, False otherwise """ try: # Parse URL to extract hostname @@ -358,9 +366,28 @@ def _check_url_exists(self, url: str) -> bool: return False # Check response status - success = resp.status_code == 200 - logger.debug(f"URL check: {url} -> {resp.status_code} ({'exists' if success else 'not found'})") - return success + if resp.status_code != 200: + logger.debug(f"URL check: {url} -> {resp.status_code} (not found)") + return False + + # Validate content-type to detect soft 404s + content_type = resp.headers.get('content-type', '').lower().split(';')[0].strip() + url_path = parsed.path.lower() + + # Check if URL has an extension we should validate + for ext, valid_types in self.EXPECTED_CONTENT_TYPES.items(): + if url_path.endswith(ext): + if content_type and content_type not in valid_types: + # Soft 404: server returned HTML for a text/xml file + if content_type == 'text/html': + logger.info(f"Soft 404 detected: {url} returned text/html instead of {valid_types}") + return False + # Log warning but allow other content types + logger.debug(f"Unexpected content-type for {url}: {content_type} (expected {valid_types})") + break + + logger.debug(f"URL check: {url} -> 200 (exists, content-type: {content_type})") + return True finally: if hasattr(resp, 'close'):