diff --git a/python/pyproject.toml b/python/pyproject.toml index 128e433290..f382be688a 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -39,7 +39,8 @@ server = [ "python-multipart>=0.0.20", "watchfiles>=0.18", # Web crawling - "crawl4ai==0.7.4", + "crawl4ai==0.7.6", # Updated from 0.7.4 for latest features and bug fixes (not required for robots.txt) + "protego>=0.3.1", # robots.txt parser - 40% faster than stdlib, supports wildcards # Database and storage "supabase==2.15.1", "asyncpg>=0.29.0", @@ -119,7 +120,8 @@ all = [ "uvicorn>=0.24.0", "python-multipart>=0.0.20", "watchfiles>=0.18", - "crawl4ai==0.7.4", + "crawl4ai==0.7.6", + "protego>=0.3.1", "supabase==2.15.1", "asyncpg>=0.29.0", "openai==1.71.0", diff --git a/python/src/server/api_routes/knowledge_api.py b/python/src/server/api_routes/knowledge_api.py index 052f75216e..97dd5a61db 100644 --- a/python/src/server/api_routes/knowledge_api.py +++ b/python/src/server/api_routes/knowledge_api.py @@ -712,6 +712,8 @@ async def _perform_refresh_with_semaphore(): safe_logfire_info( f"Cleaned up refresh task from registry | progress_id={progress_id}" ) + # Close crawl_service to release resources + await crawl_service.close() # Start the wrapper task - we don't need to track it since we'll track the actual crawl task asyncio.create_task(_perform_refresh_with_semaphore()) @@ -889,6 +891,8 @@ async def _perform_crawl_with_progress( safe_logfire_info( f"Cleaned up crawl task from registry | progress_id={progress_id}" ) + # Close orchestration_service to release resources + await orchestration_service.close() @router.post("/documents/upload") diff --git a/python/src/server/config/config.py b/python/src/server/config/config.py index d8104bb0ea..99843238b3 100644 --- a/python/src/server/config/config.py +++ b/python/src/server/config/config.py @@ -275,3 +275,79 @@ def str_to_bool(value: str | None) -> bool: enable_docker_socket=str_to_bool(os.getenv("ENABLE_DOCKER_SOCKET_MONITORING")), health_check_timeout=int(os.getenv("MCP_HEALTH_CHECK_TIMEOUT", "5")), ) + + +def get_crawler_config() -> dict: + """Get crawler configuration from environment with validation. + + Returns a dictionary with crawler settings including User-Agent, + robots.txt compliance settings, and caching configuration. + + Environment Variables: + CRAWLER_USER_AGENT: Custom User-Agent string (default: "Archon-Crawler/{version} (+{repo_url})") + ROBOTS_RESPECT: Whether to respect robots.txt (default: "true") + ROBOTS_DEFAULT_CRAWL_DELAY: Default delay between requests in seconds (default: "10.0", min: 0.0) + ROBOTS_CACHE_SIZE: Max number of domains to cache (default: "1000", min: 1) + ROBOTS_CACHE_TTL: Cache TTL in seconds (default: "86400" = 24 hours, min: 1) + + Returns: + dict with keys: user_agent, respect_robots, default_crawl_delay, + robots_cache_size, robots_cache_ttl + + Raises: + ConfigurationError: If environment variable values are invalid or out of bounds + """ + from .version import ARCHON_VERSION, GITHUB_REPO_NAME, GITHUB_REPO_OWNER + + repo_url = f"https://github.com/{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}" + default_ua = f"Archon-Crawler/{ARCHON_VERSION} (+{repo_url})" + + # Parse and validate ROBOTS_DEFAULT_CRAWL_DELAY + crawl_delay_str = os.getenv("ROBOTS_DEFAULT_CRAWL_DELAY", "10.0") + try: + default_crawl_delay = float(crawl_delay_str) + if default_crawl_delay < 0.0: + raise ConfigurationError( + f"ROBOTS_DEFAULT_CRAWL_DELAY must be >= 0.0, got: {default_crawl_delay}. " + f"Use 0.0 to disable delays." + ) + except ValueError as e: + raise ConfigurationError( + f"ROBOTS_DEFAULT_CRAWL_DELAY must be a valid number, got: '{crawl_delay_str}'" + ) from e + + # Parse and validate ROBOTS_CACHE_SIZE + cache_size_str = os.getenv("ROBOTS_CACHE_SIZE", "1000") + try: + robots_cache_size = int(cache_size_str) + if robots_cache_size < 1: + raise ConfigurationError( + f"ROBOTS_CACHE_SIZE must be >= 1, got: {robots_cache_size}. " + f"Recommended: 100-10000" + ) + except ValueError as e: + raise ConfigurationError( + f"ROBOTS_CACHE_SIZE must be a valid integer, got: '{cache_size_str}'" + ) from e + + # Parse and validate ROBOTS_CACHE_TTL + cache_ttl_str = os.getenv("ROBOTS_CACHE_TTL", "86400") + try: + robots_cache_ttl = int(cache_ttl_str) + if robots_cache_ttl < 1: + raise ConfigurationError( + f"ROBOTS_CACHE_TTL must be >= 1 second, got: {robots_cache_ttl}. " + f"RFC 9309 recommends max 86400 (24 hours)" + ) + except ValueError as e: + raise ConfigurationError( + f"ROBOTS_CACHE_TTL must be a valid integer, got: '{cache_ttl_str}'" + ) from e + + return { + "user_agent": os.getenv("CRAWLER_USER_AGENT", default_ua), + "respect_robots": os.getenv("ROBOTS_RESPECT", "true").lower() == "true", + "default_crawl_delay": default_crawl_delay, + "robots_cache_size": robots_cache_size, + "robots_cache_ttl": robots_cache_ttl, + } diff --git a/python/src/server/services/crawler_manager.py b/python/src/server/services/crawler_manager.py index 522c4f71d7..6e065ed83e 100644 --- a/python/src/server/services/crawler_manager.py +++ b/python/src/server/services/crawler_manager.py @@ -14,6 +14,7 @@ AsyncWebCrawler = None BrowserConfig = None +from ..config.config import get_crawler_config from ..config.logfire_config import get_logger, safe_logfire_error, safe_logfire_info logger = get_logger(__name__) @@ -59,14 +60,15 @@ async def initialize(self): # Initialize browser config - same for Docker and local # crawl4ai/Playwright will handle Docker-specific settings internally + crawler_config = get_crawler_config() browser_config = BrowserConfig( headless=True, verbose=False, # Set viewport for proper rendering viewport_width=1920, viewport_height=1080, - # Add user agent to appear as a real browser - user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + # Use proper bot identification + user_agent=crawler_config["user_agent"], # Set browser type browser_type="chromium", # Extra args for Chromium - optimized for speed diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index 01122704d8..58fb109b5b 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -13,6 +13,7 @@ import tldextract +from ...config.config import get_crawler_config from ...config.logfire_config import get_logger, safe_logfire_error, safe_logfire_info from ...utils import get_supabase_client from ...utils.progress.progress_tracker import ProgressTracker @@ -28,6 +29,7 @@ from .helpers.url_handler import URLHandler from .page_storage_operations import PageStorageOperations from .progress_mapper import ProgressMapper +from .robots_checker import RobotsChecker from .strategies.batch import BatchCrawlStrategy from .strategies.recursive import RecursiveCrawlStrategy from .strategies.single_page import SinglePageCrawlStrategy @@ -133,6 +135,10 @@ def __init__(self, crawler=None, supabase_client=None, progress_id=None): self.discovery_service = DiscoveryService() self.page_storage_ops = PageStorageOperations(self.supabase_client) + # Initialize robots.txt checker + crawler_config = get_crawler_config() + self.robots_checker = RobotsChecker(crawler_config) if crawler_config.get("respect_robots") else None + # Track progress state across all stages to prevent UI resets self.progress_state = {"progressId": self.progress_id} if self.progress_id else {} # Initialize progress mapper to prevent backwards jumps @@ -162,6 +168,35 @@ def _check_cancellation(self): if self._cancelled: raise asyncio.CancelledError("Crawl operation was cancelled by user") + async def _can_fetch_url(self, url: str) -> bool: + """ + Check if URL is allowed by robots.txt. + + Note: This method only validates URLs, it does NOT enforce crawl delays. + Crawl delays are handled by Crawl4AI's internal rate limiting and + concurrency controls. Enforcing delays during validation would cause + unacceptable performance (e.g., 540 seconds to validate 54 sitemap URLs). + + Args: + url: URL to check + + Returns: + True if crawling is allowed, False if blocked by robots.txt + + Raises: + No exceptions - errors result in allowing the crawl (fail open) + """ + if not self.robots_checker: + return True # Robots checking disabled + + try: + # Check if URL is allowed (no delay enforcement during validation) + return await self.robots_checker.can_fetch(url) + except Exception as e: + # Log error but allow crawl (fail open) + logger.warning(f"robots.txt check failed for {url}: {e}, allowing crawl") + return True + async def _create_crawl_progress_callback( self, base_status: str ) -> Callable[[str, int, str], Awaitable[None]]: @@ -278,6 +313,7 @@ async def crawl_recursive_with_progress( max_concurrent, progress_callback, self._check_cancellation, # Pass cancellation check + self.robots_checker, # Pass robots checker for URL validation ) # Orchestration methods @@ -909,6 +945,20 @@ async def update_crawl_progress(stage_progress: int, message: str, **kwargs): url_to_link_text = dict(same_domain_links) extracted_urls = [link for link, _ in same_domain_links] + # Filter URLs with robots.txt validation + if self.robots_checker: + original_count = len(extracted_urls) + allowed_urls = [] + for url_to_check in extracted_urls: + if await self._can_fetch_url(url_to_check): + allowed_urls.append(url_to_check) + else: + logger.info(f"Skipped (robots.txt): {url_to_check}") + extracted_urls = allowed_urls + robots_filtered = original_count - len(extracted_urls) + if robots_filtered > 0: + logger.info(f"Filtered out {robots_filtered} URLs by robots.txt from llms.txt links") + logger.info(f"Following {len(extracted_urls)} same-domain links from llms.txt") # Notify user about linked files being crawled @@ -979,6 +1029,20 @@ async def update_crawl_progress(stage_progress: int, message: str, **kwargs): url_to_link_text = dict(extracted_links_with_text) extracted_links = [link for link, _ in extracted_links_with_text] + # Filter URLs with robots.txt validation + if self.robots_checker: + original_count = len(extracted_links) + allowed_links = [] + for url_to_check in extracted_links: + if await self._can_fetch_url(url_to_check): + allowed_links.append(url_to_check) + else: + logger.info(f"Skipped (robots.txt): {url_to_check}") + extracted_links = allowed_links + robots_filtered = original_count - len(extracted_links) + if robots_filtered > 0: + logger.info(f"Filtered out {robots_filtered} URLs by robots.txt from extracted links") + # For discovery targets, respect max_depth for same-domain links max_depth = request.get('max_depth', 2) if request.get("is_discovery_target") else request.get('max_depth', 1) @@ -1035,6 +1099,20 @@ async def update_crawl_progress(stage_progress: int, message: str, **kwargs): sitemap_urls = self.parse_sitemap(url) if sitemap_urls: + # Filter URLs with robots.txt validation + if self.robots_checker: + original_count = len(sitemap_urls) + allowed_sitemap_urls = [] + for url_to_check in sitemap_urls: + if await self._can_fetch_url(url_to_check): + allowed_sitemap_urls.append(url_to_check) + else: + logger.info(f"Skipped (robots.txt): {url_to_check}") + sitemap_urls = allowed_sitemap_urls + robots_filtered = original_count - len(sitemap_urls) + if robots_filtered > 0: + logger.info(f"Filtered out {robots_filtered} URLs by robots.txt from sitemap") + # Update progress before starting batch crawl await update_crawl_progress( 75, # 75% of crawling stage @@ -1069,6 +1147,15 @@ async def update_crawl_progress(stage_progress: int, message: str, **kwargs): return crawl_results, crawl_type + async def close(self) -> None: + """ + Close resources and cleanup. + + Note: robots_checker uses a shared HTTP client that is not closed per-instance. + This method is kept for API compatibility and future cleanup needs. + """ + pass # No per-instance cleanup needed currently + # Alias for backward compatibility CrawlOrchestrationService = CrawlingService diff --git a/python/src/server/services/crawling/discovery_service.py b/python/src/server/services/crawling/discovery_service.py index 103a277296..7034c3bf8c 100644 --- a/python/src/server/services/crawling/discovery_service.py +++ b/python/src/server/services/crawling/discovery_service.py @@ -61,8 +61,6 @@ class DiscoveryService: "llms-full.txt", # Part of llms.txt spec - comprehensive content # Sitemap files (structural crawling guidance) "sitemap.xml", # Universal standard for site structure - # Robots file (basic crawling rules) - "robots.txt", # Universal standard for crawl directives # Well-known variants (alternative locations per RFC 8615) ".well-known/ai.txt", ".well-known/llms.txt", diff --git a/python/src/server/services/crawling/robots_checker.py b/python/src/server/services/crawling/robots_checker.py new file mode 100644 index 0000000000..ae078e2f5f --- /dev/null +++ b/python/src/server/services/crawling/robots_checker.py @@ -0,0 +1,393 @@ +""" +robots.txt Checker Service + +This module provides robots.txt compliance checking for the Archon web crawler. +It fetches, parses, caches, and enforces robots.txt rules including: +- Allow/Disallow rules with wildcard support +- Crawl-delay directives +- Per-domain caching with 24-hour TTL +- Thread-safe concurrent access + +Uses Protego library for fast, spec-compliant robots.txt parsing. +""" + +import asyncio +import logging +import time +from dataclasses import dataclass +from datetime import datetime, timedelta +from typing import Dict, Optional +from urllib.parse import urlparse + +import httpx +from protego import Protego + +logger = logging.getLogger(__name__) + +# Shared HTTP client for all RobotsChecker instances to prevent connection leaks +# This client is created once and reused across all crawler instances +_shared_http_client: Optional[httpx.AsyncClient] = None + + +def _get_shared_http_client() -> httpx.AsyncClient: + """Get or create shared HTTP client for robots.txt fetching.""" + global _shared_http_client + if _shared_http_client is None: + _shared_http_client = httpx.AsyncClient(timeout=10.0, follow_redirects=True) + return _shared_http_client + + +@dataclass +class CachedRobotsEntry: + """Cache entry for robots.txt parser with TTL tracking.""" + + parser: Protego + expires_at: datetime + + +class RobotsChecker: + """ + Thread-safe robots.txt checker with caching and crawl delay enforcement. + + This service: + - Fetches and caches robots.txt for each domain (24-hour TTL) + - Validates URLs against robots.txt Allow/Disallow rules + - Enforces per-domain crawl delays + - Handles errors gracefully per RFC 9309 (404 = allow, 5xx = disallow) + + Attributes: + _config: Crawler configuration dict + _cache: TTLCache for storing parsed robots.txt by domain + _locks: Per-domain locks for thread-safe access + _last_crawl_time: Tracks last crawl timestamp per domain for delay enforcement + _client: Shared httpx.AsyncClient for fetching robots.txt + """ + + def __init__(self, config: dict): + """ + Initialize the RobotsChecker. + + Args: + config: Crawler configuration dict with keys: + - user_agent: User-Agent string for requests + - robots_cache_size: Maximum domains to cache (default: 1000) + - robots_cache_ttl: Cache TTL in seconds (default: 86400 = 24h) + - default_crawl_delay: Default delay between requests (default: 10.0) + """ + self._config = config + + # Manual TTL cache for parsed robots.txt (domain -> CachedRobotsEntry) + self._cache: Dict[str, CachedRobotsEntry] = {} + self._cache_ttl = timedelta(seconds=config.get("robots_cache_ttl", 86400)) # 24 hours + self._max_cache_size = config.get("robots_cache_size", 1000) + + # Per-domain locks for thread-safe cache access + self._locks: Dict[str, asyncio.Lock] = {} + + # Separate locks for delay tracking to avoid deadlock + self._delay_locks: Dict[str, asyncio.Lock] = {} + + # Track last crawl time per domain for delay enforcement + self._last_crawl_time: Dict[str, float] = {} + + # Use shared HTTP client for fetching robots.txt (prevents connection leaks) + self._client = _get_shared_http_client() + + def _get_domain_key(self, url: str) -> str: + """ + Extract domain key from URL for caching. + + Args: + url: Full URL to extract domain from + + Returns: + Domain key in format "scheme://netloc" (e.g., "https://example.com") + + Raises: + ValueError: If URL is malformed or missing scheme/netloc + """ + parsed = urlparse(url) + if not parsed.scheme or not parsed.netloc: + raise ValueError(f"Invalid URL - missing scheme or netloc: {url}") + return f"{parsed.scheme}://{parsed.netloc}" + + def _get_domain_lock(self, domain: str) -> asyncio.Lock: + """ + Get or create asyncio.Lock for domain cache access. + + Thread-safe lock creation for concurrent access control. + + Args: + domain: Domain key to get lock for + + Returns: + asyncio.Lock for the specified domain + """ + if domain not in self._locks: + self._locks[domain] = asyncio.Lock() + return self._locks[domain] + + def _get_delay_lock(self, domain: str) -> asyncio.Lock: + """ + Get or create asyncio.Lock for domain delay tracking. + + Separate from cache locks to avoid deadlock when wait_if_needed + calls get_crawl_delay which calls get_robots_parser. + + Args: + domain: Domain key to get lock for + + Returns: + asyncio.Lock for delay tracking + """ + if domain not in self._delay_locks: + self._delay_locks[domain] = asyncio.Lock() + return self._delay_locks[domain] + + async def can_fetch(self, url: str) -> bool: + """ + Check if URL can be fetched according to robots.txt. + + This is the main entry point for robots.txt validation. + + Args: + url: URL to check + + Returns: + True if crawling is allowed, False if disallowed + + Raises: + No exceptions raised - errors result in "allow" (fail open) + """ + try: + domain = self._get_domain_key(url) + parser = await self.get_robots_parser(domain) + + # Use configured user agent + user_agent = self._config.get("user_agent", "*") + + # Protego.can_fetch expects (url, user_agent) - note reversed order from urllib + allowed = parser.can_fetch(url, user_agent) + + if not allowed: + logger.info(f"URL blocked by robots.txt: {url}") + + return allowed + + except Exception as e: + # Fail open - allow crawling on error + logger.warning(f"Error checking robots.txt for {url}: {e}, allowing crawl") + return True + + async def get_robots_parser(self, domain: str) -> Protego: + """ + Get cached or fetch robots.txt parser for domain. + + Implements manual TTL caching with thread-safe access. + Cache key is domain only (scheme + netloc). + + Args: + domain: Domain key (e.g., "https://example.com") + + Returns: + Protego parser instance for the domain + + Raises: + No exceptions raised - errors result in permissive parser + """ + # Get or create lock for this domain + async with self._get_domain_lock(domain): + # Check cache first + if domain in self._cache: + entry = self._cache[domain] + # Check if entry is still valid + if datetime.now() < entry.expires_at: + logger.debug(f"robots.txt cache hit for {domain}") + return entry.parser + else: + # Expired - remove from cache + logger.debug(f"robots.txt cache expired for {domain}, refetching...") + del self._cache[domain] + + # Cache miss or expired - fetch robots.txt + robots_content = await self._fetch_robots_txt(domain) + parser = Protego.parse(robots_content) + + # Evict oldest entry if cache is full + if len(self._cache) >= self._max_cache_size: + oldest_domain = min(self._cache.keys(), key=lambda k: self._cache[k].expires_at) + del self._cache[oldest_domain] + logger.debug(f"robots.txt cache full, evicted oldest entry: {oldest_domain}") + + # Store in cache + self._cache[domain] = CachedRobotsEntry( + parser=parser, expires_at=datetime.now() + self._cache_ttl + ) + + # Log one clear message that robots.txt is being respected + has_rules = bool(robots_content.strip()) + if has_rules: + logger.info(f"Respecting robots.txt for {domain} (cached for 24h)") + else: + logger.debug(f"No robots.txt found for {domain} - allowing all URLs") + + return parser + + async def _fetch_robots_txt(self, domain: str) -> str: + """ + Fetch robots.txt content with proper error handling per RFC 9309. + + Error handling: + - 404: Returns empty string (allow all) + - 5xx: Returns disallow-all rules (conservative) + - Timeout: Returns disallow-all rules (conservative) + - Other errors: Returns empty string (fail open) + + Args: + domain: Domain to fetch robots.txt from + + Returns: + robots.txt content as string + """ + robots_url = f"{domain}/robots.txt" + + try: + # Use configured user agent for robots.txt request + headers = {"User-Agent": self._config.get("user_agent", "Archon-Crawler/1.0")} + + response = await self._client.get(robots_url, headers=headers) + + if response.status_code == 404: + # No robots.txt = allow all (logged in get_robots_parser) + return "" + + elif response.status_code >= 500: + # Server error = disallow all (conservative per RFC 9309) + logger.warning( + f"Server error fetching robots.txt for {domain} (HTTP {response.status_code}), disallowing all" + ) + return "User-agent: *\nDisallow: /" + + elif response.status_code == 200: + # Success - return content (logged in get_robots_parser) + return response.text + + else: + # Other status codes (3xx after redirect handling, 4xx) - allow all + logger.debug( + f"Unexpected status fetching robots.txt for {domain} (HTTP {response.status_code}), allowing all" + ) + return "" + + except httpx.TimeoutException: + # Timeout = disallow all (conservative) + logger.warning(f"Timeout fetching robots.txt for {domain}, disallowing all") + return "User-agent: *\nDisallow: /" + + except Exception as e: + # Other errors = allow all (fail open) + logger.error(f"Error fetching robots.txt for {domain}: {e}, allowing all") + return "" + + async def get_crawl_delay(self, domain: str) -> float: + """ + Get crawl delay for domain from robots.txt or default. + + Extracts Crawl-delay directive from robots.txt. Falls back to + configured default if not specified. + + Args: + domain: Domain to get crawl delay for + + Returns: + Crawl delay in seconds (float) + """ + try: + parser = await self.get_robots_parser(domain) + user_agent = self._config.get("user_agent", "*") + + # Get crawl delay from robots.txt + delay = parser.crawl_delay(user_agent) + + if delay is not None: + logger.debug(f"Crawl delay for {domain}: {delay}s (from robots.txt)") + return float(delay) + + # Fall back to default + default_delay = self._config.get("default_crawl_delay", 10.0) + logger.debug(f"Crawl delay for {domain}: {default_delay}s (default)") + return default_delay + + except Exception as e: + # On error, use default delay + default_delay = self._config.get("default_crawl_delay", 10.0) + logger.warning(f"Error getting crawl delay for {domain}: {e}, using default {default_delay}s") + return default_delay + + async def wait_if_needed(self, domain: str) -> None: + """ + Wait for crawl delay if needed before next request to domain. + + Enforces minimum delay between requests to the same domain. + Uses asyncio.sleep() for non-blocking waits. + + Args: + domain: Domain key (e.g., "https://example.com") to check/enforce delay for + + Returns: + None (blocks until delay is satisfied) + """ + async with self._get_delay_lock(domain): + # Get required delay + delay = await self.get_crawl_delay(domain) + + # If delay is 0 or negative, no wait needed + if delay <= 0: + return + + # Check time since last crawl + last_time = self._last_crawl_time.get(domain, 0) + elapsed = time.time() - last_time + + # Wait if needed + if elapsed < delay: + wait_time = delay - elapsed + logger.debug(f"Crawl delay: waiting {wait_time:.1f}s for {domain}") + await asyncio.sleep(wait_time) + + # Update last crawl time + self._last_crawl_time[domain] = time.time() + + async def wait_if_needed_for_url(self, url: str) -> None: + """ + Wait for crawl delay if needed before next request to URL. + + Convenience method that extracts domain from URL and enforces delay. + + Args: + url: Full URL to check/enforce delay for + + Returns: + None (blocks until delay is satisfied) + """ + domain = self._get_domain_key(url) + await self.wait_if_needed(domain) + + async def close(self) -> None: + """ + Cleanup resources. + + Note: HTTP client is shared across all instances and should not be closed per-instance. + This method is kept for API compatibility but doesn't close the shared client. + """ + pass # Shared client is not closed per-instance + + def clear_cache(self) -> None: + """ + Clear all cached robots.txt parsers. + + Useful for testing or forcing refresh. + """ + self._cache.clear() + self._last_crawl_time.clear() + logger.info("Robots.txt cache cleared") diff --git a/python/src/server/services/crawling/strategies/recursive.py b/python/src/server/services/crawling/strategies/recursive.py index 3cdee7506a..75c50a9460 100644 --- a/python/src/server/services/crawling/strategies/recursive.py +++ b/python/src/server/services/crawling/strategies/recursive.py @@ -42,6 +42,7 @@ async def crawl_recursive_with_progress( max_concurrent: int | None = None, progress_callback: Callable[..., Awaitable[None]] | None = None, cancellation_check: Callable[[], None] | None = None, + robots_checker=None, ) -> list[dict[str, Any]]: """ Recursively crawl internal links from start URLs up to a maximum depth with progress reporting. @@ -304,6 +305,16 @@ def normalize_url(url): # Skip binary files and already visited URLs is_binary = self.url_handler.is_binary_file(next_url) if next_url not in visited and not is_binary: + # Check robots.txt if enabled + if robots_checker: + try: + allowed = await robots_checker.can_fetch(next_url) + if not allowed: + logger.info(f"Skipped (robots.txt): {next_url}") + continue + except Exception as e: + logger.warning(f"robots.txt check failed for {next_url}: {e}, allowing crawl") + if next_url not in next_level_urls: next_level_urls.add(next_url) total_discovered += 1 # Increment when we discover a new URL diff --git a/python/uv.lock b/python/uv.lock index 27c43c2e22..acbbab8e60 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -196,6 +196,7 @@ all = [ { name = "mcp" }, { name = "openai" }, { name = "pdfplumber" }, + { name = "protego" }, { name = "pydantic" }, { name = "pydantic-ai" }, { name = "pypdf2" }, @@ -246,6 +247,7 @@ server = [ { name = "markdown" }, { name = "openai" }, { name = "pdfplumber" }, + { name = "protego" }, { name = "pydantic" }, { name = "pypdf2" }, { name = "pytest" }, @@ -292,7 +294,7 @@ agents = [ ] all = [ { name = "asyncpg", specifier = ">=0.29.0" }, - { name = "crawl4ai", specifier = "==0.7.4" }, + { name = "crawl4ai", specifier = "==0.7.6" }, { name = "cryptography", specifier = ">=41.0.0" }, { name = "factory-boy", specifier = ">=3.3.0" }, { name = "fastapi", specifier = ">=0.104.0" }, @@ -302,6 +304,7 @@ all = [ { name = "mcp", specifier = "==1.12.2" }, { name = "openai", specifier = "==1.71.0" }, { name = "pdfplumber", specifier = ">=0.11.6" }, + { name = "protego", specifier = ">=0.3.1" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pydantic-ai", specifier = ">=0.0.13" }, { name = "pypdf2", specifier = ">=3.0.1" }, @@ -344,7 +347,7 @@ mcp = [ ] server = [ { name = "asyncpg", specifier = ">=0.29.0" }, - { name = "crawl4ai", specifier = "==0.7.4" }, + { name = "crawl4ai", specifier = "==0.7.6" }, { name = "cryptography", specifier = ">=41.0.0" }, { name = "fastapi", specifier = ">=0.104.0" }, { name = "httpx", specifier = ">=0.24.0" }, @@ -352,6 +355,7 @@ server = [ { name = "markdown", specifier = ">=3.8" }, { name = "openai", specifier = "==1.71.0" }, { name = "pdfplumber", specifier = ">=0.11.6" }, + { name = "protego", specifier = ">=0.3.1" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pypdf2", specifier = ">=3.0.1" }, { name = "pytest", specifier = ">=8.0.0" }, @@ -708,7 +712,7 @@ wheels = [ [[package]] name = "crawl4ai" -version = "0.7.4" +version = "0.7.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiofiles" }, @@ -720,6 +724,7 @@ dependencies = [ { name = "brotli" }, { name = "chardet" }, { name = "click" }, + { name = "cssselect" }, { name = "fake-useragent" }, { name = "httpx", extra = ["http2"] }, { name = "humanize" }, @@ -744,9 +749,9 @@ dependencies = [ { name = "tf-playwright-stealth" }, { name = "xxhash" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e3/85/39761e1b269d30ddd5c5ee59e74e03605308f304a1a7d7e4f9d12cac1923/crawl4ai-0.7.4.tar.gz", hash = "sha256:68974cab5ef318c45f58657b0b23741e9cdd3df61b5824f024e506fee12bf99f", size = 437139 } +sdist = { url = "https://files.pythonhosted.org/packages/c2/13/304d1ecef51554c87265b890a491aa8266e4e36b1f4f9135150be316e148/crawl4ai-0.7.6.tar.gz", hash = "sha256:cdcf86db45863ee0c155b9969be292fbe50dbc8756e6ddae2cbc7e919656892a", size = 447509 } wheels = [ - { url = "https://files.pythonhosted.org/packages/1a/7e/0681b76f4b59e5b7d54c16595fe5642972ab1bbbdf6dd6ac1013a526d2a5/crawl4ai-0.7.4-py3-none-any.whl", hash = "sha256:d845b062a989cf43338d30cc8efdcd2701304cea7e3e15122c826d92eee88334", size = 426242 }, + { url = "https://files.pythonhosted.org/packages/d0/cc/3b5f524a30df883a52910f6ebde2c6d13a6bd3b56a1329c96a2c6dfc7bdb/crawl4ai-0.7.6-py3-none-any.whl", hash = "sha256:02a12bd91d032d51f21d764646bd33be9f392bebba4ebd8c110bccee70e0e2cc", size = 431342 }, ] [[package]] @@ -784,6 +789,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c9/ad/51f212198681ea7b0deaaf8846ee10af99fba4e894f67b353524eab2bbe5/cryptography-44.0.3-cp39-abi3-win_amd64.whl", hash = "sha256:5d186f32e52e66994dce4f766884bcb9c68b8da62d61d9d215bfe5fb56d21334", size = 3210375 }, ] +[[package]] +name = "cssselect" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/0a/c3ea9573b1dc2e151abfe88c7fe0c26d1892fe6ed02d0cdb30f0d57029d5/cssselect-1.3.0.tar.gz", hash = "sha256:57f8a99424cfab289a1b6a816a43075a4b00948c86b4dcf3ef4ee7e15f7ab0c7", size = 42870 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ee/58/257350f7db99b4ae12b614a36256d9cc870d71d9e451e79c2dc3b23d7c3c/cssselect-1.3.0-py3-none-any.whl", hash = "sha256:56d1bf3e198080cc1667e137bc51de9cadfca259f03c2d4e09037b3e01e30f0d", size = 18786 }, +] + [[package]] name = "deprecated" version = "1.2.18" @@ -2047,6 +2061,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b8/d3/c3cb8f1d6ae3b37f83e1de806713a9b3642c5895f0215a62e1a4bd6e5e34/propcache-0.3.1-py3-none-any.whl", hash = "sha256:9a8ecf38de50a7f518c21568c80f985e776397b902f1ce0b01f799aba1608b40", size = 12376 }, ] +[[package]] +name = "protego" +version = "0.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/19/9b/9c3a649167c7e43a0818df515d515e66d95a261fdfdf2a6afd45be9db696/protego-0.5.0.tar.gz", hash = "sha256:225dee0acfcc71de8c6f7cef9c618e5a9d3e7baa7ae1470b8d076a064033c463", size = 3137494 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3a/cb/4347985f89ca3e4beb5d0cb85f8b951c9e339564bd2a3f388d6fb78382cc/protego-0.5.0-py3-none-any.whl", hash = "sha256:4237227840a67fdeec289a9b89652455b5657806388c17e1a556e160435f8fc5", size = 10356 }, +] + [[package]] name = "protobuf" version = "5.29.5"