Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ server = [
"python-multipart>=0.0.20",
"watchfiles>=0.18",
# Web crawling
"crawl4ai==0.7.4",
"crawl4ai==0.7.6", # Updated from 0.7.4 for latest features and bug fixes (not required for robots.txt)
"protego>=0.3.1", # robots.txt parser - 40% faster than stdlib, supports wildcards
# Database and storage
"supabase==2.15.1",
"asyncpg>=0.29.0",
Expand Down Expand Up @@ -119,7 +120,8 @@ all = [
"uvicorn>=0.24.0",
"python-multipart>=0.0.20",
"watchfiles>=0.18",
"crawl4ai==0.7.4",
"crawl4ai==0.7.6",
"protego>=0.3.1",
"supabase==2.15.1",
"asyncpg>=0.29.0",
"openai==1.71.0",
Expand Down
4 changes: 4 additions & 0 deletions python/src/server/api_routes/knowledge_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,8 @@ async def _perform_refresh_with_semaphore():
safe_logfire_info(
f"Cleaned up refresh task from registry | progress_id={progress_id}"
)
# Close crawl_service to release resources
await crawl_service.close()

# Start the wrapper task - we don't need to track it since we'll track the actual crawl task
asyncio.create_task(_perform_refresh_with_semaphore())
Expand Down Expand Up @@ -889,6 +891,8 @@ async def _perform_crawl_with_progress(
safe_logfire_info(
f"Cleaned up crawl task from registry | progress_id={progress_id}"
)
# Close orchestration_service to release resources
await orchestration_service.close()


@router.post("/documents/upload")
Expand Down
76 changes: 76 additions & 0 deletions python/src/server/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,3 +275,79 @@ def str_to_bool(value: str | None) -> bool:
enable_docker_socket=str_to_bool(os.getenv("ENABLE_DOCKER_SOCKET_MONITORING")),
health_check_timeout=int(os.getenv("MCP_HEALTH_CHECK_TIMEOUT", "5")),
)


def get_crawler_config() -> dict:
"""Get crawler configuration from environment with validation.

Returns a dictionary with crawler settings including User-Agent,
robots.txt compliance settings, and caching configuration.

Environment Variables:
CRAWLER_USER_AGENT: Custom User-Agent string (default: "Archon-Crawler/{version} (+{repo_url})")
ROBOTS_RESPECT: Whether to respect robots.txt (default: "true")
ROBOTS_DEFAULT_CRAWL_DELAY: Default delay between requests in seconds (default: "10.0", min: 0.0)
ROBOTS_CACHE_SIZE: Max number of domains to cache (default: "1000", min: 1)
ROBOTS_CACHE_TTL: Cache TTL in seconds (default: "86400" = 24 hours, min: 1)

Returns:
dict with keys: user_agent, respect_robots, default_crawl_delay,
robots_cache_size, robots_cache_ttl

Raises:
ConfigurationError: If environment variable values are invalid or out of bounds
"""
from .version import ARCHON_VERSION, GITHUB_REPO_NAME, GITHUB_REPO_OWNER

repo_url = f"https://github.com/{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}"
default_ua = f"Archon-Crawler/{ARCHON_VERSION} (+{repo_url})"

# Parse and validate ROBOTS_DEFAULT_CRAWL_DELAY
crawl_delay_str = os.getenv("ROBOTS_DEFAULT_CRAWL_DELAY", "10.0")
try:
default_crawl_delay = float(crawl_delay_str)
if default_crawl_delay < 0.0:
raise ConfigurationError(
f"ROBOTS_DEFAULT_CRAWL_DELAY must be >= 0.0, got: {default_crawl_delay}. "
f"Use 0.0 to disable delays."
)
except ValueError as e:
raise ConfigurationError(
f"ROBOTS_DEFAULT_CRAWL_DELAY must be a valid number, got: '{crawl_delay_str}'"
) from e

# Parse and validate ROBOTS_CACHE_SIZE
cache_size_str = os.getenv("ROBOTS_CACHE_SIZE", "1000")
try:
robots_cache_size = int(cache_size_str)
if robots_cache_size < 1:
raise ConfigurationError(
f"ROBOTS_CACHE_SIZE must be >= 1, got: {robots_cache_size}. "
f"Recommended: 100-10000"
)
except ValueError as e:
raise ConfigurationError(
f"ROBOTS_CACHE_SIZE must be a valid integer, got: '{cache_size_str}'"
) from e

# Parse and validate ROBOTS_CACHE_TTL
cache_ttl_str = os.getenv("ROBOTS_CACHE_TTL", "86400")
try:
robots_cache_ttl = int(cache_ttl_str)
if robots_cache_ttl < 1:
raise ConfigurationError(
f"ROBOTS_CACHE_TTL must be >= 1 second, got: {robots_cache_ttl}. "
f"RFC 9309 recommends max 86400 (24 hours)"
)
except ValueError as e:
raise ConfigurationError(
f"ROBOTS_CACHE_TTL must be a valid integer, got: '{cache_ttl_str}'"
) from e

return {
"user_agent": os.getenv("CRAWLER_USER_AGENT", default_ua),
"respect_robots": os.getenv("ROBOTS_RESPECT", "true").lower() == "true",
"default_crawl_delay": default_crawl_delay,
"robots_cache_size": robots_cache_size,
"robots_cache_ttl": robots_cache_ttl,
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
6 changes: 4 additions & 2 deletions python/src/server/services/crawler_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
AsyncWebCrawler = None
BrowserConfig = None

from ..config.config import get_crawler_config
from ..config.logfire_config import get_logger, safe_logfire_error, safe_logfire_info

logger = get_logger(__name__)
Expand Down Expand Up @@ -59,14 +60,15 @@ async def initialize(self):

# Initialize browser config - same for Docker and local
# crawl4ai/Playwright will handle Docker-specific settings internally
crawler_config = get_crawler_config()
browser_config = BrowserConfig(
headless=True,
verbose=False,
# Set viewport for proper rendering
viewport_width=1920,
viewport_height=1080,
# Add user agent to appear as a real browser
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
# Use proper bot identification
user_agent=crawler_config["user_agent"],
# Set browser type
browser_type="chromium",
# Extra args for Chromium - optimized for speed
Expand Down
87 changes: 87 additions & 0 deletions python/src/server/services/crawling/crawling_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import tldextract

from ...config.config import get_crawler_config
from ...config.logfire_config import get_logger, safe_logfire_error, safe_logfire_info
from ...utils import get_supabase_client
from ...utils.progress.progress_tracker import ProgressTracker
Expand All @@ -28,6 +29,7 @@
from .helpers.url_handler import URLHandler
from .page_storage_operations import PageStorageOperations
from .progress_mapper import ProgressMapper
from .robots_checker import RobotsChecker
from .strategies.batch import BatchCrawlStrategy
from .strategies.recursive import RecursiveCrawlStrategy
from .strategies.single_page import SinglePageCrawlStrategy
Expand Down Expand Up @@ -133,6 +135,10 @@ def __init__(self, crawler=None, supabase_client=None, progress_id=None):
self.discovery_service = DiscoveryService()
self.page_storage_ops = PageStorageOperations(self.supabase_client)

# Initialize robots.txt checker
crawler_config = get_crawler_config()
self.robots_checker = RobotsChecker(crawler_config) if crawler_config.get("respect_robots") else None

# Track progress state across all stages to prevent UI resets
self.progress_state = {"progressId": self.progress_id} if self.progress_id else {}
# Initialize progress mapper to prevent backwards jumps
Expand Down Expand Up @@ -162,6 +168,35 @@ def _check_cancellation(self):
if self._cancelled:
raise asyncio.CancelledError("Crawl operation was cancelled by user")

async def _can_fetch_url(self, url: str) -> bool:
"""
Check if URL is allowed by robots.txt.

Note: This method only validates URLs, it does NOT enforce crawl delays.
Crawl delays are handled by Crawl4AI's internal rate limiting and
concurrency controls. Enforcing delays during validation would cause
unacceptable performance (e.g., 540 seconds to validate 54 sitemap URLs).

Args:
url: URL to check

Returns:
True if crawling is allowed, False if blocked by robots.txt

Raises:
No exceptions - errors result in allowing the crawl (fail open)
"""
if not self.robots_checker:
return True # Robots checking disabled

try:
# Check if URL is allowed (no delay enforcement during validation)
return await self.robots_checker.can_fetch(url)
except Exception as e:
# Log error but allow crawl (fail open)
logger.warning(f"robots.txt check failed for {url}: {e}, allowing crawl")
return True

Comment thread
leex279 marked this conversation as resolved.
async def _create_crawl_progress_callback(
self, base_status: str
) -> Callable[[str, int, str], Awaitable[None]]:
Expand Down Expand Up @@ -278,6 +313,7 @@ async def crawl_recursive_with_progress(
max_concurrent,
progress_callback,
self._check_cancellation, # Pass cancellation check
self.robots_checker, # Pass robots checker for URL validation
)

# Orchestration methods
Expand Down Expand Up @@ -909,6 +945,20 @@ async def update_crawl_progress(stage_progress: int, message: str, **kwargs):
url_to_link_text = dict(same_domain_links)
extracted_urls = [link for link, _ in same_domain_links]

# Filter URLs with robots.txt validation
if self.robots_checker:
original_count = len(extracted_urls)
allowed_urls = []
for url_to_check in extracted_urls:
if await self._can_fetch_url(url_to_check):
allowed_urls.append(url_to_check)
else:
logger.info(f"Skipped (robots.txt): {url_to_check}")
extracted_urls = allowed_urls
robots_filtered = original_count - len(extracted_urls)
if robots_filtered > 0:
logger.info(f"Filtered out {robots_filtered} URLs by robots.txt from llms.txt links")

logger.info(f"Following {len(extracted_urls)} same-domain links from llms.txt")

# Notify user about linked files being crawled
Expand Down Expand Up @@ -979,6 +1029,20 @@ async def update_crawl_progress(stage_progress: int, message: str, **kwargs):
url_to_link_text = dict(extracted_links_with_text)
extracted_links = [link for link, _ in extracted_links_with_text]

# Filter URLs with robots.txt validation
if self.robots_checker:
original_count = len(extracted_links)
allowed_links = []
for url_to_check in extracted_links:
if await self._can_fetch_url(url_to_check):
allowed_links.append(url_to_check)
else:
logger.info(f"Skipped (robots.txt): {url_to_check}")
extracted_links = allowed_links
robots_filtered = original_count - len(extracted_links)
if robots_filtered > 0:
logger.info(f"Filtered out {robots_filtered} URLs by robots.txt from extracted links")

# For discovery targets, respect max_depth for same-domain links
max_depth = request.get('max_depth', 2) if request.get("is_discovery_target") else request.get('max_depth', 1)

Expand Down Expand Up @@ -1035,6 +1099,20 @@ async def update_crawl_progress(stage_progress: int, message: str, **kwargs):
sitemap_urls = self.parse_sitemap(url)

if sitemap_urls:
# Filter URLs with robots.txt validation
if self.robots_checker:
original_count = len(sitemap_urls)
allowed_sitemap_urls = []
for url_to_check in sitemap_urls:
if await self._can_fetch_url(url_to_check):
allowed_sitemap_urls.append(url_to_check)
else:
logger.info(f"Skipped (robots.txt): {url_to_check}")
sitemap_urls = allowed_sitemap_urls
robots_filtered = original_count - len(sitemap_urls)
if robots_filtered > 0:
logger.info(f"Filtered out {robots_filtered} URLs by robots.txt from sitemap")

# Update progress before starting batch crawl
await update_crawl_progress(
75, # 75% of crawling stage
Expand Down Expand Up @@ -1069,6 +1147,15 @@ async def update_crawl_progress(stage_progress: int, message: str, **kwargs):

return crawl_results, crawl_type

async def close(self) -> None:
"""
Close resources and cleanup.

Note: robots_checker uses a shared HTTP client that is not closed per-instance.
This method is kept for API compatibility and future cleanup needs.
"""
pass # No per-instance cleanup needed currently


# Alias for backward compatibility
CrawlOrchestrationService = CrawlingService
2 changes: 0 additions & 2 deletions python/src/server/services/crawling/discovery_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,6 @@ class DiscoveryService:
"llms-full.txt", # Part of llms.txt spec - comprehensive content
# Sitemap files (structural crawling guidance)
"sitemap.xml", # Universal standard for site structure
# Robots file (basic crawling rules)
"robots.txt", # Universal standard for crawl directives
# Well-known variants (alternative locations per RFC 8615)
".well-known/ai.txt",
".well-known/llms.txt",
Expand Down
Loading