diff --git a/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx b/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx index bcf01bdd76..efcdd76302 100644 --- a/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx +++ b/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx @@ -4,17 +4,19 @@ */ import { Globe, Loader2, Upload } from "lucide-react"; -import { useId, useState } from "react"; +import { useEffect, useId, useState } from "react"; import { useToast } from "@/features/shared/hooks/useToast"; +import { callAPIWithETag } from "@/features/shared/api/apiClient"; import { Button, Input, Label } from "../../ui/primitives"; import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "../../ui/primitives/dialog"; import { cn, glassCard } from "../../ui/primitives/styles"; import { Tabs, TabsContent, TabsList, TabsTrigger } from "../../ui/primitives/tabs"; import { useCrawlUrl, useUploadDocument } from "../hooks"; -import type { CrawlRequest, UploadMetadata } from "../types"; +import type { CrawlRequest, UploadMetadata, LinkPreviewResponse } from "../types"; import { KnowledgeTypeSelector } from "./KnowledgeTypeSelector"; import { LevelSelector } from "./LevelSelector"; import { TagInput } from "./TagInput"; +import { LinkReviewModal } from "./LinkReviewModal"; interface AddKnowledgeDialogProps { open: boolean; @@ -44,21 +46,83 @@ export const AddKnowledgeDialog: React.FC = ({ const [maxDepth, setMaxDepth] = useState("2"); const [tags, setTags] = useState([]); + // Glob pattern filtering state (unified field with ! prefix for exclusions) + const [urlPatterns, setUrlPatterns] = useState(""); + const [reviewLinksEnabled, setReviewLinksEnabled] = useState(true); + + // Link review modal state + const [showLinkReviewModal, setShowLinkReviewModal] = useState(false); + const [previewData, setPreviewData] = useState(null); + // Upload form state const [selectedFile, setSelectedFile] = useState(null); const [uploadType, setUploadType] = useState<"technical" | "business">("technical"); const [uploadTags, setUploadTags] = useState([]); + // Auto-detect GitHub repositories and populate smart defaults + useEffect(() => { + // Only auto-populate if the URL has changed and patterns are empty + if (!crawlUrl) return; + + // Detect GitHub URL (supports https://, http://, or just github.com) + const githubUrlPattern = /^(?:https?:\/\/)?(?:www\.)?github\.com\/([^\/]+)\/([^\/\?#]+)/i; + const match = crawlUrl.match(githubUrlPattern); + + if (match) { + // Only auto-populate if patterns are currently empty (don't override user edits) + if (!urlPatterns) { + // Use code-only patterns: only crawl tree (directories) and blob (files) pages + setUrlPatterns("**/tree/**, **/blob/**"); + } + + // Auto-add "GitHub Repo" tag if not already present + if (!tags.includes("GitHub Repo")) { + setTags((prevTags) => [...prevTags, "GitHub Repo"]); + } + + // Set max depth to 3 for GitHub repos (to traverse nested directories) + if (maxDepth === "2") { + setMaxDepth("3"); + } + } + }, [crawlUrl]); // Only depend on crawlUrl to avoid infinite loops + const resetForm = () => { setCrawlUrl(""); setCrawlType("technical"); setMaxDepth("2"); setTags([]); + setUrlPatterns(""); + setReviewLinksEnabled(true); setSelectedFile(null); setUploadType("technical"); setUploadTags([]); }; + // Parse unified pattern string into separate include/exclude arrays. + // Patterns starting with ! are exclusions, others are inclusions. + // Example: "path1, path2, !exclude1" -> { include: ["path1", "path2"], exclude: ["exclude1"] } + const parseUrlPatterns = (patterns: string): { include: string[]; exclude: string[] } => { + const include: string[] = []; + const exclude: string[] = []; + + patterns + .split(",") + .map((p) => p.trim()) + .filter((p) => p.length > 0) + .forEach((pattern) => { + if (pattern.startsWith("!")) { + // Exclude pattern - remove the ! prefix + exclude.push(pattern.substring(1).trim()); + } else { + // Include pattern + include.push(pattern); + } + }); + + return { include, exclude }; + }; + const handleCrawl = async () => { if (!crawlUrl) { showToast("Please enter a URL to crawl", "error"); @@ -66,11 +130,40 @@ export const AddKnowledgeDialog: React.FC = ({ } try { + // Parse unified pattern string into include/exclude arrays + const { include: includePatternArray, exclude: excludePatternArray } = parseUrlPatterns(urlPatterns); + + // If review is enabled, call preview endpoint first + if (reviewLinksEnabled) { + const previewData = await callAPIWithETag("/crawl/preview-links", { + method: "POST", + body: JSON.stringify({ + url: crawlUrl, + url_include_patterns: includePatternArray, + url_exclude_patterns: excludePatternArray, + }), + }); + + // If it's a link collection, show the review modal + if (previewData.is_link_collection) { + setPreviewData(previewData); + setShowLinkReviewModal(true); + return; // Don't proceed with crawl yet + } + + // Not a link collection - proceed with normal crawl + showToast("Not a link collection - proceeding with normal crawl", "info"); + } + + // Build crawl request (for non-link collections or when review is disabled) const request: CrawlRequest = { url: crawlUrl, knowledge_type: crawlType, max_depth: parseInt(maxDepth, 10), tags: tags.length > 0 ? tags : undefined, + url_include_patterns: includePatternArray.length > 0 ? includePatternArray : undefined, + url_exclude_patterns: excludePatternArray.length > 0 ? excludePatternArray : undefined, + skip_link_review: !reviewLinksEnabled, }; const response = await crawlMutation.mutateAsync(request); @@ -91,6 +184,42 @@ export const AddKnowledgeDialog: React.FC = ({ } }; + // Handle link review modal submission + const handleLinkReviewSubmit = async (selectedUrls: string[]) => { + try { + // Parse unified pattern string into include/exclude arrays + const { include: includePatternArray, exclude: excludePatternArray } = parseUrlPatterns(urlPatterns); + + const request: CrawlRequest = { + url: crawlUrl, + knowledge_type: crawlType, + max_depth: parseInt(maxDepth, 10), + tags: tags.length > 0 ? tags : undefined, + url_include_patterns: includePatternArray.length > 0 ? includePatternArray : undefined, + url_exclude_patterns: excludePatternArray.length > 0 ? excludePatternArray : undefined, + selected_urls: selectedUrls, + skip_link_review: false, + }; + + const response = await crawlMutation.mutateAsync(request); + + // Notify parent about the new crawl operation + if (response?.progressId && onCrawlStarted) { + onCrawlStarted(response.progressId); + } + + showToast(`Crawl started with ${selectedUrls.length} selected links`, "success"); + resetForm(); + setShowLinkReviewModal(false); + setPreviewData(null); + onSuccess(); + onOpenChange(false); + } catch (error) { + const message = error instanceof Error ? error.message : "Failed to start crawl"; + showToast(message, "error"); + } + }; + const handleUpload = async () => { if (!selectedFile) { showToast("Please select a file to upload", "error"); @@ -161,7 +290,7 @@ export const AddKnowledgeDialog: React.FC = ({ setCrawlUrl(e.target.value)} disabled={isProcessing} @@ -175,6 +304,69 @@ export const AddKnowledgeDialog: React.FC = ({ + {/* Glob Pattern Filtering Section */} +
+ {/* GitHub Auto-Configuration Notice */} + {crawlUrl.match(/^(?:https?:\/\/)?(?:www\.)?github\.com\/([^\/]+)\/([^\/\?#]+)/i) && ( +
+
+ +
+
+ GitHub Repository Detected: Pattern auto-configured to crawl only this repository (depth=3). + Add exclusions with !**/issues** if needed. +
+
+ )} + + {/* Review Links Checkbox */} +
+ setReviewLinksEnabled(e.target.checked)} + disabled={isProcessing} + className="h-4 w-4 text-cyan-600 focus:ring-cyan-500 border-gray-300 rounded" + /> + +
+
+ When enabled, you'll preview and select links from llms.txt or sitemap files before crawling starts +
+ + {/* Unified URL Patterns Input */} +
+ + setUrlPatterns(e.target.value)} + disabled={isProcessing} + className={cn( + "h-10", + glassCard.blur.sm, + glassCard.transparency.medium, + "border-gray-300/60 dark:border-gray-600/60 focus:border-cyan-400/70", + )} + /> +
+ Glob patterns: Include URLs with patterns like **/en/**. + Exclude with !**/api/** prefix (like .gitignore). + Leave empty to crawl all discovered links. +
+
+
+
@@ -301,6 +493,20 @@ export const AddKnowledgeDialog: React.FC = ({ + + {/* Link Review Modal */} + {showLinkReviewModal && previewData && ( + { + setShowLinkReviewModal(false); + setPreviewData(null); + }} + /> + )} ); }; diff --git a/archon-ui-main/src/features/knowledge/components/LinkReviewModal.tsx b/archon-ui-main/src/features/knowledge/components/LinkReviewModal.tsx new file mode 100644 index 0000000000..244fdf3f0a --- /dev/null +++ b/archon-ui-main/src/features/knowledge/components/LinkReviewModal.tsx @@ -0,0 +1,299 @@ +/** + * Link Review Modal Component + * Displays links from link collections (llms.txt, sitemap.xml) for user review before crawling + */ + +import { CheckCircle2, Filter, XCircle } from "lucide-react"; +import { useState, useEffect } from "react"; +import { Button, Input, Label } from "../../ui/primitives"; +import { Dialog, DialogContent, DialogHeader, DialogTitle } from "../../ui/primitives/dialog"; +import { cn, glassCard } from "../../ui/primitives/styles"; +import type { LinkPreviewResponse, PreviewLink } from "../types"; + +interface LinkReviewModalProps { + open: boolean; + previewData: LinkPreviewResponse | null; + initialIncludePatterns: string; + initialExcludePatterns: string; + onProceed: (selectedUrls: string[]) => void; + onCancel: () => void; +} + +export const LinkReviewModal: React.FC = ({ + open, + previewData, + initialIncludePatterns, + initialExcludePatterns, + onProceed, + onCancel, +}) => { + const [selectedUrls, setSelectedUrls] = useState>(new Set()); + const [includePatterns, setIncludePatterns] = useState(initialIncludePatterns); + const [excludePatterns, setExcludePatterns] = useState(initialExcludePatterns); + const [filteredLinks, setFilteredLinks] = useState([]); + const [searchTerm, setSearchTerm] = useState(""); + + // Initialize selected URLs when modal opens + useEffect(() => { + if (previewData && previewData.links) { + // Auto-select links that match filters + const initialSelection = new Set( + previewData.links.filter((link) => link.matches_filter).map((link) => link.url) + ); + setSelectedUrls(initialSelection); + setFilteredLinks(previewData.links); + } + }, [previewData]); + + // Apply search filter + useEffect(() => { + if (!previewData) return; + + const filtered = previewData.links.filter((link) => { + if (!searchTerm) return true; + const searchLower = searchTerm.toLowerCase(); + return ( + link.url.toLowerCase().includes(searchLower) || + link.text.toLowerCase().includes(searchLower) || + link.path.toLowerCase().includes(searchLower) + ); + }); + + setFilteredLinks(filtered); + }, [searchTerm, previewData]); + + const handleToggleLink = (url: string) => { + setSelectedUrls((prev) => { + const next = new Set(prev); + if (next.has(url)) { + next.delete(url); + } else { + next.add(url); + } + return next; + }); + }; + + const handleSelectAll = () => { + setSelectedUrls(new Set(filteredLinks.map((link) => link.url))); + }; + + const handleDeselectAll = () => { + setSelectedUrls(new Set()); + }; + + const handleInvertSelection = () => { + setSelectedUrls((prev) => { + const next = new Set(); + filteredLinks.forEach((link) => { + if (!prev.has(link.url)) { + next.add(link.url); + } + }); + return next; + }); + }; + + const handleApplyFilters = async () => { + if (!previewData) return; + + try { + // Parse patterns + const includePatternArray = includePatterns + .split(",") + .map((p) => p.trim()) + .filter((p) => p.length > 0); + const excludePatternArray = excludePatterns + .split(",") + .map((p) => p.trim()) + .filter((p) => p.length > 0); + + // Re-fetch preview with new patterns + const response = await fetch("http://localhost:8181/api/crawl/preview-links", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + url: previewData.source_url, + url_include_patterns: includePatternArray, + url_exclude_patterns: excludePatternArray, + }), + }); + + if (!response.ok) { + throw new Error("Failed to apply filters"); + } + + const updatedData = await response.json(); + + // Update filtered links and auto-select matching ones + setFilteredLinks(updatedData.links); + const newSelection = new Set( + updatedData.links.filter((link: PreviewLink) => link.matches_filter).map((link: PreviewLink) => link.url) + ); + setSelectedUrls(newSelection); + } catch (error) { + console.error("Failed to apply filters:", error); + } + }; + + const handleProceed = () => { + onProceed(Array.from(selectedUrls)); + }; + + if (!previewData) return null; + + const selectedCount = selectedUrls.size; + const totalCount = filteredLinks.length; + + return ( + !isOpen && onCancel()}> + +
+
+ + Review Links - {previewData.collection_type} + +
+
{previewData.source_url}
+
+ {selectedCount} of {totalCount} links selected +
+
+
+ +
+ {/* Filter Section */} +
+
+ + Filter Patterns +
+ +
+
+ + setIncludePatterns(e.target.value)} + placeholder="**/en/**" + className="h-8 text-sm" + /> +
+ +
+ + setExcludePatterns(e.target.value)} + placeholder="**/fr/**, **/de/**" + className="h-8 text-sm" + /> +
+
+ + +
+ + {/* Bulk Actions Bar */} +
+
+ + + +
+ + setSearchTerm(e.target.value)} + className="w-64 h-8 text-sm" + /> +
+ + {/* Link List (scrollable) */} +
+
+ {filteredLinks.map((link) => ( +
handleToggleLink(link.url)} + > + handleToggleLink(link.url)} + className="mt-1 h-4 w-4 text-cyan-600 focus:ring-cyan-500 border-gray-300 rounded" + /> + +
+
+
+

{link.text || "Untitled"}

+

{link.url}

+

Path: {link.path}

+
+ + {link.matches_filter && ( + + Matches Filter + + )} +
+
+
+ ))} + + {filteredLinks.length === 0 && ( +
+

No links found matching your search.

+
+ )} +
+
+
+ + {/* Footer Actions - Sticky */} +
+ + + +
+
+
+
+ ); +}; diff --git a/archon-ui-main/src/features/knowledge/components/index.ts b/archon-ui-main/src/features/knowledge/components/index.ts index a7f9ff55e0..19d99a6d33 100644 --- a/archon-ui-main/src/features/knowledge/components/index.ts +++ b/archon-ui-main/src/features/knowledge/components/index.ts @@ -4,3 +4,4 @@ export * from "./KnowledgeList"; export * from "./KnowledgeTypeSelector"; export * from "./LevelSelector"; export * from "./TagInput"; +export * from "./LinkReviewModal"; diff --git a/archon-ui-main/src/features/knowledge/types/knowledge.ts b/archon-ui-main/src/features/knowledge/types/knowledge.ts index 571cb6192e..b16380d52e 100644 --- a/archon-ui-main/src/features/knowledge/types/knowledge.ts +++ b/archon-ui-main/src/features/knowledge/types/knowledge.ts @@ -140,6 +140,36 @@ export interface CrawlRequest { update_frequency?: number; max_depth?: number; extract_code_examples?: boolean; + // Glob pattern filtering + url_include_patterns?: string[]; + url_exclude_patterns?: string[]; + // Link review mode + selected_urls?: string[]; + skip_link_review?: boolean; +} + +// Link preview request/response types +export interface LinkPreviewRequest { + url: string; + url_include_patterns?: string[]; + url_exclude_patterns?: string[]; +} + +export interface PreviewLink { + url: string; + text: string; + path: string; + matches_filter: boolean; +} + +export interface LinkPreviewResponse { + is_link_collection: boolean; + collection_type: string | null; + source_url: string; + total_links: number; + matching_links: number; + links: PreviewLink[]; + message?: string; } export interface UploadMetadata { diff --git a/archon-ui-main/src/features/ui/primitives/dialog.tsx b/archon-ui-main/src/features/ui/primitives/dialog.tsx index 27947ebdbf..7ae52522fb 100644 --- a/archon-ui-main/src/features/ui/primitives/dialog.tsx +++ b/archon-ui-main/src/features/ui/primitives/dialog.tsx @@ -62,7 +62,7 @@ export const DialogContent = React.forwardRef< )} {...props} > -
{children}
+
{children}
{showCloseButton && ( list[dict[str, Any]]: """Recursively crawl internal links from start URLs.""" return await self.recursive_strategy.crawl_recursive_with_progress( @@ -278,6 +280,8 @@ async def crawl_recursive_with_progress( max_concurrent, progress_callback, self._check_cancellation, # Pass cancellation check + include_patterns, + exclude_patterns, ) # Orchestration methods @@ -346,6 +350,15 @@ async def send_heartbeat_if_needed(): url = str(request.get("url", "")) safe_logfire_info(f"Starting async crawl orchestration | url={url} | task_id={task_id}") + # Log crawl parameters for debugging + max_depth = request.get("max_depth", 1) + url_include_patterns = request.get("url_include_patterns", []) + url_exclude_patterns = request.get("url_exclude_patterns", []) + logger.info( + f"Crawl parameters: url={url} | max_depth={max_depth} | " + f"include_patterns={url_include_patterns} | exclude_patterns={url_exclude_patterns}" + ) + # Start the progress tracker if available if self.progress_tracker: await self.progress_tracker.start({ @@ -904,6 +917,43 @@ async def update_crawl_progress(stage_progress: int, message: str, **kwargs): same_domain_links.append((link, text)) logger.debug(f"Found same-domain link: {link}") + # Apply glob pattern filtering or selected URLs + if same_domain_links: + original_count = len(same_domain_links) + + # Extract filtering parameters from request + include_patterns = request.get("url_include_patterns", []) + exclude_patterns = request.get("url_exclude_patterns", []) + selected_urls = request.get("selected_urls") + + # Option 1: Use selected_urls from review modal (takes precedence) + if selected_urls: + selected_urls_set = set(selected_urls) + same_domain_links = [ + (link, text) for link, text in same_domain_links + if link in selected_urls_set + ] + logger.info( + f"Applied selected_urls filter: {original_count} → {len(same_domain_links)} links " + f"({original_count - len(same_domain_links)} filtered)" + ) + + # Option 2: Apply glob pattern filtering + elif include_patterns or exclude_patterns: + filtered_links = [] + for link, text in same_domain_links: + if self.url_handler.matches_glob_patterns(link, include_patterns, exclude_patterns): + filtered_links.append((link, text)) + + filtered_count = original_count - len(filtered_links) + same_domain_links = filtered_links + + logger.info( + f"Applied glob pattern filter: {original_count} → {len(same_domain_links)} links " + f"({filtered_count} filtered) | " + f"include={include_patterns} | exclude={exclude_patterns}" + ) + if same_domain_links: # Build mapping and extract just URLs url_to_link_text = dict(same_domain_links) @@ -1035,6 +1085,41 @@ async def update_crawl_progress(stage_progress: int, message: str, **kwargs): sitemap_urls = self.parse_sitemap(url) if sitemap_urls: + original_count = len(sitemap_urls) + + # Apply glob pattern filtering or selected URLs + include_patterns = request.get("url_include_patterns", []) + exclude_patterns = request.get("url_exclude_patterns", []) + selected_urls = request.get("selected_urls") + + # Option 1: Use selected_urls from review modal (takes precedence) + if selected_urls: + selected_urls_set = set(selected_urls) + sitemap_urls = [ + url for url in sitemap_urls + if url in selected_urls_set + ] + logger.info( + f"Applied selected_urls filter to sitemap: {original_count} → {len(sitemap_urls)} URLs " + f"({original_count - len(sitemap_urls)} filtered)" + ) + + # Option 2: Apply glob pattern filtering + elif include_patterns or exclude_patterns: + filtered_urls = [] + for sitemap_url in sitemap_urls: + if self.url_handler.matches_glob_patterns(sitemap_url, include_patterns, exclude_patterns): + filtered_urls.append(sitemap_url) + + filtered_count = original_count - len(filtered_urls) + sitemap_urls = filtered_urls + + logger.info( + f"Applied glob pattern filter to sitemap: {original_count} → {len(sitemap_urls)} URLs " + f"({filtered_count} filtered) | " + f"include={include_patterns} | exclude={exclude_patterns}" + ) + # Update progress before starting batch crawl await update_crawl_progress( 75, # 75% of crawling stage @@ -1057,14 +1142,23 @@ async def update_crawl_progress(stage_progress: int, message: str, **kwargs): ) max_depth = request.get("max_depth", 1) - # Let the strategy handle concurrency from settings - # This will use CRAWL_MAX_CONCURRENT from database (default: 10) + include_patterns = request.get("url_include_patterns", []) + exclude_patterns = request.get("url_exclude_patterns", []) + + # Log pattern configuration for debugging + if include_patterns or exclude_patterns: + logger.info( + f"Recursive crawl with glob patterns | " + f"include={include_patterns} | exclude={exclude_patterns}" + ) crawl_results = await self.crawl_recursive_with_progress( [url], max_depth=max_depth, max_concurrent=None, # Let strategy use settings progress_callback=await self._create_crawl_progress_callback("crawling"), + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, ) return crawl_results, crawl_type diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py index f243c2ab00..e9c157cbc4 100644 --- a/python/src/server/services/crawling/helpers/url_handler.py +++ b/python/src/server/services/crawling/helpers/url_handler.py @@ -705,3 +705,75 @@ def get_base_url(url: str) -> str: except Exception as e: logger.warning(f"Error extracting base URL from {url}: {e}", exc_info=True) return url + + @staticmethod + def matches_glob_patterns( + url: str, + include_patterns: list[str] | None = None, + exclude_patterns: list[str] | None = None + ) -> bool: + """ + Check if URL path matches glob patterns. + + Filtering logic: + 1. If exclude patterns exist and URL matches any → reject (False) + 2. If include patterns exist and URL matches at least one → accept (True) + 3. If include patterns exist but URL matches none → reject (False) + 4. If no patterns specified → accept (True) + + Args: + url: The URL to check + include_patterns: List of glob patterns to include (e.g., ["**/en/**", "**/docs/**"]) + exclude_patterns: List of glob patterns to exclude (e.g., ["**/fr/**", "**/de/**"]) + + Returns: + True if URL should be included, False if it should be filtered out + + Examples: + >>> matches_glob_patterns("https://docs.example.com/en/intro", ["**/en/**"]) + True + >>> matches_glob_patterns("https://docs.example.com/fr/intro", ["**/en/**"]) + False + >>> matches_glob_patterns("https://docs.example.com/en/intro", ["**/en/**"], ["**/api/**"]) + True + >>> matches_glob_patterns("https://docs.example.com/en/api/intro", ["**/en/**"], ["**/api/**"]) + False + """ + try: + from fnmatch import fnmatch + + # Parse URL to get path + parsed = urlparse(url) + path = parsed.path + + # Normalize path (ensure it starts with / for consistent matching) + if not path.startswith('/'): + path = '/' + path + + # Check exclude patterns first (fast rejection) + if exclude_patterns: + for pattern in exclude_patterns: + if fnmatch(path, pattern): + logger.debug(f"URL excluded by pattern '{pattern}': {url}") + return False + + # Check include patterns (if specified) + if include_patterns: + matched = False + for pattern in include_patterns: + if fnmatch(path, pattern): + logger.debug(f"URL included by pattern '{pattern}': {url}") + matched = True + break + + if not matched: + logger.debug(f"URL does not match any include patterns: {url}") + return False + + # No patterns or passed all checks + return True + + except Exception as e: + logger.warning(f"Error checking glob patterns for {url}: {e}", exc_info=True) + # On error, default to including the URL (safer than filtering) + return True diff --git a/python/src/server/services/crawling/strategies/recursive.py b/python/src/server/services/crawling/strategies/recursive.py index 3cdee7506a..7795119412 100644 --- a/python/src/server/services/crawling/strategies/recursive.py +++ b/python/src/server/services/crawling/strategies/recursive.py @@ -42,6 +42,8 @@ async def crawl_recursive_with_progress( max_concurrent: int | None = None, progress_callback: Callable[..., Awaitable[None]] | None = None, cancellation_check: Callable[[], None] | None = None, + include_patterns: list[str] | None = None, + exclude_patterns: list[str] | None = None, ) -> list[dict[str, Any]]: """ Recursively crawl internal links from start URLs up to a maximum depth with progress reporting. @@ -54,6 +56,8 @@ async def crawl_recursive_with_progress( max_concurrent: Maximum concurrent crawls progress_callback: Optional callback for progress updates cancellation_check: Optional function to check for cancellation + include_patterns: Optional list of glob patterns to include (e.g., ["**/en/**"]) + exclude_patterns: Optional list of glob patterns to exclude (e.g., ["**/fr/**"]) Returns: List of crawl results @@ -166,6 +170,13 @@ def normalize_url(url): total_discovered = len(current_urls) # Track total URLs discovered (normalized & de-duped) cancelled = False + # Log pattern filtering configuration + if include_patterns or exclude_patterns: + logger.info( + f"Recursive crawl with glob filtering enabled | " + f"include={include_patterns} | exclude={exclude_patterns}" + ) + for depth in range(max_depth): # Check for cancellation at the start of each depth level if cancellation_check: @@ -301,14 +312,31 @@ def normalize_url(url): links = getattr(result, "links", {}) or {} for link in links.get("internal", []): next_url = normalize_url(link["href"]) - # Skip binary files and already visited URLs + + # Skip binary files is_binary = self.url_handler.is_binary_file(next_url) - if next_url not in visited and not is_binary: - if next_url not in next_level_urls: - next_level_urls.add(next_url) - total_discovered += 1 # Increment when we discover a new URL - elif is_binary: + if is_binary: logger.debug(f"Skipping binary file from crawl queue: {next_url}") + continue + + # Skip already visited URLs + if next_url in visited: + continue + + # Apply glob pattern filtering + if include_patterns or exclude_patterns: + if not self.url_handler.matches_glob_patterns( + next_url, include_patterns, exclude_patterns + ): + logger.debug( + f"Skipping URL (glob filter) from crawl queue: {next_url}" + ) + continue + + # Add to next level queue + if next_url not in next_level_urls: + next_level_urls.add(next_url) + total_discovered += 1 # Increment when we discover a new URL else: logger.warning( f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}"