From 74023e157c77254a4f1fb87c98577046b5d4cdbf Mon Sep 17 00:00:00 2001 From: David Rudduck <47308254+davidrudduck@users.noreply.github.com> Date: Sat, 8 Nov 2025 19:45:11 +1000 Subject: [PATCH 1/2] feat: Add glob pattern filtering and link review for knowledge crawling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements interactive link review and URL filtering for llms.txt and sitemap.xml crawling: Backend changes: - Add glob pattern matching utility (url_handler.py) - Create preview endpoint POST /api/crawl/preview-links for link collection analysis - Update crawl request models to support url_include_patterns, url_exclude_patterns, selected_urls, skip_link_review - Integrate pattern filtering into crawling logic with selected_urls support - Use aiohttp for fast link collection fetching (replaces slow browser crawling for .txt files) Frontend changes: - Add LinkReviewModal component for interactive link selection before crawling - Update AddKnowledgeDialog with pattern filter inputs and "Review links" checkbox - Add preview flow: detects link collections → shows modal → user selects links → crawls only selected - Fix dialog.tsx wrapper to support full-height flex layouts (h-full class) - Replace invalid

nesting with

elements for HTML standards compliance Features: - Glob pattern filtering (e.g., **/en/** to include only English pages) - Interactive link preview modal with bulk select/deselect, search, and individual selection - Auto-selection based on filter patterns with "Matches Filter" badges - Scrollable link list supporting 2000+ links - Apply Filters button to refine selection in real-time Fixes scroll issues by ensuring proper flex layout height propagation in dialog components. --- .../components/AddKnowledgeDialog.tsx | 185 +++++++++++ .../knowledge/components/LinkReviewModal.tsx | 299 ++++++++++++++++++ .../features/knowledge/components/index.ts | 1 + .../src/features/knowledge/types/knowledge.ts | 30 ++ .../src/features/ui/primitives/dialog.tsx | 2 +- python/src/server/api_routes/knowledge_api.py | 180 +++++++++++ .../services/crawling/crawling_service.py | 37 +++ .../services/crawling/helpers/url_handler.py | 72 +++++ 8 files changed, 805 insertions(+), 1 deletion(-) create mode 100644 archon-ui-main/src/features/knowledge/components/LinkReviewModal.tsx diff --git a/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx b/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx index bcf01bdd76..92f32f21f2 100644 --- a/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx +++ b/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx @@ -15,6 +15,7 @@ import type { CrawlRequest, UploadMetadata } from "../types"; import { KnowledgeTypeSelector } from "./KnowledgeTypeSelector"; import { LevelSelector } from "./LevelSelector"; import { TagInput } from "./TagInput"; +import { LinkReviewModal } from "./LinkReviewModal"; interface AddKnowledgeDialogProps { open: boolean; @@ -44,6 +45,15 @@ export const AddKnowledgeDialog: React.FC = ({ const [maxDepth, setMaxDepth] = useState("2"); const [tags, setTags] = useState([]); + // Glob pattern filtering state + const [includePatterns, setIncludePatterns] = useState(""); + const [excludePatterns, setExcludePatterns] = useState(""); + const [reviewLinksEnabled, setReviewLinksEnabled] = useState(true); + + // Link review modal state + const [showLinkReviewModal, setShowLinkReviewModal] = useState(false); + const [previewData, setPreviewData] = useState(null); + // Upload form state const [selectedFile, setSelectedFile] = useState(null); const [uploadType, setUploadType] = useState<"technical" | "business">("technical"); @@ -54,6 +64,9 @@ export const AddKnowledgeDialog: React.FC = ({ setCrawlType("technical"); setMaxDepth("2"); setTags([]); + setIncludePatterns(""); + setExcludePatterns(""); + setReviewLinksEnabled(true); setSelectedFile(null); setUploadType("technical"); setUploadTags([]); @@ -66,11 +79,54 @@ export const AddKnowledgeDialog: React.FC = ({ } try { + // Parse patterns from comma-separated strings + const includePatternArray = includePatterns + .split(",") + .map((p) => p.trim()) + .filter((p) => p.length > 0); + const excludePatternArray = excludePatterns + .split(",") + .map((p) => p.trim()) + .filter((p) => p.length > 0); + + // If review is enabled, call preview endpoint first + if (reviewLinksEnabled) { + const previewResponse = await fetch("http://localhost:8181/api/crawl/preview-links", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + url: crawlUrl, + url_include_patterns: includePatternArray, + url_exclude_patterns: excludePatternArray, + }), + }); + + if (!previewResponse.ok) { + throw new Error("Failed to preview links"); + } + + const previewData = await previewResponse.json(); + + // If it's a link collection, show the review modal + if (previewData.is_link_collection) { + setPreviewData(previewData); + setShowLinkReviewModal(true); + return; // Don't proceed with crawl yet + } + + // Not a link collection - proceed with normal crawl + showToast("Not a link collection - proceeding with normal crawl", "info"); + } + + // Build crawl request (for non-link collections or when review is disabled) const request: CrawlRequest = { url: crawlUrl, knowledge_type: crawlType, max_depth: parseInt(maxDepth, 10), tags: tags.length > 0 ? tags : undefined, + url_include_patterns: includePatternArray.length > 0 ? includePatternArray : undefined, + url_exclude_patterns: excludePatternArray.length > 0 ? excludePatternArray : undefined, + skip_link_review: !reviewLinksEnabled, }; const response = await crawlMutation.mutateAsync(request); @@ -91,6 +147,48 @@ export const AddKnowledgeDialog: React.FC = ({ } }; + // Handle link review modal submission + const handleLinkReviewSubmit = async (selectedUrls: string[]) => { + try { + const includePatternArray = includePatterns + .split(",") + .map((p) => p.trim()) + .filter((p) => p.length > 0); + const excludePatternArray = excludePatterns + .split(",") + .map((p) => p.trim()) + .filter((p) => p.length > 0); + + const request: CrawlRequest = { + url: crawlUrl, + knowledge_type: crawlType, + max_depth: parseInt(maxDepth, 10), + tags: tags.length > 0 ? tags : undefined, + url_include_patterns: includePatternArray.length > 0 ? includePatternArray : undefined, + url_exclude_patterns: excludePatternArray.length > 0 ? excludePatternArray : undefined, + selected_urls: selectedUrls, + skip_link_review: false, + }; + + const response = await crawlMutation.mutateAsync(request); + + // Notify parent about the new crawl operation + if (response?.progressId && onCrawlStarted) { + onCrawlStarted(response.progressId); + } + + showToast(`Crawl started with ${selectedUrls.length} selected links`, "success"); + resetForm(); + setShowLinkReviewModal(false); + setPreviewData(null); + onSuccess(); + onOpenChange(false); + } catch (error) { + const message = error instanceof Error ? error.message : "Failed to start crawl"; + showToast(message, "error"); + } + }; + const handleUpload = async () => { if (!selectedFile) { showToast("Please select a file to upload", "error"); @@ -175,6 +273,78 @@ export const AddKnowledgeDialog: React.FC = ({
+ {/* Glob Pattern Filtering Section */} +
+ {/* Review Links Checkbox */} +
+ setReviewLinksEnabled(e.target.checked)} + disabled={isProcessing} + className="h-4 w-4 text-cyan-600 focus:ring-cyan-500 border-gray-300 rounded" + /> + +
+
+ When enabled, you'll preview and select links from llms.txt or sitemap files before crawling starts +
+ + {/* Include Patterns Input */} +
+ + setIncludePatterns(e.target.value)} + disabled={isProcessing} + className={cn( + "h-10", + glassCard.blur.sm, + glassCard.transparency.medium, + "border-gray-300/60 dark:border-gray-600/60 focus:border-cyan-400/70", + )} + /> +
+ Only crawl URLs matching these glob patterns (comma-separated). Leave empty to include all. +
+
+ + {/* Exclude Patterns Input */} +
+ + setExcludePatterns(e.target.value)} + disabled={isProcessing} + className={cn( + "h-10", + glassCard.blur.sm, + glassCard.transparency.medium, + "border-gray-300/60 dark:border-gray-600/60 focus:border-cyan-400/70", + )} + /> +
+ Skip URLs matching these glob patterns (comma-separated). Leave empty to exclude none. +
+
+
+
@@ -301,6 +471,21 @@ export const AddKnowledgeDialog: React.FC = ({ + + {/* Link Review Modal */} + {showLinkReviewModal && previewData && ( + { + setShowLinkReviewModal(false); + setPreviewData(null); + }} + /> + )} ); }; diff --git a/archon-ui-main/src/features/knowledge/components/LinkReviewModal.tsx b/archon-ui-main/src/features/knowledge/components/LinkReviewModal.tsx new file mode 100644 index 0000000000..244fdf3f0a --- /dev/null +++ b/archon-ui-main/src/features/knowledge/components/LinkReviewModal.tsx @@ -0,0 +1,299 @@ +/** + * Link Review Modal Component + * Displays links from link collections (llms.txt, sitemap.xml) for user review before crawling + */ + +import { CheckCircle2, Filter, XCircle } from "lucide-react"; +import { useState, useEffect } from "react"; +import { Button, Input, Label } from "../../ui/primitives"; +import { Dialog, DialogContent, DialogHeader, DialogTitle } from "../../ui/primitives/dialog"; +import { cn, glassCard } from "../../ui/primitives/styles"; +import type { LinkPreviewResponse, PreviewLink } from "../types"; + +interface LinkReviewModalProps { + open: boolean; + previewData: LinkPreviewResponse | null; + initialIncludePatterns: string; + initialExcludePatterns: string; + onProceed: (selectedUrls: string[]) => void; + onCancel: () => void; +} + +export const LinkReviewModal: React.FC = ({ + open, + previewData, + initialIncludePatterns, + initialExcludePatterns, + onProceed, + onCancel, +}) => { + const [selectedUrls, setSelectedUrls] = useState>(new Set()); + const [includePatterns, setIncludePatterns] = useState(initialIncludePatterns); + const [excludePatterns, setExcludePatterns] = useState(initialExcludePatterns); + const [filteredLinks, setFilteredLinks] = useState([]); + const [searchTerm, setSearchTerm] = useState(""); + + // Initialize selected URLs when modal opens + useEffect(() => { + if (previewData && previewData.links) { + // Auto-select links that match filters + const initialSelection = new Set( + previewData.links.filter((link) => link.matches_filter).map((link) => link.url) + ); + setSelectedUrls(initialSelection); + setFilteredLinks(previewData.links); + } + }, [previewData]); + + // Apply search filter + useEffect(() => { + if (!previewData) return; + + const filtered = previewData.links.filter((link) => { + if (!searchTerm) return true; + const searchLower = searchTerm.toLowerCase(); + return ( + link.url.toLowerCase().includes(searchLower) || + link.text.toLowerCase().includes(searchLower) || + link.path.toLowerCase().includes(searchLower) + ); + }); + + setFilteredLinks(filtered); + }, [searchTerm, previewData]); + + const handleToggleLink = (url: string) => { + setSelectedUrls((prev) => { + const next = new Set(prev); + if (next.has(url)) { + next.delete(url); + } else { + next.add(url); + } + return next; + }); + }; + + const handleSelectAll = () => { + setSelectedUrls(new Set(filteredLinks.map((link) => link.url))); + }; + + const handleDeselectAll = () => { + setSelectedUrls(new Set()); + }; + + const handleInvertSelection = () => { + setSelectedUrls((prev) => { + const next = new Set(); + filteredLinks.forEach((link) => { + if (!prev.has(link.url)) { + next.add(link.url); + } + }); + return next; + }); + }; + + const handleApplyFilters = async () => { + if (!previewData) return; + + try { + // Parse patterns + const includePatternArray = includePatterns + .split(",") + .map((p) => p.trim()) + .filter((p) => p.length > 0); + const excludePatternArray = excludePatterns + .split(",") + .map((p) => p.trim()) + .filter((p) => p.length > 0); + + // Re-fetch preview with new patterns + const response = await fetch("http://localhost:8181/api/crawl/preview-links", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + url: previewData.source_url, + url_include_patterns: includePatternArray, + url_exclude_patterns: excludePatternArray, + }), + }); + + if (!response.ok) { + throw new Error("Failed to apply filters"); + } + + const updatedData = await response.json(); + + // Update filtered links and auto-select matching ones + setFilteredLinks(updatedData.links); + const newSelection = new Set( + updatedData.links.filter((link: PreviewLink) => link.matches_filter).map((link: PreviewLink) => link.url) + ); + setSelectedUrls(newSelection); + } catch (error) { + console.error("Failed to apply filters:", error); + } + }; + + const handleProceed = () => { + onProceed(Array.from(selectedUrls)); + }; + + if (!previewData) return null; + + const selectedCount = selectedUrls.size; + const totalCount = filteredLinks.length; + + return ( + !isOpen && onCancel()}> + +
+
+ + Review Links - {previewData.collection_type} + +
+
{previewData.source_url}
+
+ {selectedCount} of {totalCount} links selected +
+
+
+ +
+ {/* Filter Section */} +
+
+ + Filter Patterns +
+ +
+
+ + setIncludePatterns(e.target.value)} + placeholder="**/en/**" + className="h-8 text-sm" + /> +
+ +
+ + setExcludePatterns(e.target.value)} + placeholder="**/fr/**, **/de/**" + className="h-8 text-sm" + /> +
+
+ + +
+ + {/* Bulk Actions Bar */} +
+
+ + + +
+ + setSearchTerm(e.target.value)} + className="w-64 h-8 text-sm" + /> +
+ + {/* Link List (scrollable) */} +
+
+ {filteredLinks.map((link) => ( +
handleToggleLink(link.url)} + > + handleToggleLink(link.url)} + className="mt-1 h-4 w-4 text-cyan-600 focus:ring-cyan-500 border-gray-300 rounded" + /> + +
+
+
+

{link.text || "Untitled"}

+

{link.url}

+

Path: {link.path}

+
+ + {link.matches_filter && ( + + Matches Filter + + )} +
+
+
+ ))} + + {filteredLinks.length === 0 && ( +
+

No links found matching your search.

+
+ )} +
+
+
+ + {/* Footer Actions - Sticky */} +
+ + + +
+
+
+
+ ); +}; diff --git a/archon-ui-main/src/features/knowledge/components/index.ts b/archon-ui-main/src/features/knowledge/components/index.ts index a7f9ff55e0..19d99a6d33 100644 --- a/archon-ui-main/src/features/knowledge/components/index.ts +++ b/archon-ui-main/src/features/knowledge/components/index.ts @@ -4,3 +4,4 @@ export * from "./KnowledgeList"; export * from "./KnowledgeTypeSelector"; export * from "./LevelSelector"; export * from "./TagInput"; +export * from "./LinkReviewModal"; diff --git a/archon-ui-main/src/features/knowledge/types/knowledge.ts b/archon-ui-main/src/features/knowledge/types/knowledge.ts index 571cb6192e..b16380d52e 100644 --- a/archon-ui-main/src/features/knowledge/types/knowledge.ts +++ b/archon-ui-main/src/features/knowledge/types/knowledge.ts @@ -140,6 +140,36 @@ export interface CrawlRequest { update_frequency?: number; max_depth?: number; extract_code_examples?: boolean; + // Glob pattern filtering + url_include_patterns?: string[]; + url_exclude_patterns?: string[]; + // Link review mode + selected_urls?: string[]; + skip_link_review?: boolean; +} + +// Link preview request/response types +export interface LinkPreviewRequest { + url: string; + url_include_patterns?: string[]; + url_exclude_patterns?: string[]; +} + +export interface PreviewLink { + url: string; + text: string; + path: string; + matches_filter: boolean; +} + +export interface LinkPreviewResponse { + is_link_collection: boolean; + collection_type: string | null; + source_url: string; + total_links: number; + matching_links: number; + links: PreviewLink[]; + message?: string; } export interface UploadMetadata { diff --git a/archon-ui-main/src/features/ui/primitives/dialog.tsx b/archon-ui-main/src/features/ui/primitives/dialog.tsx index 27947ebdbf..7ae52522fb 100644 --- a/archon-ui-main/src/features/ui/primitives/dialog.tsx +++ b/archon-ui-main/src/features/ui/primitives/dialog.tsx @@ -62,7 +62,7 @@ export const DialogContent = React.forwardRef< )} {...props} > -
{children}
+
{children}
{showCloseButton && ( str: except Exception as e: logger.warning(f"Error extracting base URL from {url}: {e}", exc_info=True) return url + + @staticmethod + def matches_glob_patterns( + url: str, + include_patterns: list[str] | None = None, + exclude_patterns: list[str] | None = None + ) -> bool: + """ + Check if URL path matches glob patterns. + + Filtering logic: + 1. If exclude patterns exist and URL matches any → reject (False) + 2. If include patterns exist and URL matches at least one → accept (True) + 3. If include patterns exist but URL matches none → reject (False) + 4. If no patterns specified → accept (True) + + Args: + url: The URL to check + include_patterns: List of glob patterns to include (e.g., ["**/en/**", "**/docs/**"]) + exclude_patterns: List of glob patterns to exclude (e.g., ["**/fr/**", "**/de/**"]) + + Returns: + True if URL should be included, False if it should be filtered out + + Examples: + >>> matches_glob_patterns("https://docs.example.com/en/intro", ["**/en/**"]) + True + >>> matches_glob_patterns("https://docs.example.com/fr/intro", ["**/en/**"]) + False + >>> matches_glob_patterns("https://docs.example.com/en/intro", ["**/en/**"], ["**/api/**"]) + True + >>> matches_glob_patterns("https://docs.example.com/en/api/intro", ["**/en/**"], ["**/api/**"]) + False + """ + try: + from fnmatch import fnmatch + + # Parse URL to get path + parsed = urlparse(url) + path = parsed.path + + # Normalize path (ensure it starts with / for consistent matching) + if not path.startswith('/'): + path = '/' + path + + # Check exclude patterns first (fast rejection) + if exclude_patterns: + for pattern in exclude_patterns: + if fnmatch(path, pattern): + logger.debug(f"URL excluded by pattern '{pattern}': {url}") + return False + + # Check include patterns (if specified) + if include_patterns: + matched = False + for pattern in include_patterns: + if fnmatch(path, pattern): + logger.debug(f"URL included by pattern '{pattern}': {url}") + matched = True + break + + if not matched: + logger.debug(f"URL does not match any include patterns: {url}") + return False + + # No patterns or passed all checks + return True + + except Exception as e: + logger.warning(f"Error checking glob patterns for {url}: {e}", exc_info=True) + # On error, default to including the URL (safer than filtering) + return True From a26101b6f5c886c80f8580a7254e6bc7e7ffec32 Mon Sep 17 00:00:00 2001 From: David Rudduck <47308254+davidrudduck@users.noreply.github.com> Date: Wed, 12 Nov 2025 20:51:14 +1000 Subject: [PATCH 2/2] feat: Add glob pattern filtering for recursive crawls and improve GitHub auto-config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit extends the glob pattern filtering feature (from commit 74023e1) to support recursive crawls and improves GitHub repository handling. ## Changes ### Backend - Recursive Crawl Filtering - Add include_patterns and exclude_patterns parameters to RecursiveCrawlStrategy - Filter internal links during discovery (before adding to crawl queue) - Pass patterns through entire call chain (orchestration → service → strategy) - Add comprehensive logging for pattern configuration and filtered URLs - Performance: Prevents unnecessary HTTP requests and memory usage Files: - python/src/server/services/crawling/strategies/recursive.py: * Lines 45-46: Add pattern parameters to function signature * Lines 59-60: Update docstring * Lines 173-178: Log pattern configuration at crawl start * Lines 316-339: Implement filtering logic during link discovery - python/src/server/services/crawling/crawling_service.py: * Lines 271-272: Add parameters to wrapper method * Lines 283-284: Pass patterns to recursive strategy * Lines 349-356: Add early logging for crawl parameters * Lines 1145-1153: Extract and pass patterns from request ### Frontend - Improved GitHub Auto-Configuration - Change GitHub auto-config from path-based to code-only patterns - Use **/tree/**, **/blob/** instead of /username/repo* - Automatically excludes issues, PRs, actions, wiki, etc. - More efficient and future-proof than exclusion lists Files: - archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx: * Lines 73-75: Updated pattern generation logic ### Documentation - Add comprehensive glob pattern guide with examples - Document GitHub auto-configuration rationale - Include pattern syntax, use cases, and testing tips Files: - docs/GLOB_PATTERNS.md: New file (253 lines) ## Benefits 1. **Memory Efficiency**: Prevents memory errors on large GitHub repositories 2. **Performance**: Filters URLs before crawling (saves HTTP requests) 3. **Storage**: Reduces database writes (fewer pages to store) 4. **User Experience**: GitHub repos now auto-configured optimally ## Testing - Unit tests: All passing (19/19 glob pattern tests) - Frontend tests: All passing (29/29 LinkReviewModal tests) - Integration tests: Pre-existing failures unrelated to this feature - Manual testing: GitHub crawl with code-only patterns verified ## Pattern Examples Documentation sites (language filtering): **/en/**, !**/api/**, !**/changelog/** GitHub repositories (code only): **/tree/**, **/blob/** Blog sites: **/blog/**, !**/draft/** ## Related - Builds on commit 74023e1 (glob pattern filtering for link collections) - Resolves memory issues with GitHub repository crawling - Implements recursive crawl filtering requested in design discussions --- .../components/AddKnowledgeDialog.tsx | 155 ++++++----- docs/GLOB_PATTERNS.md | 253 ++++++++++++++++++ .../services/crawling/crawling_service.py | 61 ++++- .../services/crawling/strategies/recursive.py | 40 ++- 4 files changed, 434 insertions(+), 75 deletions(-) create mode 100644 docs/GLOB_PATTERNS.md diff --git a/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx b/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx index 92f32f21f2..efcdd76302 100644 --- a/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx +++ b/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx @@ -4,14 +4,15 @@ */ import { Globe, Loader2, Upload } from "lucide-react"; -import { useId, useState } from "react"; +import { useEffect, useId, useState } from "react"; import { useToast } from "@/features/shared/hooks/useToast"; +import { callAPIWithETag } from "@/features/shared/api/apiClient"; import { Button, Input, Label } from "../../ui/primitives"; import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "../../ui/primitives/dialog"; import { cn, glassCard } from "../../ui/primitives/styles"; import { Tabs, TabsContent, TabsList, TabsTrigger } from "../../ui/primitives/tabs"; import { useCrawlUrl, useUploadDocument } from "../hooks"; -import type { CrawlRequest, UploadMetadata } from "../types"; +import type { CrawlRequest, UploadMetadata, LinkPreviewResponse } from "../types"; import { KnowledgeTypeSelector } from "./KnowledgeTypeSelector"; import { LevelSelector } from "./LevelSelector"; import { TagInput } from "./TagInput"; @@ -45,9 +46,8 @@ export const AddKnowledgeDialog: React.FC = ({ const [maxDepth, setMaxDepth] = useState("2"); const [tags, setTags] = useState([]); - // Glob pattern filtering state - const [includePatterns, setIncludePatterns] = useState(""); - const [excludePatterns, setExcludePatterns] = useState(""); + // Glob pattern filtering state (unified field with ! prefix for exclusions) + const [urlPatterns, setUrlPatterns] = useState(""); const [reviewLinksEnabled, setReviewLinksEnabled] = useState(true); // Link review modal state @@ -59,19 +59,70 @@ export const AddKnowledgeDialog: React.FC = ({ const [uploadType, setUploadType] = useState<"technical" | "business">("technical"); const [uploadTags, setUploadTags] = useState([]); + // Auto-detect GitHub repositories and populate smart defaults + useEffect(() => { + // Only auto-populate if the URL has changed and patterns are empty + if (!crawlUrl) return; + + // Detect GitHub URL (supports https://, http://, or just github.com) + const githubUrlPattern = /^(?:https?:\/\/)?(?:www\.)?github\.com\/([^\/]+)\/([^\/\?#]+)/i; + const match = crawlUrl.match(githubUrlPattern); + + if (match) { + // Only auto-populate if patterns are currently empty (don't override user edits) + if (!urlPatterns) { + // Use code-only patterns: only crawl tree (directories) and blob (files) pages + setUrlPatterns("**/tree/**, **/blob/**"); + } + + // Auto-add "GitHub Repo" tag if not already present + if (!tags.includes("GitHub Repo")) { + setTags((prevTags) => [...prevTags, "GitHub Repo"]); + } + + // Set max depth to 3 for GitHub repos (to traverse nested directories) + if (maxDepth === "2") { + setMaxDepth("3"); + } + } + }, [crawlUrl]); // Only depend on crawlUrl to avoid infinite loops + const resetForm = () => { setCrawlUrl(""); setCrawlType("technical"); setMaxDepth("2"); setTags([]); - setIncludePatterns(""); - setExcludePatterns(""); + setUrlPatterns(""); setReviewLinksEnabled(true); setSelectedFile(null); setUploadType("technical"); setUploadTags([]); }; + // Parse unified pattern string into separate include/exclude arrays. + // Patterns starting with ! are exclusions, others are inclusions. + // Example: "path1, path2, !exclude1" -> { include: ["path1", "path2"], exclude: ["exclude1"] } + const parseUrlPatterns = (patterns: string): { include: string[]; exclude: string[] } => { + const include: string[] = []; + const exclude: string[] = []; + + patterns + .split(",") + .map((p) => p.trim()) + .filter((p) => p.length > 0) + .forEach((pattern) => { + if (pattern.startsWith("!")) { + // Exclude pattern - remove the ! prefix + exclude.push(pattern.substring(1).trim()); + } else { + // Include pattern + include.push(pattern); + } + }); + + return { include, exclude }; + }; + const handleCrawl = async () => { if (!crawlUrl) { showToast("Please enter a URL to crawl", "error"); @@ -79,21 +130,13 @@ export const AddKnowledgeDialog: React.FC = ({ } try { - // Parse patterns from comma-separated strings - const includePatternArray = includePatterns - .split(",") - .map((p) => p.trim()) - .filter((p) => p.length > 0); - const excludePatternArray = excludePatterns - .split(",") - .map((p) => p.trim()) - .filter((p) => p.length > 0); + // Parse unified pattern string into include/exclude arrays + const { include: includePatternArray, exclude: excludePatternArray } = parseUrlPatterns(urlPatterns); // If review is enabled, call preview endpoint first if (reviewLinksEnabled) { - const previewResponse = await fetch("http://localhost:8181/api/crawl/preview-links", { + const previewData = await callAPIWithETag("/crawl/preview-links", { method: "POST", - headers: { "Content-Type": "application/json" }, body: JSON.stringify({ url: crawlUrl, url_include_patterns: includePatternArray, @@ -101,12 +144,6 @@ export const AddKnowledgeDialog: React.FC = ({ }), }); - if (!previewResponse.ok) { - throw new Error("Failed to preview links"); - } - - const previewData = await previewResponse.json(); - // If it's a link collection, show the review modal if (previewData.is_link_collection) { setPreviewData(previewData); @@ -150,14 +187,8 @@ export const AddKnowledgeDialog: React.FC = ({ // Handle link review modal submission const handleLinkReviewSubmit = async (selectedUrls: string[]) => { try { - const includePatternArray = includePatterns - .split(",") - .map((p) => p.trim()) - .filter((p) => p.length > 0); - const excludePatternArray = excludePatterns - .split(",") - .map((p) => p.trim()) - .filter((p) => p.length > 0); + // Parse unified pattern string into include/exclude arrays + const { include: includePatternArray, exclude: excludePatternArray } = parseUrlPatterns(urlPatterns); const request: CrawlRequest = { url: crawlUrl, @@ -259,7 +290,7 @@ export const AddKnowledgeDialog: React.FC = ({ setCrawlUrl(e.target.value)} disabled={isProcessing} @@ -275,6 +306,19 @@ export const AddKnowledgeDialog: React.FC = ({ {/* Glob Pattern Filtering Section */}
+ {/* GitHub Auto-Configuration Notice */} + {crawlUrl.match(/^(?:https?:\/\/)?(?:www\.)?github\.com\/([^\/]+)\/([^\/\?#]+)/i) && ( +
+
+ +
+
+ GitHub Repository Detected: Pattern auto-configured to crawl only this repository (depth=3). + Add exclusions with !**/issues** if needed. +
+
+ )} + {/* Review Links Checkbox */}
= ({ When enabled, you'll preview and select links from llms.txt or sitemap files before crawling starts
- {/* Include Patterns Input */} -
- - setIncludePatterns(e.target.value)} - disabled={isProcessing} - className={cn( - "h-10", - glassCard.blur.sm, - glassCard.transparency.medium, - "border-gray-300/60 dark:border-gray-600/60 focus:border-cyan-400/70", - )} - /> -
- Only crawl URLs matching these glob patterns (comma-separated). Leave empty to include all. -
-
- - {/* Exclude Patterns Input */} + {/* Unified URL Patterns Input */}
-
@@ -477,8 +499,7 @@ export const AddKnowledgeDialog: React.FC = ({ { setShowLinkReviewModal(false); diff --git a/docs/GLOB_PATTERNS.md b/docs/GLOB_PATTERNS.md new file mode 100644 index 0000000000..c63ee42f12 --- /dev/null +++ b/docs/GLOB_PATTERNS.md @@ -0,0 +1,253 @@ +# Glob Pattern Filtering Guide + +## Overview + +Archon's knowledge crawling system supports flexible URL filtering using glob patterns with `.gitignore`-style syntax. Use a single unified field to specify which URLs to include or exclude during crawls. + +## Syntax + +### Basic Format +```text +pattern1, pattern2, !exclude1, !exclude2 +``` + +### Rules +1. **Include patterns** - Regular glob patterns match URLs to include +2. **Exclude patterns** - Patterns prefixed with `!` exclude URLs +3. **Comma-separated** - Separate multiple patterns with commas +4. **Exclude takes precedence** - If a URL matches any exclude pattern, it's rejected even if it matches an include pattern + +### Logic Flow +```text +1. If no patterns specified → Include all URLs +2. If URL matches ANY exclude pattern (!) → Reject +3. If include patterns exist AND URL matches at least one → Accept +4. Otherwise → Reject +``` + +## Pattern Syntax + +### Wildcards +- `*` - Matches any characters (including `/` in paths) +- `**` - Same as `*` in fnmatch (matches any characters) +- `?` - Matches any single character + +### Examples +```bash +# Match specific directory +**/docs/** # Matches: /docs/, /en/docs/, /api/v1/docs/ + +# Match file extensions +**/*.md # Matches: /readme.md, /docs/guide.md + +# Exact path prefix +/api/v1/* # Matches: /api/v1/users, /api/v1/posts + +# Combined patterns +**/en/**, **/docs/** # Matches: /en/guide, /docs/api +``` + +## Common Use Cases + +### 1. Documentation Sites - Language Filtering + +**Scenario**: Only crawl English documentation + +```text +**/en/**, !**/api/**, !**/changelog/** +``` + +**Matches**: +- ✅ `/en/getting-started` +- ✅ `/docs/en/tutorial` +- ✅ `/en/guides/setup` + +**Excludes**: +- ❌ `/fr/getting-started` (not English) +- ❌ `/en/api/reference` (API excluded) +- ❌ `/en/changelog` (changelog excluded) + +### 2. GitHub Repositories + +**Scenario**: Crawl repository code files only (directories and files) + +```text +**/tree/**, **/blob/** +``` + +**Auto-configured when entering GitHub URLs like**: +```text +https://github.com/username/reponame +``` + +**What it matches:** +- ✅ `/username/reponame/tree/main/src` (directory view) +- ✅ `/username/reponame/blob/main/README.md` (file view) +- ✅ `/username/reponame/tree/main/src/components` (nested directory) + +**What it excludes:** +- ❌ `/username/reponame/issues` (issues page) +- ❌ `/username/reponame/pull/123` (pull request) +- ❌ `/username/reponame/actions` (GitHub Actions) +- ❌ `/username/reponame/security` (security tab) +- ❌ `/username/reponame/wiki` (wiki pages) +- ❌ Any other GitHub UI pages + +**Why this pattern?** +Using include patterns is cleaner and more comprehensive than excluding individual sections. It automatically excludes any future GitHub features without updating the pattern. + +### 3. Blog Sites + +**Scenario**: Only blog posts, exclude drafts and archives + +```text +**/blog/**, !**/draft/**, !**/archive/** +``` + +**Matches**: +- ✅ `/blog/2024/my-post` +- ✅ `/en/blog/tutorial` + +**Excludes**: +- ❌ `/blog/draft/unpublished` +- ❌ `/blog/archive/2020` + +### 4. Exclude Only (No Includes) + +**Scenario**: Crawl everything except certain languages + +```text +!**/fr/**, !**/de/**, !**/ja/** +``` + +**Result**: All URLs crawled EXCEPT French, German, and Japanese pages + +## GitHub Auto-Configuration + +When you enter a GitHub repository URL, Archon automatically configures optimal settings: + +### Trigger +Any URL matching: +```text +https://github.com/username/reponame +http://github.com/username/reponame +github.com/username/reponame +``` + +### Auto-Applied Settings +1. **Pattern**: `**/tree/**, **/blob/**` (code files only) +2. **Depth**: 3 (for nested directories) +3. **Tag**: "GitHub Repo" + +### Why These Patterns? +- `**/tree/**` matches directory views (browsing folders) +- `**/blob/**` matches file views (individual files) +- Automatically excludes issues, PRs, actions, wiki, and all non-code pages +- More efficient than listing exclusions +- Works with any future GitHub features without updates + +## Link Collections (llms.txt, sitemap.xml) + +### Behavior +For link collections, patterns filter the discovered links: + +1. **Parse collection** → Extract all URLs +2. **Apply patterns** → Filter URLs by include/exclude rules +3. **Review modal** → Preview filtered links before crawling +4. **Crawl selected** → Only crawl matching URLs + +### Example Workflow + +**Sitemap URL**: `https://docs.example.com/sitemap.xml` + +**Sitemap contains**: +```text +https://docs.example.com/en/intro +https://docs.example.com/en/api +https://docs.example.com/fr/intro +https://docs.example.com/changelog +``` + +**Pattern**: `**/en/**, !**/api/**` + +**Filtered Results**: +- ✅ `/en/intro` (matches include, not excluded) +- ❌ `/en/api` (matches include BUT excluded) +- ❌ `/fr/intro` (doesn't match include) +- ❌ `/changelog` (doesn't match include) + +## Pattern Testing Tips + +### Start Simple +1. Begin with broad include pattern +2. Test the crawl preview (for link collections) +3. Add exclusions to refine + +### Use Specific Patterns +```bash +# ❌ Too broad +**/* + +# ✅ Specific and meaningful +**/en/**, **/docs/** +``` + +### Test Pattern Matching + +Use the pattern preview in the Link Review Modal to see which URLs match before crawling. + +### Common Mistakes + +❌ **Forgetting the `!` prefix** +```text +**/en/**, **/api/** # This includes BOTH en and api +``` + +✅ **Correct exclusion syntax** +```text +**/en/**, !**/api/** # This includes en but excludes api +``` + +❌ **Assuming `*` matches only one path segment** +```text +/docs/*/intro # This WILL match /docs/en/v1/intro (not just /docs/en/intro) +``` + +✅ **Understanding fnmatch behavior** +```text +# In fnmatch (used by Archon), * matches any characters including / +# Both * and ** behave the same way +``` + +## API Integration + +When the frontend sends patterns to the backend, they're automatically parsed: + +```typescript +// Frontend: Unified field +urlPatterns: "**/en/**, !**/api/**" + +// Parsed and sent to backend +{ + url_include_patterns: ["**/en/**"], + url_exclude_patterns: ["**/api/**"] +} +``` + +## Testing Patterns + +See [TESTING.md](../TESTING.md#glob-pattern-testing) for comprehensive testing examples. + +## Further Reading + +- [fnmatch documentation](https://docs.python.org/3/library/fnmatch.html) - Python's glob pattern matching +- [.gitignore patterns](https://git-scm.com/docs/gitignore#_pattern_format) - Similar syntax inspiration +- [PR #847](https://github.com/coleam00/archon/pull/847) - Original implementation + +## Support + +If you encounter issues with pattern matching: +1. Check the pattern syntax for typos +2. Test with the Link Review Modal (for link collections) +3. Start with simpler patterns and add complexity +4. Remember: `!` prefix is required for exclusions diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index fa48c6e1c5..a3539410be 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -268,6 +268,8 @@ async def crawl_recursive_with_progress( max_depth: int = 3, max_concurrent: int | None = None, progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None, + include_patterns: list[str] | None = None, + exclude_patterns: list[str] | None = None, ) -> list[dict[str, Any]]: """Recursively crawl internal links from start URLs.""" return await self.recursive_strategy.crawl_recursive_with_progress( @@ -278,6 +280,8 @@ async def crawl_recursive_with_progress( max_concurrent, progress_callback, self._check_cancellation, # Pass cancellation check + include_patterns, + exclude_patterns, ) # Orchestration methods @@ -346,6 +350,15 @@ async def send_heartbeat_if_needed(): url = str(request.get("url", "")) safe_logfire_info(f"Starting async crawl orchestration | url={url} | task_id={task_id}") + # Log crawl parameters for debugging + max_depth = request.get("max_depth", 1) + url_include_patterns = request.get("url_include_patterns", []) + url_exclude_patterns = request.get("url_exclude_patterns", []) + logger.info( + f"Crawl parameters: url={url} | max_depth={max_depth} | " + f"include_patterns={url_include_patterns} | exclude_patterns={url_exclude_patterns}" + ) + # Start the progress tracker if available if self.progress_tracker: await self.progress_tracker.start({ @@ -1072,6 +1085,41 @@ async def update_crawl_progress(stage_progress: int, message: str, **kwargs): sitemap_urls = self.parse_sitemap(url) if sitemap_urls: + original_count = len(sitemap_urls) + + # Apply glob pattern filtering or selected URLs + include_patterns = request.get("url_include_patterns", []) + exclude_patterns = request.get("url_exclude_patterns", []) + selected_urls = request.get("selected_urls") + + # Option 1: Use selected_urls from review modal (takes precedence) + if selected_urls: + selected_urls_set = set(selected_urls) + sitemap_urls = [ + url for url in sitemap_urls + if url in selected_urls_set + ] + logger.info( + f"Applied selected_urls filter to sitemap: {original_count} → {len(sitemap_urls)} URLs " + f"({original_count - len(sitemap_urls)} filtered)" + ) + + # Option 2: Apply glob pattern filtering + elif include_patterns or exclude_patterns: + filtered_urls = [] + for sitemap_url in sitemap_urls: + if self.url_handler.matches_glob_patterns(sitemap_url, include_patterns, exclude_patterns): + filtered_urls.append(sitemap_url) + + filtered_count = original_count - len(filtered_urls) + sitemap_urls = filtered_urls + + logger.info( + f"Applied glob pattern filter to sitemap: {original_count} → {len(sitemap_urls)} URLs " + f"({filtered_count} filtered) | " + f"include={include_patterns} | exclude={exclude_patterns}" + ) + # Update progress before starting batch crawl await update_crawl_progress( 75, # 75% of crawling stage @@ -1094,14 +1142,23 @@ async def update_crawl_progress(stage_progress: int, message: str, **kwargs): ) max_depth = request.get("max_depth", 1) - # Let the strategy handle concurrency from settings - # This will use CRAWL_MAX_CONCURRENT from database (default: 10) + include_patterns = request.get("url_include_patterns", []) + exclude_patterns = request.get("url_exclude_patterns", []) + + # Log pattern configuration for debugging + if include_patterns or exclude_patterns: + logger.info( + f"Recursive crawl with glob patterns | " + f"include={include_patterns} | exclude={exclude_patterns}" + ) crawl_results = await self.crawl_recursive_with_progress( [url], max_depth=max_depth, max_concurrent=None, # Let strategy use settings progress_callback=await self._create_crawl_progress_callback("crawling"), + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, ) return crawl_results, crawl_type diff --git a/python/src/server/services/crawling/strategies/recursive.py b/python/src/server/services/crawling/strategies/recursive.py index 3cdee7506a..7795119412 100644 --- a/python/src/server/services/crawling/strategies/recursive.py +++ b/python/src/server/services/crawling/strategies/recursive.py @@ -42,6 +42,8 @@ async def crawl_recursive_with_progress( max_concurrent: int | None = None, progress_callback: Callable[..., Awaitable[None]] | None = None, cancellation_check: Callable[[], None] | None = None, + include_patterns: list[str] | None = None, + exclude_patterns: list[str] | None = None, ) -> list[dict[str, Any]]: """ Recursively crawl internal links from start URLs up to a maximum depth with progress reporting. @@ -54,6 +56,8 @@ async def crawl_recursive_with_progress( max_concurrent: Maximum concurrent crawls progress_callback: Optional callback for progress updates cancellation_check: Optional function to check for cancellation + include_patterns: Optional list of glob patterns to include (e.g., ["**/en/**"]) + exclude_patterns: Optional list of glob patterns to exclude (e.g., ["**/fr/**"]) Returns: List of crawl results @@ -166,6 +170,13 @@ def normalize_url(url): total_discovered = len(current_urls) # Track total URLs discovered (normalized & de-duped) cancelled = False + # Log pattern filtering configuration + if include_patterns or exclude_patterns: + logger.info( + f"Recursive crawl with glob filtering enabled | " + f"include={include_patterns} | exclude={exclude_patterns}" + ) + for depth in range(max_depth): # Check for cancellation at the start of each depth level if cancellation_check: @@ -301,14 +312,31 @@ def normalize_url(url): links = getattr(result, "links", {}) or {} for link in links.get("internal", []): next_url = normalize_url(link["href"]) - # Skip binary files and already visited URLs + + # Skip binary files is_binary = self.url_handler.is_binary_file(next_url) - if next_url not in visited and not is_binary: - if next_url not in next_level_urls: - next_level_urls.add(next_url) - total_discovered += 1 # Increment when we discover a new URL - elif is_binary: + if is_binary: logger.debug(f"Skipping binary file from crawl queue: {next_url}") + continue + + # Skip already visited URLs + if next_url in visited: + continue + + # Apply glob pattern filtering + if include_patterns or exclude_patterns: + if not self.url_handler.matches_glob_patterns( + next_url, include_patterns, exclude_patterns + ): + logger.debug( + f"Skipping URL (glob filter) from crawl queue: {next_url}" + ) + continue + + # Add to next level queue + if next_url not in next_level_urls: + next_level_urls.add(next_url) + total_discovered += 1 # Increment when we discover a new URL else: logger.warning( f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}"