diff --git a/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx b/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx index f6c7bc2af9..db526faf4e 100644 --- a/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx +++ b/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx @@ -10,8 +10,9 @@ import { Button, Input, Label } from "../../ui/primitives"; import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "../../ui/primitives/dialog"; import { cn } from "../../ui/primitives/styles"; import { Tabs, TabsContent } from "../../ui/primitives/tabs"; -import { useCrawlUrl, useUploadDocument } from "../hooks"; -import type { CrawlRequest, UploadMetadata } from "../types"; +import { useCrawlUrl, useCrawlUrlV2, useUploadDocument } from "../hooks"; +import type { CrawlConfig, CrawlRequest, CrawlRequestV2, UploadMetadata } from "../types"; +import { AdvancedCrawlConfig } from "./AdvancedCrawlConfig"; import { KnowledgeTypeSelector } from "./KnowledgeTypeSelector"; import { LevelSelector } from "./LevelSelector"; import { TagInput } from "./TagInput"; @@ -32,6 +33,7 @@ export const AddKnowledgeDialog: React.FC = ({ const [activeTab, setActiveTab] = useState<"crawl" | "upload">("crawl"); const { showToast } = useToast(); const crawlMutation = useCrawlUrl(); + const crawlV2Mutation = useCrawlUrlV2(); const uploadMutation = useUploadDocument(); // Generate unique IDs for form elements @@ -43,6 +45,7 @@ export const AddKnowledgeDialog: React.FC = ({ const [crawlType, setCrawlType] = useState<"technical" | "business">("technical"); const [maxDepth, setMaxDepth] = useState("2"); const [tags, setTags] = useState([]); + const [crawlConfig, setCrawlConfig] = useState({}); // Upload form state const [selectedFile, setSelectedFile] = useState(null); @@ -54,6 +57,7 @@ export const AddKnowledgeDialog: React.FC = ({ setCrawlType("technical"); setMaxDepth("2"); setTags([]); + setCrawlConfig({}); setSelectedFile(null); setUploadType("technical"); setUploadTags([]); @@ -66,21 +70,42 @@ export const AddKnowledgeDialog: React.FC = ({ } try { - const request: CrawlRequest = { - url: crawlUrl, - knowledge_type: crawlType, - max_depth: parseInt(maxDepth, 10), - tags: tags.length > 0 ? tags : undefined, - }; + // Check if we have any domain filtering configuration + const hasCrawlConfig = + (crawlConfig.allowed_domains && crawlConfig.allowed_domains.length > 0) || + (crawlConfig.excluded_domains && crawlConfig.excluded_domains.length > 0) || + (crawlConfig.include_patterns && crawlConfig.include_patterns.length > 0) || + (crawlConfig.exclude_patterns && crawlConfig.exclude_patterns.length > 0); + + let response; - const response = await crawlMutation.mutateAsync(request); + if (hasCrawlConfig) { + // Use v2 endpoint with domain filtering + const requestV2: CrawlRequestV2 = { + url: crawlUrl, + knowledge_type: crawlType, + max_depth: parseInt(maxDepth, 10), + tags: tags.length > 0 ? tags : undefined, + crawl_config: crawlConfig, + }; + response = await crawlV2Mutation.mutateAsync(requestV2); + } else { + // Use regular endpoint + const request: CrawlRequest = { + url: crawlUrl, + knowledge_type: crawlType, + max_depth: parseInt(maxDepth, 10), + tags: tags.length > 0 ? tags : undefined, + }; + response = await crawlMutation.mutateAsync(request); + } // Notify parent about the new crawl operation if (response?.progressId && onCrawlStarted) { onCrawlStarted(response.progressId); } - showToast("Crawl started successfully", "success"); + showToast(hasCrawlConfig ? "Crawl started with domain filtering" : "Crawl started successfully", "success"); resetForm(); onSuccess(); onOpenChange(false); @@ -123,19 +148,19 @@ export const AddKnowledgeDialog: React.FC = ({ } }; - const isProcessing = crawlMutation.isPending || uploadMutation.isPending; + const isProcessing = crawlMutation.isPending || crawlV2Mutation.isPending || uploadMutation.isPending; return ( - - + + Add Knowledge Crawl websites or upload documents to expand your knowledge base. - setActiveTab(v as "crawl" | "upload")}> + setActiveTab(v as "crawl" | "upload")} className="flex-1 flex flex-col min-h-0"> {/* Enhanced Tab Buttons */} -
+
{/* Crawl Website Tab */}
{/* Crawl Tab */} - + +
+
{/* Enhanced URL Input Section */}
+ {/* Advanced Configuration - positioned directly below URL */} + +
@@ -233,7 +270,7 @@ export const AddKnowledgeDialog: React.FC = ({ disabled={isProcessing || !crawlUrl} className="w-full bg-gradient-to-r from-cyan-500 to-cyan-600 hover:from-cyan-600 hover:to-cyan-700 backdrop-blur-md border border-cyan-400/50 shadow-[0_0_20px_rgba(6,182,212,0.25)] hover:shadow-[0_0_30px_rgba(6,182,212,0.35)] transition-all duration-200" > - {crawlMutation.isPending ? ( + {(crawlMutation.isPending || crawlV2Mutation.isPending) ? ( <> Starting Crawl... @@ -245,10 +282,21 @@ export const AddKnowledgeDialog: React.FC = ({ )} +
+
{/* Upload Tab */} - + +
+
{/* Enhanced File Input Section */}
+
diff --git a/archon-ui-main/src/features/knowledge/components/AdvancedCrawlConfig.tsx b/archon-ui-main/src/features/knowledge/components/AdvancedCrawlConfig.tsx new file mode 100644 index 0000000000..b763cf4f4d --- /dev/null +++ b/archon-ui-main/src/features/knowledge/components/AdvancedCrawlConfig.tsx @@ -0,0 +1,315 @@ +/** + * Advanced Crawl Configuration Component + * Provides UI for configuring domain filtering and URL patterns + */ + +import { ChevronDown, Info, Plus, X } from "lucide-react"; +import React, { useEffect, useState } from "react"; +import type { CrawlConfig } from "../types"; + +interface Props { + config: CrawlConfig; + onChange: (config: CrawlConfig) => void; +} + +export const AdvancedCrawlConfig: React.FC = ({ config, onChange }) => { + const [isExpanded, setIsExpanded] = useState(false); + const [newDomain, setNewDomain] = useState(""); + const [newPattern, setNewPattern] = useState(""); + const [activeTab, setActiveTab] = useState<"allowed" | "excluded">("allowed"); + const [patternTab, setPatternTab] = useState<"include" | "exclude">("include"); + + const handleAddDomain = (type: "allowed" | "excluded") => { + if (!newDomain.trim()) return; + + const domain = newDomain.trim().toLowerCase().replace(/^https?:\/\//, "").replace(/\/$/, ""); + const key = `${type}_domains` as keyof CrawlConfig; + const current = config[key] || []; + + if (!current.includes(domain)) { + onChange({ + ...config, + [key]: [...current, domain], + }); + } + + setNewDomain(""); + }; + + const handleRemoveDomain = (type: "allowed" | "excluded", domain: string) => { + const key = `${type}_domains` as keyof CrawlConfig; + onChange({ + ...config, + [key]: (config[key] || []).filter(d => d !== domain), + }); + }; + + const handleAddPattern = (type: "include" | "exclude") => { + if (!newPattern.trim()) return; + + const key = `${type}_patterns` as keyof CrawlConfig; + const current = config[key] || []; + + if (!current.includes(newPattern)) { + onChange({ + ...config, + [key]: [...current, newPattern], + }); + } + + setNewPattern(""); + }; + + const handleRemovePattern = (type: "include" | "exclude", pattern: string) => { + const key = `${type}_patterns` as keyof CrawlConfig; + onChange({ + ...config, + [key]: (config[key] || []).filter(p => p !== pattern), + }); + }; + + const hasAnyConfig = + (config.allowed_domains && config.allowed_domains.length > 0) || + (config.excluded_domains && config.excluded_domains.length > 0) || + (config.include_patterns && config.include_patterns.length > 0) || + (config.exclude_patterns && config.exclude_patterns.length > 0); + + // Auto-expand when there's existing configuration + useEffect(() => { + if (hasAnyConfig && !isExpanded) { + setIsExpanded(true); + } + }, [hasAnyConfig]); + + return ( +
+ + + {isExpanded && ( +
+ {/* Domain Filters Section */} +
+
+

Domain Filters

+
+ +
+ Control which domains are crawled. Blacklist takes priority over whitelist. +
+
+
+ + {/* Domain Tabs */} +
+ + +
+ + {/* Domain Input */} +
+ setNewDomain(e.target.value)} + onKeyDown={(e) => { + if (e.key === "Enter") { + handleAddDomain(activeTab); + } + }} + placeholder={`Add ${activeTab} domain (e.g., docs.example.com)`} + className="flex-1 px-3 py-2 bg-gray-800 border border-gray-700 rounded text-sm text-gray-200 + placeholder-gray-500 focus:outline-none focus:border-blue-500 transition-colors" + /> + +
+ + {/* Domain List */} +
+ {activeTab === "allowed" && config.allowed_domains?.map(domain => ( +
+ {domain} + +
+ ))} + {activeTab === "excluded" && config.excluded_domains?.map(domain => ( +
+ {domain} + +
+ ))} +
+
+ + {/* URL Patterns Section */} +
+
+

URL Patterns

+
+ +
+ Use glob patterns to filter URLs. Example: */docs/* or *.pdf +
+
+
+ + {/* Pattern Tabs */} +
+ + +
+ + {/* Pattern Input */} +
+ setNewPattern(e.target.value)} + onKeyDown={(e) => { + if (e.key === "Enter") { + handleAddPattern(patternTab); + } + }} + placeholder={`Add ${patternTab} pattern (e.g., */api/* or *.pdf)`} + className="flex-1 px-3 py-2 bg-gray-800 border border-gray-700 rounded text-sm text-gray-200 + placeholder-gray-500 focus:outline-none focus:border-blue-500 transition-colors" + /> + +
+ + {/* Pattern List */} +
+ {patternTab === "include" && config.include_patterns?.map(pattern => ( +
+ {pattern} + +
+ ))} + {patternTab === "exclude" && config.exclude_patterns?.map(pattern => ( +
+ {pattern} + +
+ ))} +
+
+ + {/* Clear All Button */} + {hasAnyConfig && ( + + )} +
+ )} +
+ ); +}; \ No newline at end of file diff --git a/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx b/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx index 6da79b18d1..20621df689 100644 --- a/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx +++ b/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx @@ -3,13 +3,14 @@ * Shows document chunks and code examples for a knowledge item */ -import { ChevronDown, ChevronRight, Code, FileText, Search } from "lucide-react"; -import { useState } from "react"; +import { ChevronDown, ChevronRight, Code, ExternalLink, FileText, Globe, Search, X } from "lucide-react"; +import { useMemo, useState } from "react"; import { Input } from "../../ui/primitives"; import { Dialog, DialogContent, DialogHeader, DialogTitle } from "../../ui/primitives/dialog"; import { cn } from "../../ui/primitives/styles"; import { Tabs, TabsContent, TabsList, TabsTrigger } from "../../ui/primitives/tabs"; -import { useCodeExamples, useKnowledgeItemChunks } from "../hooks"; +import { useCodeExamples, useKnowledgeItem, useKnowledgeItemChunks } from "../hooks"; +import { extractDomain } from "../utils/knowledge-utils"; interface DocumentBrowserProps { sourceId: string; @@ -21,7 +22,9 @@ export const DocumentBrowser: React.FC = ({ sourceId, open const [activeTab, setActiveTab] = useState<"documents" | "code">("documents"); const [searchQuery, setSearchQuery] = useState(""); const [expandedChunks, setExpandedChunks] = useState>(new Set()); + const [selectedDomains, setSelectedDomains] = useState>(new Set()); + const { data: sourceItem } = useKnowledgeItem(sourceId); const { data: chunksData, isLoading: chunksLoading, @@ -33,22 +36,46 @@ export const DocumentBrowser: React.FC = ({ sourceId, open const chunks = chunksData?.chunks || []; const codeExamples = codeData?.code_examples || []; - // Filter chunks based on search - const filteredChunks = chunks.filter( - (chunk) => + // Extract unique domains from chunks + const domainStats = useMemo(() => { + const stats = new Map(); + chunks.forEach((chunk) => { + const url = chunk.url || chunk.metadata?.url; + if (url) { + const domain = extractDomain(url); + stats.set(domain, (stats.get(domain) || 0) + 1); + } + }); + + return Array.from(stats.entries()) + .sort((a, b) => b[1] - a[1]) // Sort by count descending + .map(([domain, count]) => ({ domain, count })); + }, [chunks]); + + // Filter chunks based on search and domain + const filteredChunks = useMemo(() => chunks.filter((chunk) => { + // Search filter + const matchesSearch = + !searchQuery || chunk.content.toLowerCase().includes(searchQuery.toLowerCase()) || - chunk.metadata?.title?.toLowerCase().includes(searchQuery.toLowerCase()), - ); + chunk.metadata?.title?.toLowerCase().includes(searchQuery.toLowerCase()); + + // Domain filter + const url = chunk.url || chunk.metadata?.url; + const matchesDomain = selectedDomains.size === 0 || (url && selectedDomains.has(extractDomain(url))); + + return matchesSearch && matchesDomain; + }), [chunks, searchQuery, selectedDomains]); // Filter code examples based on search - const filteredCode = codeExamples.filter((example) => { + const filteredCode = useMemo(() => codeExamples.filter((example) => { const codeContent = example.code || example.content || ""; return ( codeContent.toLowerCase().includes(searchQuery.toLowerCase()) || example.summary?.toLowerCase().includes(searchQuery.toLowerCase()) || example.language?.toLowerCase().includes(searchQuery.toLowerCase()) ); - }); + }), [codeExamples, searchQuery]); const toggleChunk = (chunkId: string) => { setExpandedChunks((prev) => { @@ -66,9 +93,30 @@ export const DocumentBrowser: React.FC = ({ sourceId, open - Document Browser -
-
+ +
+ Document Browser + {chunksData && ( + + ({chunks.length} documents from {domainStats.length} domain{domainStats.length !== 1 ? "s" : ""}) + + )} +
+ {sourceItem && sourceItem.url && ( + + + View Source + + )} +
+
+ {/* Search Bar */} +
= ({ sourceId, open className="pl-10 bg-black/30 border-white/10 focus:border-cyan-500/50" />
+ + {/* Domain Filter */} + {domainStats.length > 0 && ( +
+
+ + Domain Filter + {selectedDomains.size > 0 && ( + + )} +
+
+ {domainStats.map(({ domain, count }) => { + const isSelected = selectedDomains.has(domain); + return ( + + ); + })} +
+
+ )}
@@ -123,8 +226,9 @@ export const DocumentBrowser: React.FC = ({ sourceId, open key={chunk.id} className="bg-black/30 rounded-lg border border-white/10 p-4 hover:border-cyan-500/30 transition-colors" > - {chunk.metadata?.title && ( -

+
+ {chunk.metadata?.title && ( +

{needsExpansion && (

- )} + )} + {(() => { + // Extract URL and domain once to avoid repeated computation + const resolvedUrl = chunk.url || chunk.metadata?.url; + if (!resolvedUrl) return null; + + const resolvedDomain = extractDomain(resolvedUrl); + + return ( + + {resolvedDomain} + + + ); + })()} +
{isExpanded || !needsExpansion ? ( diff --git a/archon-ui-main/src/features/knowledge/components/EditCrawlConfigDialog.tsx b/archon-ui-main/src/features/knowledge/components/EditCrawlConfigDialog.tsx new file mode 100644 index 0000000000..afb51e3a1b --- /dev/null +++ b/archon-ui-main/src/features/knowledge/components/EditCrawlConfigDialog.tsx @@ -0,0 +1,250 @@ +/** + * Edit Crawl Configuration Dialog + * Allows editing existing crawler configuration for knowledge items + */ + +import { AlertCircle, Globe, Loader2 } from "lucide-react"; +import { useEffect, useState } from "react"; +import { useToast } from "../../ui/hooks/useToast"; +import { Button, Input, Label } from "../../ui/primitives"; +import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "../../ui/primitives/dialog"; +import { useKnowledgeItem, useUpdateCrawlConfig } from "../hooks"; +import type { CrawlConfig } from "../types"; +import { AdvancedCrawlConfig } from "./AdvancedCrawlConfig"; +import { KnowledgeTypeSelector } from "./KnowledgeTypeSelector"; +import { LevelSelector } from "./LevelSelector"; +import { TagInput } from "./TagInput"; + +interface EditCrawlConfigDialogProps { + sourceId: string; + open: boolean; + onOpenChange: (open: boolean) => void; + onSuccess?: () => void; +} + +export const EditCrawlConfigDialog: React.FC = ({ + sourceId, + open, + onOpenChange, + onSuccess, +}) => { + const { showToast } = useToast(); + const { data: item, isLoading: itemLoading, error: itemError } = useKnowledgeItem(open ? sourceId : null); + const updateMutation = useUpdateCrawlConfig(); + + // Form state + const [url, setUrl] = useState(""); + const [knowledgeType, setKnowledgeType] = useState<"technical" | "business">("technical"); + const [maxDepth, setMaxDepth] = useState("2"); + const [tags, setTags] = useState([]); + const [crawlConfig, setCrawlConfig] = useState({}); + + // Reset form when dialog opens + useEffect(() => { + if (open && !item) { + // Reset to defaults while loading + setUrl(""); + setKnowledgeType("technical"); + setMaxDepth("2"); + setTags([]); + setCrawlConfig({}); + } + }, [open, item]); + + // Load existing configuration when item loads + useEffect(() => { + if (item && open) { + // Use original_url if available (the actual crawled URL), otherwise fall back to url + const urlToEdit = item.metadata?.original_url || item.url || ""; + setUrl(urlToEdit); + + // Knowledge type is also a required field + setKnowledgeType(item.knowledge_type || "technical"); + + // Check for max_depth at various locations + const depthValue = + item.max_depth || + item.metadata?.max_depth || + item.metadata?.crawl_config?.max_depth || + 2; + setMaxDepth(depthValue.toString()); + + // Check for tags at various locations + const tagsValue = + item.tags || + item.metadata?.tags || + []; + setTags(Array.isArray(tagsValue) ? tagsValue : []); + + // Load existing crawl config if available + // It could be at top level or nested in metadata + const configValue = + item.crawl_config || + item.metadata?.crawl_config || + {}; + + console.log("EditCrawlConfigDialog - Loading item:", item); + console.log("EditCrawlConfigDialog - Config value:", configValue); + + // Ensure the config has the right shape with proper defaults + const finalConfig = { + allowed_domains: Array.isArray(configValue.allowed_domains) ? configValue.allowed_domains : [], + excluded_domains: Array.isArray(configValue.excluded_domains) ? configValue.excluded_domains : [], + include_patterns: Array.isArray(configValue.include_patterns) ? configValue.include_patterns : [], + exclude_patterns: Array.isArray(configValue.exclude_patterns) ? configValue.exclude_patterns : [] + }; + + console.log("EditCrawlConfigDialog - Setting config to:", finalConfig); + setCrawlConfig(finalConfig); + } + }, [item, open]); + + const handleSave = async () => { + if (!url) { + showToast("URL is required", "error"); + return; + } + + try { + const updateData = { + sourceId, + url, + knowledge_type: knowledgeType, + max_depth: parseInt(maxDepth, 10), + tags: tags.length > 0 ? tags : undefined, + crawl_config: crawlConfig, + }; + + console.log("EditCrawlConfigDialog - Saving config:", updateData); + await updateMutation.mutateAsync(updateData); + + showToast("Configuration updated. Recrawl initiated.", "success"); + onSuccess?.(); + onOpenChange(false); + } catch (error) { + const message = error instanceof Error ? error.message : "Failed to update configuration"; + showToast(message, "error"); + } + }; + + const isProcessing = updateMutation.isPending; + + return ( +
e.stopPropagation()} + onKeyDown={(e) => e.stopPropagation()} + > + + + + Edit Crawler Configuration + + Update the crawler settings for this knowledge item + + + + {itemLoading ? ( +
+ +
+ ) : itemError ? ( +
+
+ +

Failed to load configuration

+

{itemError.message}

+
+
+ ) : ( +
+
+ {/* Warning Alert */} +
+ +
+ Saving changes will trigger a recrawl with the new configuration. + Existing documents will be replaced with newly crawled content. +
+
+ + {/* URL Input */} +
+ +
+
+ +
+ setUrl(e.target.value)} + disabled={isProcessing} + className="pl-10 h-12 backdrop-blur-md bg-gradient-to-r from-white/60 to-white/50 dark:from-black/60 dark:to-black/50" + /> +
+
+ + {/* Advanced Configuration */} + + + {/* Knowledge Type */} + + + {/* Crawl Depth */} + + + {/* Tags */} + + + {/* Action Buttons */} +
+ + +
+
+
+ )} +
+
+
+ ); +}; \ No newline at end of file diff --git a/archon-ui-main/src/features/knowledge/components/KnowledgeCard.tsx b/archon-ui-main/src/features/knowledge/components/KnowledgeCard.tsx index bb49edd9a1..6ceed6c31d 100644 --- a/archon-ui-main/src/features/knowledge/components/KnowledgeCard.tsx +++ b/archon-ui-main/src/features/knowledge/components/KnowledgeCard.tsx @@ -18,6 +18,7 @@ import { SimpleTooltip } from "../../ui/primitives/tooltip"; import { useDeleteKnowledgeItem, useRefreshKnowledgeItem } from "../hooks"; import type { KnowledgeItem } from "../types"; import { extractDomain } from "../utils/knowledge-utils"; +import { EditCrawlConfigDialog } from "./EditCrawlConfigDialog"; import { KnowledgeCardActions } from "./KnowledgeCardActions"; import { KnowledgeCardTags } from "./KnowledgeCardTags"; import { KnowledgeCardTitle } from "./KnowledgeCardTitle"; @@ -43,6 +44,7 @@ export const KnowledgeCard: React.FC = ({ onRefreshStarted, }) => { const [isHovered, setIsHovered] = useState(false); + const [showEditConfigDialog, setShowEditConfigDialog] = useState(false); const deleteMutation = useDeleteKnowledgeItem(); const refreshMutation = useRefreshKnowledgeItem(); @@ -221,6 +223,7 @@ export const KnowledgeCard: React.FC = ({ hasCodeExamples={codeExamplesCount > 0} onViewDocuments={onViewDocument} onViewCodeExamples={codeExamplesCount > 0 ? onViewCodeExamples : undefined} + onEditConfig={isUrl ? () => setShowEditConfigDialog(true) : undefined} onRefresh={isUrl ? handleRefresh : undefined} onDelete={handleDelete} onExport={onExport} @@ -343,6 +346,19 @@ export const KnowledgeCard: React.FC = ({

+ + {/* Edit Configuration Dialog */} + { + // The refresh will be handled by the update itself + if (onRefreshStarted) { + // We'll get the progressId from the update response + } + }} + /> ); }; diff --git a/archon-ui-main/src/features/knowledge/components/KnowledgeCardActions.tsx b/archon-ui-main/src/features/knowledge/components/KnowledgeCardActions.tsx index 9f07e2f50d..12241d4c53 100644 --- a/archon-ui-main/src/features/knowledge/components/KnowledgeCardActions.tsx +++ b/archon-ui-main/src/features/knowledge/components/KnowledgeCardActions.tsx @@ -4,7 +4,7 @@ * Following the pattern from ProjectCardActions */ -import { Code, Download, Eye, MoreHorizontal, RefreshCw, Trash2 } from "lucide-react"; +import { Code, Download, Edit, Eye, MoreHorizontal, RefreshCw, Trash2 } from "lucide-react"; import { useState } from "react"; import { DeleteConfirmModal } from "../../ui/components/DeleteConfirmModal"; import { Button } from "../../ui/primitives/button"; @@ -24,6 +24,7 @@ interface KnowledgeCardActionsProps { hasCodeExamples: boolean; onViewDocuments: () => void; onViewCodeExamples?: () => void; + onEditConfig?: () => void; onRefresh?: () => Promise; onDelete?: () => Promise; onExport?: () => void; @@ -36,6 +37,7 @@ export const KnowledgeCardActions: React.FC = ({ hasCodeExamples, onViewDocuments, onViewCodeExamples, + onEditConfig, onRefresh, onDelete, onExport, @@ -91,6 +93,11 @@ export const KnowledgeCardActions: React.FC = ({ onExport?.(); }; + const handleEditConfig = (e: React.MouseEvent) => { + e.stopPropagation(); + onEditConfig?.(); + }; + return ( <> @@ -123,6 +130,16 @@ export const KnowledgeCardActions: React.FC = ({ )} + {isUrl && onEditConfig && ( + <> + + + + Edit Configuration + + + )} + {isUrl && onRefresh && ( <> diff --git a/archon-ui-main/src/features/knowledge/components/index.ts b/archon-ui-main/src/features/knowledge/components/index.ts index e9174d5b13..3173213947 100644 --- a/archon-ui-main/src/features/knowledge/components/index.ts +++ b/archon-ui-main/src/features/knowledge/components/index.ts @@ -1,4 +1,5 @@ export * from "./AddKnowledgeDialog"; +export * from "./AdvancedCrawlConfig"; export * from "./DocumentBrowser"; export * from "./KnowledgeCard"; export * from "./KnowledgeList"; diff --git a/archon-ui-main/src/features/knowledge/hooks/useKnowledgeQueries.ts b/archon-ui-main/src/features/knowledge/hooks/useKnowledgeQueries.ts index 874499e275..a1dc3fe3db 100644 --- a/archon-ui-main/src/features/knowledge/hooks/useKnowledgeQueries.ts +++ b/archon-ui-main/src/features/knowledge/hooks/useKnowledgeQueries.ts @@ -14,7 +14,9 @@ import { useSmartPolling } from "../../ui/hooks"; import { useToast } from "../../ui/hooks/useToast"; import { knowledgeService } from "../services"; import type { + CrawlConfig, CrawlRequest, + CrawlRequestV2, CrawlStartResponse, KnowledgeItem, KnowledgeItemsFilter, @@ -298,6 +300,189 @@ export function useCrawlUrl() { }); } +/** + * Crawl URL mutation with domain filtering (v2) with optimistic updates + * Returns the progressId that can be used to track crawl progress + */ +export function useCrawlUrlV2() { + const queryClient = useQueryClient(); + const { showToast } = useToast(); + + return useMutation< + CrawlStartResponse, + Error, + CrawlRequestV2, + { + previousKnowledge?: KnowledgeItem[]; + previousSummaries?: Array<[readonly unknown[], KnowledgeItemsResponse | undefined]>; + previousOperations?: ActiveOperationsResponse; + tempProgressId: string; + tempItemId: string; + } + >({ + mutationFn: (request: CrawlRequestV2) => knowledgeService.crawlUrlV2(request), + onMutate: async (request) => { + // Cancel any outgoing refetches to prevent race conditions + await queryClient.cancelQueries({ queryKey: knowledgeKeys.summariesPrefix() }); + await queryClient.cancelQueries({ queryKey: progressKeys.active() }); + + // Snapshot the previous values for rollback + const previousSummaries = queryClient.getQueriesData({ + queryKey: knowledgeKeys.summariesPrefix(), + }); + const previousOperations = queryClient.getQueryData(progressKeys.active()); + + // Generate temporary progress ID and optimistic entity + const tempProgressId = createOptimisticId(); + const optimisticItem = createOptimisticEntity({ + title: (() => { + try { + return new URL(request.url).hostname || "New crawl"; + } catch { + return "New crawl"; + } + })(), + url: request.url, + source_id: tempProgressId, + source_type: "url", + knowledge_type: request.knowledge_type || "technical", + status: "processing", + document_count: 0, + code_examples_count: 0, + metadata: { + knowledge_type: request.knowledge_type || "technical", + tags: request.tags || [], + source_type: "url", + status: "processing", + description: `Crawling ${request.url} with domain filters`, + crawl_config: request.crawl_config, + }, + created_at: new Date().toISOString(), + updated_at: new Date().toISOString(), + } as Omit); + const tempItemId = optimisticItem.id; + + // Update all summaries caches with optimistic data + const entries = queryClient.getQueriesData({ + queryKey: knowledgeKeys.summariesPrefix(), + }); + for (const [qk, old] of entries) { + const filter = qk[qk.length - 1] as KnowledgeItemsFilter | undefined; + const matchesType = !filter?.knowledge_type || optimisticItem.knowledge_type === filter.knowledge_type; + const matchesTags = + !filter?.tags || filter.tags.every((t) => (optimisticItem.metadata?.tags ?? []).includes(t)); + if (!(matchesType && matchesTags)) continue; + if (!old) { + queryClient.setQueryData(qk, { + items: [optimisticItem], + total: 1, + page: 1, + per_page: 100, + }); + } else { + queryClient.setQueryData(qk, { + ...old, + items: [optimisticItem, ...old.items], + total: (old.total ?? old.items.length) + 1, + }); + } + } + + // Add optimistic progress entry + if (!previousOperations) { + queryClient.setQueryData(progressKeys.active(), { + operations: [ + { + operation_id: tempProgressId, + operation_type: "crawl", + status: "starting", + progress: 0, + message: `Starting crawl of ${request.url} with domain filtering`, + started_at: new Date().toISOString(), + progressId: tempProgressId, + } as ActiveOperation, + ], + count: 1, + timestamp: new Date().toISOString(), + }); + } else { + queryClient.setQueryData(progressKeys.active(), { + operations: [ + { + operation_id: tempProgressId, + operation_type: "crawl", + status: "starting", + progress: 0, + message: `Starting crawl of ${request.url} with domain filtering`, + started_at: new Date().toISOString(), + progressId: tempProgressId, + } as ActiveOperation, + ...(previousOperations.operations || []), + ], + count: (previousOperations.count || 0) + 1, + timestamp: new Date().toISOString(), + }); + } + + return { previousSummaries, previousOperations, tempProgressId, tempItemId }; + }, + onSuccess: async (response, _variables, context) => { + // Show success message + showToast("Crawl started with domain filtering", "success"); + + // Update the temporary progress ID with the real one + if (context) { + const activeOps = queryClient.getQueryData(progressKeys.active()); + if (activeOps) { + const updated: ActiveOperationsResponse = { + ...activeOps, // Preserve count, timestamp, and any other fields + operations: activeOps.operations.map((op) => + op.progressId === context.tempProgressId ? { ...op, progressId: response.progressId } : op, + ), + }; + queryClient.setQueryData(progressKeys.active(), updated); + } + + // Update item in all summaries caches + const entries = queryClient.getQueriesData({ + queryKey: knowledgeKeys.summariesPrefix(), + }); + for (const [qk, data] of entries) { + if (data) { + const updated = { + ...data, + items: data.items.map((item) => + item.source_id === context.tempProgressId ? { ...item, source_id: response.progressId } : item, + ), + }; + queryClient.setQueryData(qk, updated); + } + } + } + + // Invalidate to get fresh data + queryClient.invalidateQueries({ queryKey: progressKeys.active() }); + + // Return the response so caller can access progressId + return response; + }, + onError: (error, _variables, context) => { + // Rollback optimistic updates on error + if (context?.previousSummaries) { + for (const [queryKey, data] of context.previousSummaries) { + queryClient.setQueryData(queryKey, data); + } + } + if (context?.previousOperations) { + queryClient.setQueryData(progressKeys.active(), context.previousOperations); + } + + const errorMessage = getProviderErrorMessage(error) || "Failed to start crawl with filters"; + showToast(errorMessage, "error"); + }, + }); +} + /** * Upload document mutation with optimistic updates */ @@ -810,3 +995,77 @@ export function useKnowledgeCodeExamples( staleTime: STALE_TIMES.normal, }); } + +/** + * Update crawler configuration for existing knowledge item + * Triggers a recrawl with the new configuration + */ +export function useUpdateCrawlConfig() { + const queryClient = useQueryClient(); + const { showToast } = useToast(); + + return useMutation< + CrawlStartResponse, + Error, + { + sourceId: string; + url: string; + knowledge_type: "technical" | "business"; + max_depth: number; + tags?: string[]; + crawl_config?: CrawlConfig; + } + >({ + mutationFn: (request) => knowledgeService.updateCrawlConfig(request), + onMutate: async ({ sourceId }) => { + // Update item status to processing in all caches + await queryClient.cancelQueries({ queryKey: knowledgeKeys.detail(sourceId) }); + await queryClient.cancelQueries({ queryKey: knowledgeKeys.summariesPrefix() }); + + const previousItem = queryClient.getQueryData(knowledgeKeys.detail(sourceId)); + const previousSummaries = queryClient.getQueriesData({ queryKey: knowledgeKeys.summariesPrefix() }); + + // Optimistically update status to processing + if (previousItem) { + queryClient.setQueryData(knowledgeKeys.detail(sourceId), { + ...previousItem, + status: "processing", + }); + } + + // Update summaries cache + queryClient.setQueriesData({ queryKey: knowledgeKeys.summariesPrefix() }, (old) => { + if (!old?.items) return old; + return { + ...old, + items: old.items.map((item) => + item.source_id === sourceId ? { ...item, status: "processing" } : item + ), + }; + }); + + return { previousItem, previousSummaries }; + }, + onSuccess: (response) => { + // Invalidate to get fresh data + queryClient.invalidateQueries({ queryKey: knowledgeKeys.summariesPrefix() }); + queryClient.invalidateQueries({ queryKey: progressKeys.active() }); + + return response; + }, + onError: (error, { sourceId }, context) => { + // Rollback on error + if (context?.previousItem) { + queryClient.setQueryData(knowledgeKeys.detail(sourceId), context.previousItem); + } + if (context?.previousSummaries) { + for (const [queryKey, data] of context.previousSummaries) { + queryClient.setQueryData(queryKey, data); + } + } + + const errorMessage = error instanceof Error ? error.message : "Failed to update configuration"; + showToast(errorMessage, "error"); + }, + }); +} diff --git a/archon-ui-main/src/features/knowledge/inspector/components/ContentViewer.tsx b/archon-ui-main/src/features/knowledge/inspector/components/ContentViewer.tsx index d3f91a3a56..7a19cf9f00 100644 --- a/archon-ui-main/src/features/knowledge/inspector/components/ContentViewer.tsx +++ b/archon-ui-main/src/features/knowledge/inspector/components/ContentViewer.tsx @@ -3,9 +3,11 @@ * Displays the selected document or code content */ -import { Check, Code, Copy, FileText, Layers } from "lucide-react"; +import { Check, ChevronDown, ChevronRight, Code, Copy, ExternalLink, FileText, Info, Layers } from "lucide-react"; +import { useState } from "react"; import { Button } from "../../../ui/primitives"; import type { InspectorSelectedItem } from "../../types"; +import { extractDomain } from "../../utils/knowledge-utils"; interface ContentViewerProps { selectedItem: InspectorSelectedItem | null; @@ -14,6 +16,7 @@ interface ContentViewerProps { } export const ContentViewer: React.FC = ({ selectedItem, onCopy, copiedId }) => { + const [showMetadata, setShowMetadata] = useState(false); if (!selectedItem) { return (
@@ -102,26 +105,57 @@ export const ContentViewer: React.FC = ({ selectedItem, onCo
- {/* Content Body */} -
- {selectedItem.type === "document" ? ( -
-
-              {selectedItem.content || "No content available"}
-            
-
- ) : ( -
-
-              
-                {selectedItem.content || "// No code content available"}
-              
-            
+ {/* Main Content Area with Metadata Panel */} +
+ {/* Content Body */} +
+ {selectedItem.type === "document" ? ( +
+
+                {selectedItem.content || "No content available"}
+              
+
+ ) : ( +
+
+                
+                  {selectedItem.content || "// No code content available"}
+                
+              
+
+ )} +
+ + {/* Metadata Section - Always visible as collapsible at bottom */} + {selectedItem.metadata && Object.keys(selectedItem.metadata).length > 0 && ( +
+ + + {showMetadata && ( +
+
+
+                    {JSON.stringify(selectedItem.metadata, null, 2)}
+                  
+
+
+ )}
)}
- {/* Content Footer - Show metadata */} + {/* Content Footer - Show quick info */}
@@ -131,15 +165,21 @@ export const ContentViewer: React.FC = ({ selectedItem, onCo {(selectedItem.metadata.relevance_score * 100).toFixed(0)}% )} - {selectedItem.type === "document" && "url" in selectedItem.metadata && selectedItem.metadata.url && ( - - View Source - + {selectedItem.type === "document" && selectedItem.metadata && "url" in selectedItem.metadata && selectedItem.metadata.url && ( + <> + + Domain: {extractDomain(selectedItem.metadata.url)} + + + + View Source + + )}
{selectedItem.type === "document" ? "Document Chunk" : "Code Example"} diff --git a/archon-ui-main/src/features/knowledge/inspector/components/InspectorSidebar.tsx b/archon-ui-main/src/features/knowledge/inspector/components/InspectorSidebar.tsx index 09b9e441e3..b822539174 100644 --- a/archon-ui-main/src/features/knowledge/inspector/components/InspectorSidebar.tsx +++ b/archon-ui-main/src/features/knowledge/inspector/components/InspectorSidebar.tsx @@ -4,10 +4,13 @@ */ import { motion } from "framer-motion"; -import { Code, FileText, Hash, Loader2, Search } from "lucide-react"; +import { Code, FileText, Globe, Hash, Loader2, Search } from "lucide-react"; +import { useMemo } from "react"; import { Button, Input } from "../../../ui/primitives"; +import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "../../../ui/primitives/select"; import { cn } from "../../../ui/primitives/styles"; import type { CodeExample, DocumentChunk } from "../../types"; +import { extractDomain } from "../../utils/knowledge-utils"; interface InspectorSidebarProps { viewMode: "documents" | "code"; @@ -20,6 +23,8 @@ interface InspectorSidebarProps { hasNextPage: boolean; onLoadMore: () => void; isFetchingNextPage: boolean; + selectedDomain?: string; + onDomainChange?: (domain: string) => void; } export const InspectorSidebar: React.FC = ({ @@ -33,7 +38,41 @@ export const InspectorSidebar: React.FC = ({ hasNextPage, onLoadMore, isFetchingNextPage, + selectedDomain = "all", + onDomainChange, }) => { + + // Extract unique domains from documents + const domainStats = useMemo(() => { + if (viewMode !== "documents") return []; + + const stats = new Map(); + (items as DocumentChunk[]).forEach((doc) => { + const url = doc.url || doc.metadata?.url; + if (url) { + const domain = extractDomain(url); + stats.set(domain, (stats.get(domain) || 0) + 1); + } + }); + + return Array.from(stats.entries()) + .sort((a, b) => b[1] - a[1]) + .map(([domain, count]) => ({ domain, count })); + }, [items, viewMode]); + + // Filter items by selected domain + const filteredItems = useMemo(() => { + if (viewMode !== "documents" || selectedDomain === "all") { + return items; + } + + return (items as DocumentChunk[]).filter((doc) => { + const url = doc.url || doc.metadata?.url; + if (!url) return false; + return extractDomain(url) === selectedDomain; + }); + }, [items, selectedDomain, viewMode]); + const getItemTitle = (item: DocumentChunk | CodeExample) => { const idSuffix = String(item.id).slice(-6); if (viewMode === "documents") { @@ -62,8 +101,9 @@ export const InspectorSidebar: React.FC = ({ return ( ); -}; +}; \ No newline at end of file diff --git a/archon-ui-main/src/features/knowledge/inspector/components/KnowledgeInspector.tsx b/archon-ui-main/src/features/knowledge/inspector/components/KnowledgeInspector.tsx index 69e8f05062..b2c7504319 100644 --- a/archon-ui-main/src/features/knowledge/inspector/components/KnowledgeInspector.tsx +++ b/archon-ui-main/src/features/knowledge/inspector/components/KnowledgeInspector.tsx @@ -31,6 +31,7 @@ export const KnowledgeInspector: React.FC = ({ const [searchQuery, setSearchQuery] = useState(""); const [selectedItem, setSelectedItem] = useState(null); const [copiedId, setCopiedId] = useState(null); + const [selectedDomain, setSelectedDomain] = useState("all"); // Reset view mode when item or initialTab changes useEffect(() => { @@ -68,11 +69,12 @@ export const KnowledgeInspector: React.FC = ({ id: firstDoc.id, content: firstDoc.content || "", metadata: { + // Include all metadata from the backend + ...firstDoc.metadata, + // Also include top-level fields that might be useful title: firstDoc.title || firstDoc.metadata?.title, section: firstDoc.section || firstDoc.metadata?.section, - relevance_score: firstDoc.metadata?.relevance_score, url: firstDoc.url || firstDoc.metadata?.url, - tags: firstDoc.metadata?.tags, }, }); } else { @@ -82,10 +84,12 @@ export const KnowledgeInspector: React.FC = ({ id: String(firstCode.id || ""), content: firstCode.content || firstCode.code || "", metadata: { + // Include all metadata from the backend + ...firstCode.metadata, + // Also include top-level fields that might be useful language: firstCode.language, file_path: firstCode.file_path, summary: firstCode.summary, - relevance_score: firstCode.metadata?.relevance_score, title: firstCode.title || firstCode.example_name, }, }); @@ -111,11 +115,12 @@ export const KnowledgeInspector: React.FC = ({ id: doc.id || "", content: doc.content || "", metadata: { + // Include all metadata from the backend + ...doc.metadata, + // Also include top-level fields that might be useful title: doc.title || doc.metadata?.title, section: doc.section || doc.metadata?.section, - relevance_score: doc.metadata?.relevance_score, url: doc.url || doc.metadata?.url, - tags: doc.metadata?.tags, }, }); } else { @@ -125,10 +130,12 @@ export const KnowledgeInspector: React.FC = ({ id: String(code.id), content: code.content || code.code || "", metadata: { + // Include all metadata from the backend + ...code.metadata, + // Also include top-level fields that might be useful language: code.language, file_path: code.file_path, summary: code.summary, - relevance_score: code.metadata?.relevance_score, title: code.title || code.example_name, }, }); @@ -141,6 +148,7 @@ export const KnowledgeInspector: React.FC = ({ setViewMode(mode); setSelectedItem(null); setSearchQuery(""); + setSelectedDomain("all"); // Reset domain filter when switching modes }, []); return ( @@ -175,6 +183,8 @@ export const KnowledgeInspector: React.FC = ({ hasNextPage={hasNextPage} onLoadMore={fetchNextPage} isFetchingNextPage={isFetchingNextPage} + selectedDomain={selectedDomain} + onDomainChange={setSelectedDomain} /> {/* Content Viewer */} diff --git a/archon-ui-main/src/features/knowledge/services/knowledgeService.ts b/archon-ui-main/src/features/knowledge/services/knowledgeService.ts index b9d6af0662..1ad63e6701 100644 --- a/archon-ui-main/src/features/knowledge/services/knowledgeService.ts +++ b/archon-ui-main/src/features/knowledge/services/knowledgeService.ts @@ -8,7 +8,9 @@ import { APIServiceError } from "../../shared/errors"; import type { ChunksResponse, CodeExamplesResponse, + CrawlConfig, CrawlRequest, + CrawlRequestV2, CrawlStartResponse, KnowledgeItem, KnowledgeItemsFilter, @@ -89,6 +91,18 @@ export const knowledgeService = { return response; }, + /** + * Start crawling a URL with domain filtering (v2) + */ + async crawlUrlV2(request: CrawlRequestV2): Promise { + const response = await callAPIWithETag("/api/knowledge-items/crawl-v2", { + method: "POST", + body: JSON.stringify(request), + }); + + return response; + }, + /** * Refresh an existing knowledge item */ @@ -217,4 +231,26 @@ export const knowledgeService = { async getKnowledgeSources(): Promise { return callAPIWithETag("/api/knowledge-items/sources"); }, + + /** + * Update crawler configuration for an existing knowledge item + * This will trigger a recrawl with the new configuration + */ + async updateCrawlConfig(request: { + sourceId: string; + url: string; + knowledge_type: "technical" | "business"; + max_depth: number; + tags?: string[]; + crawl_config?: CrawlConfig; + }): Promise { + const { sourceId, ...crawlData } = request; + + const response = await callAPIWithETag(`/api/knowledge-items/${sourceId}/update-config`, { + method: "POST", + body: JSON.stringify(crawlData), + }); + + return response; + }, }; diff --git a/archon-ui-main/src/features/knowledge/types/knowledge.ts b/archon-ui-main/src/features/knowledge/types/knowledge.ts index 571cb6192e..fc8ac74910 100644 --- a/archon-ui-main/src/features/knowledge/types/knowledge.ts +++ b/archon-ui-main/src/features/knowledge/types/knowledge.ts @@ -21,6 +21,9 @@ export interface KnowledgeItemMetadata { original_url?: string; document_count?: number; // Number of documents in this knowledge item code_examples_count?: number; // Number of code examples found + max_depth?: number; // Crawl depth configuration + crawl_config?: CrawlConfig; // Advanced crawl configuration + [key: string]: any; // Allow additional untyped fields from backend } export interface KnowledgeItem { @@ -36,6 +39,10 @@ export interface KnowledgeItem { metadata: KnowledgeItemMetadata; created_at: string; updated_at: string; + // Additional fields that might be at top level + max_depth?: number; + tags?: string[]; + crawl_config?: CrawlConfig; } export interface CodeExampleMetadata { @@ -133,6 +140,65 @@ export interface KnowledgeItemsFilter { per_page?: number; } +/** + * Advanced crawler configuration for domain and URL pattern filtering. + * + * Precedence Rules (highest to lowest priority): + * 1. excluded_domains - Always blocks, takes highest priority + * 2. allowed_domains - If specified, only these domains are crawled + * 3. exclude_patterns - Blocks matching URL patterns + * 4. include_patterns - If specified, only matching patterns are crawled + * + * Pattern Syntax: + * - Domain patterns: Support wildcards ([star].example.com) and exact matches + * - URL patterns: Use glob syntax with fnmatch ([star], ?, [seq], [!seq]) + * + * Common Examples: + * + * Example 1 - Crawl only docs subdomain, excluding API references: + * allowed_domains: ["docs.example.com"] + * exclude_patterns: [[star]/api-reference/[star], [star]/deprecated/[star]] + * + * Example 2 - Crawl all subdomains except blog, only documentation paths: + * allowed_domains: [[star].example.com] + * excluded_domains: ["blog.example.com"] + * include_patterns: [[star]/docs/[star], [star]/guide/[star], [star]/tutorial/[star]] + * + * Example 3 - Block specific file types across all domains: + * exclude_patterns: [[star].pdf, [star].zip, [star]/downloads/[star]] + */ +export interface CrawlConfig { + /** + * Whitelist of domains to crawl. Supports exact matches and wildcards. + * Examples: docs.example.com, [star].example.com, api.example.com + * If specified, ONLY these domains will be crawled (unless blocked by excluded_domains). + */ + allowed_domains?: string[]; + + /** + * Blacklist of domains to never crawl. Takes precedence over allowed_domains. + * Examples: blog.example.com, [star].internal.example.com + * These domains are ALWAYS blocked, even if they match allowed_domains. + */ + excluded_domains?: string[]; + + /** + * URL patterns that must match for pages to be crawled. Uses glob syntax. + * Examples: [star]/docs/[star], [star]/api/v2/[star], [star]tutorial[star] + * If specified, ONLY URLs matching at least one pattern will be crawled. + * Patterns are matched against the full URL. + */ + include_patterns?: string[]; + + /** + * URL patterns to exclude from crawling. Uses glob syntax. Takes precedence over include_patterns. + * Examples: [star]/admin/[star], [star].pdf, [star]/temp/[star], [star]test[star] + * URLs matching these patterns are ALWAYS blocked. + * Patterns are matched against the full URL. + */ + exclude_patterns?: string[]; +} + export interface CrawlRequest { url: string; knowledge_type?: "technical" | "business"; @@ -142,6 +208,10 @@ export interface CrawlRequest { extract_code_examples?: boolean; } +export interface CrawlRequestV2 extends CrawlRequest { + crawl_config?: CrawlConfig; +} + export interface UploadMetadata { knowledge_type?: "technical" | "business"; tags?: string[]; diff --git a/python/src/server/api_routes/knowledge_api.py b/python/src/server/api_routes/knowledge_api.py index 5672583859..fb72872c9e 100644 --- a/python/src/server/api_routes/knowledge_api.py +++ b/python/src/server/api_routes/knowledge_api.py @@ -20,6 +20,8 @@ # Import unified logging from ..config.logfire_config import get_logger, safe_logfire_error, safe_logfire_info +# Import crawl models for type validation +from ..models.crawl_models import CrawlRequestV2 from ..services.crawler_manager import get_crawler from ..services.crawling import CrawlingService from ..services.credential_service import credential_service @@ -29,6 +31,7 @@ from ..services.storage import DocumentStorageService from ..utils import get_supabase_client from ..utils.document_processing import extract_text_from_document +from ..utils.progress.progress_tracker import ProgressTracker # Get logger for this module logger = get_logger(__name__) @@ -324,6 +327,32 @@ async def delete_knowledge_item(source_id: str): raise HTTPException(status_code=500, detail={"error": str(e)}) +@router.get("/knowledge-items/{source_id}") +async def get_knowledge_item(source_id: str): + """ + Get a single knowledge item by its source ID. + + Args: + source_id: The unique source ID of the knowledge item + + Returns: + The knowledge item with all its metadata + """ + try: + service = KnowledgeItemService(get_supabase_client()) + item = await service.get_item(source_id) + + if not item: + raise HTTPException(status_code=404, detail={"error": f"Knowledge item with source_id {source_id} not found"}) + + return item + except HTTPException: + raise + except Exception as e: + safe_logfire_error(f"Failed to get knowledge item | error={str(e)} | source_id={source_id}") + raise HTTPException(status_code=500, detail={"error": str(e)}) + + @router.get("/knowledge-items/{source_id}/chunks") async def get_knowledge_item_chunks( source_id: str, @@ -573,6 +602,92 @@ async def get_knowledge_item_code_examples( raise HTTPException(status_code=500, detail={"error": str(e)}) +@router.post("/knowledge-items/{source_id}/update-config") +async def update_crawl_config(source_id: str, request: CrawlRequestV2): + """ + Update crawler configuration for an existing knowledge item and trigger a recrawl. + + This endpoint allows users to edit existing crawler configuration including: + - URL + - Knowledge type + - Max depth + - Tags + - Advanced crawl configuration (domain filters, patterns, etc.) + """ + + # Validate API key before starting expensive operation + logger.info("🔍 About to validate API key for config update...") + provider_config = await credential_service.get_active_provider("embedding") + provider = provider_config.get("provider", "openai") + await _validate_provider_api_key(provider) + logger.info("✅ API key validation completed successfully for config update") + + try: + safe_logfire_info(f"Starting knowledge item config update | source_id={source_id}") + + # Get the existing knowledge item to verify it exists + service = KnowledgeItemService(get_supabase_client()) + existing_item = await service.get_item(source_id) + + if not existing_item: + raise HTTPException( + status_code=404, detail={"error": f"Knowledge item {source_id} not found"} + ) + + # Use the validated request directly + crawl_request = request + + # Generate unique progress ID for the recrawl + progress_id = str(uuid.uuid4()) + + # Create progress tracker for HTTP polling + tracker = ProgressTracker(progress_id, operation_type="crawl") + await tracker.start({ + "status": "starting", + "url": crawl_request.url, + "source_id": source_id, + "operation": "update_and_recrawl", + "has_filters": crawl_request.crawl_config is not None + }) + + # First delete the existing knowledge item and its documents + safe_logfire_info(f"Deleting existing knowledge item before recrawl | source_id={source_id}") + try: + from ..services.source_management_service import SourceManagementService + source_service = SourceManagementService(get_supabase_client()) + success, result_data = source_service.delete_source(source_id) + + if not success: + safe_logfire_error(f"Failed to delete existing item | error={result_data.get('error', 'Unknown error')}") + # Continue anyway - we'll overwrite + else: + safe_logfire_info(f"Successfully deleted existing knowledge item and documents | source_id={source_id}") + except Exception as e: + safe_logfire_error(f"Failed to delete existing item | error={str(e)}") + # Continue anyway - we'll overwrite + + # Create async task for crawling with updated configuration + crawl_task = asyncio.create_task( + _run_crawl_v2(request_dict=crawl_request.dict(), progress_id=progress_id) + ) + active_crawl_tasks[progress_id] = crawl_task + + safe_logfire_info( + f"Config update crawl task created | progress_id={progress_id} | url={crawl_request.url}" + ) + + return { + "success": True, + "progressId": progress_id, + "message": "Configuration updated. Recrawl initiated.", + "estimatedDuration": "2-10 minutes depending on site size" + } + + except Exception as e: + safe_logfire_error(f"Failed to update config and recrawl | error={str(e)}") + raise HTTPException(status_code=500, detail={"error": str(e)}) + + @router.post("/knowledge-items/{source_id}/refresh") async def refresh_knowledge_item(source_id: str): """Refresh a knowledge item by re-crawling its URL with the same metadata.""" @@ -855,6 +970,142 @@ async def _perform_crawl_with_progress( ) +@router.post("/knowledge-items/crawl-v2") +async def crawl_knowledge_item_v2(request: CrawlRequestV2): + """ + Crawl a URL with advanced domain filtering configuration. + + This is version 2 of the crawl endpoint that supports domain filtering. + """ + # Use the validated request directly + crawl_request = request + + # Validate API key before starting expensive operation + logger.info("🔍 About to validate API key for crawl-v2...") + provider_config = await credential_service.get_active_provider("embedding") + provider = provider_config.get("provider", "openai") + await _validate_provider_api_key(provider) + logger.info("✅ API key validation completed successfully") + + try: + safe_logfire_info( + f"Starting knowledge item crawl v2 | url={crawl_request.url} | " + f"knowledge_type={crawl_request.knowledge_type} | " + f"has_crawl_config={crawl_request.crawl_config is not None}" + ) + + # Generate unique progress ID + progress_id = str(uuid.uuid4()) + + # Create progress tracker for HTTP polling + tracker = ProgressTracker(progress_id, operation_type="crawl") + await tracker.start({ + "status": "starting", + "url": crawl_request.url, + "has_filters": crawl_request.crawl_config is not None + }) + + # Create async task for crawling + crawl_task = asyncio.create_task(_run_crawl_v2(request_dict=crawl_request.dict(), progress_id=progress_id)) + active_crawl_tasks[progress_id] = crawl_task + + safe_logfire_info( + f"Crawl v2 task created | progress_id={progress_id} | url={crawl_request.url}" + ) + + return { + "success": True, + "progressId": progress_id, + "message": "Crawl started with domain filtering", + "estimatedDuration": "2-10 minutes depending on site size" + } + + except Exception as e: + safe_logfire_error(f"Failed to start crawl v2 | error={str(e)}") + raise HTTPException(status_code=500, detail={"error": str(e)}) + + +async def _run_crawl_v2(request_dict: dict, progress_id: str): + """Run the crawl v2 with domain filtering in background.""" + tracker = ProgressTracker(progress_id, operation_type="crawl") + + try: + safe_logfire_info( + f"Starting crawl v2 with progress tracking | progress_id={progress_id} | url={request_dict['url']}" + ) + + # Get crawler from CrawlerManager + try: + crawler = await get_crawler() + if crawler is None: + raise Exception("Crawler not available - initialization may have failed") + except Exception as e: + safe_logfire_error(f"Failed to get crawler | error={str(e)}") + await tracker.error(f"Failed to initialize crawler: {str(e)}") + return + + supabase_client = get_supabase_client() + + # Extract crawl_config if present + crawl_config_dict = request_dict.get("crawl_config") + crawl_config = None + if crawl_config_dict: + from ..models.crawl_models import CrawlConfig + crawl_config = CrawlConfig(**crawl_config_dict) + + # Create orchestration service with crawl_config + orchestration_service = CrawlingService( + crawler, + supabase_client, + crawl_config=crawl_config + ) + orchestration_service.set_progress_id(progress_id) + + # Add important fields to metadata for storage and later retrieval + request_dict["metadata"] = request_dict.get("metadata", {}) + + # Always store these fields in metadata + request_dict["metadata"]["knowledge_type"] = request_dict.get("knowledge_type", "technical") + request_dict["metadata"]["max_depth"] = request_dict.get("max_depth", 2) + request_dict["metadata"]["tags"] = request_dict.get("tags", []) + + # Store the original URL for later reference + request_dict["metadata"]["original_url"] = request_dict.get("url", "") + + # Add crawl_config to metadata if present + if crawl_config: + request_dict["metadata"]["crawl_config"] = crawl_config.dict() + + # Orchestrate the crawl - this returns immediately with task info + result = await orchestration_service.orchestrate_crawl(request_dict) + + # Store the actual crawl task for proper cancellation + crawl_task = result.get("task") + if crawl_task: + active_crawl_tasks[progress_id] = crawl_task + safe_logfire_info( + f"Stored actual crawl v2 task in active_crawl_tasks | progress_id={progress_id}" + ) + else: + safe_logfire_error(f"No task returned from orchestrate_crawl v2 | progress_id={progress_id}") + + safe_logfire_info( + f"Crawl v2 task started | progress_id={progress_id} | task_id={result.get('task_id')}" + ) + + except asyncio.CancelledError: + safe_logfire_info(f"Crawl v2 cancelled | progress_id={progress_id}") + raise + except Exception as e: + safe_logfire_error(f"Crawl v2 task failed | progress_id={progress_id} | error={str(e)}") + await tracker.error(str(e)) + finally: + # Clean up task from registry when done + if progress_id in active_crawl_tasks: + del active_crawl_tasks[progress_id] + safe_logfire_info(f"Cleaned up crawl v2 task from registry | progress_id={progress_id}") + + @router.post("/documents/upload") async def upload_document( file: UploadFile = File(...), diff --git a/python/src/server/models/__init__.py b/python/src/server/models/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/src/server/models/crawl_models.py b/python/src/server/models/crawl_models.py new file mode 100644 index 0000000000..fcc8a762ef --- /dev/null +++ b/python/src/server/models/crawl_models.py @@ -0,0 +1,63 @@ +""" +Crawling Models Module + +This module contains Pydantic models for crawling configuration, +specifically for domain filtering and URL pattern matching. +""" + + +from pydantic import BaseModel, Field, validator + + +class CrawlConfig(BaseModel): + """Configuration for domain filtering during crawl.""" + + allowed_domains: list[str] | None = Field(None, description="Whitelist of domains to crawl") + excluded_domains: list[str] | None = Field(None, description="Blacklist of domains to exclude") + include_patterns: list[str] | None = Field(None, description="URL patterns to include (glob-style)") + exclude_patterns: list[str] | None = Field(None, description="URL patterns to exclude (glob-style)") + + @validator("allowed_domains", "excluded_domains", pre=True) + def normalize_domains(cls, v): + """Normalize domain formats for consistent matching.""" + if v is None: + return v + return [d.lower().strip().replace("http://", "").replace("https://", "").rstrip("/") for d in v] + + @validator("include_patterns", "exclude_patterns", pre=True) + def validate_patterns(cls, v): + """Validate URL patterns are valid glob patterns.""" + if v is None: + return v + # Ensure patterns are strings and not empty + return [p.strip() for p in v if p and isinstance(p, str) and p.strip()] + + +class CrawlRequestV2(BaseModel): + """Extended crawl request with domain filtering.""" + + url: str = Field(..., description="URL to start crawling from") + knowledge_type: str | None = Field("technical", description="Type of knowledge (technical/business)") + tags: list[str] | None = Field(default_factory=list, description="Tags to apply to crawled content") + update_frequency: int | None = Field(None, description="Update frequency in days") + max_depth: int | None = Field(3, description="Maximum crawl depth") + crawl_config: CrawlConfig | None = Field(None, description="Domain filtering configuration") + crawl_options: dict | None = Field(None, description="Additional crawl options") + extract_code_examples: bool | None = Field(True, description="Whether to extract code examples") + + @validator("url") + def validate_url(cls, v): + """Ensure URL is properly formatted.""" + if not v or not v.strip(): + raise ValueError("URL cannot be empty") + # Add http:// if no protocol specified + if not v.startswith(("http://", "https://")): + v = f"https://{v}" + return v.strip() + + @validator("knowledge_type") + def validate_knowledge_type(cls, v): + """Ensure knowledge type is valid.""" + if v and v not in ["technical", "business"]: + return "technical" # Default to technical if invalid + return v or "technical" diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index 55cd6d924a..3c3efd3248 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -12,12 +12,14 @@ from typing import Any, Optional from ...config.logfire_config import get_logger, safe_logfire_error, safe_logfire_info +from ...models.crawl_models import CrawlConfig from ...utils import get_supabase_client from ...utils.progress.progress_tracker import ProgressTracker # Import strategies # Import operations from .document_storage_operations import DocumentStorageOperations +from .domain_filter import DomainFilter from .helpers.site_config import SiteConfig # Import helpers @@ -56,7 +58,7 @@ class CrawlingService: Combines functionality from both CrawlingService and CrawlOrchestrationService. """ - def __init__(self, crawler=None, supabase_client=None, progress_id=None): + def __init__(self, crawler=None, supabase_client=None, progress_id=None, crawl_config=None): """ Initialize the crawling service. @@ -64,21 +66,25 @@ def __init__(self, crawler=None, supabase_client=None, progress_id=None): crawler: The Crawl4AI crawler instance supabase_client: The Supabase client for database operations progress_id: Optional progress ID for HTTP polling updates + crawl_config: Optional CrawlConfig for domain filtering """ self.crawler = crawler self.supabase_client = supabase_client or get_supabase_client() self.progress_id = progress_id self.progress_tracker = None + self.crawl_config = crawl_config # Initialize helpers self.url_handler = URLHandler() self.site_config = SiteConfig() self.markdown_generator = self.site_config.get_markdown_generator() self.link_pruning_markdown_generator = self.site_config.get_link_pruning_markdown_generator() + # DomainFilter doesn't need initialization params - it uses config passed to is_url_allowed + self.domain_filter = DomainFilter() # Initialize strategies - self.batch_strategy = BatchCrawlStrategy(crawler, self.link_pruning_markdown_generator) - self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.link_pruning_markdown_generator) + self.batch_strategy = BatchCrawlStrategy(crawler, self.link_pruning_markdown_generator, self.domain_filter) + self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.link_pruning_markdown_generator, self.domain_filter) self.single_page_strategy = SinglePageCrawlStrategy(crawler, self.markdown_generator) self.sitemap_strategy = SitemapCrawlStrategy() @@ -207,6 +213,7 @@ async def crawl_batch_with_progress( max_concurrent, progress_callback, self._check_cancellation, # Pass cancellation check + self.crawl_config, # Pass crawl_config for domain filtering ) async def crawl_recursive_with_progress( @@ -225,6 +232,7 @@ async def crawl_recursive_with_progress( max_concurrent, progress_callback, self._check_cancellation, # Pass cancellation check + self.crawl_config, # Pass crawl config for domain filtering ) # Orchestration methods @@ -293,6 +301,13 @@ async def send_heartbeat_if_needed(): url = str(request.get("url", "")) safe_logfire_info(f"Starting async crawl orchestration | url={url} | task_id={task_id}") + # Ensure crawl_config is available for strategies + if self.crawl_config is None and "crawl_config" in request: + self.crawl_config = request.get("crawl_config") + safe_logfire_info( + f"Crawl config attached | has_config={bool(self.crawl_config)}" + ) + # Start the progress tracker if available if self.progress_tracker: await self.progress_tracker.start({ diff --git a/python/src/server/services/crawling/domain_filter.py b/python/src/server/services/crawling/domain_filter.py new file mode 100644 index 0000000000..70970755fd --- /dev/null +++ b/python/src/server/services/crawling/domain_filter.py @@ -0,0 +1,183 @@ +""" +Domain Filtering Module + +This module provides domain filtering utilities for web crawling, +allowing users to control which domains and URL patterns are crawled. +""" + +import fnmatch +from urllib.parse import urlparse + +from ...config.logfire_config import get_logger +from ...models.crawl_models import CrawlConfig + +logger = get_logger(__name__) + + +class DomainFilter: + """ + Handles domain and URL pattern filtering for crawl operations. + + Priority order: + 1. Blacklist (excluded_domains) - always blocks + 2. Whitelist (allowed_domains) - must match if specified + 3. Exclude patterns - blocks matching URLs + 4. Include patterns - must match if specified + """ + + def is_url_allowed(self, url: str, base_url: str, config: CrawlConfig | None) -> bool: + """ + Check if a URL should be crawled based on domain filtering configuration. + + Args: + url: The URL to check + base_url: The base URL of the crawl (for resolving relative URLs) + config: The crawl configuration with filtering rules + + Returns: + True if the URL should be crawled, False otherwise + """ + if not config: + # No filtering configured, allow all URLs + return True + + try: + # Parse the URL + parsed = urlparse(url) + + # Handle relative URLs by using base URL's domain + if not parsed.netloc: + base_parsed = urlparse(base_url) + domain = base_parsed.netloc.lower() + # Construct full URL for pattern matching + full_url = f"{base_parsed.scheme}://{base_parsed.netloc}{parsed.path or '/'}" + else: + domain = parsed.netloc.lower() + full_url = url + + # Remove www. prefix for consistent matching + # Strip leading www. only (not from middle of domain) + normalized_domain = domain + if normalized_domain.startswith("www."): + normalized_domain = normalized_domain[4:] + + # PRIORITY 1: Blacklist always wins + if config.excluded_domains: + for excluded in config.excluded_domains: + if self._matches_domain(normalized_domain, excluded.lower()): + logger.debug(f"URL blocked by excluded domain | url={url} | domain={normalized_domain} | excluded={excluded}") + return False + + # PRIORITY 2: If whitelist exists, URL must match + if config.allowed_domains: + allowed = False + for allowed_domain in config.allowed_domains: + if self._matches_domain(normalized_domain, allowed_domain.lower()): + allowed = True + break + + if not allowed: + logger.debug(f"URL blocked - not in allowed domains | url={url} | domain={normalized_domain}") + return False + + # PRIORITY 3: Check exclude patterns (glob-style) + if config.exclude_patterns: + for pattern in config.exclude_patterns: + if fnmatch.fnmatch(full_url, pattern): + logger.debug(f"URL blocked by exclude pattern | url={url} | pattern={pattern}") + return False + + # PRIORITY 4: Check include patterns if specified + if config.include_patterns: + matched = False + for pattern in config.include_patterns: + if fnmatch.fnmatch(full_url, pattern): + matched = True + break + + if not matched: + logger.debug(f"URL blocked - doesn't match include patterns | url={url}") + return False + + logger.debug(f"URL allowed | url={url} | domain={normalized_domain}") + return True + + except Exception as e: + logger.error(f"Error filtering URL | url={url} | error={str(e)}") + # On error, be conservative and block the URL + return False + + def _matches_domain(self, domain: str, pattern: str) -> bool: + """ + Check if a domain matches a pattern. + + Supports: + - Exact matches: example.com matches example.com + - Subdomain wildcards: *.example.com matches sub.example.com + - Subdomain matching: sub.example.com matches sub.example.com and subsub.sub.example.com + + Args: + domain: The domain to check (already normalized and lowercase) + pattern: The pattern to match against + + Returns: + True if the domain matches the pattern + """ + # Normalize inputs + domain = (domain or "").lower() + pattern = (pattern or "").lower() + + # Remove any remaining protocol or path from pattern + pattern = pattern.replace("http://", "").replace("https://", "").split("/")[0] + # Drop port if present on pattern (e.g., example.com:8080) + pattern = pattern.split(":", 1)[0] + # Strip leading www. only + if pattern.startswith("www."): + pattern = pattern[4:] + + # Drop port from domain defensively + domain = domain.split(":", 1)[0] + + # Exact match + if domain == pattern: + return True + + # Wildcard subdomain match (*.example.com) + if pattern.startswith("*."): + base_pattern = pattern[2:] # Remove *. + # Check if domain ends with the base pattern and has a subdomain + if domain.endswith(base_pattern): + # Make sure it's a proper subdomain, not just containing the pattern + prefix = domain[:-len(base_pattern)] + if prefix and prefix.endswith("."): + return True + + # Subdomain match (allow any subdomain of the pattern) + # e.g., pattern=example.com should match sub.example.com + if domain.endswith(f".{pattern}"): + return True + + return False + + def get_domains_from_urls(self, urls: list[str]) -> set[str]: + """ + Extract unique domains from a list of URLs. + + Args: + urls: List of URLs to extract domains from + + Returns: + Set of unique domains (normalized and lowercase) + """ + domains = set() + for url in urls: + try: + parsed = urlparse(url) + if parsed.netloc: + domain = parsed.netloc.lower().replace("www.", "") + domains.add(domain) + except Exception as e: + logger.debug(f"Could not extract domain from URL | url={url} | error={str(e)}") + continue + + return domains diff --git a/python/src/server/services/crawling/strategies/batch.py b/python/src/server/services/crawling/strategies/batch.py index 1457fdca48..ecebe8c589 100644 --- a/python/src/server/services/crawling/strategies/batch.py +++ b/python/src/server/services/crawling/strategies/batch.py @@ -19,16 +19,18 @@ class BatchCrawlStrategy: """Strategy for crawling multiple URLs in batch.""" - def __init__(self, crawler, markdown_generator): + def __init__(self, crawler, markdown_generator, domain_filter=None): """ Initialize batch crawl strategy. Args: crawler (AsyncWebCrawler): The Crawl4AI crawler instance for web crawling operations markdown_generator (DefaultMarkdownGenerator): The markdown generator instance for converting HTML to markdown + domain_filter: Optional DomainFilter instance for URL filtering """ self.crawler = crawler self.markdown_generator = markdown_generator + self.domain_filter = domain_filter async def crawl_batch_with_progress( self, @@ -38,6 +40,7 @@ async def crawl_batch_with_progress( max_concurrent: int | None = None, progress_callback: Callable[..., Awaitable[None]] | None = None, cancellation_check: Callable[[], None] | None = None, + crawl_config=None, ) -> list[dict[str, Any]]: """ Batch crawl multiple URLs in parallel with progress reporting. @@ -149,11 +152,11 @@ async def report_progress(progress_val: int, message: str, status: str = "crawli **kwargs ) - total_urls = len(urls) + initial_urls_count = len(urls) await report_progress( 0, # Start at 0% progress - f"Starting to crawl {total_urls} URLs...", - total_pages=total_urls, + f"Starting to process {initial_urls_count} URLs...", + total_pages=initial_urls_count, processed_pages=0 ) @@ -162,14 +165,31 @@ async def report_progress(progress_val: int, message: str, status: str = "crawli processed = 0 cancelled = False - # Transform all URLs at the beginning + # Transform all URLs at the beginning and apply domain filtering url_mapping = {} # Map transformed URLs back to original transformed_urls = [] + filtered_count = 0 + for url in urls: + # Apply domain filtering if configured + if self.domain_filter and crawl_config: + # Use first URL as base for filtering (consistent with recursive strategy) + base_url = urls[0] if urls else url + if not self.domain_filter.is_url_allowed(url, base_url, crawl_config): + logger.debug(f"Filtering URL based on domain rules: {url}") + filtered_count += 1 + continue + transformed = transform_url_func(url) transformed_urls.append(transformed) url_mapping[transformed] = url + if filtered_count > 0: + logger.info(f"Filtered {filtered_count} URLs based on domain rules") + + # Update total count after filtering + total_urls = len(transformed_urls) + for i in range(0, total_urls, batch_size): # Check for cancellation before processing each batch if cancellation_check: diff --git a/python/src/server/services/crawling/strategies/recursive.py b/python/src/server/services/crawling/strategies/recursive.py index d13b51d480..775ce41952 100644 --- a/python/src/server/services/crawling/strategies/recursive.py +++ b/python/src/server/services/crawling/strategies/recursive.py @@ -21,17 +21,19 @@ class RecursiveCrawlStrategy: """Strategy for recursive crawling of websites.""" - def __init__(self, crawler, markdown_generator): + def __init__(self, crawler, markdown_generator, domain_filter=None): """ Initialize recursive crawl strategy. Args: crawler (AsyncWebCrawler): The Crawl4AI crawler instance for web crawling operations markdown_generator (DefaultMarkdownGenerator): The markdown generator instance for converting HTML to markdown + domain_filter: Optional DomainFilter instance for URL filtering """ self.crawler = crawler self.markdown_generator = markdown_generator self.url_handler = URLHandler() + self.domain_filter = domain_filter async def crawl_recursive_with_progress( self, @@ -42,6 +44,7 @@ async def crawl_recursive_with_progress( max_concurrent: int | None = None, progress_callback: Callable[..., Awaitable[None]] | None = None, cancellation_check: Callable[[], None] | None = None, + crawl_config=None, ) -> list[dict[str, Any]]: """ Recursively crawl internal links from start URLs up to a maximum depth with progress reporting. @@ -291,6 +294,33 @@ def normalize_url(url): # Skip binary files and already visited URLs is_binary = self.url_handler.is_binary_file(next_url) if next_url not in visited and not is_binary: + # Apply domain filtering if configured + if self.domain_filter and crawl_config: + # Use next_url's origin for domain checks, fallback to original_url + # This ensures we're checking against the appropriate domain + base_url = original_url + if len(start_urls) > 0: + # If we have start_urls, use the first one + base_url = start_urls[0] + else: + # Try to use the current page's URL as base + # This handles relative links better + try: + from urllib.parse import urljoin + base_url = urljoin(original_url, next_url) + except Exception: + base_url = original_url + + # Wrap filter check in try/except to prevent crashes + try: + if not self.domain_filter.is_url_allowed(next_url, base_url, crawl_config): + logger.debug(f"Filtering URL based on domain rules: {next_url}") + continue + except Exception as e: + # Log error and conservatively skip the URL + logger.warning(f"Error checking domain filter for {next_url}: {str(e)}. Skipping URL.") + continue + if next_url not in next_level_urls: next_level_urls.add(next_url) total_discovered += 1 # Increment when we discover a new URL diff --git a/python/src/server/services/knowledge/knowledge_item_service.py b/python/src/server/services/knowledge/knowledge_item_service.py index de8c9e0a3a..a90f13fe23 100644 --- a/python/src/server/services/knowledge/knowledge_item_service.py +++ b/python/src/server/services/knowledge/knowledge_item_service.py @@ -375,6 +375,11 @@ async def _transform_source_to_item(self, source: dict[str, Any]) -> dict[str, A "url": first_page_url, "source_id": source_id, "code_examples": code_examples, + # Include important fields at top level for easy access + "knowledge_type": source_metadata.get("knowledge_type", "technical"), + "max_depth": source_metadata.get("max_depth"), + "tags": source_metadata.get("tags", []), + "crawl_config": source_metadata.get("crawl_config"), "metadata": { # Spread source_metadata first, then override with computed values **source_metadata, diff --git a/python/src/server/services/tests/__init__.py b/python/src/server/services/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/src/server/services/tests/test_domain_filter.py b/python/src/server/services/tests/test_domain_filter.py new file mode 100644 index 0000000000..104cc33064 --- /dev/null +++ b/python/src/server/services/tests/test_domain_filter.py @@ -0,0 +1,204 @@ +""" +Unit tests for domain filtering functionality +""" + +from src.server.models.crawl_models import CrawlConfig +from src.server.services.crawling.domain_filter import DomainFilter + + +class TestDomainFilter: + """Test suite for DomainFilter class.""" + + def setup_method(self): + """Set up test fixtures.""" + self.filter = DomainFilter() + + def test_no_config_allows_all(self): + """Test that no configuration allows all URLs.""" + assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", None) is True + assert self.filter.is_url_allowed("https://other.com/page", "https://example.com", None) is True + + def test_whitelist_only(self): + """Test whitelist-only configuration.""" + config = CrawlConfig( + allowed_domains=["example.com", "docs.example.com"] + ) + + # Should allow whitelisted domains + assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", config) is True + assert self.filter.is_url_allowed("https://docs.example.com/api", "https://example.com", config) is True + + # Should block non-whitelisted domains + assert self.filter.is_url_allowed("https://other.com/page", "https://example.com", config) is False + assert self.filter.is_url_allowed("https://evil.com", "https://example.com", config) is False + + def test_blacklist_only(self): + """Test blacklist-only configuration.""" + config = CrawlConfig( + excluded_domains=["evil.com", "ads.example.com"] + ) + + # Should block blacklisted domains + assert self.filter.is_url_allowed("https://evil.com/page", "https://example.com", config) is False + assert self.filter.is_url_allowed("https://ads.example.com/track", "https://example.com", config) is False + + # Should allow non-blacklisted domains + assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", config) is True + assert self.filter.is_url_allowed("https://docs.example.com/api", "https://example.com", config) is True + + def test_blacklist_overrides_whitelist(self): + """Test that blacklist takes priority over whitelist.""" + config = CrawlConfig( + allowed_domains=["example.com", "blog.example.com"], + excluded_domains=["blog.example.com"] + ) + + # Blacklist should override whitelist + assert self.filter.is_url_allowed("https://blog.example.com/post", "https://example.com", config) is False + + # Non-blacklisted whitelisted domain should work + assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", config) is True + + def test_subdomain_matching(self): + """Test subdomain matching patterns.""" + config = CrawlConfig( + allowed_domains=["example.com"] + ) + + # Should match subdomains of allowed domain + assert self.filter.is_url_allowed("https://docs.example.com/page", "https://example.com", config) is True + assert self.filter.is_url_allowed("https://api.example.com/v1", "https://example.com", config) is True + assert self.filter.is_url_allowed("https://sub.sub.example.com", "https://example.com", config) is True + + # Should not match different domains + assert self.filter.is_url_allowed("https://notexample.com", "https://example.com", config) is False + + def test_wildcard_subdomain_matching(self): + """Test wildcard subdomain patterns.""" + config = CrawlConfig( + allowed_domains=["*.example.com"] + ) + + # Should match subdomains + assert self.filter.is_url_allowed("https://docs.example.com/page", "https://example.com", config) is True + assert self.filter.is_url_allowed("https://api.example.com/v1", "https://example.com", config) is True + + # Should NOT match the base domain without subdomain + assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", config) is False + + def test_url_patterns_include(self): + """Test include URL patterns.""" + config = CrawlConfig( + include_patterns=["*/api/*", "*/docs/*"] + ) + + # Should match include patterns + assert self.filter.is_url_allowed("https://example.com/api/v1", "https://example.com", config) is True + assert self.filter.is_url_allowed("https://example.com/docs/guide", "https://example.com", config) is True + + # Should not match URLs not in patterns + assert self.filter.is_url_allowed("https://example.com/blog/post", "https://example.com", config) is False + assert self.filter.is_url_allowed("https://example.com/", "https://example.com", config) is False + + def test_url_patterns_exclude(self): + """Test exclude URL patterns.""" + config = CrawlConfig( + exclude_patterns=["*/private/*", "*.pdf", "*/admin/*"] + ) + + # Should block excluded patterns + assert self.filter.is_url_allowed("https://example.com/private/data", "https://example.com", config) is False + assert self.filter.is_url_allowed("https://example.com/file.pdf", "https://example.com", config) is False + assert self.filter.is_url_allowed("https://example.com/admin/panel", "https://example.com", config) is False + + # Should allow non-excluded URLs + assert self.filter.is_url_allowed("https://example.com/public/page", "https://example.com", config) is True + assert self.filter.is_url_allowed("https://example.com/file.html", "https://example.com", config) is True + + def test_combined_filters(self): + """Test combination of all filter types.""" + config = CrawlConfig( + allowed_domains=["example.com", "docs.example.com"], + excluded_domains=["ads.example.com"], + include_patterns=["*/api/*", "*/guide/*"], + exclude_patterns=["*/deprecated/*"] + ) + + # Should pass all filters + assert self.filter.is_url_allowed("https://docs.example.com/api/v2", "https://example.com", config) is True + assert self.filter.is_url_allowed("https://example.com/guide/intro", "https://example.com", config) is True + + # Should fail on blacklist (highest priority) + assert self.filter.is_url_allowed("https://ads.example.com/api/track", "https://example.com", config) is False + + # Should fail on not in whitelist + assert self.filter.is_url_allowed("https://other.com/api/v1", "https://example.com", config) is False + + # Should fail on exclude pattern + assert self.filter.is_url_allowed("https://example.com/api/deprecated/old", "https://example.com", config) is False + + # Should fail on not matching include pattern + assert self.filter.is_url_allowed("https://example.com/blog/post", "https://example.com", config) is False + + def test_relative_urls(self): + """Test handling of relative URLs.""" + config = CrawlConfig( + allowed_domains=["example.com"] + ) + + # Relative URLs should use base URL's domain + assert self.filter.is_url_allowed("/page/path", "https://example.com", config) is True + assert self.filter.is_url_allowed("page.html", "https://example.com", config) is True + assert self.filter.is_url_allowed("../other/page", "https://example.com", config) is True + + def test_domain_normalization(self): + """Test that domains are properly normalized.""" + config = CrawlConfig( + allowed_domains=["EXAMPLE.COM", "https://docs.example.com/", "www.test.com"] + ) + + # Should handle different cases and formats + assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", config) is True + assert self.filter.is_url_allowed("https://EXAMPLE.COM/PAGE", "https://example.com", config) is True + assert self.filter.is_url_allowed("https://docs.example.com/api", "https://example.com", config) is True + assert self.filter.is_url_allowed("https://www.test.com/page", "https://example.com", config) is True + assert self.filter.is_url_allowed("https://test.com/page", "https://example.com", config) is True + + def test_edge_cases(self): + """Test edge cases and error handling.""" + config = CrawlConfig( + allowed_domains=["example.com"] + ) + + # Should handle malformed URLs gracefully + assert self.filter.is_url_allowed("not-a-url", "https://example.com", config) is True # Treated as relative + assert self.filter.is_url_allowed("", "https://example.com", config) is True # Empty URL + assert self.filter.is_url_allowed("//example.com/page", "https://example.com", config) is True # Protocol-relative + + def test_get_domains_from_urls(self): + """Test extracting domains from URL list.""" + urls = [ + "https://example.com/page1", + "https://docs.example.com/api", + "https://example.com/page2", + "https://other.com/resource", + "https://WWW.TEST.COM/page", + "/relative/path", # Should be skipped + "invalid-url", # Should be skipped + ] + + domains = self.filter.get_domains_from_urls(urls) + + assert domains == {"example.com", "docs.example.com", "other.com", "test.com"} + + def test_empty_filter_lists(self): + """Test that empty filter lists behave correctly.""" + config = CrawlConfig( + allowed_domains=[], + excluded_domains=[], + include_patterns=[], + exclude_patterns=[] + ) + + # Empty lists should be ignored (allow all) + assert self.filter.is_url_allowed("https://any.com/page", "https://example.com", config) is True