Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1a55d93
Implement priority-based automatic discovery of llms.txt and sitemap.…
leex279 Sep 8, 2025
43af7b7
fix: Update tests for single-file discovery and discovery stage integ…
leex279 Sep 8, 2025
d2adc15
fix: Address CodeRabbit critical issues for discovery service
leex279 Sep 8, 2025
77b0470
fix: Implement remaining CodeRabbit fixes for async and depth handling
leex279 Sep 8, 2025
8072066
Merge main into feature/automatic-discovery-llms-sitemap-430
leex279 Sep 20, 2025
2be44d1
fix: Resolve syntax error from merge conflict resolution
leex279 Sep 20, 2025
0a2c43f
fix: Update test assertions for proper rounding behavior in progress …
leex279 Sep 20, 2025
7f74aea
fix: Discovery now respects given URL path and fix method signature m…
leex279 Sep 20, 2025
c1677a9
fix: Skip discovery when user provides direct discovery file URLs
leex279 Sep 20, 2025
597fc86
fix: Skip link extraction for discovery targets (single-file mode)
leex279 Sep 20, 2025
d3cecd2
Merge branch 'main' into feature/automatic-discovery-llms-sitemap-430
leex279 Sep 22, 2025
d696918
Merge main into feature/automatic-discovery-llms-sitemap-430
leex279 Oct 11, 2025
968e5b7
Add SSL verification and response size limits to discovery service
leex279 Oct 14, 2025
e5160dd
fix: Address CodeRabbit feedback for discovery service
leex279 Oct 17, 2025
8777e94
feat: Prioritize same-directory discovery for llms.txt and sitemaps
leex279 Oct 17, 2025
a03ce1e
fix: Respect llms.txt priority over robots.txt sitemap declarations
leex279 Oct 17, 2025
cdf4323
feat: Implement llms.txt link following with discovery priority fix
leex279 Oct 17, 2025
8ab6c75
fix: Improve path detection and add progress validation
leex279 Oct 17, 2025
ddcd364
docs: Remove PRPs/llms-txt-link-following.md - not needed in repo
leex279 Oct 19, 2025
13796ab
feat: Improve discovery system with SSRF protection and optimize file…
leex279 Oct 19, 2025
957d8b9
fix: Update tests for requests.Session mock and cleanup URL validation
leex279 Oct 19, 2025
46ae553
fix: add tldextract to all dependency group
leex279 Oct 19, 2025
35c9ea9
fix: update test to use 'pages' terminology for llms.txt
leex279 Oct 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import { Button } from "../../ui/primitives";
import { cn } from "../../ui/primitives/styles";
import { useCrawlProgressPolling } from "../hooks";
import type { ActiveOperation } from "../types/progress";
import { isValidHttpUrl } from "../utils/urlValidation";

interface CrawlingProgressProps {
onSwitchToBrowse: () => void;
Expand Down Expand Up @@ -129,6 +130,7 @@ export const CrawlingProgress: React.FC<CrawlingProgressProps> = ({ onSwitchToBr
"in_progress",
"starting",
"initializing",
"discovery",
"analyzing",
"storing",
"source_creation",
Expand Down Expand Up @@ -245,6 +247,63 @@ export const CrawlingProgress: React.FC<CrawlingProgressProps> = ({ onSwitchToBr
)}
</div>

{/* Discovery Information */}
{operation.discovered_file && (
<div className="pt-2 border-t border-white/10">
<div className="flex items-center gap-2 mb-2">
<span className="text-xs font-semibold text-cyan-400">Discovery Result</span>
{operation.discovered_file_type && (
<span className="px-2 py-0.5 text-xs rounded bg-cyan-500/10 border border-cyan-500/20 text-cyan-300">
{operation.discovered_file_type}
</span>
)}
</div>
{isValidHttpUrl(operation.discovered_file) ? (
<a
href={operation.discovered_file}
target="_blank"
rel="noopener noreferrer"
className="text-sm text-gray-400 hover:text-cyan-400 transition-colors truncate block"
>
{operation.discovered_file}
</a>
) : (
<span className="text-sm text-gray-400 truncate block">
{operation.discovered_file}
</span>
)}
</div>
)}

{/* Linked Files */}
{operation.linked_files && operation.linked_files.length > 0 && (
<div className="pt-2 border-t border-white/10">
<div className="text-xs font-semibold text-cyan-400 mb-2">
Following {operation.linked_files.length} Linked File
{operation.linked_files.length > 1 ? "s" : ""}
</div>
<div className="space-y-1 max-h-32 overflow-y-auto">
{operation.linked_files.map((file: string, idx: number) => (
isValidHttpUrl(file) ? (
<a
key={idx}
href={file}
target="_blank"
rel="noopener noreferrer"
className="text-xs text-gray-400 hover:text-cyan-400 transition-colors truncate block"
>
• {file}
</a>
) : (
<span key={idx} className="text-xs text-gray-400 truncate block">
• {file}
</span>
)
))}
</div>
</div>
)}

{/* Current Action or Operation Type Info */}
{(operation.current_url || operation.operation_type) && (
<div className="pt-2 border-t border-white/10">
Expand Down
27 changes: 26 additions & 1 deletion archon-ui-main/src/features/progress/types/progress.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
export type ProgressStatus =
| "starting"
| "initializing"
| "discovery"
| "analyzing"
| "crawling"
| "processing"
Expand All @@ -24,7 +25,16 @@ export type ProgressStatus =
| "cancelled"
| "stopping";

export type CrawlType = "normal" | "sitemap" | "llms-txt" | "text_file" | "refresh";
export type CrawlType =
| "normal"
| "sitemap"
| "llms-txt"
| "text_file"
| "refresh"
| "llms_txt_with_linked_files"
| "llms_txt_linked_files"
| "discovery_single_file"
| "discovery_sitemap";
export type UploadType = "document";

export interface BaseProgressData {
Expand All @@ -48,6 +58,10 @@ export interface CrawlProgressData extends BaseProgressData {
codeBlocksFound?: number;
totalSummaries?: number;
completedSummaries?: number;
// Discovery-related fields
discoveredFile?: string;
discoveredFileType?: string;
linkedFiles?: string[];
originalCrawlParams?: {
url: string;
knowledge_type?: string;
Expand Down Expand Up @@ -100,6 +114,10 @@ export interface ActiveOperation {
code_examples_found?: number;
current_operation?: string;
};
// Discovery information
discovered_file?: string;
discovered_file_type?: string;
linked_files?: string[];
}

export interface ActiveOperationsResponse {
Expand Down Expand Up @@ -127,6 +145,13 @@ export interface ProgressResponse {
codeBlocksFound?: number;
totalSummaries?: number;
completedSummaries?: number;
// Discovery-related fields
discoveredFile?: string;
discovered_file?: string; // Snake case from backend
discoveredFileType?: string;
discovered_file_type?: string; // Snake case from backend
linkedFiles?: string[];
linked_files?: string[]; // Snake case from backend
fileName?: string;
fileSize?: number;
chunksProcessed?: number;
Expand Down
44 changes: 44 additions & 0 deletions archon-ui-main/src/features/progress/utils/urlValidation.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/**
* Client-side URL validation utility for discovered files.
* Ensures only safe HTTP/HTTPS URLs are rendered as clickable links.
*/

const SAFE_PROTOCOLS = ["http:", "https:"];

/**
* Validates that a URL is safe to render as a clickable link.
* Only allows http: and https: protocols.
*
* @param url - URL string to validate
* @returns true if URL is safe (http/https), false otherwise
*/
export function isValidHttpUrl(url: string | undefined | null): boolean {
if (!url || typeof url !== "string") {
return false;
}

// Trim whitespace
const trimmed = url.trim();
if (!trimmed) {
return false;
}

try {
const parsed = new URL(trimmed);

// Only allow http and https protocols
if (!SAFE_PROTOCOLS.includes(parsed.protocol)) {
return false;
}

// Basic hostname validation (must have at least one dot or be localhost)
if (!parsed.hostname.includes(".") && parsed.hostname !== "localhost") {
return false;
}

return true;
} catch {
// URL parsing failed - not a valid URL
return false;
}
}
2 changes: 2 additions & 0 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ server = [
"pydantic>=2.0.0",
"python-dotenv>=1.0.0",
"docker>=6.1.0",
"tldextract>=5.0.0",
# Logging
"logfire>=0.30.0",
# Testing (needed for UI-triggered tests)
Expand Down Expand Up @@ -116,6 +117,7 @@ all = [
"cryptography>=41.0.0",
"slowapi>=0.1.9",
"docker>=6.1.0",
"tldextract>=5.0.0",
"logfire>=0.30.0",
# MCP specific (mcp version)
"mcp==1.12.2",
Expand Down
Loading