diff --git a/PRPs/fix-error-operation-visibility.md b/PRPs/fix-error-operation-visibility.md new file mode 100644 index 0000000000..36024c47ea --- /dev/null +++ b/PRPs/fix-error-operation-visibility.md @@ -0,0 +1,1226 @@ +name: "Fix Error Operation Visibility in Knowledge Base UI" +description: | + Fix the architectural issue where failed crawl operations disappear from the UI instead of showing error state with retry/remove actions. + +--- + +## Goal + +**Feature Goal**: Failed crawl operations must remain visible in the UI with clear error indication and actionable recovery options (retry, remove) instead of silently disappearing after 5-30 seconds. + +**Deliverable**: +1. "Failed Operations" section in Knowledge Base UI showing persistent error states +2. Backend API endpoint to retrieve failed operations with extended retention +3. Retry mechanism for failed crawls with original parameters +4. Remove action for explicit user dismissal of failed operations + +**Success Definition**: +- When a crawl fails with MemoryError or any error, it appears in "Failed Operations" section +- Error message is clearly displayed with operation details (URL, timestamp, error reason) +- User can click "Retry" to restart crawl with same parameters +- User can click "Remove" to explicitly dismiss the failed operation +- Failed operations persist for 5 minutes (not 30 seconds) before auto-cleanup +- Active operations list excludes failed operations (shown separately) + +## User Persona + +**Target User**: Archon end user attempting to add knowledge sources + +**Use Case**: User adds large documentation site (e.g., docs.mem0.ai/llms.txt) that fails due to memory constraints + +**User Journey**: +1. User clicks "Add Knowledge" and enters URL +2. Crawl starts successfully, shows in Active Operations +3. After 8 minutes, crawl fails with MemoryError +4. **CURRENT**: Operation disappears completely from UI +5. **NEW**: Operation moves to "Failed Operations" section with error badge +6. User sees error message: "Crawl failed: Memory usage exceeded threshold for 600.0 seconds" +7. User can click "Retry" to attempt again or "Remove" to dismiss +8. If user does nothing, operation auto-removes after 5 minutes + +**Pain Points Addressed**: +- Silent failures with no user visibility +- No way to diagnose why crawl failed +- No recovery path without re-entering URL +- Wasted time re-attempting same failing URLs +- Loss of trust in system reliability + +## Why + +- **Business Value**: Users can diagnose and recover from failures instead of experiencing silent errors +- **Integration**: Extends existing progress tracking system with persistent error handling +- **Problems Solved**: + - Issue #801: MemoryError causes operations to disappear + - User frustration with "black box" failures + - Support burden from users reporting "disappeared jobs" + - Data loss when users don't know which URLs failed + +## What + +### User-Visible Behavior + +**Active Operations Section** (existing, no change): +- Shows in-progress crawls (status: starting, analyzing, crawling, processing) +- Stop button to cancel active operations +- Real-time progress percentage and current page + +**Failed Operations Section** (NEW): +- Shows operations with status: error, failed +- Red error badge with count (e.g., "Failed (2)") +- Each failed operation displays: + - Original URL + - Error icon (red) + - Error message summary (first 100 chars) + - Timestamp of failure + - Action buttons: "View Details", "Retry", "Remove" +- Expandable details showing full error message and crawl logs +- Auto-cleanup after 5 minutes (not 30 seconds) + +**Backend Changes**: +- Failed operations persist in backend for 5 minutes (300 seconds) +- New endpoint: `GET /api/progress/?include_failed=true` +- Error state includes original request parameters for retry +- Cleanup delay extended from 30s to 300s for error states + +### Success Criteria + +- [ ] Failed crawls appear in "Failed Operations" section within 1 second of failure +- [ ] Error message is displayed accurately from backend +- [ ] Retry button restarts crawl with identical parameters +- [ ] Remove button clears operation from UI and backend memory +- [ ] Failed operations auto-remove after 5 minutes +- [ ] Active operations list excludes failed operations +- [ ] Failed operation count badge shows correct number +- [ ] Error details are expandable/collapsible +- [ ] No regression in active operation tracking + +## All Needed Context + +### Context Completeness Check + +✅ **Validation**: An AI agent unfamiliar with Archon can implement this successfully using: +- Exact file paths to existing patterns for error UI, retry logic, and action buttons +- Specific line numbers for backend changes (cleanup delay, API filtering) +- Complete TanStack Query patterns for error handling and cache management +- UI component patterns with styling conventions (glassmorphism, color system) +- Backend progress tracker modification points + +### Documentation & References + +```yaml +# CRITICAL BACKEND UNDERSTANDING +- file: /mnt/c/Users/Leex279/Documents/GitHub/YouTube/tmp/Archon/tmp/issue-801-analysis.md + why: Complete root cause analysis of why operations disappear + critical: | + - Backend sets error state correctly (line 735-741 of crawling_service.py) + - ProgressTracker.error() schedules cleanup after 30s (progress_tracker.py:219) + - Backend API excludes error states from active list (progress_api.py:118) + - Frontend removes error queries after 5s (useProgressQueries.ts:143) + pattern: "Understand the complete disappearance sequence before fixing" + +- file: /mnt/c/Users/Leex279/Documents/GitHub/YouTube/tmp/Archon/tmp/issue-801-memory-root-cause.md + why: Understanding why MemoryError occurs helps design retry mechanism + critical: | + - MemoryError comes from crawl4ai after 600 seconds above 80% memory + - Retry should suggest lower CRAWL_MAX_CONCURRENT setting + - Error message should include memory optimization hints + pattern: "Use for retry guidance and error message enhancement" + +# MUST READ - Frontend Error Handling Patterns +- file: archon-ui-main/src/features/knowledge/components/KnowledgeCard.tsx + why: Shows existing error display pattern with red edge indicator + pattern: | + Lines 35-50: isError prop → red border styling + className: bg-red-500/10 border-red-500/20 + gotcha: "Use conditional border, not wrapper component" + +- file: archon-ui-main/src/features/knowledge/components/KnowledgeList.tsx + why: Container pattern for list of items with error states + pattern: | + Lines 45-89: Maps over items, handles loading/error/empty states + Empty state uses text-gray-400 + gotcha: "Handle empty failed operations list with helpful message" + +- file: archon-ui-main/src/features/progress/components/CrawlingProgress.tsx + why: Active operations display pattern - adapt for failed operations + pattern: | + Lines 1-200: Complete component structure for operation lists + Shows operation cards with status badges + Uses smart polling with visibility awareness + gotcha: "Don't poll failed operations - they're terminal states" + +- file: archon-ui-main/src/features/ui/components/DeleteConfirmModal.tsx + why: Reusable confirmation modal pattern for Remove action + pattern: | + Lines 1-150: Type-aware modal with entity-specific messages + Supports size variants: compact, default, large + Red destructive styling with glassmorphism + gotcha: "Use 'knowledge_item' type for failed operation removal" + +# MUST READ - Action Button Patterns +- file: archon-ui-main/src/features/knowledge/components/KnowledgeCardActions.tsx + why: Dropdown action menu pattern for retry/remove buttons + pattern: | + Lines 80-180: DropdownMenu with Radix UI primitives + Ghost button trigger, separators before destructive actions + Icons from lucide-react, tooltips on buttons + critical: "Place destructive 'Remove' action after separator at bottom" + +- file: archon-ui-main/src/features/knowledge/hooks/useKnowledgeQueries.ts + why: Mutation patterns for retry and remove operations + pattern: | + Lines 100-150: useDeleteKnowledgeItem with optimistic updates + Lines 200-250: Service layer integration with TanStack Query + onSuccess invalidates related queries + gotcha: "Retry is not a mutation - it's re-calling the create source endpoint" + +# MUST READ - Progress Tracking & Terminal States +- file: archon-ui-main/src/features/progress/hooks/useProgressQueries.ts + why: Complete progress polling and cleanup logic + pattern: | + Lines 22-23: TERMINAL_STATES = ["completed", "error", "failed", "cancelled"] + Lines 83-94: refetchInterval stops polling on terminal states + Lines 136-149: Error cleanup timeout set to 5 seconds + critical: | + - PROBLEM: Lines 136-149 auto-remove error queries after 5s + - FIX: Create separate hook for failed operations that doesn't auto-remove + - KEEP: Auto-removal for completed/cancelled, REMOVE for error/failed + +- file: archon-ui-main/src/features/shared/config/queryPatterns.ts + why: Shared constants for query configuration + pattern: | + Lines 8-15: STALE_TIMES constants + Line 3: DISABLED_QUERY_KEY for conditional queries + gotcha: "Use STALE_TIMES.normal (30s) for failed operations list" + +- file: archon-ui-main/src/features/shared/hooks/useSmartPolling.ts + why: Visibility-aware polling for active operations only + pattern: | + Lines 1-50: Pauses when tab hidden, slows when unfocused + Returns refetchInterval for TanStack Query + gotcha: "Don't use smart polling for failed operations - they're static" + +# MUST READ - Backend Progress API +- file: python/src/server/api_routes/progress_api.py + why: Active operations endpoint that currently excludes errors + pattern: | + Lines 18-19: TERMINAL_STATES = {"completed", "failed", "error", "cancelled"} + Lines 115-118: Filters OUT terminal states from active operations + Lines 100-152: list_active_operations() endpoint implementation + critical: | + CURRENT PROBLEM (Line 118): + ```python + if status not in TERMINAL_STATES: # Excludes error/failed! + active_operations.append(operation_data) + ``` + FIX OPTIONS: + 1. Add query param: ?include_failed=true to include failed ops + 2. Create new endpoint: GET /api/progress/failed + 3. Modify filter to include failed but not completed + RECOMMENDED: Option 1 (query param) for simplicity + +- file: python/src/server/utils/progress/progress_tracker.py + why: Progress state storage and cleanup mechanism + pattern: | + Lines 61-73: _delayed_cleanup method with 30s delay + Lines 196-219: error() method that schedules cleanup + Lines 163-164: Update method schedules cleanup for failed/cancelled + critical: | + CURRENT PROBLEM (Line 61): + ```python + async def _delayed_cleanup(cls, progress_id: str, delay_seconds: int = 30): + ``` + FIX (Line 61): + ```python + async def _delayed_cleanup(cls, progress_id: str, delay_seconds: int = 300): + ``` + OR make delay_seconds configurable per status type + +- file: python/src/server/models/progress_models.py + why: Progress response schema for API + pattern: Check for existing error field structure + gotcha: "Error field should include full message, not truncated" + +# MUST READ - TanStack Query Error Handling +- file: archon-ui-main/src/features/shared/config/queryClient.ts + why: Global query client configuration for retries and errors + pattern: | + Lines 1-50: Default staleTime, gcTime, retry logic + Retry disabled for 4xx errors (client errors) + gotcha: "Failed operations endpoint should return 200, not 404" + +- file: archon-ui-main/src/features/shared/utils/optimistic.ts + why: Optimistic update utilities using nanoid + pattern: | + Lines 1-100: createOptimisticEntity, replaceOptimisticEntity + Uses _optimistic flag and _localId + gotcha: "Don't need optimistic updates for retry - it's a new operation" + +# UI Design System +- file: archon-ui-main/src/features/ui/primitives/ + why: Radix UI primitives used throughout Archon + pattern: | + Accordion, AlertDialog, Badge, Button, DropdownMenu + All styled with glassmorphism and Tron-inspired design + critical: "Use Badge for failed operation count, AlertDialog for errors" + +- docfile: PRPs/ai_docs/UI_STANDARDS.md + why: Comprehensive UI styling standards + section: "Section 2: Styling Patterns - Color System, Glassmorphism" + critical: | + - Error colors: bg-red-500/10, text-red-400, border-red-500/20 + - Never construct Tailwind classes dynamically + - Use conditional className strings + pattern: "Follow glassmorphism pattern with backdrop-blur" + +# External Best Practices +- url: https://docs.github.com/en/actions/monitoring-and-troubleshooting-workflows/viewing-workflow-run-history + why: GitHub Actions failed workflow UI pattern + critical: | + - Status badges with icon + text + - Expandable error logs + - Retry button prominently placed + - Clear timestamp and duration + pattern: "Status badge → Expandable details → Action buttons" + +- url: https://vercel.com/docs/deployments/managing-deployments#deployment-states + why: Vercel deployment states and error handling + critical: | + - Failed deployments stay in history + - Multi-channel notifications (web + email) + - One-click retry from failed state + pattern: "Persistent history with retry capability" + +- url: https://developer.mozilla.org/en-US/docs/Web/Accessibility/WCAG/Understanding_SC/2_2_1 + why: WCAG 2.1 - Don't auto-dismiss error messages + critical: "Errors must persist until user acknowledges - NEVER auto-dismiss" + pattern: "User-controlled dismissal only" +``` + +### Current Codebase Tree (Relevant Sections) + +```bash +archon-ui-main/src/features/ +├── knowledge/ +│ ├── components/ +│ │ ├── KnowledgeCard.tsx # Has error state pattern (red border) +│ │ ├── KnowledgeCardActions.tsx # Dropdown action menu pattern +│ │ ├── KnowledgeList.tsx # List container with states +│ │ └── AddKnowledgeDialog.tsx # Create knowledge source +│ ├── hooks/ +│ │ └── useKnowledgeQueries.ts # Query/mutation hooks +│ ├── services/ +│ │ └── knowledgeService.ts # API calls +│ └── views/ +│ └── KnowledgeView.tsx # Main view component +├── progress/ +│ ├── components/ +│ │ └── CrawlingProgress.tsx # Active operations display +│ ├── hooks/ +│ │ └── useProgressQueries.ts # Progress polling logic +│ └── services/ +│ └── progressService.ts # Progress API calls +└── ui/ + ├── components/ + │ └── DeleteConfirmModal.tsx # Confirmation modal pattern + └── primitives/ # Radix UI components + ├── Badge.tsx + ├── Button.tsx + └── DropdownMenu.tsx + +python/src/server/ +├── api_routes/ +│ └── progress_api.py # Lines 100-152: list_active_operations +├── utils/progress/ +│ └── progress_tracker.py # Lines 61-73: _delayed_cleanup +├── models/ +│ └── progress_models.py # Progress response schemas +└── services/crawling/ + └── crawling_service.py # Lines 728-751: Error handling +``` + +### Desired Codebase Tree (Files to Add/Modify) + +```bash +# NEW FILES +archon-ui-main/src/features/progress/components/ +└── FailedOperationsSection.tsx # NEW: Display failed operations with retry/remove + +archon-ui-main/src/features/knowledge/hooks/ +└── useRetryKnowledgeSource.ts # NEW: Retry failed crawl mutation + +# MODIFIED FILES +archon-ui-main/src/features/progress/hooks/ +└── useProgressQueries.ts # MODIFY: Add useFailedOperations hook + +archon-ui-main/src/features/progress/services/ +└── progressService.ts # MODIFY: Add listFailedOperations method + +archon-ui-main/src/features/knowledge/views/ +└── KnowledgeView.tsx # MODIFY: Add FailedOperationsSection + +python/src/server/api_routes/ +└── progress_api.py # MODIFY: Add ?include_failed param + +python/src/server/utils/progress/ +└── progress_tracker.py # MODIFY: Extend cleanup delay to 300s +``` + +### Known Gotchas & Library Quirks + +```typescript +// CRITICAL: TanStack Query - Auto-removal timing +// Current problem in useProgressQueries.ts:136-149 +// Error states are removed after 5 seconds via setTimeout +// This must be DISABLED for failed operations section + +// PATTERN: Don't auto-remove error queries +// INSTEAD: Let user explicitly remove via "Remove" button +if (status === "error" || status === "failed") { + // DON'T DO THIS for failed operations display: + // setTimeout(() => queryClient.removeQueries(...), 5000) + + // DO THIS instead: + // Only remove when user clicks "Remove" button + // Let _delayed_cleanup handle auto-removal after 5 minutes +} + +// CRITICAL: Backend cleanup timing +// python/src/server/utils/progress/progress_tracker.py:61 +// Default delay_seconds=30 is too short for user visibility +// Change to delay_seconds=300 (5 minutes) for error states + +// GOTCHA: Terminal state filtering +// Backend excludes ALL terminal states from active operations +// Need to include failed operations when requested +// Use query parameter: ?include_failed=true + +// CRITICAL: Retry mechanism +// Retry is NOT a mutation update - it's creating a new crawl operation +// Must store original request parameters in error state +// Then call knowledge service's create endpoint with same params + +// GOTCHA: Radix UI DropdownMenu +// Must wrap DropdownMenuItem with + + + + + + {/* Expanded Error Details */} + {expandedOps.has(op.progress_id) && ( +
+

Full Error Message:

+
+                      {op.error}
+                    
+ + {op.logs && op.logs.length > 0 && ( + <> +

Crawl Logs:

+
+ {op.logs.slice(-10).map((log: any, idx: number) => ( +
+ {new Date(log.timestamp).toLocaleTimeString()} + {" - "} + {log.message} +
+ ))} +
+ + )} +
+ )} + + ))} + + + {/* Remove Confirmation Modal */} + {confirmRemove && ( + { + removeMutation.mutate(confirmRemove.progress_id); + setConfirmRemove(null); + }} + onCancel={() => setConfirmRemove(null)} + /> + )} + + ); + } + ``` + FOLLOW PATTERNS: + - CrawlingProgress.tsx for operation list structure + - KnowledgeCard.tsx for error styling (red border, bg-red-500/10) + - KnowledgeCardActions.tsx for action buttons + - DeleteConfirmModal.tsx for remove confirmation + STYLING: + - Error container: border-red-500/20 bg-red-500/10 + - Text: text-red-400 for errors, text-gray-300 for normal + - Glassmorphism: backdrop-blur-sm + DEPENDENCIES: Tasks 5, 6, 7 (hooks exist) + VALIDATION: Component displays failed operations with retry/remove buttons + +Task 9: MODIFY archon-ui-main/src/features/knowledge/views/KnowledgeView.tsx + ACTION: Add FailedOperationsSection below active operations + FILE: archon-ui-main/src/features/knowledge/views/KnowledgeView.tsx + CHANGES: + - Import: import { FailedOperationsSection } from "../../progress/components/FailedOperationsSection"; + - Add section after CrawlingProgress component (find existing location) + - Place between active operations and knowledge sources list + PATTERN: + ```tsx + {/* Active Operations */} + + + {/* Failed Operations - NEW */} + + + {/* Knowledge Sources List */} + + ``` + WHY: Failed operations appear in same view as active operations for visibility + DEPENDENCIES: Task 8 (FailedOperationsSection component exists) + VALIDATION: Failed operations section appears in Knowledge Base view + +Task 10: ADD archon-ui-main/src/features/progress/types/index.ts types + ACTION: Add FailedOperation and FailedOperationsResponse types + FILE: archon-ui-main/src/features/progress/types/index.ts + ADD TYPES (as shown in Data Models section above): + - FailedOperation interface + - FailedOperationsResponse interface + DEPENDENCIES: None (pure types) + VALIDATION: TypeScript compiles without errors +``` + +### Implementation Patterns & Key Details + +```typescript +// PATTERN: Failed operations query hook (no auto-removal) +export function useFailedOperations() { + return useQuery({ + queryKey: progressKeys.failed(), + queryFn: () => progressService.listFailedOperations(), + enabled: true, + refetchInterval: 30000, // Static polling - failed ops don't change often + staleTime: STALE_TIMES.normal, + // CRITICAL: No setTimeout cleanup like error queries + // User must explicitly click "Remove" button + }); +} + +// PATTERN: Retry mutation (restarts crawl with original params) +export function useRetryKnowledgeSource() { + const queryClient = useQueryClient(); + const { toast } = useToast(); + + return useMutation({ + mutationFn: async (failedOp: FailedOperation) => { + // GOTCHA: Retry is NOT an update - it's a new CREATE + // Use the original request parameters stored in error state + if (!failedOp.original_request?.url) { + throw new Error("Cannot retry: Original request data missing"); + } + + return knowledgeService.createSource(failedOp.original_request); + }, + + onSuccess: (data, failedOp) => { + // Remove from failed list (new operation will appear in active) + queryClient.removeQueries({ + queryKey: progressKeys.detail(failedOp.progress_id), + exact: true + }); + + // PATTERN: Invalidate both lists to ensure UI consistency + queryClient.invalidateQueries({ queryKey: progressKeys.failed() }); + queryClient.invalidateQueries({ queryKey: progressKeys.active() }); + }, + }); +} + +// PATTERN: Error display component styling +
+ {/* CRITICAL: Red theme for errors */} + {/* border-red-500/20 = 20% opacity red border */} + {/* bg-red-500/10 = 10% opacity red background */} + {/* backdrop-blur-sm = glassmorphism effect */} + +
+ +

+ {/* Error text in red-400 */} +

+
+
+ +// PATTERN: Action buttons with appropriate variants + + + +``` + +```python +# PATTERN: Extended cleanup delay for error states +async def error(self, error_message: str, error_details: dict[str, Any] | None = None, cleanup_delay_seconds: int = 300): + """ + Mark progress as failed with error information. + + Args: + error_message: Error message + error_details: Optional additional error details + cleanup_delay_seconds: Seconds before cleanup (default 300 = 5 minutes) + """ + self.state.update({ + "status": "error", + "error": error_message, + "error_time": datetime.now().isoformat(), + }) + + if error_details: + self.state["error_details"] = error_details + + self._update_state() + safe_logfire_error( + f"Progress error | progress_id={self.progress_id} | type={self.operation_type} | error={error_message}" + ) + + # CRITICAL: Extended delay for error states (300s vs 30s) + # Gives users time to see and act on failed operations + asyncio.create_task(self._delayed_cleanup(self.progress_id, cleanup_delay_seconds)) + +# PATTERN: Query parameter for including failed operations +@router.get("/") +async def list_active_operations(include_failed: bool = Query(False)): + """ + List all active operations. + + Args: + include_failed: If True, include failed/error operations in response + """ + try: + logfire.info("Listing active operations") + active_operations = [] + + for op_id, operation in ProgressTracker.list_active().items(): + status = operation.get("status", "unknown") + + # CRITICAL: Include failed operations when requested + should_include = ( + status not in TERMINAL_STATES or # Active operations + (include_failed and status in {"error", "failed"}) # Failed if requested + ) + + if should_include: + # ... build operation_data ... + active_operations.append(operation_data) + + # GOTCHA: Return 200 even when no operations (not 404) + return { + "operations": active_operations, + "count": len(active_operations), + "timestamp": datetime.utcnow().isoformat() + } +``` + +### Integration Points + +```yaml +FRONTEND ROUTES: + - no change: /knowledge route already exists + - component: FailedOperationsSection added to existing KnowledgeView + +BACKEND API: + - modify: GET /api/progress/ to accept ?include_failed=true parameter + - response: Returns failed operations when include_failed=true + +TANSTACK QUERY CACHE: + - new key: progressKeys.failed() for failed operations list + - existing keys: progressKeys.active(), progressKeys.detail(id) + - invalidation: Both failed() and active() after retry + +STATE MANAGEMENT: + - no global state: All state in TanStack Query cache + - local state: Component-level for expanded operations + - modal state: Component-level for confirmation dialogs + +STYLING: + - design system: Follow PRPs/ai_docs/UI_STANDARDS.md + - error colors: bg-red-500/10, text-red-400, border-red-500/20 + - glassmorphism: backdrop-blur-sm on all cards + - icons: lucide-react (AlertCircle, RotateCcw, X, ChevronDown/Up) +``` + +## Validation Loop + +### Level 1: Syntax & Style (Immediate Feedback) + +```bash +# Backend validation +cd python +uv run ruff check src/server/api_routes/progress_api.py --fix +uv run ruff check src/server/utils/progress/progress_tracker.py --fix +uv run ruff check src/server/services/crawling/crawling_service.py --fix +uv run mypy src/server/api_routes/progress_api.py +uv run mypy src/server/utils/progress/progress_tracker.py + +# Frontend validation +cd archon-ui-main +npm run biome:fix src/features/progress/ +npm run biome:fix src/features/knowledge/hooks/useRetryKnowledgeSource.ts +npx tsc --noEmit 2>&1 | grep "src/features/progress\|src/features/knowledge" + +# Expected: Zero errors before proceeding +``` + +### Level 2: Unit Tests (Component Validation) + +```bash +# Backend tests (if test files exist) +cd python +uv run pytest tests/server/api_routes/test_progress_api.py -v -k "test_list_operations" +uv run pytest tests/server/utils/test_progress_tracker.py -v -k "test_error" + +# Frontend tests +cd archon-ui-main +npm run test src/features/progress/hooks/useProgressQueries.test.ts +npm run test src/features/progress/components/FailedOperationsSection.test.tsx + +# Expected: All existing tests still pass, new tests added and passing +``` + +### Level 3: Integration Testing (System Validation) + +```bash +# Start backend +cd python +docker compose up -d +# OR: uv run python -m src.server.main + +# Verify backend API changes +curl -X GET "http://localhost:8181/api/progress/?include_failed=true" | jq . +# Expected: Returns operations with "error" or "failed" status + +# Start frontend +cd archon-ui-main +npm run dev + +# Manual testing steps: +# 1. Open http://localhost:3737/knowledge +# 2. Click "Add Knowledge" with URL that will fail (e.g., very large site) +# 3. Wait for operation to fail (or stop it manually) +# 4. Verify failed operation appears in "Failed Operations" section +# 5. Click "View Details" - error message should expand +# 6. Click "Retry" - new crawl should start +# 7. Click "Remove" on a failed operation - should show confirmation +# 8. Confirm removal - operation should disappear from list +# 9. Wait 5 minutes - failed operations should auto-cleanup + +# Backend logs validation +docker compose logs -f archon-server | grep "Progress state cleaned up after delay" +# Expected: Shows cleanup after 300s (not 30s) + +# Query cache validation (Chrome DevTools) +# 1. Open React Query DevTools +# 2. Find ["progress", "failed"] query +# 3. Verify it's not auto-removed after 5 seconds +# 4. Click "Remove" button in UI +# 5. Verify query is removed from cache +``` + +### Level 4: Accessibility & UX Validation + +```bash +# Accessibility validation +cd archon-ui-main + +# Run axe-core accessibility tests (if configured) +npm run test:a11y src/features/progress/components/FailedOperationsSection.tsx + +# Manual accessibility checks: +# 1. Keyboard navigation: Tab through all buttons, Enter to activate +# 2. Screen reader: NVDA/JAWS announces "Failed Operations, 2 items" +# 3. Error messages: Read in full, not truncated +# 4. Button labels: Clear and descriptive +# 5. Focus indicators: Visible on all interactive elements + +# Color contrast validation (WCAG 2.1 AA) +# - Red error text (text-red-400) on dark background: Must be 4.5:1 ratio +# - Use browser dev tools or contrast checker + +# Performance validation +# - Failed operations list with 10 items should render in <100ms +# - No memory leaks when operations are removed +# - Chrome DevTools Performance tab: No long tasks + +# UX validation checklist: +# [ ] Failed operations are immediately visible (< 1 second after failure) +# [ ] Error messages are readable and helpful +# [ ] Retry button clearly indicates it will restart the crawl +# [ ] Remove requires confirmation (no accidental dismissals) +# [ ] Badge count matches number of failed operations +# [ ] Expandable details work smoothly +# [ ] Operations auto-cleanup after 5 minutes (not intrusive) +``` + +## Final Validation Checklist + +### Technical Validation + +- [ ] All 4 validation levels completed successfully +- [ ] Backend tests pass: `uv run pytest tests/server/ -v` +- [ ] Frontend tests pass: `npm run test` +- [ ] No linting errors: `uv run ruff check src/` and `npm run biome` +- [ ] No type errors: `uv run mypy src/` and `npx tsc --noEmit` +- [ ] Backend cleanup delay is 300s (verified in logs) +- [ ] Frontend doesn't auto-remove error queries (verified in DevTools) + +### Feature Validation + +- [ ] Failed crawls appear in "Failed Operations" section within 1 second +- [ ] Error message displays accurately from backend +- [ ] Retry button starts new crawl with identical parameters +- [ ] Remove button requires confirmation and clears operation +- [ ] Failed operations auto-remove after 5 minutes (not 30 seconds) +- [ ] Active operations list excludes failed operations +- [ ] Failed operation count badge shows correct number +- [ ] Error details expand/collapse smoothly +- [ ] No regression in active operation tracking (existing functionality works) + +### Code Quality Validation + +- [ ] Follows existing patterns: KnowledgeCard for errors, CrawlingProgress for lists +- [ ] File placement matches desired tree structure +- [ ] Styling matches UI_STANDARDS.md (glassmorphism, error colors) +- [ ] TanStack Query patterns match existing hooks (queryKeys, invalidation) +- [ ] No anti-patterns: No auto-removal for failed ops, no sync in async context +- [ ] Backend uses existing progress tracker patterns +- [ ] Frontend uses existing Radix UI primitives + +### User Experience Validation + +- [ ] User persona journey is satisfied (see error, retry, remove) +- [ ] Error messages are actionable and clear +- [ ] Retry provides user feedback (toast notification) +- [ ] Remove confirmation prevents accidental dismissal +- [ ] Failed operations persist long enough for users to see (5 minutes) +- [ ] UI is accessible (keyboard navigation, screen reader support) +- [ ] Color contrast meets WCAG 2.1 AA standards + +### Documentation & Deployment + +- [ ] Code is self-documenting with clear component/function names +- [ ] Error logs are informative: "Progress state cleaned up after delay | progress_id=... | status=error" +- [ ] No new environment variables required +- [ ] Backend changes backward compatible (query param is optional) +- [ ] Frontend changes don't break existing Knowledge Base functionality + +--- + +## Anti-Patterns to Avoid + +- ❌ Don't auto-dismiss error states - WCAG violation, user must explicitly dismiss +- ❌ Don't use setTimeout cleanup for failed operations - only for completed/cancelled +- ❌ Don't retry with different parameters - must use original request +- ❌ Don't construct Tailwind classes dynamically - use conditional strings +- ❌ Don't poll failed operations with smart polling - they're static, use fixed interval +- ❌ Don't create new error colors - use existing red-500/10, red-400 system +- ❌ Don't skip confirmation for remove - it's a destructive action +- ❌ Don't return 404 for empty failed operations list - return 200 with empty array +- ❌ Don't modify TERMINAL_STATES constant - it's correct, just change filtering logic +- ❌ Don't create separate backend endpoint - use query parameter on existing endpoint + +--- + +## Confidence Score: 9/10 + +**Why 9/10:** +- ✅ All patterns exist in codebase and are well-documented +- ✅ Exact file paths and line numbers provided +- ✅ Research covered all necessary areas +- ✅ Implementation is straightforward extension of existing patterns +- ✅ No new libraries or dependencies required +- ✅ Backward compatible changes +- ⚠️ Minor risk: Exact Tailwind class combinations might need adjustment +- ⚠️ Minor risk: DeleteConfirmModal type might need extension + +**One-Pass Implementation Likelihood:** Very High (90%) + +An AI agent unfamiliar with Archon can implement this successfully using only the PRP and codebase access because: +- All referenced files exist and patterns are proven +- Step-by-step tasks with exact file paths and line numbers +- Complete code examples for all new components +- Clear validation gates at each level +- Comprehensive context about why the problem exists and how to fix it diff --git a/archon-ui-main/src/features/knowledge/hooks/useRetryKnowledgeSource.ts b/archon-ui-main/src/features/knowledge/hooks/useRetryKnowledgeSource.ts new file mode 100644 index 0000000000..b93f9a1ef4 --- /dev/null +++ b/archon-ui-main/src/features/knowledge/hooks/useRetryKnowledgeSource.ts @@ -0,0 +1,54 @@ +/** + * Retry Knowledge Source Hook + * Allows retrying failed crawl operations with original parameters + */ + +import { useMutation, useQueryClient } from "@tanstack/react-query"; +import { progressKeys } from "../../progress/hooks/useProgressQueries"; +import type { FailedOperation } from "../../progress/types"; +import { useToast } from "../../shared/hooks/useToast"; +import { knowledgeService } from "../services"; + +export function useRetryKnowledgeSource() { + const queryClient = useQueryClient(); + const { showToast } = useToast(); + + return useMutation({ + mutationFn: async (failedOp: FailedOperation) => { + if (!failedOp.original_request?.url) { + throw new Error("Cannot retry: Original request data missing"); + } + + // Retry by calling crawl endpoint with original parameters + return knowledgeService.crawlUrl({ + url: failedOp.original_request.url, + max_depth: failedOp.original_request.max_depth, + tags: failedOp.original_request.tags, + }); + }, + + onSuccess: (_data, failedOp) => { + showToast( + `Crawl restarted for ${failedOp.original_request?.url}`, + "success", + ); + + // Remove the failed operation from failed list + queryClient.removeQueries({ + queryKey: progressKeys.detail(failedOp.progressId), + exact: true, + }); + + // Refresh both failed and active operations + queryClient.invalidateQueries({ queryKey: progressKeys.failed() }); + queryClient.invalidateQueries({ queryKey: progressKeys.active() }); + }, + + onError: (error) => { + showToast( + error instanceof Error ? error.message : "Could not restart crawl", + "error", + ); + }, + }); +} diff --git a/archon-ui-main/src/features/knowledge/views/KnowledgeView.tsx b/archon-ui-main/src/features/knowledge/views/KnowledgeView.tsx index c9a9a3af6d..fc413eec22 100644 --- a/archon-ui-main/src/features/knowledge/views/KnowledgeView.tsx +++ b/archon-ui-main/src/features/knowledge/views/KnowledgeView.tsx @@ -6,6 +6,7 @@ import { useEffect, useMemo, useRef, useState } from "react"; import { useToast } from "@/features/shared/hooks/useToast"; import { CrawlingProgress } from "../../progress/components/CrawlingProgress"; +import { FailedOperationsSection } from "../../progress/components/FailedOperationsSection"; import type { ActiveOperation } from "../../progress/types"; import { AddKnowledgeDialog } from "../components/AddKnowledgeDialog"; import { KnowledgeHeader } from "../components/KnowledgeHeader"; @@ -149,6 +150,11 @@ export const KnowledgeView = () => { )} + {/* Failed Operations - Show persistent error states */} +
+ +
+ {/* Knowledge Items List */} >(new Set()); + const [confirmRemove, setConfirmRemove] = useState(null); + + const toggleExpanded = (progressId: string) => { + setExpandedOps((prev) => { + const next = new Set(prev); + if (next.has(progressId)) { + next.delete(progressId); + } else { + next.add(progressId); + } + return next; + }); + }; + + if (isLoading) { + return
Loading failed operations...
; + } + + if (!failedOps?.operations.length) { + return null; + } + + return ( +
+ {/* Section Header */} +
+ +

Failed Operations

+ + {failedOps.count} + +
+ + {/* Failed Operations List */} +
+ {failedOps.operations.map((op) => ( +
+ {/* Operation Header */} +
+
+
+ +

+ {op.url || op.currentUrl || "Unknown URL"} +

+
+

+ Failed {op.error_time ? new Date(op.error_time).toLocaleString() : "recently"} +

+ {/* Error Message Preview */} +

{op.error || "Unknown error"}

+
+ + {/* Action Buttons */} +
+ + + +
+
+ + {/* Expanded Error Details */} + {expandedOps.has(op.progressId) && ( +
+

Full Error Message:

+
+                  {op.error}
+                
+ + {op.logs && op.logs.length > 0 && ( + <> +

Crawl Logs:

+
+ {op.logs + .slice(-10) + .filter((log): log is { timestamp: string; message: string } => typeof log !== "string") + .map((log) => ( +
+ {new Date(log.timestamp).toLocaleTimeString()} + {" - "} + {log.message} +
+ ))} +
+ + )} +
+ )} +
+ ))} +
+ + {/* Remove Confirmation Modal */} + {confirmRemove && ( + { + removeMutation.mutate(confirmRemove.progressId); + setConfirmRemove(null); + }} + onCancel={() => setConfirmRemove(null)} + /> + )} +
+ ); +} diff --git a/archon-ui-main/src/features/progress/hooks/useProgressQueries.ts b/archon-ui-main/src/features/progress/hooks/useProgressQueries.ts index 84f1bdd3d7..dcc8c3985e 100644 --- a/archon-ui-main/src/features/progress/hooks/useProgressQueries.ts +++ b/archon-ui-main/src/features/progress/hooks/useProgressQueries.ts @@ -3,13 +3,20 @@ * Handles polling for operation progress with TanStack Query */ -import { type UseQueryResult, useQueries, useQuery, useQueryClient } from "@tanstack/react-query"; +import { + type UseMutationResult, + type UseQueryResult, + useMutation, + useQueries, + useQuery, + useQueryClient, +} from "@tanstack/react-query"; import { useEffect, useMemo, useRef } from "react"; import { DISABLED_QUERY_KEY, STALE_TIMES } from "../../shared/config/queryPatterns"; import { useSmartPolling } from "../../shared/hooks"; import { APIServiceError } from "../../shared/types/errors"; import { progressService } from "../services"; -import type { ActiveOperationsResponse, ProgressResponse, ProgressStatus } from "../types"; +import type { ActiveOperationsResponse, FailedOperationsResponse, ProgressResponse, ProgressStatus } from "../types"; // Query keys factory export const progressKeys = { @@ -17,6 +24,7 @@ export const progressKeys = { lists: () => [...progressKeys.all, "list"] as const, detail: (id: string) => [...progressKeys.all, "detail", id] as const, active: () => [...progressKeys.all, "active"] as const, + failed: () => [...progressKeys.all, "failed"] as const, }; // Terminal states that should stop polling @@ -381,3 +389,47 @@ export function useMultipleOperations( }; }); } + +/** + * Get all failed operations + * These are operations with error/failed status that persist for 5 minutes + * IMPORTANT: These are NOT auto-removed - user must explicitly dismiss + */ +export function useFailedOperations() { + return useQuery({ + queryKey: progressKeys.failed(), + queryFn: () => progressService.listFailedOperations(), + enabled: true, + refetchInterval: 30000, // Poll every 30s - failed ops are mostly static + staleTime: STALE_TIMES.normal, + // CRITICAL: No auto-removal for failed operations + // User must explicitly click "Remove" button + }); +} + +/** + * Remove a failed operation from the list + * This is explicit user action, not automatic cleanup + */ +export function useRemoveFailedOperation(): UseMutationResult<{ progressId: string }, Error, string> { + const queryClient = useQueryClient(); + + return useMutation({ + mutationFn: async (progressId: string) => { + // Just remove from cache - backend will auto-cleanup after 5 minutes + // No API call needed - this is client-side dismissal + return { progressId }; + }, + + onSuccess: (_data, progressId) => { + // Remove specific operation + queryClient.removeQueries({ + queryKey: progressKeys.detail(progressId), + exact: true, + }); + + // Refresh failed operations list + queryClient.invalidateQueries({ queryKey: progressKeys.failed() }); + }, + }); +} diff --git a/archon-ui-main/src/features/progress/services/progressService.ts b/archon-ui-main/src/features/progress/services/progressService.ts index ba0e68baa5..c2bf00836b 100644 --- a/archon-ui-main/src/features/progress/services/progressService.ts +++ b/archon-ui-main/src/features/progress/services/progressService.ts @@ -4,7 +4,7 @@ */ import { callAPIWithETag } from "../../shared/api/apiClient"; -import type { ActiveOperationsResponse, ProgressResponse } from "../types"; +import type { ActiveOperationsResponse, FailedOperationsResponse, ProgressResponse } from "../types"; export const progressService = { /** @@ -21,4 +21,26 @@ export const progressService = { // IMPORTANT: Use trailing slash to avoid FastAPI redirect that breaks in Docker return callAPIWithETag("/api/progress/"); }, + + /** + * List all failed operations + * These are operations with error/failed status that persist for 5 minutes + */ + async listFailedOperations(): Promise { + // Request all operations including failed ones + const response = await callAPIWithETag( + "/api/progress/?include_failed=true" + ); + + // Filter to only return operations with error or failed status + const failedOperations = response.operations.filter( + (op) => op.status === "error" || op.status === "failed" + ); + + return { + operations: failedOperations, + count: failedOperations.length, + timestamp: response.timestamp, + }; + }, }; diff --git a/archon-ui-main/src/features/progress/types/progress.ts b/archon-ui-main/src/features/progress/types/progress.ts index c57426b9ca..07140ab201 100644 --- a/archon-ui-main/src/features/progress/types/progress.ts +++ b/archon-ui-main/src/features/progress/types/progress.ts @@ -156,7 +156,14 @@ export interface ProgressResponse { fileSize?: number; chunksProcessed?: number; totalChunks?: number; - logs?: string[]; + logs?: + | string[] + | Array<{ + timestamp: string; + message: string; + status?: string; + progress?: number; + }>; timestamp?: string; startedAt?: string; // ISO date string of when operation started stats?: { @@ -172,3 +179,23 @@ export interface ProgressResponse { current_operation?: string; }; } + +// Failed operation types for error visibility feature +export interface FailedOperation extends ProgressResponse { + status: "error" | "failed"; + error: string; // Error message (required for failed operations) + error_time?: string; // ISO timestamp of when error occurred + original_request?: { + // For retry functionality - original parameters + url: string; + max_depth?: number; + max_concurrent?: number; + tags?: string[]; + }; +} + +export interface FailedOperationsResponse { + operations: FailedOperation[]; + count: number; + timestamp: string; +} diff --git a/python/src/server/api_routes/progress_api.py b/python/src/server/api_routes/progress_api.py index 96ab7eb9ff..916ab0cb9f 100644 --- a/python/src/server/api_routes/progress_api.py +++ b/python/src/server/api_routes/progress_api.py @@ -3,7 +3,7 @@ from datetime import datetime from email.utils import formatdate -from fastapi import APIRouter, Header, HTTPException, Response +from fastapi import APIRouter, Header, HTTPException, Query, Response from fastapi import status as http_status from ..config.logfire_config import get_logger, logfire @@ -98,14 +98,17 @@ async def get_progress( @router.get("/") -async def list_active_operations(): +async def list_active_operations(include_failed: bool = Query(False, description="Include failed/error operations in response")): """ List all active operations. + Args: + include_failed: If True, include failed/error operations in response + This endpoint is useful for debugging and monitoring active operations. """ try: - logfire.info("Listing active operations") + logfire.info(f"Listing active operations | include_failed={include_failed}") # Get all active operations from ProgressTracker active_operations = [] @@ -114,8 +117,14 @@ async def list_active_operations(): # Include all non-completed statuses for op_id, operation in ProgressTracker.list_active().items(): status = operation.get("status", "unknown") - # Include all operations that aren't in terminal states - if status not in TERMINAL_STATES: + + # Include active operations OR failed operations when requested + should_include = ( + status not in TERMINAL_STATES or # Active operations + (include_failed and status in {"error", "failed"}) # Failed if requested + ) + + if should_include: operation_data = { "operation_id": op_id, "operation_type": operation.get("type", "unknown"), @@ -135,6 +144,11 @@ async def list_active_operations(): "total_pages": operation.get("total_pages"), "documents_created": operation.get("documents_created") or operation.get("chunks_stored"), "code_blocks_found": operation.get("code_blocks_found") or operation.get("code_examples_found"), + # Include error information for failed operations + "error": operation.get("error"), + "error_time": operation.get("error_time"), + # Include original request for retry functionality + "original_request": operation.get("original_request"), } # Only include non-None values to keep response clean active_operations.append({k: v for k, v in operation_data.items() if v is not None}) diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index 01122704d8..273a6d23f1 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -732,12 +732,22 @@ async def code_progress_callback(data: dict): error_message = f"Crawl failed: {str(e)}" # Use ProgressMapper to get proper progress value for error state error_progress = self.progress_mapper.map_progress("error", 0) + + # Store original request parameters for retry functionality + original_request = { + "url": request.get("url"), + "max_depth": request.get("max_depth"), + "max_concurrent": request.get("max_concurrent"), + "tags": request.get("tags"), + } + await self._handle_progress_update( task_id, { "status": "error", "progress": error_progress, "log": error_message, - "error": str(e) + "error": str(e), + "original_request": original_request, } ) # Mark error in progress tracker with standardized schema diff --git a/python/src/server/utils/progress/progress_tracker.py b/python/src/server/utils/progress/progress_tracker.py index 60a7936395..4d50f45689 100644 --- a/python/src/server/utils/progress/progress_tracker.py +++ b/python/src/server/utils/progress/progress_tracker.py @@ -61,7 +61,7 @@ def list_active(cls) -> dict[str, dict[str, Any]]: async def _delayed_cleanup(cls, progress_id: str, delay_seconds: int = 30): """ Remove progress state from memory after a delay. - + This gives clients time to see the final state before cleanup. """ await asyncio.sleep(delay_seconds) @@ -106,7 +106,7 @@ async def update(self, status: str, progress: int, log: str, **kwargs): f"DEBUG: ProgressTracker.update called | status={status} | progress={progress} | " f"current_state_progress={self.state.get('progress', 0)} | kwargs_keys={list(kwargs.keys())}" ) - + # CRITICAL: Never allow progress to go backwards current_progress = self.state.get("progress", 0) new_progress = min(100, max(0, progress)) # Ensure 0-100 @@ -129,7 +129,7 @@ async def update(self, status: str, progress: int, log: str, **kwargs): "log": log, "timestamp": datetime.now().isoformat(), }) - + # DEBUG: Log final state for document_storage if status == "document_storage" and actual_progress >= 35: safe_logfire_info( @@ -155,13 +155,15 @@ async def update(self, status: str, progress: int, log: str, **kwargs): for key, value in kwargs.items(): if key not in protected_fields: self.state[key] = value - + self._update_state() - - # Schedule cleanup for terminal states - if status in ["cancelled", "failed"]: - asyncio.create_task(self._delayed_cleanup(self.progress_id)) + + # Schedule cleanup for terminal states with extended delay for failed (user visibility) + if status == "failed": + asyncio.create_task(self._delayed_cleanup(self.progress_id, 300)) # 5 minutes for failed operations + elif status == "cancelled": + asyncio.create_task(self._delayed_cleanup(self.progress_id, 30)) # 30 seconds for cancelled async def complete(self, completion_data: dict[str, Any] | None = None): """ @@ -189,17 +191,18 @@ async def complete(self, completion_data: dict[str, Any] | None = None): safe_logfire_info( f"Progress completed | progress_id={self.progress_id} | type={self.operation_type} | duration={self.state.get('duration_formatted', 'unknown')}" ) - + # Schedule cleanup after delay to allow clients to see final state asyncio.create_task(self._delayed_cleanup(self.progress_id)) - async def error(self, error_message: str, error_details: dict[str, Any] | None = None): + async def error(self, error_message: str, error_details: dict[str, Any] | None = None, cleanup_delay_seconds: int = 300): """ Mark progress as failed with error information. Args: error_message: Error message error_details: Optional additional error details + cleanup_delay_seconds: Seconds before cleanup (default 300 = 5 minutes for user visibility) """ self.state.update({ "status": "error", @@ -214,9 +217,9 @@ async def error(self, error_message: str, error_details: dict[str, Any] | None = safe_logfire_error( f"Progress error | progress_id={self.progress_id} | type={self.operation_type} | error={error_message}" ) - - # Schedule cleanup after delay to allow clients to see final state - asyncio.create_task(self._delayed_cleanup(self.progress_id)) + + # Schedule cleanup after extended delay to allow users to see and act on failed operations + asyncio.create_task(self._delayed_cleanup(self.progress_id, cleanup_delay_seconds)) async def update_batch_progress( self, current_batch: int, total_batches: int, batch_size: int, message: str @@ -241,9 +244,9 @@ async def update_batch_progress( ) async def update_crawl_stats( - self, - processed_pages: int, - total_pages: int, + self, + processed_pages: int, + total_pages: int, current_url: str | None = None, pages_found: int | None = None ): @@ -269,16 +272,16 @@ async def update_crawl_stats( "total_pages": total_pages, "current_url": current_url, } - + if pages_found is not None: update_data["pages_found"] = pages_found - + await self.update(**update_data) async def update_storage_progress( - self, - chunks_stored: int, - total_chunks: int, + self, + chunks_stored: int, + total_chunks: int, operation: str = "storing", word_count: int | None = None, embeddings_created: int | None = None @@ -294,7 +297,7 @@ async def update_storage_progress( embeddings_created: Number of embeddings created """ progress_val = int((chunks_stored / max(total_chunks, 1)) * 100) - + update_data = { "status": "document_storage", "progress": progress_val, @@ -302,14 +305,14 @@ async def update_storage_progress( "chunks_stored": chunks_stored, "total_chunks": total_chunks, } - + if word_count is not None: update_data["word_count"] = word_count if embeddings_created is not None: update_data["embeddings_created"] = embeddings_created - + await self.update(**update_data) - + async def update_code_extraction_progress( self, completed_summaries: int, @@ -319,7 +322,7 @@ async def update_code_extraction_progress( ): """ Update code extraction progress with detailed metrics. - + Args: completed_summaries: Number of code summaries completed total_summaries: Total code summaries to generate @@ -327,11 +330,11 @@ async def update_code_extraction_progress( current_file: Current file being processed """ progress_val = int((completed_summaries / max(total_summaries, 1)) * 100) - + log = f"Extracting code: {completed_summaries}/{total_summaries} summaries" if current_file: log += f" - {current_file}" - + await self.update( status="code_extraction", progress=progress_val,