Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import { useActiveOperations } from "../progress/hooks";
import { progressKeys } from "../progress/hooks/useProgressQueries";
import type { ActiveOperation, ActiveOperationsResponse } from "../progress/types";
import { knowledgeService } from "../services";
import { getProviderErrorMessage } from "../utils/providerErrorHandler";
import type {
CrawlRequest,
CrawlStartResponse,
Expand Down Expand Up @@ -273,7 +274,7 @@ export function useCrawlUrl() {
queryClient.setQueryData(progressKeys.list(), context.previousOperations);
}

const errorMessage = error instanceof Error ? error.message : "Failed to start crawl";
const errorMessage = getProviderErrorMessage(error) || "Failed to start crawl";
showToast(errorMessage, "error");
},
});
Expand Down
1 change: 1 addition & 0 deletions archon-ui-main/src/features/knowledge/utils/index.ts
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
export * from "./knowledge-utils";
export * from "./providerErrorHandler";
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/**
* Provider-agnostic error handler for LLM operations
* Supports OpenAI, Google AI, Anthropic, and other providers
*/

export interface ProviderError extends Error {
statusCode?: number;
provider?: string;
errorType?: string;
isProviderError?: boolean;
}

/**
* Parse backend error responses into provider-aware error objects
*/
export function parseProviderError(error: unknown): ProviderError {
const providerError = error as ProviderError;

// Check if this is a structured provider error from backend
if (error && typeof error === 'object') {
if (error.statusCode || error.status) {
providerError.statusCode = error.statusCode || error.status;
}

// Parse backend error structure
if (error.message && error.message.includes('detail')) {
try {
const parsed = JSON.parse(error.message);
if (parsed.detail && parsed.detail.error_type) {
providerError.isProviderError = true;
providerError.provider = parsed.detail.provider || 'LLM';
providerError.errorType = parsed.detail.error_type;
providerError.message = parsed.detail.message || error.message;
}
} catch {
// If parsing fails, use message as-is
}
}
}

return providerError;
}

/**
* Get user-friendly error message for any LLM provider
*/
export function getProviderErrorMessage(error: unknown): string {
const parsed = parseProviderError(error);

if (parsed.isProviderError) {
const provider = parsed.provider || 'LLM';

switch (parsed.errorType) {
case 'authentication_failed':
return `Please verify your ${provider} API key in Settings.`;
case 'quota_exhausted':
return `${provider} quota exhausted. Please check your billing settings.`;
case 'rate_limit':
return `${provider} rate limit exceeded. Please wait and try again.`;
default:
return `${provider} API error. Please check your configuration.`;
}
}

// Handle status codes for non-structured errors
if (parsed.statusCode === 401) {
return "Please verify your API key in Settings.";
}

return parsed.message || "An error occurred.";
}
78 changes: 78 additions & 0 deletions python/src/server/api_routes/knowledge_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from ..config.logfire_config import get_logger, safe_logfire_error, safe_logfire_info
from ..services.crawler_manager import get_crawler
from ..services.crawling import CrawlingService
from ..services.credential_service import credential_service
from ..services.embeddings.provider_error_adapters import ProviderErrorFactory
from ..services.knowledge import DatabaseMetricsService, KnowledgeItemService, KnowledgeSummaryService
from ..services.search.rag_service import RAGService
from ..services.storage import DocumentStorageService
Expand Down Expand Up @@ -53,6 +55,59 @@
active_crawl_tasks: dict[str, asyncio.Task] = {}




async def _validate_provider_api_key(provider: str = None) -> None:
"""Validate LLM provider API key before starting operations."""
logger.info("🔑 Starting API key validation...")

try:
if not provider:
provider = "openai"

logger.info(f"🔑 Testing {provider.title()} API key with minimal embedding request...")

# Test API key with minimal embedding request - this will fail if key is invalid
from ..services.embeddings.embedding_service import create_embedding
test_result = await create_embedding(text="test")

if not test_result:
logger.error(f"❌ {provider.title()} API key validation failed - no embedding returned")
raise HTTPException(
status_code=401,
detail={
"error": f"Invalid {provider.title()} API key",
"message": f"Please verify your {provider.title()} API key in Settings.",
"error_type": "authentication_failed",
"provider": provider
}
)

logger.info(f"✅ {provider.title()} API key validation successful")

except HTTPException:
# Re-raise our intended HTTP exceptions
logger.error("🚨 Re-raising HTTPException from validation")
raise
except Exception as e:
# Sanitize error before logging to prevent sensitive data exposure
error_str = str(e)
sanitized_error = ProviderErrorFactory.sanitize_provider_error(error_str, provider or "openai")
logger.error(f"❌ Caught exception during API key validation: {sanitized_error}")

# Always fail for any exception during validation - better safe than sorry
logger.error("🚨 API key validation failed - blocking crawl operation")
raise HTTPException(
status_code=401,
detail={
"error": "Invalid API key",
"message": f"Please verify your {(provider or 'openai').title()} API key in Settings before starting a crawl.",
"error_type": "authentication_failed",
"provider": provider or "openai"
}
) from None


# Request Models
class KnowledgeItemRequest(BaseModel):
url: str
Expand Down Expand Up @@ -479,6 +534,14 @@ async def get_knowledge_item_code_examples(
@router.post("/knowledge-items/{source_id}/refresh")
async def refresh_knowledge_item(source_id: str):
"""Refresh a knowledge item by re-crawling its URL with the same metadata."""

# Validate API key before starting expensive refresh operation
logger.info("🔍 About to validate API key for refresh...")
provider_config = await credential_service.get_active_provider("embedding")
provider = provider_config.get("provider", "openai")
await _validate_provider_api_key(provider)
logger.info("✅ API key validation completed successfully for refresh")

try:
safe_logfire_info(f"Starting knowledge item refresh | source_id={source_id}")

Expand Down Expand Up @@ -597,6 +660,13 @@ async def crawl_knowledge_item(request: KnowledgeItemRequest):
if not request.url.startswith(("http://", "https://")):
raise HTTPException(status_code=422, detail="URL must start with http:// or https://")

# Validate API key before starting expensive operation
logger.info("🔍 About to validate API key...")
provider_config = await credential_service.get_active_provider("embedding")
provider = provider_config.get("provider", "openai")
await _validate_provider_api_key(provider)
logger.info("✅ API key validation completed successfully")

try:
safe_logfire_info(
f"Starting knowledge item crawl | url={str(request.url)} | knowledge_type={request.knowledge_type} | tags={request.tags}"
Expand Down Expand Up @@ -750,6 +820,14 @@ async def upload_document(
knowledge_type: str = Form("technical"),
):
"""Upload and process a document with progress tracking."""

# Validate API key before starting expensive upload operation
logger.info("🔍 About to validate API key for upload...")
provider_config = await credential_service.get_active_provider("embedding")
provider = provider_config.get("provider", "openai")
await _validate_provider_api_key(provider)
logger.info("✅ API key validation completed successfully for upload")

try:
# DETAILED LOGGING: Track knowledge_type parameter flow
safe_logfire_info(
Expand Down
16 changes: 16 additions & 0 deletions python/src/server/services/embeddings/embedding_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,22 @@ def __init__(self, message: str, original_error: Exception | None = None, **kwar
self.metadata["original_error_message"] = str(original_error)


class EmbeddingAuthenticationError(EmbeddingError):
"""
Raised when API authentication fails (invalid or expired API key).

This is a CRITICAL error that should stop the entire process
as continuing would be pointless without valid API access.
"""

def __init__(self, message: str, api_key_prefix: str | None = None, **kwargs):
super().__init__(message, **kwargs)
# Store masked API key prefix for debugging
self.api_key_prefix = api_key_prefix[:3] + "…" if api_key_prefix and len(api_key_prefix) >= 3 else None
if self.api_key_prefix:
self.metadata["api_key_prefix"] = self.api_key_prefix


class EmbeddingValidationError(EmbeddingError):
"""
Raised when embedding validation fails (e.g., zero vector detected).
Expand Down
162 changes: 162 additions & 0 deletions python/src/server/services/embeddings/provider_error_adapters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
"""
Provider-agnostic error handling for LLM embedding services.

Supports OpenAI, Google AI, Anthropic, Ollama, and future providers
with unified error handling and sanitization patterns.
"""

import re
from abc import ABC, abstractmethod

from .embedding_exceptions import (
EmbeddingAPIError,
EmbeddingAuthenticationError,
EmbeddingQuotaExhaustedError,
EmbeddingRateLimitError,
)


class ProviderErrorAdapter(ABC):
"""Abstract base class for provider-specific error handling."""

@abstractmethod
def get_provider_name(self) -> str:
pass

@abstractmethod
def sanitize_error_message(self, message: str) -> str:
pass


class OpenAIErrorAdapter(ProviderErrorAdapter):
def get_provider_name(self) -> str:
return "openai"

def sanitize_error_message(self, message: str) -> str:
if not isinstance(message, str) or not message.strip() or len(message) > 2000:
return "OpenAI API encountered an error. Please verify your API key and quota."

sanitized = message

# Comprehensive OpenAI patterns with case-insensitive matching
patterns = [
(r'sk-[a-zA-Z0-9]{48}', '[REDACTED_KEY]'), # OpenAI API keys
(r'https?://[^\s]*openai\.com[^\s]*', '[REDACTED_URL]'), # OpenAI URLs
(r'org-[a-zA-Z0-9]{20,}', '[REDACTED_ORG]'), # Organization IDs
(r'proj_[a-zA-Z0-9]{10,}', '[REDACTED_PROJECT]'), # Project IDs
(r'req_[a-zA-Z0-9]{10,}', '[REDACTED_REQUEST]'), # Request IDs
(r'Bearer\s+[a-zA-Z0-9._-]+', 'Bearer [REDACTED_TOKEN]'), # Bearer tokens
]

for pattern, replacement in patterns:
sanitized = re.sub(pattern, replacement, sanitized, flags=re.IGNORECASE)

# Check for sensitive words after sanitization
sensitive_words = ['internal', 'server', 'endpoint']
if any(word in sanitized.lower() for word in sensitive_words):
return "OpenAI API encountered an error. Please verify your API key and quota."

return sanitized


class GoogleAIErrorAdapter(ProviderErrorAdapter):
def get_provider_name(self) -> str:
return "google"

def sanitize_error_message(self, message: str) -> str:
if not isinstance(message, str) or not message.strip() or len(message) > 2000:
return "Google AI API encountered an error. Please verify your API key."

sanitized = message

# Comprehensive Google AI patterns
patterns = [
(r'AIza[a-zA-Z0-9_-]{35}', '[REDACTED_KEY]'), # Google AI API keys
(r'https?://[^\s]*googleapis\.com[^\s]*', '[REDACTED_URL]'), # Google API URLs
(r'https?://[^\s]*googleusercontent\.com[^\s]*', '[REDACTED_URL]'), # Google content URLs
(r'projects/[a-zA-Z0-9_-]+', 'projects/[REDACTED_PROJECT]'), # GCP project paths
(r'ya29\.[a-zA-Z0-9_-]+', '[REDACTED_TOKEN]'), # OAuth tokens
(r'Bearer\s+[a-zA-Z0-9._-]+', 'Bearer [REDACTED_TOKEN]'), # Bearer tokens
]

for pattern, replacement in patterns:
sanitized = re.sub(pattern, replacement, sanitized, flags=re.IGNORECASE)

# Check for sensitive words
sensitive_words = ['internal', 'server', 'endpoint', 'project']
if any(word in sanitized.lower() for word in sensitive_words):
return "Google AI API encountered an error. Please verify your API key."

return sanitized


class AnthropicErrorAdapter(ProviderErrorAdapter):
def get_provider_name(self) -> str:
return "anthropic"

def sanitize_error_message(self, message: str) -> str:
if not isinstance(message, str) or not message.strip() or len(message) > 2000:
return "Anthropic API encountered an error. Please verify your API key."

sanitized = message

# Comprehensive Anthropic patterns
patterns = [
(r'sk-ant-[a-zA-Z0-9_-]{10,}', '[REDACTED_KEY]'), # Anthropic API keys
(r'https?://[^\s]*anthropic\.com[^\s]*', '[REDACTED_URL]'), # Anthropic URLs
(r'Bearer\s+[a-zA-Z0-9._-]+', 'Bearer [REDACTED_TOKEN]'), # Bearer tokens
]

for pattern, replacement in patterns:
sanitized = re.sub(pattern, replacement, sanitized, flags=re.IGNORECASE)

# Check for sensitive words
sensitive_words = ['internal', 'server', 'endpoint']
if any(word in sanitized.lower() for word in sensitive_words):
return "Anthropic API encountered an error. Please verify your API key."

return sanitized


class ProviderErrorFactory:
"""Factory for provider-agnostic error handling."""

_adapters = {
"openai": OpenAIErrorAdapter(),
"google": GoogleAIErrorAdapter(),
"anthropic": AnthropicErrorAdapter(),
}

@classmethod
def get_adapter(cls, provider: str) -> ProviderErrorAdapter:
return cls._adapters.get(provider.lower(), cls._adapters["openai"])

@classmethod
def sanitize_provider_error(cls, message: str, provider: str) -> str:
adapter = cls.get_adapter(provider)
return adapter.sanitize_error_message(message)

@classmethod
def detect_provider_from_error(cls, error_str: str) -> str:
"""Detect provider from error message with comprehensive pattern matching."""
if not error_str:
return "openai"

error_lower = error_str.lower()

# Case-insensitive provider detection with multiple patterns
if ("anthropic" in error_lower or
re.search(r'sk-ant-[a-zA-Z0-9_-]+', error_str, re.IGNORECASE) or
"claude" in error_lower):
return "anthropic"
elif ("google" in error_lower or
re.search(r'AIza[a-zA-Z0-9_-]+', error_str, re.IGNORECASE) or
"googleapis" in error_lower or
"vertex" in error_lower):
return "google"
elif ("openai" in error_lower or
re.search(r'sk-[a-zA-Z0-9]{48}', error_str, re.IGNORECASE) or
"gpt" in error_lower):
return "openai"
else:
return "openai" # Safe default