diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 807fc85cec..c114a838d6 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -28,16 +28,25 @@ sequenceDiagram participant Client participant ProxyServer as proxy/proxy_server.py participant Auth as proxy/auth/user_api_key_auth.py + participant Redis as Redis Cache participant Hooks as proxy/hooks/ participant Router as router.py - participant Main as main.py + participant Main as main.py + utils.py participant Handler as llms/custom_httpx/llm_http_handler.py participant Transform as llms/{provider}/chat/transformation.py participant Provider as LLM Provider API + participant CostCalc as cost_calculator.py + participant LoggingObj as litellm_logging.py + participant DBWriter as db/db_spend_update_writer.py + participant Postgres as PostgreSQL + %% Request Flow Client->>ProxyServer: POST /v1/chat/completions ProxyServer->>Auth: user_api_key_auth() + Auth->>Redis: Check API key cache + Redis-->>Auth: Key info + spend limits ProxyServer->>Hooks: max_budget_limiter, parallel_request_limiter + Hooks->>Redis: Check/increment rate limit counters ProxyServer->>Router: route_request() Router->>Main: litellm.acompletion() Main->>Handler: BaseLLMHTTPHandler.completion() @@ -45,8 +54,25 @@ sequenceDiagram Handler->>Provider: HTTP Request Provider-->>Handler: Response Handler->>Transform: ProviderConfig.transform_response() - Handler-->>Hooks: async_log_success_event() - Handler-->>Client: ModelResponse + Transform-->>Handler: ModelResponse + Handler-->>Main: ModelResponse + + %% Cost Attribution (in utils.py wrapper) + Main->>LoggingObj: update_response_metadata() + LoggingObj->>CostCalc: _response_cost_calculator() + CostCalc->>CostCalc: completion_cost(tokens × price) + CostCalc-->>LoggingObj: response_cost + LoggingObj-->>Main: Set response._hidden_params["response_cost"] + Main-->>ProxyServer: ModelResponse (with cost in _hidden_params) + + %% Response Headers + Async Logging + ProxyServer->>ProxyServer: Extract cost from hidden_params + ProxyServer->>LoggingObj: async_success_handler() + LoggingObj->>Hooks: async_log_success_event() + Hooks->>DBWriter: update_database(response_cost) + DBWriter->>Redis: Queue spend increment + DBWriter->>Postgres: Batch write spend logs (async) + ProxyServer-->>Client: ModelResponse + x-litellm-response-cost header ``` ### Proxy Components @@ -75,11 +101,19 @@ graph TD Main["main.py"] end + subgraph "Infrastructure" + DualCache["DualCache
(in-memory + Redis)"] + Postgres["PostgreSQL
(keys, teams, spend logs)"] + end + Client --> Endpoint Endpoint --> Auth + Auth --> DualCache + DualCache -.->|cache miss| Postgres Auth --> PreCall PreCall --> RouteRequest RouteRequest --> Router + Router --> DualCache Router --> Main Main --> Client ``` @@ -119,6 +153,93 @@ graph TD To add a new proxy hook, implement `CustomLogger` and register in `PROXY_HOOKS`. +### Infrastructure Components + +The AI Gateway uses external infrastructure for persistence and caching: + +```mermaid +graph LR + subgraph "AI Gateway (proxy/)" + Proxy["proxy_server.py"] + Auth["auth/user_api_key_auth.py"] + DBWriter["db/db_spend_update_writer.py
DBSpendUpdateWriter"] + InternalCache["utils.py
InternalUsageCache"] + CostCallback["hooks/proxy_track_cost_callback.py
_ProxyDBLogger"] + Scheduler["APScheduler
ProxyStartupEvent"] + end + + subgraph "SDK (litellm/)" + Router["router.py
Router.cache (DualCache)"] + LLMCache["caching/caching_handler.py
LLMCachingHandler"] + CacheClass["caching/caching.py
Cache"] + end + + subgraph "Redis (caching/redis_cache.py)" + RateLimit["Rate Limit Counters"] + SpendQueue["Spend Increment Queue"] + KeyCache["API Key Cache"] + TPM_RPM["TPM/RPM Tracking"] + Cooldowns["Deployment Cooldowns"] + LLMResponseCache["LLM Response Cache"] + end + + subgraph "PostgreSQL (proxy/schema.prisma)" + Keys["LiteLLM_VerificationToken"] + Teams["LiteLLM_TeamTable"] + SpendLogs["LiteLLM_SpendLogs"] + Users["LiteLLM_UserTable"] + end + + Auth --> InternalCache + InternalCache --> KeyCache + InternalCache -.->|cache miss| Keys + InternalCache --> RateLimit + Router --> TPM_RPM + Router --> Cooldowns + LLMCache --> CacheClass + CacheClass --> LLMResponseCache + CostCallback --> DBWriter + DBWriter --> SpendQueue + DBWriter --> SpendLogs + Scheduler --> SpendLogs + Scheduler --> Keys +``` + +| Component | Purpose | Key Files/Classes | +|-----------|---------|-------------------| +| **Redis** | Rate limiting, API key caching, TPM/RPM tracking, cooldowns, LLM response caching, spend queuing | `caching/redis_cache.py` (`RedisCache`), `caching/dual_cache.py` (`DualCache`) | +| **PostgreSQL** | API keys, teams, users, spend logs | `proxy/utils.py` (`PrismaClient`), `proxy/schema.prisma` | +| **InternalUsageCache** | Proxy-level cache for rate limits + API keys (in-memory + Redis) | `proxy/utils.py` (`InternalUsageCache`) | +| **Router.cache** | TPM/RPM tracking, deployment cooldowns, client caching (in-memory + Redis) | `router.py` (`Router.cache: DualCache`) | +| **LLMCachingHandler** | SDK-level LLM response/embedding caching | `caching/caching_handler.py` (`LLMCachingHandler`), `caching/caching.py` (`Cache`) | +| **DBSpendUpdateWriter** | Batches spend updates to reduce DB writes | `proxy/db/db_spend_update_writer.py` (`DBSpendUpdateWriter`) | +| **Cost Tracking** | Calculates and logs response costs | `proxy/hooks/proxy_track_cost_callback.py` (`_ProxyDBLogger`) | + +**Background Jobs** (APScheduler, initialized in `proxy/proxy_server.py` → `ProxyStartupEvent.initialize_scheduled_background_jobs()`): + +| Job | Interval | Purpose | Key Files | +|-----|----------|---------|-----------| +| `update_spend` | 60s | Batch write spend logs to PostgreSQL | `proxy/db/db_spend_update_writer.py` | +| `reset_budget` | 10-12min | Reset budgets for keys/users/teams | `proxy/management_helpers/budget_reset_job.py` | +| `add_deployment` | 10s | Sync new model deployments from DB | `proxy/proxy_server.py` (`ProxyConfig`) | +| `cleanup_old_spend_logs` | cron/interval | Delete old spend logs | `proxy/management_helpers/spend_log_cleanup.py` | +| `check_batch_cost` | 30min | Calculate costs for batch jobs | `proxy/management_helpers/check_batch_cost_job.py` | +| `check_responses_cost` | 30min | Calculate costs for responses API | `proxy/management_helpers/check_responses_cost_job.py` | +| `process_rotations` | 1hr | Auto-rotate API keys | `proxy/management_helpers/key_rotation_manager.py` | +| `_run_background_health_check` | continuous | Health check model deployments | `proxy/proxy_server.py` | +| `send_weekly_spend_report` | weekly | Slack spend alerts | `proxy/utils.py` (`SlackAlerting`) | +| `send_monthly_spend_report` | monthly | Slack spend alerts | `proxy/utils.py` (`SlackAlerting`) | + +**Cost Attribution Flow:** +1. LLM response returns to `utils.py` wrapper after `litellm.acompletion()` completes +2. `update_response_metadata()` (`llm_response_utils/response_metadata.py`) is called +3. `logging_obj._response_cost_calculator()` (`litellm_logging.py`) calculates cost via `litellm.completion_cost()` (`cost_calculator.py`) +4. Cost is stored in `response._hidden_params["response_cost"]` +5. `proxy/common_request_processing.py` extracts cost from `hidden_params` and adds to response headers (`x-litellm-response-cost`) +6. `logging_obj.async_success_handler()` triggers callbacks including `_ProxyDBLogger.async_log_success_event()` +7. `DBSpendUpdateWriter.update_database()` queues spend increments to Redis +8. Background job `update_spend` flushes queued spend to PostgreSQL every 60s + --- ## 2. SDK Request Flow diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 4abbddb0d5..470d598a25 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -10201,48 +10201,6 @@ "mode": "completion", "output_cost_per_token": 5e-07 }, - "deepseek-v3-2-251201": { - "input_cost_per_token": 0.0, - "litellm_provider": "volcengine", - "max_input_tokens": 98304, - "max_output_tokens": 32768, - "max_tokens": 32768, - "mode": "chat", - "output_cost_per_token": 0.0, - "supports_assistant_prefill": true, - "supports_function_calling": true, - "supports_prompt_caching": true, - "supports_reasoning": true, - "supports_tool_choice": true - }, - "glm-4-7-251222": { - "input_cost_per_token": 0.0, - "litellm_provider": "volcengine", - "max_input_tokens": 204800, - "max_output_tokens": 131072, - "max_tokens": 131072, - "mode": "chat", - "output_cost_per_token": 0.0, - "supports_assistant_prefill": true, - "supports_function_calling": true, - "supports_prompt_caching": true, - "supports_reasoning": true, - "supports_tool_choice": true - }, - "kimi-k2-thinking-251104": { - "input_cost_per_token": 0.0, - "litellm_provider": "volcengine", - "max_input_tokens": 229376, - "max_output_tokens": 32768, - "max_tokens": 32768, - "mode": "chat", - "output_cost_per_token": 0.0, - "supports_assistant_prefill": true, - "supports_function_calling": true, - "supports_prompt_caching": true, - "supports_reasoning": true, - "supports_tool_choice": true - }, "doubao-embedding": { "input_cost_per_token": 0.0, "litellm_provider": "volcengine",