From 29e41d780854808f928c57f773ad4bfe19d3570b Mon Sep 17 00:00:00 2001 From: Ishaan Jaffer Date: Fri, 16 Jan 2026 14:29:12 -0800 Subject: [PATCH 1/5] fixes 1 --- ARCHITECTURE.md | 81 ++++++++++++++++++- ...odel_prices_and_context_window_backup.json | 42 ---------- 2 files changed, 79 insertions(+), 44 deletions(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 807fc85cec..5692e924e6 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -28,16 +28,24 @@ sequenceDiagram participant Client participant ProxyServer as proxy/proxy_server.py participant Auth as proxy/auth/user_api_key_auth.py + participant Redis as Redis Cache participant Hooks as proxy/hooks/ participant Router as router.py participant Main as main.py participant Handler as llms/custom_httpx/llm_http_handler.py participant Transform as llms/{provider}/chat/transformation.py participant Provider as LLM Provider API + participant CostCalc as litellm.completion_cost() + participant DBWriter as db/db_spend_update_writer.py + participant Postgres as PostgreSQL + %% Request Flow Client->>ProxyServer: POST /v1/chat/completions ProxyServer->>Auth: user_api_key_auth() + Auth->>Redis: Check API key cache + Redis-->>Auth: Key info + spend limits ProxyServer->>Hooks: max_budget_limiter, parallel_request_limiter + Hooks->>Redis: Check/increment rate limit counters ProxyServer->>Router: route_request() Router->>Main: litellm.acompletion() Main->>Handler: BaseLLMHTTPHandler.completion() @@ -45,8 +53,16 @@ sequenceDiagram Handler->>Provider: HTTP Request Provider-->>Handler: Response Handler->>Transform: ProviderConfig.transform_response() - Handler-->>Hooks: async_log_success_event() - Handler-->>Client: ModelResponse + + %% Response Flow with Cost Attribution + Handler->>CostCalc: Calculate response cost (tokens × price) + CostCalc-->>Handler: response_cost + Handler->>Hooks: async_log_success_event() + Hooks->>DBWriter: update_database(response_cost) + DBWriter->>Redis: Queue spend increment + DBWriter->>Postgres: Batch write spend logs (async) + Hooks->>Redis: update_cache(token, response_cost) + Handler-->>Client: ModelResponse + x-litellm-response-cost header ``` ### Proxy Components @@ -75,12 +91,20 @@ graph TD Main["main.py"] end + subgraph "Infrastructure" + Redis["Redis
(rate limits, caching, spend queue)"] + Postgres["PostgreSQL
(keys, teams, spend logs)"] + end + Client --> Endpoint Endpoint --> Auth + Auth --> Redis Auth --> PreCall PreCall --> RouteRequest RouteRequest --> Router Router --> Main + Main --> Redis + Main --> Postgres Main --> Client ``` @@ -119,6 +143,59 @@ graph TD To add a new proxy hook, implement `CustomLogger` and register in `PROXY_HOOKS`. +### Infrastructure Components + +The AI Gateway uses external infrastructure for persistence and caching: + +```mermaid +graph LR + subgraph "AI Gateway" + Proxy["proxy/proxy_server.py"] + DBWriter["proxy/db/db_spend_update_writer.py
DBSpendUpdateWriter"] + Cache["proxy/utils.py
InternalUsageCache"] + CostCallback["proxy/hooks/proxy_track_cost_callback.py
_ProxyDBLogger"] + end + + subgraph "Redis (caching/redis_cache.py)" + RateLimit["Rate Limit Counters"] + SpendQueue["Spend Increment Queue"] + KeyCache["API Key Cache (DualCache)"] + ResponseCache["LLM Response Cache"] + end + + subgraph "PostgreSQL (proxy/schema.prisma)" + Keys["LiteLLM_VerificationToken"] + Teams["LiteLLM_TeamTable"] + SpendLogs["LiteLLM_SpendLogs"] + Users["LiteLLM_UserTable"] + end + + Proxy --> Cache + Cache --> RateLimit + Cache --> KeyCache + Cache --> ResponseCache + CostCallback --> DBWriter + DBWriter --> SpendQueue + DBWriter --> SpendLogs + Proxy --> Keys + Proxy --> Teams +``` + +| Component | Purpose | Key Files/Classes | +|-----------|---------|-------------------| +| **Redis** | Rate limiting, caching, spend queuing | `caching/redis_cache.py` (`RedisCache`), `caching/dual_cache.py` (`DualCache`) | +| **PostgreSQL** | API keys, teams, users, spend logs | `proxy/utils.py` (`PrismaClient`), `proxy/schema.prisma` | +| **InternalUsageCache** | In-memory + Redis cache abstraction | `proxy/utils.py` (`InternalUsageCache`) | +| **DBSpendUpdateWriter** | Batches spend updates to reduce DB writes | `proxy/db/db_spend_update_writer.py` (`DBSpendUpdateWriter`) | +| **Cost Tracking** | Calculates and logs response costs | `proxy/hooks/proxy_track_cost_callback.py` (`_ProxyDBLogger`) | + +**Cost Attribution Flow:** +1. `litellm.completion_cost()` (`cost_calculator.py`) calculates cost from token usage × model pricing +2. Cost is added to response headers (`x-litellm-response-cost`) via `proxy/common_request_processing.py` +3. `_ProxyDBLogger.async_log_success_event()` triggers spend tracking +4. `DBSpendUpdateWriter.update_database()` queues spend increments +5. `update_cache()` in `proxy/proxy_server.py` updates Redis for real-time budget enforcement + --- ## 2. SDK Request Flow diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 4abbddb0d5..470d598a25 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -10201,48 +10201,6 @@ "mode": "completion", "output_cost_per_token": 5e-07 }, - "deepseek-v3-2-251201": { - "input_cost_per_token": 0.0, - "litellm_provider": "volcengine", - "max_input_tokens": 98304, - "max_output_tokens": 32768, - "max_tokens": 32768, - "mode": "chat", - "output_cost_per_token": 0.0, - "supports_assistant_prefill": true, - "supports_function_calling": true, - "supports_prompt_caching": true, - "supports_reasoning": true, - "supports_tool_choice": true - }, - "glm-4-7-251222": { - "input_cost_per_token": 0.0, - "litellm_provider": "volcengine", - "max_input_tokens": 204800, - "max_output_tokens": 131072, - "max_tokens": 131072, - "mode": "chat", - "output_cost_per_token": 0.0, - "supports_assistant_prefill": true, - "supports_function_calling": true, - "supports_prompt_caching": true, - "supports_reasoning": true, - "supports_tool_choice": true - }, - "kimi-k2-thinking-251104": { - "input_cost_per_token": 0.0, - "litellm_provider": "volcengine", - "max_input_tokens": 229376, - "max_output_tokens": 32768, - "max_tokens": 32768, - "mode": "chat", - "output_cost_per_token": 0.0, - "supports_assistant_prefill": true, - "supports_function_calling": true, - "supports_prompt_caching": true, - "supports_reasoning": true, - "supports_tool_choice": true - }, "doubao-embedding": { "input_cost_per_token": 0.0, "litellm_provider": "volcengine", From 53376b71f021413eefce0e6122893bbc83e2d441 Mon Sep 17 00:00:00 2001 From: Ishaan Jaffer Date: Fri, 16 Jan 2026 14:30:21 -0800 Subject: [PATCH 2/5] docs fix --- ARCHITECTURE.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 5692e924e6..cf8ba7df2f 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -154,6 +154,7 @@ graph LR DBWriter["proxy/db/db_spend_update_writer.py
DBSpendUpdateWriter"] Cache["proxy/utils.py
InternalUsageCache"] CostCallback["proxy/hooks/proxy_track_cost_callback.py
_ProxyDBLogger"] + Scheduler["APScheduler
ProxyStartupEvent.initialize_scheduled_background_jobs()"] end subgraph "Redis (caching/redis_cache.py)" @@ -179,6 +180,8 @@ graph LR DBWriter --> SpendLogs Proxy --> Keys Proxy --> Teams + Scheduler --> SpendLogs + Scheduler --> Keys ``` | Component | Purpose | Key Files/Classes | @@ -189,12 +192,28 @@ graph LR | **DBSpendUpdateWriter** | Batches spend updates to reduce DB writes | `proxy/db/db_spend_update_writer.py` (`DBSpendUpdateWriter`) | | **Cost Tracking** | Calculates and logs response costs | `proxy/hooks/proxy_track_cost_callback.py` (`_ProxyDBLogger`) | +**Background Jobs** (APScheduler, initialized in `proxy/proxy_server.py` → `ProxyStartupEvent.initialize_scheduled_background_jobs()`): + +| Job | Interval | Purpose | Key Files | +|-----|----------|---------|-----------| +| `update_spend` | 60s | Batch write spend logs to PostgreSQL | `proxy/db/db_spend_update_writer.py` | +| `reset_budget` | 10-12min | Reset budgets for keys/users/teams | `proxy/management_helpers/budget_reset_job.py` | +| `add_deployment` | 10s | Sync new model deployments from DB | `proxy/proxy_server.py` (`ProxyConfig`) | +| `cleanup_old_spend_logs` | cron/interval | Delete old spend logs | `proxy/management_helpers/spend_log_cleanup.py` | +| `check_batch_cost` | 30min | Calculate costs for batch jobs | `proxy/management_helpers/check_batch_cost_job.py` | +| `check_responses_cost` | 30min | Calculate costs for responses API | `proxy/management_helpers/check_responses_cost_job.py` | +| `process_rotations` | 1hr | Auto-rotate API keys | `proxy/management_helpers/key_rotation_manager.py` | +| `_run_background_health_check` | continuous | Health check model deployments | `proxy/proxy_server.py` | +| `send_weekly_spend_report` | weekly | Slack spend alerts | `proxy/utils.py` (`SlackAlerting`) | +| `send_monthly_spend_report` | monthly | Slack spend alerts | `proxy/utils.py` (`SlackAlerting`) | + **Cost Attribution Flow:** 1. `litellm.completion_cost()` (`cost_calculator.py`) calculates cost from token usage × model pricing 2. Cost is added to response headers (`x-litellm-response-cost`) via `proxy/common_request_processing.py` 3. `_ProxyDBLogger.async_log_success_event()` triggers spend tracking 4. `DBSpendUpdateWriter.update_database()` queues spend increments 5. `update_cache()` in `proxy/proxy_server.py` updates Redis for real-time budget enforcement +6. Background job `update_spend` flushes queued spend to PostgreSQL every 60s --- From c8279a70a51284b9f371837d27d71146abd43861 Mon Sep 17 00:00:00 2001 From: Ishaan Jaffer Date: Fri, 16 Jan 2026 14:37:10 -0800 Subject: [PATCH 3/5] docs fix --- ARCHITECTURE.md | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index cf8ba7df2f..512a7b9655 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -92,19 +92,18 @@ graph TD end subgraph "Infrastructure" - Redis["Redis
(rate limits, caching, spend queue)"] + DualCache["DualCache
(in-memory + Redis)"] Postgres["PostgreSQL
(keys, teams, spend logs)"] end Client --> Endpoint Endpoint --> Auth - Auth --> Redis + Auth --> DualCache + DualCache -.->|cache miss| Postgres Auth --> PreCall PreCall --> RouteRequest RouteRequest --> Router Router --> Main - Main --> Redis - Main --> Postgres Main --> Client ``` @@ -149,19 +148,25 @@ The AI Gateway uses external infrastructure for persistence and caching: ```mermaid graph LR - subgraph "AI Gateway" - Proxy["proxy/proxy_server.py"] - DBWriter["proxy/db/db_spend_update_writer.py
DBSpendUpdateWriter"] - Cache["proxy/utils.py
InternalUsageCache"] - CostCallback["proxy/hooks/proxy_track_cost_callback.py
_ProxyDBLogger"] - Scheduler["APScheduler
ProxyStartupEvent.initialize_scheduled_background_jobs()"] + subgraph "AI Gateway (proxy/)" + Proxy["proxy_server.py"] + Auth["auth/user_api_key_auth.py"] + DBWriter["db/db_spend_update_writer.py
DBSpendUpdateWriter"] + InternalCache["utils.py
InternalUsageCache"] + CostCallback["hooks/proxy_track_cost_callback.py
_ProxyDBLogger"] + Scheduler["APScheduler
ProxyStartupEvent"] + end + + subgraph "SDK (litellm/)" + LLMCache["caching/caching_handler.py
LLMCachingHandler"] + CacheClass["caching/caching.py
Cache"] end subgraph "Redis (caching/redis_cache.py)" RateLimit["Rate Limit Counters"] SpendQueue["Spend Increment Queue"] - KeyCache["API Key Cache (DualCache)"] - ResponseCache["LLM Response Cache"] + KeyCache["API Key Cache"] + LLMResponseCache["LLM Response Cache"] end subgraph "PostgreSQL (proxy/schema.prisma)" @@ -171,24 +176,25 @@ graph LR Users["LiteLLM_UserTable"] end - Proxy --> Cache - Cache --> RateLimit - Cache --> KeyCache - Cache --> ResponseCache + Auth --> InternalCache + InternalCache --> KeyCache + InternalCache -.->|cache miss| Keys + InternalCache --> RateLimit + LLMCache --> CacheClass + CacheClass --> LLMResponseCache CostCallback --> DBWriter DBWriter --> SpendQueue DBWriter --> SpendLogs - Proxy --> Keys - Proxy --> Teams Scheduler --> SpendLogs Scheduler --> Keys ``` | Component | Purpose | Key Files/Classes | |-----------|---------|-------------------| -| **Redis** | Rate limiting, caching, spend queuing | `caching/redis_cache.py` (`RedisCache`), `caching/dual_cache.py` (`DualCache`) | +| **Redis** | Rate limiting, API key caching, LLM response caching, spend queuing | `caching/redis_cache.py` (`RedisCache`), `caching/dual_cache.py` (`DualCache`) | | **PostgreSQL** | API keys, teams, users, spend logs | `proxy/utils.py` (`PrismaClient`), `proxy/schema.prisma` | -| **InternalUsageCache** | In-memory + Redis cache abstraction | `proxy/utils.py` (`InternalUsageCache`) | +| **InternalUsageCache** | Proxy-level cache for rate limits + API keys (in-memory + Redis) | `proxy/utils.py` (`InternalUsageCache`) | +| **LLMCachingHandler** | SDK-level LLM response/embedding caching | `caching/caching_handler.py` (`LLMCachingHandler`), `caching/caching.py` (`Cache`) | | **DBSpendUpdateWriter** | Batches spend updates to reduce DB writes | `proxy/db/db_spend_update_writer.py` (`DBSpendUpdateWriter`) | | **Cost Tracking** | Calculates and logs response costs | `proxy/hooks/proxy_track_cost_callback.py` (`_ProxyDBLogger`) | From 0864dc9dd7e4e666dbfdb8a449b6cbf618cd5884 Mon Sep 17 00:00:00 2001 From: Ishaan Jaffer Date: Fri, 16 Jan 2026 14:41:20 -0800 Subject: [PATCH 4/5] docs fix --- ARCHITECTURE.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 512a7b9655..c84677511b 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -103,6 +103,7 @@ graph TD Auth --> PreCall PreCall --> RouteRequest RouteRequest --> Router + Router --> DualCache Router --> Main Main --> Client ``` @@ -158,6 +159,7 @@ graph LR end subgraph "SDK (litellm/)" + Router["router.py
Router.cache (DualCache)"] LLMCache["caching/caching_handler.py
LLMCachingHandler"] CacheClass["caching/caching.py
Cache"] end @@ -166,6 +168,8 @@ graph LR RateLimit["Rate Limit Counters"] SpendQueue["Spend Increment Queue"] KeyCache["API Key Cache"] + TPM_RPM["TPM/RPM Tracking"] + Cooldowns["Deployment Cooldowns"] LLMResponseCache["LLM Response Cache"] end @@ -180,6 +184,8 @@ graph LR InternalCache --> KeyCache InternalCache -.->|cache miss| Keys InternalCache --> RateLimit + Router --> TPM_RPM + Router --> Cooldowns LLMCache --> CacheClass CacheClass --> LLMResponseCache CostCallback --> DBWriter @@ -191,9 +197,10 @@ graph LR | Component | Purpose | Key Files/Classes | |-----------|---------|-------------------| -| **Redis** | Rate limiting, API key caching, LLM response caching, spend queuing | `caching/redis_cache.py` (`RedisCache`), `caching/dual_cache.py` (`DualCache`) | +| **Redis** | Rate limiting, API key caching, TPM/RPM tracking, cooldowns, LLM response caching, spend queuing | `caching/redis_cache.py` (`RedisCache`), `caching/dual_cache.py` (`DualCache`) | | **PostgreSQL** | API keys, teams, users, spend logs | `proxy/utils.py` (`PrismaClient`), `proxy/schema.prisma` | | **InternalUsageCache** | Proxy-level cache for rate limits + API keys (in-memory + Redis) | `proxy/utils.py` (`InternalUsageCache`) | +| **Router.cache** | TPM/RPM tracking, deployment cooldowns, client caching (in-memory + Redis) | `router.py` (`Router.cache: DualCache`) | | **LLMCachingHandler** | SDK-level LLM response/embedding caching | `caching/caching_handler.py` (`LLMCachingHandler`), `caching/caching.py` (`Cache`) | | **DBSpendUpdateWriter** | Batches spend updates to reduce DB writes | `proxy/db/db_spend_update_writer.py` (`DBSpendUpdateWriter`) | | **Cost Tracking** | Calculates and logs response costs | `proxy/hooks/proxy_track_cost_callback.py` (`_ProxyDBLogger`) | From cfa21cebd6c940d008eaacb7d97b488438f33ca0 Mon Sep 17 00:00:00 2001 From: Ishaan Jaffer Date: Fri, 16 Jan 2026 14:50:34 -0800 Subject: [PATCH 5/5] docs fix --- ARCHITECTURE.md | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index c84677511b..c114a838d6 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -31,11 +31,12 @@ sequenceDiagram participant Redis as Redis Cache participant Hooks as proxy/hooks/ participant Router as router.py - participant Main as main.py + participant Main as main.py + utils.py participant Handler as llms/custom_httpx/llm_http_handler.py participant Transform as llms/{provider}/chat/transformation.py participant Provider as LLM Provider API - participant CostCalc as litellm.completion_cost() + participant CostCalc as cost_calculator.py + participant LoggingObj as litellm_logging.py participant DBWriter as db/db_spend_update_writer.py participant Postgres as PostgreSQL @@ -53,16 +54,25 @@ sequenceDiagram Handler->>Provider: HTTP Request Provider-->>Handler: Response Handler->>Transform: ProviderConfig.transform_response() + Transform-->>Handler: ModelResponse + Handler-->>Main: ModelResponse - %% Response Flow with Cost Attribution - Handler->>CostCalc: Calculate response cost (tokens × price) - CostCalc-->>Handler: response_cost - Handler->>Hooks: async_log_success_event() + %% Cost Attribution (in utils.py wrapper) + Main->>LoggingObj: update_response_metadata() + LoggingObj->>CostCalc: _response_cost_calculator() + CostCalc->>CostCalc: completion_cost(tokens × price) + CostCalc-->>LoggingObj: response_cost + LoggingObj-->>Main: Set response._hidden_params["response_cost"] + Main-->>ProxyServer: ModelResponse (with cost in _hidden_params) + + %% Response Headers + Async Logging + ProxyServer->>ProxyServer: Extract cost from hidden_params + ProxyServer->>LoggingObj: async_success_handler() + LoggingObj->>Hooks: async_log_success_event() Hooks->>DBWriter: update_database(response_cost) DBWriter->>Redis: Queue spend increment DBWriter->>Postgres: Batch write spend logs (async) - Hooks->>Redis: update_cache(token, response_cost) - Handler-->>Client: ModelResponse + x-litellm-response-cost header + ProxyServer-->>Client: ModelResponse + x-litellm-response-cost header ``` ### Proxy Components @@ -221,12 +231,14 @@ graph LR | `send_monthly_spend_report` | monthly | Slack spend alerts | `proxy/utils.py` (`SlackAlerting`) | **Cost Attribution Flow:** -1. `litellm.completion_cost()` (`cost_calculator.py`) calculates cost from token usage × model pricing -2. Cost is added to response headers (`x-litellm-response-cost`) via `proxy/common_request_processing.py` -3. `_ProxyDBLogger.async_log_success_event()` triggers spend tracking -4. `DBSpendUpdateWriter.update_database()` queues spend increments -5. `update_cache()` in `proxy/proxy_server.py` updates Redis for real-time budget enforcement -6. Background job `update_spend` flushes queued spend to PostgreSQL every 60s +1. LLM response returns to `utils.py` wrapper after `litellm.acompletion()` completes +2. `update_response_metadata()` (`llm_response_utils/response_metadata.py`) is called +3. `logging_obj._response_cost_calculator()` (`litellm_logging.py`) calculates cost via `litellm.completion_cost()` (`cost_calculator.py`) +4. Cost is stored in `response._hidden_params["response_cost"]` +5. `proxy/common_request_processing.py` extracts cost from `hidden_params` and adds to response headers (`x-litellm-response-cost`) +6. `logging_obj.async_success_handler()` triggers callbacks including `_ProxyDBLogger.async_log_success_event()` +7. `DBSpendUpdateWriter.update_database()` queues spend increments to Redis +8. Background job `update_spend` flushes queued spend to PostgreSQL every 60s ---