BerriAI · ishaan-jaff · Feb 28, 2026 · Feb 27, 2026 · Feb 28, 2026 · Feb 28, 2026
diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md
@@ -113,6 +113,31 @@ litellm_settings:
 ```
 
 
+## Pod Health Metrics
+
+Use these to measure per-pod queue depth and diagnose latency that occurs **before** LiteLLM starts processing a request.
+
+| Metric Name | Type | Description |
+|---|---|---|
+| `litellm_in_flight_requests` | Gauge | Number of HTTP requests currently in-flight on this uvicorn worker. Tracks the pod's queue depth in real time. With multiple workers, values are summed across all live workers (`livesum`). |
+
+### When to use this
+
+LiteLLM measures latency from when its handler starts. If a request waits in uvicorn's event loop before the handler runs, that wait is invisible to LiteLLM's own logs. `litellm_in_flight_requests` shows how loaded the pod was at any point in time.
+
+```
+high in_flight_requests + high ALB TargetResponseTime → pod overloaded, scale out
+low  in_flight_requests + high ALB TargetResponseTime → delay is pre-ASGI (event loop blocking)
+```
+
+You can also check the current value directly without Prometheus:
+
+```bash
+curl http://localhost:4000/health/backlog \
+  -H "Authorization: Bearer sk-..."
+# {"in_flight_requests": 47}
+```
+
 ## Proxy Level Tracking Metrics
 
 Use this to track overall LiteLLM Proxy usage.

diff --git a/docs/my-website/docs/troubleshoot/latency_overhead.md b/docs/my-website/docs/troubleshoot/latency_overhead.md
@@ -2,9 +2,41 @@
 
 Use this guide when you see unexpected latency overhead between LiteLLM proxy and the LLM provider.
 
+## The Invisible Latency Gap
+
+LiteLLM measures latency from when its handler starts. If a request waits in uvicorn's event loop **before** the handler runs, that wait is invisible to LiteLLM's own logs.
+
+```
+T=0   Request arrives at load balancer
+      [queue wait — LiteLLM never logs this]
+T=10  LiteLLM handler starts → timer begins
+T=20  Response sent
+
+LiteLLM logs: 10s    User experiences: 20s
+```
+
+To measure the pre-handler wait, poll `/health/backlog` on each pod:
+
+```bash
+curl http://localhost:4000/health/backlog \
+  -H "Authorization: Bearer sk-..."
+# {"in_flight_requests": 47}
+```
+
+Or scrape the `litellm_in_flight_requests` Prometheus gauge at `/metrics`.
+
+| `in_flight_requests` | ALB `TargetResponseTime` | Diagnosis |
+|---|---|---|
+| High | High | Pod overloaded → scale out |
+| Low | High | Delay is pre-ASGI — check for sync blocking code or event loop saturation |
+| High | Normal | Pod is busy but healthy, no queue buildup |
+
+If you're on **AWS ALB**, correlate `litellm_in_flight_requests` spikes with ALB's `TargetResponseTime` CloudWatch metric. The gap between what ALB reports and what LiteLLM logs is the invisible wait.
+
 ## Quick Checklist
 
-1. **Collect the `x-litellm-overhead-duration-ms` response header** — this tells you LiteLLM's total overhead on every request. Start here.
+1. **Check `in_flight_requests` on each pod** via `/health/backlog` or the `litellm_in_flight_requests` Prometheus gauge — this tells you if requests are queuing before LiteLLM starts processing. Start here for unexplained latency.
+2. **Collect the `x-litellm-overhead-duration-ms` response header** — this tells you LiteLLM's total overhead on every request.
 2. **Is DEBUG logging enabled?** This is the #1 cause of latency with large payloads.
 3. **Are you sending large base64 payloads?** (images, PDFs) — see [Large Payload Overhead](#large-payload-overhead).
 4. **Enable detailed timing headers** to pinpoint where time is spent.

diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
@@ -19210,6 +19210,39 @@
         "supports_tool_choice": true,
         "supports_vision": false
     },
+    "gpt-audio-1.5": {
+        "input_cost_per_audio_token": 3.2e-05,
+        "input_cost_per_token": 2.5e-06,
+        "litellm_provider": "openai",
+        "max_input_tokens": 128000,
+        "max_output_tokens": 16384,
+        "max_tokens": 16384,
+        "mode": "chat",
+        "output_cost_per_audio_token": 6.4e-05,
+        "output_cost_per_token": 1e-05,
+        "supported_endpoints": [
+            "/v1/chat/completions"
+        ],
+        "supported_modalities": [
+            "text",
+            "audio"
+        ],
+        "supported_output_modalities": [
+            "text",
+            "audio"
+        ],
+        "supports_audio_input": true,
+        "supports_audio_output": true,
+        "supports_function_calling": true,
+        "supports_native_streaming": true,
+        "supports_parallel_function_calling": true,
+        "supports_prompt_caching": false,
+        "supports_reasoning": false,
+        "supports_response_schema": false,
+        "supports_system_messages": true,
+        "supports_tool_choice": true,
+        "supports_vision": false
+    },
     "gpt-audio-2025-08-28": {
         "input_cost_per_audio_token": 3.2e-05,
         "input_cost_per_token": 2.5e-06,
@@ -20927,6 +20960,38 @@
         "supports_system_messages": true,
         "supports_tool_choice": true
     },
+    "gpt-realtime-1.5": {
+        "cache_creation_input_audio_token_cost": 4e-07,
+        "cache_read_input_token_cost": 4e-07,
+        "input_cost_per_audio_token": 3.2e-05,
+        "input_cost_per_image": 5e-06,
+        "input_cost_per_token": 4e-06,
+        "litellm_provider": "openai",
+        "max_input_tokens": 32000,
+        "max_output_tokens": 4096,
+        "max_tokens": 4096,
+        "mode": "chat",
+        "output_cost_per_audio_token": 6.4e-05,
+        "output_cost_per_token": 1.6e-05,
+        "supported_endpoints": [
+            "/v1/realtime"
+        ],
+        "supported_modalities": [
+            "text",
+            "image",
+            "audio"
+        ],
+        "supported_output_modalities": [
+            "text",
+            "audio"
+        ],
+        "supports_audio_input": true,
+        "supports_audio_output": true,
+        "supports_function_calling": true,
+        "supports_parallel_function_calling": true,
+        "supports_system_messages": true,
+        "supports_tool_choice": true
+    },
     "gpt-realtime-mini": {
         "cache_creation_input_audio_token_cost": 3e-07,
         "cache_read_input_audio_token_cost": 3e-07,
@@ -26650,8 +26715,8 @@
         "mode": "chat",
         "output_cost_per_token": 0.0,
         "source": "https://platform.publicai.co/docs",
-        "supports_function_calling": true,
-        "supports_tool_choice": true
+        "supports_function_calling": false,
+        "supports_tool_choice": false
     },
     "publicai/swiss-ai/apertus-70b-instruct": {
         "input_cost_per_token": 0.0,
@@ -26662,8 +26727,8 @@
         "mode": "chat",
         "output_cost_per_token": 0.0,
         "source": "https://platform.publicai.co/docs",
-        "supports_function_calling": true,
-        "supports_tool_choice": true
+        "supports_function_calling": false,
+        "supports_tool_choice": false
     },
     "publicai/aisingapore/Gemma-SEA-LION-v4-27B-IT": {
         "input_cost_per_token": 0.0,
@@ -32991,6 +33056,7 @@
         "supports_web_search": true
     },
     "xai/grok-2-vision-1212": {
+        "deprecation_date": "2026-02-28",
         "input_cost_per_image": 2e-06,
         "input_cost_per_token": 2e-06,
         "litellm_provider": "xai",
@@ -33095,6 +33161,7 @@
     },
     "xai/grok-3-mini": {
         "cache_read_input_token_cost": 7.5e-08,
+        "deprecation_date": "2026-02-28",
         "input_cost_per_token": 3e-07,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -33111,6 +33178,7 @@
     },
     "xai/grok-3-mini-beta": {
         "cache_read_input_token_cost": 7.5e-08,
+        "deprecation_date": "2026-02-28",
         "input_cost_per_token": 3e-07,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,

diff --git a/litellm/proxy/health_endpoints/_health_endpoints.py b/litellm/proxy/health_endpoints/_health_endpoints.py
@@ -33,6 +33,9 @@
     perform_health_check,
     run_with_timeout,
 )
+from litellm.proxy.middleware.in_flight_requests_middleware import (
+    get_in_flight_requests,
+)
 from litellm.secret_managers.main import get_secret
 
 #### Health ENDPOINTS ####
@@ -1297,6 +1300,23 @@ async def health_readiness():
         raise HTTPException(status_code=503, detail=f"Service Unhealthy ({str(e)})")
 
 
+@router.get(
+    "/health/backlog",
+    tags=["health"],
+    dependencies=[Depends(user_api_key_auth)],
+)
+async def health_backlog():
+    """
+    Returns the number of HTTP requests currently in-flight on this uvicorn worker.
+
+    Use this to measure per-pod queue depth. A high value means the worker is
+    processing many concurrent requests — requests arriving now will have to wait
+    for the event loop to get to them, adding latency before LiteLLM even starts
+    its own timer.
+    """
+    return {"in_flight_requests": get_in_flight_requests()}
+
+
 @router.get(
     "/health/liveliness",  # Historical LiteLLM name; doesn't match k8s terminology but kept for backwards compatibility
     tags=["health"],

diff --git a/litellm/proxy/middleware/in_flight_requests_middleware.py b/litellm/proxy/middleware/in_flight_requests_middleware.py
@@ -0,0 +1,81 @@
+"""
+Tracks the number of HTTP requests currently in-flight on this uvicorn worker.
+
+Used by /health/backlog to expose per-pod queue depth, and emitted as the
+Prometheus gauge `litellm_in_flight_requests`.
+"""
+
+import os
+from typing import Optional
+
+from starlette.types import ASGIApp, Receive, Scope, Send
+
+
+class InFlightRequestsMiddleware:
+    """
+    ASGI middleware that increments a counter when a request arrives and
+    decrements it when the response is sent (or an error occurs).
+
+    The counter is class-level and therefore scoped to a single uvicorn worker
+    process — exactly the per-pod granularity we want.
+
+    Also updates the `litellm_in_flight_requests` Prometheus gauge if
+    prometheus_client is installed. The gauge is lazily initialised on the
+    first request so that PROMETHEUS_MULTIPROC_DIR is already set by the time
+    we register the metric. Initialisation is attempted only once — if
+    prometheus_client is absent the class remembers and never retries.
+    """
+
+    _in_flight: int = 0
+    _gauge: Optional[object] = None
+    _gauge_init_attempted: bool = False
+
+    def __init__(self, app: ASGIApp) -> None:
+        self.app = app
+
+    async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
+        if scope["type"] != "http":
+            await self.app(scope, receive, send)
+            return
+
+        InFlightRequestsMiddleware._in_flight += 1
+        gauge = InFlightRequestsMiddleware._get_gauge()
+        if gauge is not None:
+            gauge.inc()  # type: ignore[union-attr]
+        try:
+            await self.app(scope, receive, send)
+        finally:
+            InFlightRequestsMiddleware._in_flight -= 1
+            if gauge is not None:
+                gauge.dec()  # type: ignore[union-attr]
+
+    @staticmethod
+    def get_count() -> int:
+        """Return the number of HTTP requests currently in-flight."""
+        return InFlightRequestsMiddleware._in_flight
+
+    @staticmethod
+    def _get_gauge() -> Optional[object]:
+        if InFlightRequestsMiddleware._gauge_init_attempted:
+            return InFlightRequestsMiddleware._gauge
+        InFlightRequestsMiddleware._gauge_init_attempted = True
+        try:
+            from prometheus_client import Gauge
+
+            kwargs = {}
+            if "PROMETHEUS_MULTIPROC_DIR" in os.environ:
+                # livesum aggregates across all worker processes in the scrape response
+                kwargs["multiprocess_mode"] = "livesum"
+            InFlightRequestsMiddleware._gauge = Gauge(
+                "litellm_in_flight_requests",
+                "Number of HTTP requests currently in-flight on this uvicorn worker",
+                **kwargs,
+            )
+        except Exception:
+            InFlightRequestsMiddleware._gauge = None
+        return InFlightRequestsMiddleware._gauge
+
+
+def get_in_flight_requests() -> int:
+    """Module-level convenience wrapper used by the /health/backlog endpoint."""
+    return InFlightRequestsMiddleware.get_count()
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
@@ -424,6 +424,9 @@ def generate_feedback_box():
     router as user_agent_analytics_router,
 )
 from litellm.proxy.management_helpers.audit_logs import create_audit_log_for_update
+from litellm.proxy.middleware.in_flight_requests_middleware import (
+    InFlightRequestsMiddleware,
+)
 from litellm.proxy.middleware.prometheus_auth_middleware import PrometheusAuthMiddleware
 from litellm.proxy.ocr_endpoints.endpoints import router as ocr_router
 from litellm.proxy.openai_evals_endpoints.endpoints import router as evals_router
@@ -1404,6 +1407,7 @@ def _restructure_ui_html_files(ui_root: str) -> None:
 )
 
 app.add_middleware(PrometheusAuthMiddleware)
+app.add_middleware(InFlightRequestsMiddleware)
 
 
 def mount_swagger_ui():

diff --git a/litellm/types/integrations/prometheus.py b/litellm/types/integrations/prometheus.py
@@ -237,6 +237,7 @@ class UserAPIKeyLabelNames(Enum):
     "litellm_remaining_api_key_tokens_for_model",
     "litellm_llm_api_failed_requests_metric",
     "litellm_callback_logging_failures_metric",
+    "litellm_in_flight_requests",
 ]