BerriAI · krrishdholakia · Feb 28, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 28, 2026
diff --git a/docs/my-website/docs/pass_through/assembly_ai.md b/docs/my-website/docs/pass_through/assembly_ai.md
@@ -1,85 +1,194 @@
-# Assembly AI
+# AssemblyAI
 
-Pass-through endpoints for Assembly AI - call Assembly AI endpoints, in native format (no translation).
+Pass-through endpoints for AssemblyAI - call AssemblyAI endpoints, in native format (no translation).
 
-| Feature | Supported | Notes | 
+| Feature | Supported | Notes |
 |-------|-------|-------|
 | Cost Tracking | ✅ | works across all integrations |
 | Logging | ✅ | works across all integrations |
 
 
-Supports **ALL** Assembly AI Endpoints
+Supports **ALL** AssemblyAI Endpoints
 
-[**See All Assembly AI Endpoints**](https://www.assemblyai.com/docs/api-reference)
+[**See All AssemblyAI Endpoints**](https://www.assemblyai.com/docs/api-reference)
 
 
-<iframe width="840" height="500" src="https://www.loom.com/embed/aac3f4d74592448992254bfa79b9f62d?sid=267cd0ab-d92b-42fa-b97a-9f385ef8930c" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
+## Supported Routes
+
+| AssemblyAI Service | LiteLLM Route | AssemblyAI Base URL |
+|-------------------|---------------|---------------------|
+| Speech-to-Text (US) | `/assemblyai/*` | `api.assemblyai.com` |
+| Speech-to-Text (EU) | `/eu.assemblyai/*` | `eu.api.assemblyai.com` |
 
 ## Quick Start
 
-Let's call the Assembly AI [`/v2/transcripts` endpoint](https://www.assemblyai.com/docs/api-reference/transcripts)
+Let's call the AssemblyAI [`/v2/transcripts` endpoint](https://www.assemblyai.com/docs/api-reference/transcripts)
 
-1. Add Assembly AI API Key to your environment 
+1. Add AssemblyAI API Key to your environment
 
 ```bash
 export ASSEMBLYAI_API_KEY=""
 ```
 
-2. Start LiteLLM Proxy 
+2. Start LiteLLM Proxy
 
 ```bash
 litellm
 
 # RUNNING on http://0.0.0.0:4000
 ```
 
-3. Test it! 
+3. Test it!
 
-Let's call the Assembly AI `/v2/transcripts` endpoint
+Let's call the AssemblyAI [`/v2/transcripts` endpoint](https://www.assemblyai.com/docs/api-reference/transcripts). Includes commented-out [Speech Understanding](https://www.assemblyai.com/docs/speech-understanding) features you can toggle on.
 
 ```python
 import assemblyai as aai
 
-LITELLM_VIRTUAL_KEY = "sk-1234" # <your-virtual-key>
-LITELLM_PROXY_BASE_URL = "http://0.0.0.0:4000/assemblyai" # <your-proxy-base-url>/assemblyai
+aai.settings.base_url = "http://0.0.0.0:4000/assemblyai" # <your-proxy-base-url>/assemblyai
+aai.settings.api_key = "Bearer sk-1234" # Bearer <your-virtual-key>
 
-aai.settings.api_key = f"Bearer {LITELLM_VIRTUAL_KEY}"
-aai.settings.base_url = LITELLM_PROXY_BASE_URL
+# Use a publicly-accessible URL
+audio_file = "https://assembly.ai/wildfires.mp3"
 
-# URL of the file to transcribe
-FILE_URL = "https://assembly.ai/wildfires.mp3"
+# Or use a local file:
+# audio_file = "./example.mp3"
 
-# You can also transcribe a local file by passing in a file path
-# FILE_URL = './path/to/file.mp3'
+config = aai.TranscriptionConfig(
+    speech_models=["universal-3-pro", "universal-2"],
+    language_detection=True,
+    speaker_labels=True,
+    # Speech understanding features
+    # sentiment_analysis=True,
+    # entity_detection=True,
+    # auto_chapters=True,
+    # summarization=True,
+    # summary_type=aai.SummarizationType.bullets,
+    # redact_pii=True,
+    # content_safety=True,
+)
 
-transcriber = aai.Transcriber()
-transcript = transcriber.transcribe(FILE_URL)
-print(transcript)
-print(transcript.id)
-```
+transcript = aai.Transcriber().transcribe(audio_file, config=config)
 
-## Calling Assembly AI EU endpoints
+if transcript.status == aai.TranscriptStatus.error:
+    raise RuntimeError(f"Transcription failed: {transcript.error}")
 
-If you want to send your request to the Assembly AI EU endpoint, you can do so by setting the `LITELLM_PROXY_BASE_URL` to `<your-proxy-base-url>/eu.assemblyai`
+print(f"\nFull Transcript:\n\n{transcript.text}")
 
+# Optionally print speaker diarization results
+# for utterance in transcript.utterances:
+#     print(f"Speaker {utterance.speaker}: {utterance.text}")
+```
+
+4. [Prompting with Universal-3 Pro](https://www.assemblyai.com/docs/speech-to-text/prompting) (optional)
 
 ```python
 import assemblyai as aai
 
-LITELLM_VIRTUAL_KEY = "sk-1234" # <your-virtual-key>
-LITELLM_PROXY_BASE_URL = "http://0.0.0.0:4000/eu.assemblyai" # <your-proxy-base-url>/eu.assemblyai
+aai.settings.base_url = "http://0.0.0.0:4000/assemblyai" # <your-proxy-base-url>/assemblyai
+aai.settings.api_key = "Bearer sk-1234" # Bearer <your-virtual-key>
+
+audio_file = "https://assemblyaiassets.com/audios/verbatim.mp3"
+
+config = aai.TranscriptionConfig(
+    speech_models=["universal-3-pro", "universal-2"],
+    language_detection=True,
+    prompt="Produce a transcript suitable for conversational analysis. Every disfluency is meaningful data. Include: fillers (um, uh, er, ah, hmm, mhm, like, you know, I mean), repetitions (I I, the the), restarts (I was- I went), stutters (th-that, b-but, no-not), and informal speech (gonna, wanna, gotta)",
+)
+
+transcript = aai.Transcriber().transcribe(audio_file, config)
+
+print(transcript.text)
+```
+
+## Calling AssemblyAI EU endpoints
+
+If you want to send your request to the AssemblyAI EU endpoint, you can do so by setting the `LITELLM_PROXY_BASE_URL` to `<your-proxy-base-url>/eu.assemblyai`
 
-aai.settings.api_key = f"Bearer {LITELLM_VIRTUAL_KEY}"
-aai.settings.base_url = LITELLM_PROXY_BASE_URL
 
-# URL of the file to transcribe
-FILE_URL = "https://assembly.ai/wildfires.mp3"
+```python
+import assemblyai as aai
+
+aai.settings.base_url = "http://0.0.0.0:4000/eu.assemblyai" # <your-proxy-base-url>/eu.assemblyai
+aai.settings.api_key = "Bearer sk-1234" # Bearer <your-virtual-key>
 
-# You can also transcribe a local file by passing in a file path
-# FILE_URL = './path/to/file.mp3'
+# Use a publicly-accessible URL
+audio_file = "https://assembly.ai/wildfires.mp3"
+
+# Or use a local file:
+# audio_file = "./path/to/file.mp3"
 
 transcriber = aai.Transcriber()
-transcript = transcriber.transcribe(FILE_URL)
+transcript = transcriber.transcribe(audio_file)
 print(transcript)
 print(transcript.id)
 ```
+
+## LLM Gateway
+
+Use AssemblyAI's [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway) as an OpenAI-compatible provider — a unified API for Claude, GPT, and Gemini models with full LiteLLM logging, guardrails, and cost tracking support.
+
+[**See Available Models**](https://www.assemblyai.com/docs/llm-gateway#available-models)
+
+### Usage
+
+#### LiteLLM Python SDK
+
+```python
+import litellm
+import os
+
+os.environ["ASSEMBLYAI_API_KEY"] = "your-assemblyai-api-key"
+
+response = litellm.completion(
+    model="assemblyai/claude-sonnet-4-5-20250929",
+    messages=[{"role": "user", "content": "What is the capital of France?"}]
+)
+
+print(response.choices[0].message.content)
+```
+
+#### LiteLLM Proxy
+
+1. Config
+
+```yaml
+model_list:
+  - model_name: assemblyai/*
+    litellm_params:
+      model: assemblyai/*
+      api_key: os.environ/ASSEMBLYAI_API_KEY
+```
+
+2. Start proxy
+
+```bash
+litellm --config config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it!
+
+```python
+import requests
+
+headers = {
+    "authorization": "Bearer sk-1234"  # Bearer <your-virtual-key>
+}
+
+response = requests.post(
+    "http://0.0.0.0:4000/v1/chat/completions",
+    headers=headers,
+    json={
+        "model": "assemblyai/claude-sonnet-4-5-20250929",
+        "messages": [
+            {"role": "user", "content": "What is the capital of France?"}
+        ],
+        "max_tokens": 1000
+    }
+)
+
+result = response.json()
+print(result["choices"][0]["message"]["content"])
+```
diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md
@@ -113,6 +113,31 @@ litellm_settings:
 ```
 
 
+## Pod Health Metrics
+
+Use these to measure per-pod queue depth and diagnose latency that occurs **before** LiteLLM starts processing a request.
+
+| Metric Name | Type | Description |
+|---|---|---|
+| `litellm_in_flight_requests` | Gauge | Number of HTTP requests currently in-flight on this uvicorn worker. Tracks the pod's queue depth in real time. With multiple workers, values are summed across all live workers (`livesum`). |
+
+### When to use this
+
+LiteLLM measures latency from when its handler starts. If a request waits in uvicorn's event loop before the handler runs, that wait is invisible to LiteLLM's own logs. `litellm_in_flight_requests` shows how loaded the pod was at any point in time.
+
+```
+high in_flight_requests + high ALB TargetResponseTime → pod overloaded, scale out
+low  in_flight_requests + high ALB TargetResponseTime → delay is pre-ASGI (event loop blocking)
+```
+
+You can also check the current value directly without Prometheus:
+
+```bash
+curl http://localhost:4000/health/backlog \
+  -H "Authorization: Bearer sk-..."
+# {"in_flight_requests": 47}
+```
+
 ## Proxy Level Tracking Metrics
 
 Use this to track overall LiteLLM Proxy usage.

diff --git a/docs/my-website/docs/troubleshoot/latency_overhead.md b/docs/my-website/docs/troubleshoot/latency_overhead.md
@@ -2,9 +2,41 @@
 
 Use this guide when you see unexpected latency overhead between LiteLLM proxy and the LLM provider.
 
+## The Invisible Latency Gap
+
+LiteLLM measures latency from when its handler starts. If a request waits in uvicorn's event loop **before** the handler runs, that wait is invisible to LiteLLM's own logs.
+
+```
+T=0   Request arrives at load balancer
+      [queue wait — LiteLLM never logs this]
+T=10  LiteLLM handler starts → timer begins
+T=20  Response sent
+
+LiteLLM logs: 10s    User experiences: 20s
+```
+
+To measure the pre-handler wait, poll `/health/backlog` on each pod:
+
+```bash
+curl http://localhost:4000/health/backlog \
+  -H "Authorization: Bearer sk-..."
+# {"in_flight_requests": 47}
+```
+
+Or scrape the `litellm_in_flight_requests` Prometheus gauge at `/metrics`.
+
+| `in_flight_requests` | ALB `TargetResponseTime` | Diagnosis |
+|---|---|---|
+| High | High | Pod overloaded → scale out |
+| Low | High | Delay is pre-ASGI — check for sync blocking code or event loop saturation |
+| High | Normal | Pod is busy but healthy, no queue buildup |
+
+If you're on **AWS ALB**, correlate `litellm_in_flight_requests` spikes with ALB's `TargetResponseTime` CloudWatch metric. The gap between what ALB reports and what LiteLLM logs is the invisible wait.
+
 ## Quick Checklist
 
-1. **Collect the `x-litellm-overhead-duration-ms` response header** — this tells you LiteLLM's total overhead on every request. Start here.
+1. **Check `in_flight_requests` on each pod** via `/health/backlog` or the `litellm_in_flight_requests` Prometheus gauge — this tells you if requests are queuing before LiteLLM starts processing. Start here for unexplained latency.
+2. **Collect the `x-litellm-overhead-duration-ms` response header** — this tells you LiteLLM's total overhead on every request.
 2. **Is DEBUG logging enabled?** This is the #1 cause of latency with large payloads.
 3. **Are you sending large base64 payloads?** (images, PDFs) — see [Large Payload Overhead](#large-payload-overhead).
 4. **Enable detailed timing headers** to pinpoint where time is spent.

diff --git a/...litellm_proxy_extras/migrations/20260228000000_add_claude_code_plugin_table/migration.sql b/...litellm_proxy_extras/migrations/20260228000000_add_claude_code_plugin_table/migration.sql
@@ -0,0 +1,18 @@
+-- CreateTable
+CREATE TABLE "LiteLLM_ClaudeCodePluginTable" (
+    "id" TEXT NOT NULL,
+    "name" TEXT NOT NULL,
+    "version" TEXT,
+    "description" TEXT,
+    "manifest_json" TEXT,
+    "files_json" TEXT DEFAULT '{}',
+    "enabled" BOOLEAN NOT NULL DEFAULT true,
+    "created_at" TIMESTAMP(3) DEFAULT CURRENT_TIMESTAMP,
+    "updated_at" TIMESTAMP(3) DEFAULT CURRENT_TIMESTAMP,
+    "created_by" TEXT,
+
+    CONSTRAINT "LiteLLM_ClaudeCodePluginTable_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateIndex
+CREATE UNIQUE INDEX "LiteLLM_ClaudeCodePluginTable_name_key" ON "LiteLLM_ClaudeCodePluginTable"("name");
diff --git a/litellm-proxy-extras/litellm_proxy_extras/schema.prisma b/litellm-proxy-extras/litellm_proxy_extras/schema.prisma
@@ -300,7 +300,7 @@ model LiteLLM_MCPServerTable {
   token_url          String?
   registration_url   String?
   allow_all_keys Boolean @default(false)
-  available_on_public_internet Boolean @default(false)
+  available_on_public_internet Boolean @default(true)
 }
 
 // Generate Tokens for Proxy

diff --git a/litellm/caching/llm_caching_handler.py b/litellm/caching/llm_caching_handler.py
@@ -3,11 +3,37 @@
 """
 
 import asyncio
+from typing import Set
 
 from .in_memory_cache import InMemoryCache
 
 
 class LLMClientCache(InMemoryCache):
+    # Background tasks must be stored to prevent garbage collection, which would
+    # trigger "coroutine was never awaited" warnings. See:
+    # https://docs.python.org/3/library/asyncio-task.html#creating-tasks
+    # Intentionally shared across all instances as a global task registry.
+    _background_tasks: Set[asyncio.Task] = set()
+
+    def _remove_key(self, key: str) -> None:
+        """Close async clients before evicting them to prevent connection pool leaks."""
+        value = self.cache_dict.get(key)
+        super()._remove_key(key)
+        if value is not None:
+            close_fn = getattr(value, "aclose", None) or getattr(value, "close", None)
+            if close_fn and asyncio.iscoroutinefunction(close_fn):
+                try:
+                    task = asyncio.get_running_loop().create_task(close_fn())
+                    self._background_tasks.add(task)
+                    task.add_done_callback(self._background_tasks.discard)
+                except RuntimeError:
+                    pass
+            elif close_fn and callable(close_fn):
+                try:
+                    close_fn()
+                except Exception:
+                    pass
+
     def update_cache_key_with_event_loop(self, key):
         """
         Add the event loop to the cache key, to prevent event loop closed errors.

diff --git a/litellm/constants.py b/litellm/constants.py
@@ -193,9 +193,9 @@
 
 # Aiohttp connection pooling - prevents memory leaks from unbounded connection growth
 # Set to 0 for unlimited (not recommended for production)
-AIOHTTP_CONNECTOR_LIMIT = int(os.getenv("AIOHTTP_CONNECTOR_LIMIT", 300))
+AIOHTTP_CONNECTOR_LIMIT = int(os.getenv("AIOHTTP_CONNECTOR_LIMIT", 1000))
 AIOHTTP_CONNECTOR_LIMIT_PER_HOST = int(
-    os.getenv("AIOHTTP_CONNECTOR_LIMIT_PER_HOST", 50)
+    os.getenv("AIOHTTP_CONNECTOR_LIMIT_PER_HOST", 500)
 )
 AIOHTTP_KEEPALIVE_TIMEOUT = int(os.getenv("AIOHTTP_KEEPALIVE_TIMEOUT", 120))
 AIOHTTP_TTL_DNS_CACHE = int(os.getenv("AIOHTTP_TTL_DNS_CACHE", 300))

diff --git a/litellm/images/main.py b/litellm/images/main.py
@@ -483,6 +483,7 @@ def image_generation(  # noqa: PLR0915
                 organization=organization,
                 aimg_generation=aimg_generation,
                 client=client,
+                headers=headers,
             )
         elif custom_llm_provider == "bedrock":
             if model is None: