BerriAI · krrishdholakia · Feb 28, 2026 · Feb 28, 2026 · Feb 28, 2026 · Feb 28, 2026
diff --git a/docs/my-website/docs/pass_through/assembly_ai.md b/docs/my-website/docs/pass_through/assembly_ai.md
@@ -1,85 +1,194 @@
-# Assembly AI
+# AssemblyAI
 
-Pass-through endpoints for Assembly AI - call Assembly AI endpoints, in native format (no translation).
+Pass-through endpoints for AssemblyAI - call AssemblyAI endpoints, in native format (no translation).
 
-| Feature | Supported | Notes | 
+| Feature | Supported | Notes |
 |-------|-------|-------|
 | Cost Tracking | ✅ | works across all integrations |
 | Logging | ✅ | works across all integrations |
 
 
-Supports **ALL** Assembly AI Endpoints
+Supports **ALL** AssemblyAI Endpoints
 
-[**See All Assembly AI Endpoints**](https://www.assemblyai.com/docs/api-reference)
+[**See All AssemblyAI Endpoints**](https://www.assemblyai.com/docs/api-reference)
 
 
-<iframe width="840" height="500" src="https://www.loom.com/embed/aac3f4d74592448992254bfa79b9f62d?sid=267cd0ab-d92b-42fa-b97a-9f385ef8930c" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
+## Supported Routes
+
+| AssemblyAI Service | LiteLLM Route | AssemblyAI Base URL |
+|-------------------|---------------|---------------------|
+| Speech-to-Text (US) | `/assemblyai/*` | `api.assemblyai.com` |
+| Speech-to-Text (EU) | `/eu.assemblyai/*` | `eu.api.assemblyai.com` |
 
 ## Quick Start
 
-Let's call the Assembly AI [`/v2/transcripts` endpoint](https://www.assemblyai.com/docs/api-reference/transcripts)
+Let's call the AssemblyAI [`/v2/transcripts` endpoint](https://www.assemblyai.com/docs/api-reference/transcripts)
 
-1. Add Assembly AI API Key to your environment 
+1. Add AssemblyAI API Key to your environment
 
 ```bash
 export ASSEMBLYAI_API_KEY=""
 ```
 
-2. Start LiteLLM Proxy 
+2. Start LiteLLM Proxy
 
 ```bash
 litellm
 
 # RUNNING on http://0.0.0.0:4000
 ```
 
-3. Test it! 
+3. Test it!
 
-Let's call the Assembly AI `/v2/transcripts` endpoint
+Let's call the AssemblyAI [`/v2/transcripts` endpoint](https://www.assemblyai.com/docs/api-reference/transcripts). Includes commented-out [Speech Understanding](https://www.assemblyai.com/docs/speech-understanding) features you can toggle on.
 
 ```python
 import assemblyai as aai
 
-LITELLM_VIRTUAL_KEY = "sk-1234" # <your-virtual-key>
-LITELLM_PROXY_BASE_URL = "http://0.0.0.0:4000/assemblyai" # <your-proxy-base-url>/assemblyai
+aai.settings.base_url = "http://0.0.0.0:4000/assemblyai" # <your-proxy-base-url>/assemblyai
+aai.settings.api_key = "Bearer sk-1234" # Bearer <your-virtual-key>
 
-aai.settings.api_key = f"Bearer {LITELLM_VIRTUAL_KEY}"
-aai.settings.base_url = LITELLM_PROXY_BASE_URL
+# Use a publicly-accessible URL
+audio_file = "https://assembly.ai/wildfires.mp3"
 
-# URL of the file to transcribe
-FILE_URL = "https://assembly.ai/wildfires.mp3"
+# Or use a local file:
+# audio_file = "./example.mp3"
 
-# You can also transcribe a local file by passing in a file path
-# FILE_URL = './path/to/file.mp3'
+config = aai.TranscriptionConfig(
+    speech_models=["universal-3-pro", "universal-2"],
+    language_detection=True,
+    speaker_labels=True,
+    # Speech understanding features
+    # sentiment_analysis=True,
+    # entity_detection=True,
+    # auto_chapters=True,
+    # summarization=True,
+    # summary_type=aai.SummarizationType.bullets,
+    # redact_pii=True,
+    # content_safety=True,
+)
 
-transcriber = aai.Transcriber()
-transcript = transcriber.transcribe(FILE_URL)
-print(transcript)
-print(transcript.id)
-```
+transcript = aai.Transcriber().transcribe(audio_file, config=config)
 
-## Calling Assembly AI EU endpoints
+if transcript.status == aai.TranscriptStatus.error:
+    raise RuntimeError(f"Transcription failed: {transcript.error}")
 
-If you want to send your request to the Assembly AI EU endpoint, you can do so by setting the `LITELLM_PROXY_BASE_URL` to `<your-proxy-base-url>/eu.assemblyai`
+print(f"\nFull Transcript:\n\n{transcript.text}")
 
+# Optionally print speaker diarization results
+# for utterance in transcript.utterances:
+#     print(f"Speaker {utterance.speaker}: {utterance.text}")
+```
+
+4. [Prompting with Universal-3 Pro](https://www.assemblyai.com/docs/speech-to-text/prompting) (optional)
 
 ```python
 import assemblyai as aai
 
-LITELLM_VIRTUAL_KEY = "sk-1234" # <your-virtual-key>
-LITELLM_PROXY_BASE_URL = "http://0.0.0.0:4000/eu.assemblyai" # <your-proxy-base-url>/eu.assemblyai
+aai.settings.base_url = "http://0.0.0.0:4000/assemblyai" # <your-proxy-base-url>/assemblyai
+aai.settings.api_key = "Bearer sk-1234" # Bearer <your-virtual-key>
+
+audio_file = "https://assemblyaiassets.com/audios/verbatim.mp3"
+
+config = aai.TranscriptionConfig(
+    speech_models=["universal-3-pro", "universal-2"],
+    language_detection=True,
+    prompt="Produce a transcript suitable for conversational analysis. Every disfluency is meaningful data. Include: fillers (um, uh, er, ah, hmm, mhm, like, you know, I mean), repetitions (I I, the the), restarts (I was- I went), stutters (th-that, b-but, no-not), and informal speech (gonna, wanna, gotta)",
+)
+
+transcript = aai.Transcriber().transcribe(audio_file, config)
+
+print(transcript.text)
+```
+
+## Calling AssemblyAI EU endpoints
+
+If you want to send your request to the AssemblyAI EU endpoint, you can do so by setting the `LITELLM_PROXY_BASE_URL` to `<your-proxy-base-url>/eu.assemblyai`
 
-aai.settings.api_key = f"Bearer {LITELLM_VIRTUAL_KEY}"
-aai.settings.base_url = LITELLM_PROXY_BASE_URL
 
-# URL of the file to transcribe
-FILE_URL = "https://assembly.ai/wildfires.mp3"
+```python
+import assemblyai as aai
+
+aai.settings.base_url = "http://0.0.0.0:4000/eu.assemblyai" # <your-proxy-base-url>/eu.assemblyai
+aai.settings.api_key = "Bearer sk-1234" # Bearer <your-virtual-key>
 
-# You can also transcribe a local file by passing in a file path
-# FILE_URL = './path/to/file.mp3'
+# Use a publicly-accessible URL
+audio_file = "https://assembly.ai/wildfires.mp3"
+
+# Or use a local file:
+# audio_file = "./path/to/file.mp3"
 
 transcriber = aai.Transcriber()
-transcript = transcriber.transcribe(FILE_URL)
+transcript = transcriber.transcribe(audio_file)
 print(transcript)
 print(transcript.id)
 ```
+
+## LLM Gateway
+
+Use AssemblyAI's [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway) as an OpenAI-compatible provider — a unified API for Claude, GPT, and Gemini models with full LiteLLM logging, guardrails, and cost tracking support.
+
+[**See Available Models**](https://www.assemblyai.com/docs/llm-gateway#available-models)
+
+### Usage
+
+#### LiteLLM Python SDK
+
+```python
+import litellm
+import os
+
+os.environ["ASSEMBLYAI_API_KEY"] = "your-assemblyai-api-key"
+
+response = litellm.completion(
+    model="assemblyai/claude-sonnet-4-5-20250929",
+    messages=[{"role": "user", "content": "What is the capital of France?"}]
+)
+
+print(response.choices[0].message.content)
+```
+
+#### LiteLLM Proxy
+
+1. Config
+
+```yaml
+model_list:
+  - model_name: assemblyai/*
+    litellm_params:
+      model: assemblyai/*
+      api_key: os.environ/ASSEMBLYAI_API_KEY
+```
+
+2. Start proxy
+
+```bash
+litellm --config config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it!
+
+```python
+import requests
+
+headers = {
+    "authorization": "Bearer sk-1234"  # Bearer <your-virtual-key>
+}
+
+response = requests.post(
+    "http://0.0.0.0:4000/v1/chat/completions",
+    headers=headers,
+    json={
+        "model": "assemblyai/claude-sonnet-4-5-20250929",
+        "messages": [
+            {"role": "user", "content": "What is the capital of France?"}
+        ],
+        "max_tokens": 1000
+    }
+)
+
+result = response.json()
+print(result["choices"][0]["message"]["content"])
+```
diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md
@@ -113,6 +113,31 @@ litellm_settings:
 ```
 
 
+## Pod Health Metrics
+
+Use these to measure per-pod queue depth and diagnose latency that occurs **before** LiteLLM starts processing a request.
+
+| Metric Name | Type | Description |
+|---|---|---|
+| `litellm_in_flight_requests` | Gauge | Number of HTTP requests currently in-flight on this uvicorn worker. Tracks the pod's queue depth in real time. With multiple workers, values are summed across all live workers (`livesum`). |
+
+### When to use this
+
+LiteLLM measures latency from when its handler starts. If a request waits in uvicorn's event loop before the handler runs, that wait is invisible to LiteLLM's own logs. `litellm_in_flight_requests` shows how loaded the pod was at any point in time.
+
+```
+high in_flight_requests + high ALB TargetResponseTime → pod overloaded, scale out
+low  in_flight_requests + high ALB TargetResponseTime → delay is pre-ASGI (event loop blocking)
+```
+
+You can also check the current value directly without Prometheus:
+
+```bash
+curl http://localhost:4000/health/backlog \
+  -H "Authorization: Bearer sk-..."
+# {"in_flight_requests": 47}
+```
+
 ## Proxy Level Tracking Metrics
 
 Use this to track overall LiteLLM Proxy usage.

diff --git a/docs/my-website/docs/troubleshoot/latency_overhead.md b/docs/my-website/docs/troubleshoot/latency_overhead.md
@@ -2,9 +2,41 @@
 
 Use this guide when you see unexpected latency overhead between LiteLLM proxy and the LLM provider.
 
+## The Invisible Latency Gap
+
+LiteLLM measures latency from when its handler starts. If a request waits in uvicorn's event loop **before** the handler runs, that wait is invisible to LiteLLM's own logs.
+
+```
+T=0   Request arrives at load balancer
+      [queue wait — LiteLLM never logs this]
+T=10  LiteLLM handler starts → timer begins
+T=20  Response sent
+
+LiteLLM logs: 10s    User experiences: 20s
+```
+
+To measure the pre-handler wait, poll `/health/backlog` on each pod:
+
+```bash
+curl http://localhost:4000/health/backlog \
+  -H "Authorization: Bearer sk-..."
+# {"in_flight_requests": 47}
+```
+
+Or scrape the `litellm_in_flight_requests` Prometheus gauge at `/metrics`.
+
+| `in_flight_requests` | ALB `TargetResponseTime` | Diagnosis |
+|---|---|---|
+| High | High | Pod overloaded → scale out |
+| Low | High | Delay is pre-ASGI — check for sync blocking code or event loop saturation |
+| High | Normal | Pod is busy but healthy, no queue buildup |
+
+If you're on **AWS ALB**, correlate `litellm_in_flight_requests` spikes with ALB's `TargetResponseTime` CloudWatch metric. The gap between what ALB reports and what LiteLLM logs is the invisible wait.
+
 ## Quick Checklist
 
-1. **Collect the `x-litellm-overhead-duration-ms` response header** — this tells you LiteLLM's total overhead on every request. Start here.
+1. **Check `in_flight_requests` on each pod** via `/health/backlog` or the `litellm_in_flight_requests` Prometheus gauge — this tells you if requests are queuing before LiteLLM starts processing. Start here for unexplained latency.
+2. **Collect the `x-litellm-overhead-duration-ms` response header** — this tells you LiteLLM's total overhead on every request.
 2. **Is DEBUG logging enabled?** This is the #1 cause of latency with large payloads.
 3. **Are you sending large base64 payloads?** (images, PDFs) — see [Large Payload Overhead](#large-payload-overhead).
 4. **Enable detailed timing headers** to pinpoint where time is spent.

diff --git a/...litellm_proxy_extras/migrations/20260228000000_add_claude_code_plugin_table/migration.sql b/...litellm_proxy_extras/migrations/20260228000000_add_claude_code_plugin_table/migration.sql
@@ -0,0 +1,18 @@
+-- CreateTable
+CREATE TABLE "LiteLLM_ClaudeCodePluginTable" (
+    "id" TEXT NOT NULL,
+    "name" TEXT NOT NULL,
+    "version" TEXT,
+    "description" TEXT,
+    "manifest_json" TEXT,
+    "files_json" TEXT DEFAULT '{}',
+    "enabled" BOOLEAN NOT NULL DEFAULT true,
+    "created_at" TIMESTAMP(3) DEFAULT CURRENT_TIMESTAMP,
+    "updated_at" TIMESTAMP(3) DEFAULT CURRENT_TIMESTAMP,
+    "created_by" TEXT,
+
+    CONSTRAINT "LiteLLM_ClaudeCodePluginTable_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateIndex
+CREATE UNIQUE INDEX "LiteLLM_ClaudeCodePluginTable_name_key" ON "LiteLLM_ClaudeCodePluginTable"("name");
diff --git a/litellm/llms/bedrock/chat/converse_transformation.py b/litellm/llms/bedrock/chat/converse_transformation.py
@@ -511,6 +511,7 @@ def get_supported_openai_params(self, model: str) -> List[str]:
             "response_format",
             "requestMetadata",
             "service_tier",
+            "parallel_tool_calls",
         ]
 
         if (
@@ -913,6 +914,13 @@ def map_openai_params(
                 )
                 if _tool_choice_value is not None:
                     optional_params["tool_choice"] = _tool_choice_value
+            if param == "parallel_tool_calls":
+                disable_parallel = not value
+                optional_params["_parallel_tool_use_config"] = {
+                    "tool_choice": {
+                        "disable_parallel_tool_use": disable_parallel
+                    }
+                }
             if param == "thinking":
                 optional_params["thinking"] = value
             elif param == "reasoning_effort" and isinstance(value, str):

diff --git a/litellm/llms/openai/chat/gpt_5_transformation.py b/litellm/llms/openai/chat/gpt_5_transformation.py
@@ -40,11 +40,16 @@ def is_model_gpt_5_1_model(cls, model: str) -> bool:
         
         gpt-5.1/5.2 support temperature when reasoning_effort="none",
         unlike base gpt-5 which only supports temperature=1. Excludes
-        pro variants which keep stricter knobs.
+        pro variants which keep stricter knobs and gpt-5.2-chat variants
+        which only support temperature=1.
         """
         model_name = model.split("/")[-1]
         is_gpt_5_1 = model_name.startswith("gpt-5.1")
-        is_gpt_5_2 = model_name.startswith("gpt-5.2") and "pro" not in model_name
+        is_gpt_5_2 = (
+            model_name.startswith("gpt-5.2")
+            and "pro" not in model_name
+            and not model_name.startswith("gpt-5.2-chat")
+        )
         return is_gpt_5_1 or is_gpt_5_2
 
     @classmethod

diff --git a/litellm/llms/openai_like/providers.json b/litellm/llms/openai_like/providers.json
@@ -90,5 +90,9 @@
     "headers": {
       "api-subscription-key": "{api_key}"
     }
+  },
+  "assemblyai": {
+    "base_url": "https://llm-gateway.assemblyai.com/v1",
+    "api_key_env": "ASSEMBLYAI_API_KEY"
   }
 }