BerriAI · Sameerlite · Jan 29, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/...extras/litellm_proxy_extras/migrations/20251119131227_add_prompt_versioning/migration.sql b/...extras/litellm_proxy_extras/migrations/20251119131227_add_prompt_versioning/migration.sql
@@ -1,12 +1,12 @@
 -- DropIndex
-DROP INDEX "LiteLLM_PromptTable_prompt_id_key";
+DROP INDEX IF EXISTS "LiteLLM_PromptTable_prompt_id_key";
 
 -- AlterTable
-ALTER TABLE "LiteLLM_PromptTable" ADD COLUMN     "version" INTEGER NOT NULL DEFAULT 1;
+ALTER TABLE "LiteLLM_PromptTable"
+ADD COLUMN "version" INTEGER NOT NULL DEFAULT 1;
 
 -- CreateIndex
-CREATE INDEX "LiteLLM_PromptTable_prompt_id_idx" ON "LiteLLM_PromptTable"("prompt_id");
+CREATE INDEX "LiteLLM_PromptTable_prompt_id_idx" ON "LiteLLM_PromptTable" ("prompt_id");
 
 -- CreateIndex
-CREATE UNIQUE INDEX "LiteLLM_PromptTable_prompt_id_version_key" ON "LiteLLM_PromptTable"("prompt_id", "version");
-
+CREATE UNIQUE INDEX "LiteLLM_PromptTable_prompt_id_version_key" ON "LiteLLM_PromptTable" ("prompt_id", "version");
diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py
@@ -290,10 +290,19 @@ def _map_tool_choice(
         elif tool_choice == "none":
             _tool_choice = AnthropicMessagesToolChoice(type="none")
         elif isinstance(tool_choice, dict):
-            _tool_name = tool_choice.get("function", {}).get("name")
-            _tool_choice = AnthropicMessagesToolChoice(type="tool")
-            if _tool_name is not None:
-                _tool_choice["name"] = _tool_name
+            if "type" in tool_choice and "function" not in tool_choice:
+                tool_type = tool_choice.get("type")
+                if tool_type == "auto":
+                    _tool_choice = AnthropicMessagesToolChoice(type="auto")
+                elif tool_type == "required" or tool_type == "any":
+                    _tool_choice = AnthropicMessagesToolChoice(type="any")
+                elif tool_type == "none":
+                    _tool_choice = AnthropicMessagesToolChoice(type="none")
+            else:
+                _tool_name = tool_choice.get("function", {}).get("name")
+                if _tool_name is not None:
+                    _tool_choice = AnthropicMessagesToolChoice(type="tool")
+                    _tool_choice["name"] = _tool_name
 
         if parallel_tool_use is not None:
             # Anthropic uses 'disable_parallel_tool_use' flag to determine if parallel tool use is allowed

diff --git a/litellm/llms/azure/chat/gpt_5_transformation.py b/litellm/llms/azure/chat/gpt_5_transformation.py
@@ -22,7 +22,8 @@ def is_model_gpt_5_model(cls, model: str) -> bool:
         Accepts both explicit gpt-5 model names and the ``gpt5_series/`` prefix
         used for manual routing.
         """
-        return "gpt-5" in model or "gpt5_series" in model
+        # gpt-5-chat* is a chat model and shouldn't go through GPT-5 reasoning restrictions.
+        return ("gpt-5" in model and "gpt-5-chat" not in model) or "gpt5_series" in model
 
     def get_supported_openai_params(self, model: str) -> List[str]:
         """Get supported parameters for Azure OpenAI GPT-5 models.
@@ -37,6 +38,11 @@ def get_supported_openai_params(self, model: str) -> List[str]:
         """
         params = OpenAIGPT5Config.get_supported_openai_params(self, model=model)
 
+        # Azure supports tool_choice for GPT-5 deployments, but the base GPT-5 config
+        # can drop it when the deployment name isn't in the OpenAI model registry.
+        if "tool_choice" not in params:
+            params.append("tool_choice")
+
         # Only gpt-5.2 has been verified to support logprobs on Azure
         if self.is_model_gpt_5_2_model(model):
             azure_supported_params = ["logprobs", "top_logprobs"]

diff --git a/litellm/llms/bedrock/chat/invoke_transformations/anthropic_claude3_transformation.py b/litellm/llms/bedrock/chat/invoke_transformations/anthropic_claude3_transformation.py
@@ -53,13 +53,26 @@ def map_openai_params(
         model: str,
         drop_params: bool,
     ) -> dict:
-        return AnthropicConfig.map_openai_params(
+        # Force tool-based structured outputs for Bedrock Invoke
+        # (similar to VertexAI fix in #19201)
+        # Bedrock Invoke doesn't support output_format parameter
+        original_model = model
+        if "response_format" in non_default_params:
+            # Use a model name that forces tool-based approach
+            model = "claude-3-sonnet-20240229"
+
+        optional_params = AnthropicConfig.map_openai_params(
             self,
             non_default_params,
             optional_params,
             model,
             drop_params,
         )
+
+        # Restore original model name
+        model = original_model
+
+        return optional_params
 
 
     def transform_request(
@@ -90,6 +103,8 @@ def transform_request(
 
         _anthropic_request.pop("model", None)
         _anthropic_request.pop("stream", None)
+        # Bedrock Invoke doesn't support output_format parameter
+        _anthropic_request.pop("output_format", None)
         if "anthropic_version" not in _anthropic_request:
             _anthropic_request["anthropic_version"] = self.anthropic_version
 
@@ -117,6 +132,26 @@ def transform_request(
             if "opus-4" in model.lower() or "opus_4" in model.lower():
                 beta_set.add("tool-search-tool-2025-10-19")
 
+        # Filter out beta headers that Bedrock Invoke doesn't support
+        # AWS Bedrock only supports a specific whitelist of beta flags
+        # Reference: https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-messages-request-response.html
+        BEDROCK_SUPPORTED_BETAS = {
+            "computer-use-2024-10-22",  # Legacy computer use
+            "computer-use-2025-01-24",  # Current computer use (Claude 3.7 Sonnet)
+            "token-efficient-tools-2025-02-19",  # Tool use (Claude 3.7+ and Claude 4+)
+            "interleaved-thinking-2025-05-14",  # Interleaved thinking (Claude 4+)
+            "output-128k-2025-02-19",  # 128K output tokens (Claude 3.7 Sonnet)
+            "dev-full-thinking-2025-05-14",  # Developer mode for raw thinking (Claude 4+)
+            "context-1m-2025-08-07",  # 1 million tokens (Claude Sonnet 4)
+            "context-management-2025-06-27",  # Context management (Claude Sonnet/Haiku 4.5)
+            "effort-2025-11-24",  # Effort parameter (Claude Opus 4.5)
+            "tool-search-tool-2025-10-19",  # Tool search (Claude Opus 4.5)
+            "tool-examples-2025-10-29",  # Tool use examples (Claude Opus 4.5)
+        }
+
+        # Only keep beta headers that Bedrock supports
+        beta_set = {beta for beta in beta_set if beta in BEDROCK_SUPPORTED_BETAS}
+
         if beta_set:
             _anthropic_request["anthropic_beta"] = list(beta_set)
 

diff --git a/litellm/llms/openai/chat/gpt_5_transformation.py b/litellm/llms/openai/chat/gpt_5_transformation.py
@@ -19,7 +19,9 @@ class OpenAIGPT5Config(OpenAIGPTConfig):
 
     @classmethod
     def is_model_gpt_5_model(cls, model: str) -> bool:
-        return "gpt-5" in model
+        # gpt-5-chat* behaves like a regular chat model (supports temperature, etc.)
+        # Don't route it through GPT-5 reasoning-specific parameter restrictions.
+        return "gpt-5" in model and "gpt-5-chat" not in model
 
     @classmethod
     def is_model_gpt_5_codex_model(cls, model: str) -> bool:

diff --git a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@@ -1657,7 +1657,17 @@ def _calculate_usage(  # noqa: PLR0915
         ## This is necessary because promptTokensDetails includes both cached and non-cached tokens
         ## See: https://github.com/BerriAI/litellm/issues/18750
         if cached_text_tokens is not None and prompt_text_tokens is not None:
+            # Explicit caching: subtract cached tokens per modality from cacheTokensDetails
             prompt_text_tokens = prompt_text_tokens - cached_text_tokens
+        elif (
+            cached_tokens is not None
+            and prompt_text_tokens is not None
+            and cached_text_tokens is None
+        ):
+            # Implicit caching: only cachedContentTokenCount is provided (no cacheTokensDetails)
+            # Subtract from text tokens since implicit caching is primarily for text content
+            # See: https://github.com/BerriAI/litellm/issues/16341
+            prompt_text_tokens = prompt_text_tokens - cached_tokens
         if cached_audio_tokens is not None and prompt_audio_tokens is not None:
             prompt_audio_tokens = prompt_audio_tokens - cached_audio_tokens
         if cached_image_tokens is not None and prompt_image_tokens is not None:

diff --git a/litellm/main.py b/litellm/main.py
@@ -7280,8 +7280,11 @@ def _get_encoding():
 def __getattr__(name: str) -> Any:
     """Lazy import handler for main module"""
     if name == "encoding":
-        # Lazy load encoding to avoid heavy tiktoken import at module load time
-        _encoding = tiktoken.get_encoding("cl100k_base")
+        # Use _get_default_encoding which properly sets TIKTOKEN_CACHE_DIR
+        # before loading tiktoken, ensuring the local cache is used
+        # instead of downloading from the internet
+        from litellm._lazy_imports import _get_default_encoding
+        _encoding = _get_default_encoding()
         # Cache it in the module's __dict__ for subsequent accesses
         import sys
 

diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
@@ -3130,7 +3130,7 @@
         "supports_reasoning": true,
         "supports_response_schema": true,
         "supports_system_messages": true,
-        "supports_tool_choice": false,
+        "supports_tool_choice": true,
         "supports_vision": true
     },
     "azure/gpt-5-chat-latest": {
@@ -3162,7 +3162,7 @@
         "supports_reasoning": true,
         "supports_response_schema": true,
         "supports_system_messages": true,
-        "supports_tool_choice": false,
+        "supports_tool_choice": true,
         "supports_vision": true
     },
     "azure/gpt-5-codex": {

diff --git a/litellm/proxy/common_request_processing.py b/litellm/proxy/common_request_processing.py
@@ -650,11 +650,15 @@ async def base_process_llm_request(
         )
 
         tasks = []
+        # Start the moderation check (during_call_hook) as early as possible
+        # This gives it a head start to mask/validate input while the proxy handles routing
         tasks.append(
-            proxy_logging_obj.during_call_hook(
-                data=self.data,
-                user_api_key_dict=user_api_key_dict,
-                call_type=route_type,  # type: ignore
+            asyncio.create_task(
+                proxy_logging_obj.during_call_hook(
+                    data=self.data,
+                    user_api_key_dict=user_api_key_dict,
+                    call_type=route_type,  # type: ignore
+                )
             )
         )