diff --git a/litellm-proxy-extras/litellm_proxy_extras/migrations/20251119131227_add_prompt_versioning/migration.sql b/litellm-proxy-extras/litellm_proxy_extras/migrations/20251119131227_add_prompt_versioning/migration.sql
index a9d9528bd24..43eb2401422 100644
--- a/litellm-proxy-extras/litellm_proxy_extras/migrations/20251119131227_add_prompt_versioning/migration.sql
+++ b/litellm-proxy-extras/litellm_proxy_extras/migrations/20251119131227_add_prompt_versioning/migration.sql
@@ -1,12 +1,12 @@
 -- DropIndex
-DROP INDEX "LiteLLM_PromptTable_prompt_id_key";
+DROP INDEX IF EXISTS "LiteLLM_PromptTable_prompt_id_key";
 
 -- AlterTable
-ALTER TABLE "LiteLLM_PromptTable" ADD COLUMN     "version" INTEGER NOT NULL DEFAULT 1;
+ALTER TABLE "LiteLLM_PromptTable"
+ADD COLUMN "version" INTEGER NOT NULL DEFAULT 1;
 
 -- CreateIndex
-CREATE INDEX "LiteLLM_PromptTable_prompt_id_idx" ON "LiteLLM_PromptTable"("prompt_id");
+CREATE INDEX "LiteLLM_PromptTable_prompt_id_idx" ON "LiteLLM_PromptTable" ("prompt_id");
 
 -- CreateIndex
-CREATE UNIQUE INDEX "LiteLLM_PromptTable_prompt_id_version_key" ON "LiteLLM_PromptTable"("prompt_id", "version");
-
+CREATE UNIQUE INDEX "LiteLLM_PromptTable_prompt_id_version_key" ON "LiteLLM_PromptTable" ("prompt_id", "version");
\ No newline at end of file
diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py
index 5ba901f9729..1b61b533275 100644
--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@@ -290,10 +290,19 @@ def _map_tool_choice(
         elif tool_choice == "none":
             _tool_choice = AnthropicMessagesToolChoice(type="none")
         elif isinstance(tool_choice, dict):
-            _tool_name = tool_choice.get("function", {}).get("name")
-            _tool_choice = AnthropicMessagesToolChoice(type="tool")
-            if _tool_name is not None:
-                _tool_choice["name"] = _tool_name
+            if "type" in tool_choice and "function" not in tool_choice:
+                tool_type = tool_choice.get("type")
+                if tool_type == "auto":
+                    _tool_choice = AnthropicMessagesToolChoice(type="auto")
+                elif tool_type == "required" or tool_type == "any":
+                    _tool_choice = AnthropicMessagesToolChoice(type="any")
+                elif tool_type == "none":
+                    _tool_choice = AnthropicMessagesToolChoice(type="none")
+            else:
+                _tool_name = tool_choice.get("function", {}).get("name")
+                if _tool_name is not None:
+                    _tool_choice = AnthropicMessagesToolChoice(type="tool")
+                    _tool_choice["name"] = _tool_name
 
         if parallel_tool_use is not None:
             # Anthropic uses 'disable_parallel_tool_use' flag to determine if parallel tool use is allowed
diff --git a/litellm/llms/azure/chat/gpt_5_transformation.py b/litellm/llms/azure/chat/gpt_5_transformation.py
index 506b7fdfe5e..eeb55911ecf 100644
--- a/litellm/llms/azure/chat/gpt_5_transformation.py
+++ b/litellm/llms/azure/chat/gpt_5_transformation.py
@@ -22,7 +22,8 @@ def is_model_gpt_5_model(cls, model: str) -> bool:
         Accepts both explicit gpt-5 model names and the ``gpt5_series/`` prefix
         used for manual routing.
         """
-        return "gpt-5" in model or "gpt5_series" in model
+        # gpt-5-chat* is a chat model and shouldn't go through GPT-5 reasoning restrictions.
+        return ("gpt-5" in model and "gpt-5-chat" not in model) or "gpt5_series" in model
 
     def get_supported_openai_params(self, model: str) -> List[str]:
         """Get supported parameters for Azure OpenAI GPT-5 models.
@@ -37,6 +38,11 @@ def get_supported_openai_params(self, model: str) -> List[str]:
         """
         params = OpenAIGPT5Config.get_supported_openai_params(self, model=model)
 
+        # Azure supports tool_choice for GPT-5 deployments, but the base GPT-5 config
+        # can drop it when the deployment name isn't in the OpenAI model registry.
+        if "tool_choice" not in params:
+            params.append("tool_choice")
+
         # Only gpt-5.2 has been verified to support logprobs on Azure
         if self.is_model_gpt_5_2_model(model):
             azure_supported_params = ["logprobs", "top_logprobs"]
diff --git a/litellm/llms/bedrock/chat/invoke_transformations/anthropic_claude3_transformation.py b/litellm/llms/bedrock/chat/invoke_transformations/anthropic_claude3_transformation.py
index 53e08229799..c936b2cd23c 100644
--- a/litellm/llms/bedrock/chat/invoke_transformations/anthropic_claude3_transformation.py
+++ b/litellm/llms/bedrock/chat/invoke_transformations/anthropic_claude3_transformation.py
@@ -53,13 +53,26 @@ def map_openai_params(
         model: str,
         drop_params: bool,
     ) -> dict:
-        return AnthropicConfig.map_openai_params(
+        # Force tool-based structured outputs for Bedrock Invoke
+        # (similar to VertexAI fix in #19201)
+        # Bedrock Invoke doesn't support output_format parameter
+        original_model = model
+        if "response_format" in non_default_params:
+            # Use a model name that forces tool-based approach
+            model = "claude-3-sonnet-20240229"
+        
+        optional_params = AnthropicConfig.map_openai_params(
             self,
             non_default_params,
             optional_params,
             model,
             drop_params,
         )
+        
+        # Restore original model name
+        model = original_model
+        
+        return optional_params
 
 
     def transform_request(
@@ -90,6 +103,8 @@ def transform_request(
 
         _anthropic_request.pop("model", None)
         _anthropic_request.pop("stream", None)
+        # Bedrock Invoke doesn't support output_format parameter
+        _anthropic_request.pop("output_format", None)
         if "anthropic_version" not in _anthropic_request:
             _anthropic_request["anthropic_version"] = self.anthropic_version
 
@@ -117,6 +132,26 @@ def transform_request(
             if "opus-4" in model.lower() or "opus_4" in model.lower():
                 beta_set.add("tool-search-tool-2025-10-19")
 
+        # Filter out beta headers that Bedrock Invoke doesn't support
+        # AWS Bedrock only supports a specific whitelist of beta flags
+        # Reference: https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-messages-request-response.html
+        BEDROCK_SUPPORTED_BETAS = {
+            "computer-use-2024-10-22",  # Legacy computer use
+            "computer-use-2025-01-24",  # Current computer use (Claude 3.7 Sonnet)
+            "token-efficient-tools-2025-02-19",  # Tool use (Claude 3.7+ and Claude 4+)
+            "interleaved-thinking-2025-05-14",  # Interleaved thinking (Claude 4+)
+            "output-128k-2025-02-19",  # 128K output tokens (Claude 3.7 Sonnet)
+            "dev-full-thinking-2025-05-14",  # Developer mode for raw thinking (Claude 4+)
+            "context-1m-2025-08-07",  # 1 million tokens (Claude Sonnet 4)
+            "context-management-2025-06-27",  # Context management (Claude Sonnet/Haiku 4.5)
+            "effort-2025-11-24",  # Effort parameter (Claude Opus 4.5)
+            "tool-search-tool-2025-10-19",  # Tool search (Claude Opus 4.5)
+            "tool-examples-2025-10-29",  # Tool use examples (Claude Opus 4.5)
+        }
+        
+        # Only keep beta headers that Bedrock supports
+        beta_set = {beta for beta in beta_set if beta in BEDROCK_SUPPORTED_BETAS}
+
         if beta_set:
             _anthropic_request["anthropic_beta"] = list(beta_set)
 
diff --git a/litellm/llms/openai/chat/gpt_5_transformation.py b/litellm/llms/openai/chat/gpt_5_transformation.py
index 3fffa335fdc..05c003c8b7a 100644
--- a/litellm/llms/openai/chat/gpt_5_transformation.py
+++ b/litellm/llms/openai/chat/gpt_5_transformation.py
@@ -19,7 +19,9 @@ class OpenAIGPT5Config(OpenAIGPTConfig):
 
     @classmethod
     def is_model_gpt_5_model(cls, model: str) -> bool:
-        return "gpt-5" in model
+        # gpt-5-chat* behaves like a regular chat model (supports temperature, etc.)
+        # Don't route it through GPT-5 reasoning-specific parameter restrictions.
+        return "gpt-5" in model and "gpt-5-chat" not in model
 
     @classmethod
     def is_model_gpt_5_codex_model(cls, model: str) -> bool:
diff --git a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
index b78ac8f9e98..a9ac21bb56f 100644
--- a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
+++ b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@@ -1657,7 +1657,17 @@ def _calculate_usage(  # noqa: PLR0915
         ## This is necessary because promptTokensDetails includes both cached and non-cached tokens
         ## See: https://github.com/BerriAI/litellm/issues/18750
         if cached_text_tokens is not None and prompt_text_tokens is not None:
+            # Explicit caching: subtract cached tokens per modality from cacheTokensDetails
             prompt_text_tokens = prompt_text_tokens - cached_text_tokens
+        elif (
+            cached_tokens is not None
+            and prompt_text_tokens is not None
+            and cached_text_tokens is None
+        ):
+            # Implicit caching: only cachedContentTokenCount is provided (no cacheTokensDetails)
+            # Subtract from text tokens since implicit caching is primarily for text content
+            # See: https://github.com/BerriAI/litellm/issues/16341
+            prompt_text_tokens = prompt_text_tokens - cached_tokens
         if cached_audio_tokens is not None and prompt_audio_tokens is not None:
             prompt_audio_tokens = prompt_audio_tokens - cached_audio_tokens
         if cached_image_tokens is not None and prompt_image_tokens is not None:
diff --git a/litellm/main.py b/litellm/main.py
index 23922e3c8bf..99bf224c5b7 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -7280,8 +7280,11 @@ def _get_encoding():
 def __getattr__(name: str) -> Any:
     """Lazy import handler for main module"""
     if name == "encoding":
-        # Lazy load encoding to avoid heavy tiktoken import at module load time
-        _encoding = tiktoken.get_encoding("cl100k_base")
+        # Use _get_default_encoding which properly sets TIKTOKEN_CACHE_DIR
+        # before loading tiktoken, ensuring the local cache is used
+        # instead of downloading from the internet
+        from litellm._lazy_imports import _get_default_encoding
+        _encoding = _get_default_encoding()
         # Cache it in the module's __dict__ for subsequent accesses
         import sys
 
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index d556b746626..fad5f243fff 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -3130,7 +3130,7 @@
         "supports_reasoning": true,
         "supports_response_schema": true,
         "supports_system_messages": true,
-        "supports_tool_choice": false,
+        "supports_tool_choice": true,
         "supports_vision": true
     },
     "azure/gpt-5-chat-latest": {
@@ -3162,7 +3162,7 @@
         "supports_reasoning": true,
         "supports_response_schema": true,
         "supports_system_messages": true,
-        "supports_tool_choice": false,
+        "supports_tool_choice": true,
         "supports_vision": true
     },
     "azure/gpt-5-codex": {
diff --git a/litellm/proxy/common_request_processing.py b/litellm/proxy/common_request_processing.py
index 0d3e61b75c7..51f3e6482a4 100644
--- a/litellm/proxy/common_request_processing.py
+++ b/litellm/proxy/common_request_processing.py
@@ -650,11 +650,15 @@ async def base_process_llm_request(
         )
 
         tasks = []
+        # Start the moderation check (during_call_hook) as early as possible
+        # This gives it a head start to mask/validate input while the proxy handles routing
         tasks.append(
-            proxy_logging_obj.during_call_hook(
-                data=self.data,
-                user_api_key_dict=user_api_key_dict,
-                call_type=route_type,  # type: ignore
+            asyncio.create_task(
+                proxy_logging_obj.during_call_hook(
+                    data=self.data,
+                    user_api_key_dict=user_api_key_dict,
+                    call_type=route_type,  # type: ignore
+                )
             )
         )
 
diff --git a/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/content_filter.py b/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/content_filter.py
index c9bd0135a05..083a407e9cf 100644
--- a/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/content_filter.py
+++ b/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/content_filter.py
@@ -198,6 +198,15 @@ def __init__(
         for pattern_config in normalized_patterns:
             self._add_pattern(pattern_config)
 
+        # Warn if using during_call with MASK action (unstable)
+        if self.event_hook == GuardrailEventHooks.during_call and any(
+            p["action"] == ContentFilterAction.MASK for p in self.compiled_patterns
+        ):
+            verbose_proxy_logger.warning(
+                f"ContentFilterGuardrail '{self.guardrail_name}': 'during_call' mode with 'MASK' action is unstable due to race conditions. "
+                "Use 'pre_call' mode for reliable request masking."
+            )
+
         # Load blocked words - always initialize as dict
         self.blocked_words: Dict[str, Tuple[ContentFilterAction, Optional[str]]] = {}
         for word in normalized_blocked_words:
@@ -905,11 +914,15 @@ async def _process_images(
                 elif isinstance(e.detail, str):
                     e.detail = e.detail + " (Image description): " + description
                 else:
-                    e.detail = "Content blocked: Image description detected" + description
+                    e.detail = (
+                        "Content blocked: Image description detected" + description
+                    )
                 raise e
 
     def _count_masked_entities(
-        self, detections: List[ContentFilterDetection], masked_entity_count: Dict[str, int]
+        self,
+        detections: List[ContentFilterDetection],
+        masked_entity_count: Dict[str, int],
     ) -> None:
         """
         Count masked entities by type from detections.
@@ -964,9 +977,11 @@ def _log_guardrail_information(
             dict(detection) for detection in detections
         ]
         if status != "success":
-            guardrail_json_response = exception_str if exception_str else [
-                dict(detection) for detection in detections
-            ]
+            guardrail_json_response = (
+                exception_str
+                if exception_str
+                else [dict(detection) for detection in detections]
+            )
 
         self.add_standard_logging_guardrail_information_to_request_data(
             guardrail_provider=self.guardrail_provider,
@@ -1066,99 +1081,84 @@ async def async_post_call_streaming_iterator_hook(
         Process streaming response chunks and check for blocked content.
 
         For BLOCK action: Raises HTTPException immediately when blocked content is detected.
-        For MASK action: Content passes through (masking streaming responses is not supported).
+        For MASK action: Content is buffered to handle patterns split across chunks.
         """
+        accumulated_full_text = ""
+        yielded_masked_text_len = 0
+        buffer_size = 50  # Increased buffer to catch patterns split across many chunks
 
-        # Accumulate content as we iterate through chunks
-        accumulated_content = ""
+        verbose_proxy_logger.info(
+            f"ContentFilterGuardrail: Starting robust streaming masking for model {request_data.get('model')}"
+        )
 
         async for item in response:
-            # Accumulate content from this chunk before checking
             if isinstance(item, ModelResponseStream) and item.choices:
+                delta_content = ""
+                is_final = False
                 for choice in item.choices:
                     if hasattr(choice, "delta") and choice.delta:
                         content = getattr(choice.delta, "content", None)
                         if content and isinstance(content, str):
-                            accumulated_content += content
-
-                # Check accumulated content for blocked patterns/keywords after processing all choices
-                # Only check for BLOCK actions, not MASK (masking streaming is not supported)
-                if accumulated_content:
-                    try:
-                        # Check patterns
-                        pattern_match = self._check_patterns(accumulated_content)
-                        if pattern_match:
-                            matched_text, pattern_name, action = pattern_match
-                            if action == ContentFilterAction.BLOCK:
-                                error_msg = (
-                                    f"Content blocked: {pattern_name} pattern detected"
-                                )
-                                verbose_proxy_logger.warning(error_msg)
-                                raise HTTPException(
-                                    status_code=403,
-                                    detail={
-                                        "error": error_msg,
-                                        "pattern": pattern_name,
-                                    },
-                                )
-
-                        # Check blocked words
-                        blocked_word_match = self._check_blocked_words(
-                            accumulated_content
-                        )
-                        if blocked_word_match:
-                            keyword, action, description = blocked_word_match
-                            if action == ContentFilterAction.BLOCK:
-                                error_msg = (
-                                    f"Content blocked: keyword '{keyword}' detected"
-                                )
-                                if description:
-                                    error_msg += f" ({description})"
-                                verbose_proxy_logger.warning(error_msg)
-                                raise HTTPException(
-                                    status_code=403,
-                                    detail={
-                                        "error": error_msg,
-                                        "keyword": keyword,
-                                        "description": description,
-                                    },
-                                )
-
-                        # Check category keywords
-                        all_exceptions = []
-                        for category in self.loaded_categories.values():
-                            all_exceptions.extend(category.exceptions)
-                        category_match = self._check_category_keywords(
-                            accumulated_content, all_exceptions
-                        )
-                        if category_match:
-                            keyword, category_name, severity, action = category_match
-                            if action == ContentFilterAction.BLOCK:
-                                error_msg = (
-                                    f"Content blocked: {category_name} category keyword '{keyword}' detected "
-                                    f"(severity: {severity})"
-                                )
-                                verbose_proxy_logger.warning(error_msg)
-                                raise HTTPException(
-                                    status_code=403,
-                                    detail={
-                                        "error": error_msg,
-                                        "category": category_name,
-                                        "keyword": keyword,
-                                        "severity": severity,
-                                    },
-                                )
-                    except HTTPException:
-                        # Re-raise HTTPException (blocked content detected)
-                        raise
-                    except Exception as e:
-                        # Log other exceptions but don't block the stream
-                        verbose_proxy_logger.warning(
-                            f"Error checking content filter in streaming: {e}"
-                        )
+                            delta_content += content
+                    if getattr(choice, "finish_reason", None):
+                        is_final = True
+
+                accumulated_full_text += delta_content
+
+                # Check for blocking or apply masking
+                # Add a space at the end if it's the final chunk to trigger word boundaries (\b)
+                text_to_check = accumulated_full_text
+                if is_final:
+                    text_to_check += " "
+
+                try:
+                    masked_text = self._filter_single_text(text_to_check)
+                    if is_final and masked_text.endswith(" "):
+                        masked_text = masked_text[:-1]
+                except HTTPException:
+                    raise
+                except Exception as e:
+                    verbose_proxy_logger.error(
+                        f"ContentFilterGuardrail: Error in masking: {e}"
+                    )
+                    masked_text = text_to_check  # Fallback to current text
+
+                # Determine how much can be safely yielded
+                if is_final:
+                    safe_to_yield_len = len(masked_text)
+                else:
+                    safe_to_yield_len = max(0, len(masked_text) - buffer_size)
+
+                if safe_to_yield_len > yielded_masked_text_len:
+                    new_masked_content = masked_text[
+                        yielded_masked_text_len:safe_to_yield_len
+                    ]
+                    # Modify the chunk to contain only the new masked content
+                    if (
+                        item.choices
+                        and hasattr(item.choices[0], "delta")
+                        and item.choices[0].delta
+                    ):
+                        item.choices[0].delta.content = new_masked_content
+                        yielded_masked_text_len = safe_to_yield_len
+                        yield item
+                else:
+                    # Hold content by yielding empty content chunk (keeps metadata/structure)
+                    if (
+                        item.choices
+                        and hasattr(item.choices[0], "delta")
+                        and item.choices[0].delta
+                    ):
+                        item.choices[0].delta.content = ""
+                    yield item
+            else:
+                # Not a ModelResponseStream or no choices - yield as is
+                yield item
 
-            # Yield the chunk (only if no exception was raised above)
-            yield item
+        # Any remaining content (should have been handled by is_final, but just in case)
+        if yielded_masked_text_len < len(accumulated_full_text):
+            # We already reached the end of the generator
+            pass
 
     @staticmethod
     def get_config_model():
diff --git a/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/patterns.json b/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/patterns.json
index f2427b5b920..1eff7804b42 100644
--- a/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/patterns.json
+++ b/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/patterns.json
@@ -108,7 +108,7 @@
     {
       "name": "ipv6",
       "display_name": "IP Address (IPv6)",
-      "pattern": "\\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\\b",
+      "pattern": "(?<![0-9a-fA-F:])(?:(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|:(?::[0-9a-fA-F]{1,4}){1,7}|::|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?::[0-9a-fA-F]{1,4}){1,6})(?![0-9a-fA-F:])",
       "category": "Network Patterns",
       "description": "Detects IPv6 addresses"
     },
@@ -122,9 +122,9 @@
     {
       "name": "passport_us",
       "display_name": "Passport (US)",
-      "pattern": "\\b[0-9]{9}\\b",
+      "pattern": "\\b([A-Z][0-9]{8}|[0-9]{9})\\b",
       "category": "PII Patterns",
-      "description": "US passport numbers (9 digits)"
+      "description": "US passport numbers (9 digits or alphanumeric letter + 8 digits)"
     },
     {
       "name": "passport_uk",
@@ -157,9 +157,9 @@
     {
       "name": "passport_canada",
       "display_name": "Passport (Canada)",
-      "pattern": "\\b[A-Z]{2}[0-9]{6}\\b",
+      "pattern": "\\b([A-Z]{2}[0-9]{6}|[A-Z][0-9]{6}[A-Z]{2})\\b",
       "category": "PII Patterns",
-      "description": "Canadian passport numbers (2 letters + 6 digits)"
+      "description": "Canadian passport numbers (old: 2 letters + 6 digits; new: 1 letter + 6 digits + 2 letters)"
     },
     {
       "name": "passport_india",
@@ -369,4 +369,4 @@
       "description": "Detects Brazilian RG identity card numbers (common pattern for SP, RJ, MG states)"
     }
   ]
-}
+}
\ No newline at end of file
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 4fa58e9d244..00b13c9789a 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -5201,7 +5201,20 @@ async def _setup_prisma_client(
                 except Exception as e:
                     raise e
 
-                await prisma_client.connect()
+                try:
+                    await prisma_client.connect()
+                except Exception as e:
+                    if "P3018" in str(e) or "P3009" in str(e):
+                        verbose_proxy_logger.debug(
+                            "CRITICAL: DATABASE MIGRATION FAILED"
+                        )
+                        verbose_proxy_logger.debug(
+                            "Your database is in a 'dirty' state."
+                        )
+                        verbose_proxy_logger.debug(
+                            "FIX: Run 'prisma migrate resolve --applied <migration_name>'"
+                        )
+                    raise e
 
                 ## Start RDS IAM token refresh background task if enabled ##
                 # This proactively refreshes IAM tokens before they expire,
diff --git a/litellm/proxy/schema.prisma b/litellm/proxy/schema.prisma
index ca60b9e1bec..b118400b620 100644
--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@@ -5,6 +5,7 @@ datasource client {
 
 generator client {
   provider = "prisma-client-py"
+  binaryTargets = ["native", "debian-openssl-1.1.x", "debian-openssl-3.0.x", "linux-musl", "linux-musl-openssl-3.0.x"]
 }
 
 // Budget / Rate Limits for an org
diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index 13f42a2f71d..8922ed032e2 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -1901,7 +1901,18 @@ async def async_post_call_streaming_iterator_hook(
                 ) or _callback.should_run_guardrail(
                     data=request_data, event_type=GuardrailEventHooks.post_call
                 ):
-                    if "apply_guardrail" in type(callback).__dict__:
+                    if (
+                        "async_post_call_streaming_iterator_hook"
+                        in type(callback).__dict__
+                    ):
+                        current_response = (
+                            _callback.async_post_call_streaming_iterator_hook(
+                                user_api_key_dict=user_api_key_dict,
+                                response=current_response,
+                                request_data=request_data,
+                            )
+                        )
+                    elif "apply_guardrail" in type(callback).__dict__:
                         request_data["guardrail_to_apply"] = callback
                         current_response = (
                             unified_guardrail.async_post_call_streaming_iterator_hook(
diff --git a/litellm/router.py b/litellm/router.py
index 09d71b6b497..a3c3afa9326 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -8729,11 +8729,6 @@ def get_allowed_fails_from_policy(self, exception: Exception):
         if allowed_fails_policy is None:
             return None
 
-        if (
-            isinstance(exception, litellm.BadRequestError)
-            and allowed_fails_policy.BadRequestErrorAllowedFails is not None
-        ):
-            return allowed_fails_policy.BadRequestErrorAllowedFails
         if (
             isinstance(exception, litellm.AuthenticationError)
             and allowed_fails_policy.AuthenticationErrorAllowedFails is not None
@@ -8754,6 +8749,11 @@ def get_allowed_fails_from_policy(self, exception: Exception):
             and allowed_fails_policy.ContentPolicyViolationErrorAllowedFails is not None
         ):
             return allowed_fails_policy.ContentPolicyViolationErrorAllowedFails
+        if (
+            isinstance(exception, litellm.BadRequestError)
+            and allowed_fails_policy.BadRequestErrorAllowedFails is not None
+        ):
+            return allowed_fails_policy.BadRequestErrorAllowedFails
 
     def _initialize_alerting(self):
         from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting
diff --git a/litellm/router_utils/get_retry_from_policy.py b/litellm/router_utils/get_retry_from_policy.py
index 48df43ef818..ec326ebb50d 100644
--- a/litellm/router_utils/get_retry_from_policy.py
+++ b/litellm/router_utils/get_retry_from_policy.py
@@ -43,11 +43,6 @@ def get_num_retries_from_retry_policy(
     if isinstance(retry_policy, dict):
         retry_policy = RetryPolicy(**retry_policy)
 
-    if (
-        isinstance(exception, BadRequestError)
-        and retry_policy.BadRequestErrorRetries is not None
-    ):
-        return retry_policy.BadRequestErrorRetries
     if (
         isinstance(exception, AuthenticationError)
         and retry_policy.AuthenticationErrorRetries is not None
@@ -65,6 +60,11 @@ def get_num_retries_from_retry_policy(
         and retry_policy.ContentPolicyViolationErrorRetries is not None
     ):
         return retry_policy.ContentPolicyViolationErrorRetries
+    if (
+        isinstance(exception, BadRequestError)
+        and retry_policy.BadRequestErrorRetries is not None
+    ):
+        return retry_policy.BadRequestErrorRetries
 
 
 def reset_retry_policy() -> RetryPolicy:
diff --git a/litellm/types/guardrails.py b/litellm/types/guardrails.py
index eea0a26b332..ca22049720e 100644
--- a/litellm/types/guardrails.py
+++ b/litellm/types/guardrails.py
@@ -20,6 +20,9 @@
 from litellm.types.proxy.guardrails.guardrail_hooks.qualifire import (
     QualifireGuardrailConfigModel,
 )
+from litellm.types.proxy.guardrails.guardrail_hooks.litellm_content_filter import (
+    ContentFilterCategoryConfig,
+)
 
 """
 Pydantic object defining how to set guardrails on litellm proxy
@@ -547,9 +550,27 @@ class ContentFilterConfigModel(BaseModel):
     blocked_words_file: Optional[str] = Field(
         default=None, description="Path to YAML file containing blocked_words list"
     )
+    categories: Optional[List[ContentFilterCategoryConfig]] = Field(
+        default=None,
+        description="List of prebuilt categories to enable (harmful_*, bias_*)",
+    )
+    severity_threshold: Optional[str] = Field(
+        default=None,
+        description="Minimum severity to block (high, medium, low)",
+    )
+    pattern_redaction_format: Optional[str] = Field(
+        default=None,
+        description="Format string for pattern redaction (use {pattern_name} placeholder)",
+    )
+    keyword_redaction_tag: Optional[str] = Field(
+        default=None,
+        description="Tag to use for keyword redaction",
+    )
 
 
-class BaseLitellmParams(BaseModel):  # works for new and patch update guardrails
+class BaseLitellmParams(
+    ContentFilterConfigModel
+):  # works for new and patch update guardrails
     api_key: Optional[str] = Field(
         default=None, description="API key for the guardrail service"
     )
@@ -630,7 +651,6 @@ class BaseLitellmParams(BaseModel):  # works for new and patch update guardrails
         description="Whether to fail the request if Model Armor encounters an error",
     )
 
-    # Generic Guardrail API params
     additional_provider_specific_params: Optional[Dict[str, Any]] = Field(
         default=None,
         description="Additional provider-specific parameters for generic guardrail APIs",
@@ -657,7 +677,6 @@ class LitellmParams(
     ToolPermissionGuardrailConfigModel,
     ZscalerAIGuardConfigModel,
     JavelinGuardrailConfigModel,
-    ContentFilterConfigModel,
     BaseLitellmParams,
     EnkryptAIGuardrailConfigs,
     IBMGuardrailsBaseConfigModel,
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index d556b746626..fad5f243fff 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -3130,7 +3130,7 @@
         "supports_reasoning": true,
         "supports_response_schema": true,
         "supports_system_messages": true,
-        "supports_tool_choice": false,
+        "supports_tool_choice": true,
         "supports_vision": true
     },
     "azure/gpt-5-chat-latest": {
@@ -3162,7 +3162,7 @@
         "supports_reasoning": true,
         "supports_response_schema": true,
         "supports_system_messages": true,
-        "supports_tool_choice": false,
+        "supports_tool_choice": true,
         "supports_vision": true
     },
     "azure/gpt-5-codex": {
diff --git a/tests/local_testing/test_auth_utils.py b/tests/local_testing/test_auth_utils.py
index 72f799a6cf0..d36f96b1a39 100644
--- a/tests/local_testing/test_auth_utils.py
+++ b/tests/local_testing/test_auth_utils.py
@@ -356,6 +356,25 @@ def test_get_internal_user_header_from_mapping_no_internal_returns_none():
             "/openai/deployments/my-deployment/chat/completions",
             "my-deployment"
         ),
+        # Custom model_name with slashes (e.g., gcp/google/gemini-2.5-flash)
+        # This is the NVIDIA P0 bug fix - regex should capture full model name including slashes
+        (
+            {},
+            "/vertex_ai/v1/projects/my-project/locations/us-central1/publishers/google/models/gcp/google/gemini-2.5-flash:generateContent",
+            "gcp/google/gemini-2.5-flash"
+        ),
+        # Another custom model_name with slashes
+        (
+            {},
+            "/vertex_ai/v1/projects/my-project/locations/global/publishers/google/models/gcp/google/gemini-3-flash-preview:generateContent",
+            "gcp/google/gemini-3-flash-preview"
+        ),
+        # Model name with single slash
+        (
+            {},
+            "/vertex_ai/v1/projects/my-project/locations/us-central1/publishers/google/models/custom/model:generateContent",
+            "custom/model"
+        ),
     ],
 )
 def test_get_model_from_request_vertex_ai_passthrough(request_data, route, expected_model):
diff --git a/tests/local_testing/test_completion_with_retries.py b/tests/local_testing/test_completion_with_retries.py
index 6eb3ad460e6..585e1ee2618 100644
--- a/tests/local_testing/test_completion_with_retries.py
+++ b/tests/local_testing/test_completion_with_retries.py
@@ -60,6 +60,7 @@ async def test_completion_with_retry_policy(sync_mode):
 
     retry_number = 1
     retry_policy = RetryPolicy(
+        BadRequestErrorRetries=10,
         ContentPolicyViolationErrorRetries=retry_number,  # run 3 retries for ContentPolicyViolationErrors
         AuthenticationErrorRetries=0,  # run 0 retries for AuthenticationErrorRetries
     )
diff --git a/tests/test_litellm/llms/anthropic/chat/test_anthropic_chat_transformation.py b/tests/test_litellm/llms/anthropic/chat/test_anthropic_chat_transformation.py
index bd229dedfaa..eee0b267fad 100644
--- a/tests/test_litellm/llms/anthropic/chat/test_anthropic_chat_transformation.py
+++ b/tests/test_litellm/llms/anthropic/chat/test_anthropic_chat_transformation.py
@@ -548,6 +548,59 @@ def test_map_tool_choice_dict_type_function_with_name():
     assert result["name"] == "my_tool"
 
 
+def test_map_tool_choice_dict_type_auto():
+    """
+    Test that dict {"type": "auto"} maps to Anthropic type='auto'.
+    This handles Cursor's format for tool_choice.
+    """
+    config = AnthropicConfig()
+    result = config._map_tool_choice(
+        tool_choice={"type": "auto"},
+        parallel_tool_use=None,
+    )
+    assert result is not None
+    assert result["type"] == "auto"
+
+
+def test_map_tool_choice_dict_type_required():
+    """
+    Test that dict {"type": "required"} maps to Anthropic type='any'.
+    """
+    config = AnthropicConfig()
+    result = config._map_tool_choice(
+        tool_choice={"type": "required"},
+        parallel_tool_use=None,
+    )
+    assert result is not None
+    assert result["type"] == "any"
+
+
+def test_map_tool_choice_dict_type_none():
+    """
+    Test that dict {"type": "none"} maps to Anthropic type='none'.
+    """
+    config = AnthropicConfig()
+    result = config._map_tool_choice(
+        tool_choice={"type": "none"},
+        parallel_tool_use=None,
+    )
+    assert result is not None
+    assert result["type"] == "none"
+
+
+def test_map_tool_choice_dict_type_function_without_name():
+    """
+    Test that dict {"type": "function"} without name is handled gracefully.
+    Should return None since there's no valid tool name.
+    """
+    config = AnthropicConfig()
+    result = config._map_tool_choice(
+        tool_choice={"type": "function"},
+        parallel_tool_use=None,
+    )
+    assert result is None
+
+
 def test_transform_response_with_prefix_prompt():
     import httpx
 
diff --git a/tests/test_litellm/llms/azure/chat/test_azure_gpt5_transformation.py b/tests/test_litellm/llms/azure/chat/test_azure_gpt5_transformation.py
index 199a16d8590..25f3d1364f6 100644
--- a/tests/test_litellm/llms/azure/chat/test_azure_gpt5_transformation.py
+++ b/tests/test_litellm/llms/azure/chat/test_azure_gpt5_transformation.py
@@ -16,6 +16,17 @@ def test_azure_gpt5_supports_reasoning_effort(config: AzureOpenAIGPT5Config):
     )
 
 
+def test_azure_gpt5_allows_tool_choice_for_deployment_names():
+    supported_params = litellm.get_supported_openai_params(
+        model="gpt-5-chat-2025-08-07", custom_llm_provider="azure"
+    )
+    assert supported_params is not None
+    assert "tool_choice" in supported_params
+    # gpt-5-chat* should not be treated as a GPT-5 reasoning model
+    assert "reasoning_effort" not in supported_params
+    assert "temperature" in supported_params
+
+
 def test_azure_gpt5_maps_max_tokens(config: AzureOpenAIGPT5Config):
     params = config.map_openai_params(
         non_default_params={"max_tokens": 5},
diff --git a/tests/test_litellm/llms/bedrock/chat/invoke_transformations/test_bedrock_chat_invoke_transformations_anthropic_claude3_transformation.py b/tests/test_litellm/llms/bedrock/chat/invoke_transformations/test_bedrock_chat_invoke_transformations_anthropic_claude3_transformation.py
index 98b392a353d..5c1b4cbd38e 100644
--- a/tests/test_litellm/llms/bedrock/chat/invoke_transformations/test_bedrock_chat_invoke_transformations_anthropic_claude3_transformation.py
+++ b/tests/test_litellm/llms/bedrock/chat/invoke_transformations/test_bedrock_chat_invoke_transformations_anthropic_claude3_transformation.py
@@ -464,3 +464,108 @@ def test_opus_4_5_model_detection():
     for model in non_opus_4_5_models:
         assert not config._is_claude_opus_4_5(model), \
             f"Should not detect {model} as Opus 4.5"
+
+
+def test_structured_outputs_beta_header_filtered_for_bedrock_invoke():
+    """
+    Test that unsupported beta headers are filtered out for Bedrock Invoke API.
+    
+    Bedrock Invoke API only supports a specific whitelist of beta flags and returns
+    "invalid beta flag" error for others (e.g., structured-outputs, mcp-servers).
+    This test ensures unsupported headers are filtered while keeping supported ones.
+    
+    Fixes: https://github.com/BerriAI/litellm/issues/16726
+    """
+    config = AmazonAnthropicClaudeConfig()
+    
+    messages = [{"role": "user", "content": "test"}]
+    
+    # Test 1: structured-outputs beta header (unsupported)
+    headers = {"anthropic-beta": "structured-outputs-2025-11-13"}
+    
+    result = config.transform_request(
+        model="anthropic.claude-4-0-sonnet-20250514-v1:0",
+        messages=messages,
+        optional_params={},
+        litellm_params={},
+        headers=headers,
+    )
+    
+    # Verify structured-outputs beta is filtered out
+    anthropic_beta = result.get("anthropic_beta", [])
+    assert not any("structured-outputs" in beta for beta in anthropic_beta), \
+        f"structured-outputs beta should be filtered, got: {anthropic_beta}"
+    
+    # Test 2: mcp-servers beta header (unsupported - the main issue from #16726)
+    headers = {"anthropic-beta": "mcp-servers-2025-12-04"}
+    
+    result = config.transform_request(
+        model="anthropic.claude-4-0-sonnet-20250514-v1:0",
+        messages=messages,
+        optional_params={},
+        litellm_params={},
+        headers=headers,
+    )
+    
+    # Verify mcp-servers beta is filtered out
+    anthropic_beta = result.get("anthropic_beta", [])
+    assert not any("mcp-servers" in beta for beta in anthropic_beta), \
+        f"mcp-servers beta should be filtered, got: {anthropic_beta}"
+    
+    # Test 3: Mix of supported and unsupported beta headers
+    headers = {"anthropic-beta": "computer-use-2024-10-22,mcp-servers-2025-12-04,structured-outputs-2025-11-13"}
+    
+    result = config.transform_request(
+        model="anthropic.claude-4-0-sonnet-20250514-v1:0",
+        messages=messages,
+        optional_params={},
+        litellm_params={},
+        headers=headers,
+    )
+    
+    # Verify only supported betas are kept
+    anthropic_beta = result.get("anthropic_beta", [])
+    assert not any("structured-outputs" in beta for beta in anthropic_beta), \
+        f"structured-outputs beta should be filtered, got: {anthropic_beta}"
+    assert not any("mcp-servers" in beta for beta in anthropic_beta), \
+        f"mcp-servers beta should be filtered, got: {anthropic_beta}"
+    assert any("computer-use" in beta for beta in anthropic_beta), \
+        f"computer-use beta should be kept, got: {anthropic_beta}"
+
+
+def test_output_format_removed_from_bedrock_invoke_request():
+    """
+    Test that output_format parameter is removed from Bedrock Invoke requests.
+    
+    Bedrock Invoke API doesn't support the output_format parameter (only supported
+    in Anthropic Messages API). This test ensures it's removed to prevent errors.
+    """
+    config = AmazonAnthropicClaudeConfig()
+    
+    messages = [{"role": "user", "content": "test"}]
+    
+    # Create a request with output_format via map_openai_params
+    non_default_params = {
+        "response_format": {"type": "json_object"}
+    }
+    optional_params = {}
+    
+    # This should trigger tool-based structured outputs
+    optional_params = config.map_openai_params(
+        non_default_params=non_default_params,
+        optional_params=optional_params,
+        model="anthropic.claude-4-0-sonnet-20250514-v1:0",
+        drop_params=False,
+    )
+    
+    result = config.transform_request(
+        model="anthropic.claude-4-0-sonnet-20250514-v1:0",
+        messages=messages,
+        optional_params=optional_params,
+        litellm_params={},
+        headers={},
+    )
+    
+    # Verify output_format is not in the request
+    assert "output_format" not in result, \
+        f"output_format should be removed for Bedrock Invoke, got keys: {result.keys()}"
diff --git a/tests/test_litellm/llms/openai/test_gpt5_transformation.py b/tests/test_litellm/llms/openai/test_gpt5_transformation.py
index fd25d302d07..386f264a4dd 100644
--- a/tests/test_litellm/llms/openai/test_gpt5_transformation.py
+++ b/tests/test_litellm/llms/openai/test_gpt5_transformation.py
@@ -20,6 +20,23 @@ def test_gpt5_supports_reasoning_effort(config: OpenAIConfig):
     assert "reasoning_effort" in config.get_supported_openai_params(model="gpt-5-mini")
 
 
+def test_gpt5_chat_does_not_support_reasoning_effort(config: OpenAIConfig):
+    assert (
+        "reasoning_effort"
+        not in config.get_supported_openai_params(model="gpt-5-chat-latest")
+    )
+
+
+def test_gpt5_chat_supports_temperature(config: OpenAIConfig):
+    params = config.map_openai_params(
+        non_default_params={"temperature": 0.3},
+        optional_params={},
+        model="gpt-5-chat-latest",
+        drop_params=False,
+    )
+    assert params["temperature"] == 0.3
+
+
 def test_gpt5_maps_max_tokens(config: OpenAIConfig):
     params = config.map_openai_params(
         non_default_params={"max_tokens": 10},
diff --git a/tests/test_litellm/proxy/guardrails/guardrail_hooks/content_filter/test_content_filter.py b/tests/test_litellm/proxy/guardrails/guardrail_hooks/content_filter/test_content_filter.py
index 265bc530dc2..be8c84a554f 100644
--- a/tests/test_litellm/proxy/guardrails/guardrail_hooks/content_filter/test_content_filter.py
+++ b/tests/test_litellm/proxy/guardrails/guardrail_hooks/content_filter/test_content_filter.py
@@ -4,7 +4,7 @@
 
 import os
 import sys
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import MagicMock
 
 import pytest
 
@@ -385,20 +385,12 @@ def test_api_key_patterns(self):
         assert result is not None
         assert result[1] == "aws_access_key"
 
-    @pytest.mark.skip(
-        reason="Masking in streaming responses is no longer supported after unified_guardrail.py changes. Only blocking/rejecting is supported for responses."
-    )
     @pytest.mark.asyncio
     async def test_streaming_hook_mask(self):
         """
-        Test streaming hook with MASK action
-
-        Note: After changes to unified_guardrail.py, masking responses to users
-        is no longer supported. This test is skipped as the feature is deprecated.
-        Only BLOCK actions (test_streaming_hook_block) are supported for streaming responses.
+        Test streaming hook with MASK action.
+        This now works with the 50-char sliding window buffer.
         """
-        from unittest.mock import AsyncMock
-
         from litellm.types.utils import Delta, ModelResponseStream, StreamingChoices
 
         patterns = [
@@ -415,51 +407,54 @@ async def test_streaming_hook_mask(self):
             event_hook=GuardrailEventHooks.during_call,
         )
 
-        # Create mock streaming chunks
+        # Create mock streaming chunks that split an email
         async def mock_stream():
-            # Chunk 1: contains email
-            chunk1 = ModelResponseStream(
+            # Chunk 1: starts email
+            yield ModelResponseStream(
                 id="chunk1",
                 choices=[
                     StreamingChoices(
-                        delta=Delta(content="Contact me at test@example.com"), index=0
+                        delta=Delta(content="Contact me at test@ex"), index=0
                     )
                 ],
                 model="gpt-4",
             )
-            yield chunk1
-
-            # Chunk 2: normal content
-            chunk2 = ModelResponseStream(
+            # Chunk 2: ends email
+            yield ModelResponseStream(
                 id="chunk2",
                 choices=[
-                    StreamingChoices(delta=Delta(content=" for more info"), index=0)
+                    StreamingChoices(
+                        delta=Delta(content="ample.com for info"),
+                        index=0,
+                        finish_reason="stop",
+                    )
                 ],
                 model="gpt-4",
             )
-            yield chunk2
 
         user_api_key_dict = MagicMock()
         request_data = {}
 
-        # Process streaming response - no masking expected
-        result_chunks = []
+        # Process streaming response - masking IS expected now
+        full_content = ""
         async for chunk in guardrail.async_post_call_streaming_iterator_hook(
             user_api_key_dict=user_api_key_dict,
             response=mock_stream(),
             request_data=request_data,
         ):
-            result_chunks.append(chunk)
+            if chunk.choices[0].delta.content:
+                full_content += chunk.choices[0].delta.content
 
-        # Chunks should pass through unchanged since masking is no longer supported
-        assert len(result_chunks) == 2
+        # The email should be redacted even though it was split
+        assert "test@example.com" not in full_content
+        assert "[EMAIL_REDACTED]" in full_content
+        assert "Contact me at [EMAIL_REDACTED] for info" in full_content
 
     @pytest.mark.asyncio
     async def test_streaming_hook_block(self):
         """
         Test streaming hook with BLOCK action
         """
-        from unittest.mock import AsyncMock
 
         from litellm.types.utils import Delta, ModelResponseStream, StreamingChoices
 
@@ -715,7 +710,10 @@ async def test_apply_guardrail_masks_all_regex_pattern_matches(self):
         assert result is not None
         assert len(result) == 1
         # All matches should be redacted
-        assert result[0] == "[CUSTOM_KEY_REDACTED] [CUSTOM_KEY_REDACTED] [CUSTOM_KEY_REDACTED]"
+        assert (
+            result[0]
+            == "[CUSTOM_KEY_REDACTED] [CUSTOM_KEY_REDACTED] [CUSTOM_KEY_REDACTED]"
+        )
         assert "Key1" not in result[0]
         assert "Key2" not in result[0]
 
@@ -797,7 +795,7 @@ async def test_apply_guardrail_logs_guardrail_information(self):
 
         # Apply guardrail with content that triggers detections
         # Email will be masked, blocked word will be masked
-        result = await guardrail.apply_guardrail(
+        await guardrail.apply_guardrail(
             inputs={"texts": ["Contact me at test@example.com for confidential info"]},
             request_data=request_data,
             input_type="request",
@@ -807,7 +805,9 @@ async def test_apply_guardrail_logs_guardrail_information(self):
         assert "metadata" in request_data
         assert "standard_logging_guardrail_information" in request_data["metadata"]
 
-        guardrail_info_list = request_data["metadata"]["standard_logging_guardrail_information"]
+        guardrail_info_list = request_data["metadata"][
+            "standard_logging_guardrail_information"
+        ]
         assert isinstance(guardrail_info_list, list)
         assert len(guardrail_info_list) == 1
 
@@ -820,8 +820,8 @@ async def test_apply_guardrail_logs_guardrail_information(self):
         assert "start_time" in guardrail_info
         assert "end_time" in guardrail_info
         assert "duration" in guardrail_info
-        assert guardrail_info["duration"] > 0
-        assert guardrail_info["start_time"] < guardrail_info["end_time"]
+        assert guardrail_info["duration"] >= 0
+        assert guardrail_info["start_time"] <= guardrail_info["end_time"]
 
         # Verify detections are logged
         assert "guardrail_response" in guardrail_info
@@ -839,15 +839,21 @@ async def test_apply_guardrail_logs_guardrail_information(self):
             assert "action" in detection
             assert detection["action"] == "MASK"
             # Verify sensitive content (matched_text) is NOT included
-            assert "matched_text" not in detection, "Sensitive content should not be logged"
+            assert (
+                "matched_text" not in detection
+            ), "Sensitive content should not be logged"
 
         # Verify blocked word detection structure
-        blocked_word_detections = [d for d in detections if d.get("type") == "blocked_word"]
+        blocked_word_detections = [
+            d for d in detections if d.get("type") == "blocked_word"
+        ]
         assert len(blocked_word_detections) > 0
         for detection in blocked_word_detections:
             assert detection["type"] == "blocked_word"
             assert "keyword" in detection
-            assert detection["keyword"] == "confidential"  # Config keyword, not user content
+            assert (
+                detection["keyword"] == "confidential"
+            )  # Config keyword, not user content
             assert "action" in detection
             assert detection["action"] == "MASK"
             assert "description" in detection
@@ -896,7 +902,9 @@ async def test_apply_guardrail_logs_blocked_status(self):
         assert "metadata" in request_data
         assert "standard_logging_guardrail_information" in request_data["metadata"]
 
-        guardrail_info_list = request_data["metadata"]["standard_logging_guardrail_information"]
+        guardrail_info_list = request_data["metadata"][
+            "standard_logging_guardrail_information"
+        ]
         assert len(guardrail_info_list) == 1
 
         guardrail_info = guardrail_info_list[0]
@@ -909,4 +917,6 @@ async def test_apply_guardrail_logs_blocked_status(self):
             # If detections are logged, verify they don't contain sensitive content
             for detection in detections:
                 if detection.get("type") == "pattern":
-                    assert "matched_text" not in detection, "Sensitive content should not be logged"
+                    assert (
+                        "matched_text" not in detection
+                    ), "Sensitive content should not be logged"
diff --git a/tests/test_litellm/test_cost_calculator.py b/tests/test_litellm/test_cost_calculator.py
index e277baf0b0c..74f5cf9bdd7 100644
--- a/tests/test_litellm/test_cost_calculator.py
+++ b/tests/test_litellm/test_cost_calculator.py
@@ -1695,60 +1695,84 @@ def test_gemini_without_cache_tokens_details():
     print("✅ Gemini without cacheTokensDetails works correctly")
 
 
-def test_generic_provider_cached_token_cost():
+def test_gemini_implicit_caching_cost_calculation():
     """
-    Test that the generic cost calculator correctly handles cached tokens
-    for providers like z.ai/deepseek that are not explicitly handled.
+    Test for Issue #16341: Gemini implicit cached tokens not counted in spend log
+
+    When Gemini uses implicit caching, it returns cachedContentTokenCount but NOT
+    cacheTokensDetails. In this case, we should subtract cachedContentTokenCount
+    from text_tokens to correctly calculate costs.
+
+    See: https://github.com/BerriAI/litellm/issues/16341
     """
-    from litellm.cost_calculator import completion_cost
-    from litellm.types.utils import ModelResponse, PromptTokensDetailsWrapper, Usage
-
-    # Setup model cost for a generic provider
-    # We use a name that will bypass complex provider mapping logic
-    model_name = "custom-cached-model"
-    litellm.model_cost[model_name] = {
-        "input_cost_per_token": 0.0000006,
-        "output_cost_per_token": 0.0000006,
-        "cache_read_input_token_cost": 0.0000001,
-        "litellm_provider": "openai",
+    from litellm import completion_cost
+    from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
+        VertexGeminiConfig,
+    )
+    from litellm.types.utils import Choices, Message, ModelResponse
+
+    # Simulate Gemini response with implicit caching (cachedContentTokenCount only)
+    completion_response = {
+        "usageMetadata": {
+            "promptTokenCount": 10000,
+            "candidatesTokenCount": 5,
+            "totalTokenCount": 10005,
+            "cachedContentTokenCount": 8000,  # Implicit caching - no cacheTokensDetails
+            "promptTokensDetails": [{"modality": "TEXT", "tokenCount": 10000}],
+            "candidatesTokensDetails": [{"modality": "TEXT", "tokenCount": 5}],
+        }
     }
 
-    # Case 1: Standard nested cached tokens (prompt_tokens_details.cached_tokens)
-    usage = Usage(
-        prompt_tokens=10000,
-        completion_tokens=0,
-        prompt_tokens_details=PromptTokensDetailsWrapper(cached_tokens=9000),
+    usage = VertexGeminiConfig._calculate_usage(completion_response)
+
+    # Verify parsing
+    assert (
+        usage.cache_read_input_tokens == 8000
+    ), f"cache_read_input_tokens should be 8000, got {usage.cache_read_input_tokens}"
+    assert (
+        usage.prompt_tokens_details.cached_tokens == 8000
+    ), f"cached_tokens should be 8000, got {usage.prompt_tokens_details.cached_tokens}"
+
+    # CRITICAL: text_tokens should be (10000 - 8000) = 2000, NOT 10000
+    # This is the fix for issue #16341
+    assert (
+        usage.prompt_tokens_details.text_tokens == 2000
+    ), f"text_tokens should be 2000 (10000 - 8000), got {usage.prompt_tokens_details.text_tokens}"
+
+    # Verify cost calculation uses cached token pricing
+    response = ModelResponse(
+        id="mock-id",
+        model="gemini-2.0-flash",
+        choices=[
+            Choices(
+                index=0,
+                message=Message(role="assistant", content="Hello!"),
+                finish_reason="stop",
+            )
+        ],
+        usage=usage,
     )
-    response = ModelResponse(usage=usage, model=model_name)
 
     cost = completion_cost(
         completion_response=response,
-        model=model_name,
-        custom_llm_provider="openai",  # Explicitly set provider to trigger generic path
+        model="gemini-2.0-flash",
+        custom_llm_provider="gemini",
     )
 
-    # Expected: (1000 * 0.0000006) + (9000 * 0.0000001) = 0.0006 + 0.0009 = 0.0015
-    expected_cost = 0.0015
-    assert (
-        abs(cost - expected_cost) < 1e-9
-    ), f"Nested cache cost failed. Got {cost}, expected {expected_cost}"
+    # Get model pricing for verification
+    import litellm
 
-    # Case 2: Top-level cached tokens (cache_read_input_tokens)
-    usage_top = Usage(
-        prompt_tokens=10000,
-        completion_tokens=0,
-        cache_read_input_tokens=9000,
-    )
-    response_top = ModelResponse(usage=usage_top, model=model_name)
+    model_info = litellm.get_model_info("gemini/gemini-2.0-flash")
+    input_cost = model_info.get("input_cost_per_token", 0)
+    cache_read_cost = model_info.get("cache_read_input_token_cost", input_cost)
+    output_cost = model_info.get("output_cost_per_token", 0)
 
-    cost_top = completion_cost(
-        completion_response=response_top,
-        model=model_name,
-        custom_llm_provider="openai",
-    )
+    # Expected cost: (2000 * input) + (8000 * cache_read) + (5 * output)
+    expected_cost = (2000 * input_cost) + (8000 * cache_read_cost) + (5 * output_cost)
 
-    assert (
-        abs(cost_top - expected_cost) < 1e-9
-    ), f"Top-level cache cost failed. Got {cost_top}, expected {expected_cost}"
+    assert abs(cost - expected_cost) < 1e-9, (
+        f"Cost calculation is wrong. Got ${cost:.6f}, expected ${expected_cost:.6f}. "
+        f"Cached tokens may not be using reduced pricing."
+    )
 
-    print("✅ Generic provider cached token cost verified")
+    print("✅ Issue #16341 fix verified: Gemini implicit caching cost calculated correctly")
diff --git a/tests/test_litellm/test_eager_tiktoken_load.py b/tests/test_litellm/test_eager_tiktoken_load.py
index 1264c68b99e..33dd57fad8d 100644
--- a/tests/test_litellm/test_eager_tiktoken_load.py
+++ b/tests/test_litellm/test_eager_tiktoken_load.py
@@ -78,6 +78,35 @@ def test_lazy_loading_default():
     assert len(tokens) > 0, "Encoding should work"
 
 
+def test_tiktoken_cache_dir_set_on_lazy_load():
+    """Test that TIKTOKEN_CACHE_DIR is set when encoding is lazy loaded.
+
+    This ensures the local tiktoken cache is used instead of downloading
+    from the internet. Regression test for issue #19768.
+    """
+    # Remove environment variables to ensure clean state
+    if "LITELLM_DISABLE_LAZY_LOADING" in os.environ:
+        del os.environ["LITELLM_DISABLE_LAZY_LOADING"]
+    if "TIKTOKEN_CACHE_DIR" in os.environ:
+        del os.environ["TIKTOKEN_CACHE_DIR"]
+
+    # Clear any cached modules
+    modules_to_clear = [k for k in sys.modules.keys() if k.startswith("litellm")]
+    for module in modules_to_clear:
+        del sys.modules[module]
+
+    # Import litellm fresh
+    import litellm
+
+    # Access encoding (triggers lazy load)
+    _ = litellm.encoding
+
+    # Verify TIKTOKEN_CACHE_DIR is now set and points to local tokenizers
+    assert "TIKTOKEN_CACHE_DIR" in os.environ, "TIKTOKEN_CACHE_DIR should be set after lazy loading encoding"
+    cache_dir = os.environ["TIKTOKEN_CACHE_DIR"]
+    assert "tokenizers" in cache_dir, f"TIKTOKEN_CACHE_DIR should point to tokenizers directory, got: {cache_dir}"
+
+
 @pytest.fixture(autouse=True)
 def cleanup_env():
     """Clean up environment variable after each test"""