diff --git a/litellm-proxy-extras/litellm_proxy_extras/migrations/20251119131227_add_prompt_versioning/migration.sql b/litellm-proxy-extras/litellm_proxy_extras/migrations/20251119131227_add_prompt_versioning/migration.sql index a9d9528bd24..43eb2401422 100644 --- a/litellm-proxy-extras/litellm_proxy_extras/migrations/20251119131227_add_prompt_versioning/migration.sql +++ b/litellm-proxy-extras/litellm_proxy_extras/migrations/20251119131227_add_prompt_versioning/migration.sql @@ -1,12 +1,12 @@ -- DropIndex -DROP INDEX "LiteLLM_PromptTable_prompt_id_key"; +DROP INDEX IF EXISTS "LiteLLM_PromptTable_prompt_id_key"; -- AlterTable -ALTER TABLE "LiteLLM_PromptTable" ADD COLUMN "version" INTEGER NOT NULL DEFAULT 1; +ALTER TABLE "LiteLLM_PromptTable" +ADD COLUMN "version" INTEGER NOT NULL DEFAULT 1; -- CreateIndex -CREATE INDEX "LiteLLM_PromptTable_prompt_id_idx" ON "LiteLLM_PromptTable"("prompt_id"); +CREATE INDEX "LiteLLM_PromptTable_prompt_id_idx" ON "LiteLLM_PromptTable" ("prompt_id"); -- CreateIndex -CREATE UNIQUE INDEX "LiteLLM_PromptTable_prompt_id_version_key" ON "LiteLLM_PromptTable"("prompt_id", "version"); - +CREATE UNIQUE INDEX "LiteLLM_PromptTable_prompt_id_version_key" ON "LiteLLM_PromptTable" ("prompt_id", "version"); \ No newline at end of file diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py index 5ba901f9729..1b61b533275 100644 --- a/litellm/llms/anthropic/chat/transformation.py +++ b/litellm/llms/anthropic/chat/transformation.py @@ -290,10 +290,19 @@ def _map_tool_choice( elif tool_choice == "none": _tool_choice = AnthropicMessagesToolChoice(type="none") elif isinstance(tool_choice, dict): - _tool_name = tool_choice.get("function", {}).get("name") - _tool_choice = AnthropicMessagesToolChoice(type="tool") - if _tool_name is not None: - _tool_choice["name"] = _tool_name + if "type" in tool_choice and "function" not in tool_choice: + tool_type = tool_choice.get("type") + if tool_type == "auto": + _tool_choice = AnthropicMessagesToolChoice(type="auto") + elif tool_type == "required" or tool_type == "any": + _tool_choice = AnthropicMessagesToolChoice(type="any") + elif tool_type == "none": + _tool_choice = AnthropicMessagesToolChoice(type="none") + else: + _tool_name = tool_choice.get("function", {}).get("name") + if _tool_name is not None: + _tool_choice = AnthropicMessagesToolChoice(type="tool") + _tool_choice["name"] = _tool_name if parallel_tool_use is not None: # Anthropic uses 'disable_parallel_tool_use' flag to determine if parallel tool use is allowed diff --git a/litellm/llms/azure/chat/gpt_5_transformation.py b/litellm/llms/azure/chat/gpt_5_transformation.py index 506b7fdfe5e..eeb55911ecf 100644 --- a/litellm/llms/azure/chat/gpt_5_transformation.py +++ b/litellm/llms/azure/chat/gpt_5_transformation.py @@ -22,7 +22,8 @@ def is_model_gpt_5_model(cls, model: str) -> bool: Accepts both explicit gpt-5 model names and the ``gpt5_series/`` prefix used for manual routing. """ - return "gpt-5" in model or "gpt5_series" in model + # gpt-5-chat* is a chat model and shouldn't go through GPT-5 reasoning restrictions. + return ("gpt-5" in model and "gpt-5-chat" not in model) or "gpt5_series" in model def get_supported_openai_params(self, model: str) -> List[str]: """Get supported parameters for Azure OpenAI GPT-5 models. @@ -37,6 +38,11 @@ def get_supported_openai_params(self, model: str) -> List[str]: """ params = OpenAIGPT5Config.get_supported_openai_params(self, model=model) + # Azure supports tool_choice for GPT-5 deployments, but the base GPT-5 config + # can drop it when the deployment name isn't in the OpenAI model registry. + if "tool_choice" not in params: + params.append("tool_choice") + # Only gpt-5.2 has been verified to support logprobs on Azure if self.is_model_gpt_5_2_model(model): azure_supported_params = ["logprobs", "top_logprobs"] diff --git a/litellm/llms/bedrock/chat/invoke_transformations/anthropic_claude3_transformation.py b/litellm/llms/bedrock/chat/invoke_transformations/anthropic_claude3_transformation.py index 53e08229799..c936b2cd23c 100644 --- a/litellm/llms/bedrock/chat/invoke_transformations/anthropic_claude3_transformation.py +++ b/litellm/llms/bedrock/chat/invoke_transformations/anthropic_claude3_transformation.py @@ -53,13 +53,26 @@ def map_openai_params( model: str, drop_params: bool, ) -> dict: - return AnthropicConfig.map_openai_params( + # Force tool-based structured outputs for Bedrock Invoke + # (similar to VertexAI fix in #19201) + # Bedrock Invoke doesn't support output_format parameter + original_model = model + if "response_format" in non_default_params: + # Use a model name that forces tool-based approach + model = "claude-3-sonnet-20240229" + + optional_params = AnthropicConfig.map_openai_params( self, non_default_params, optional_params, model, drop_params, ) + + # Restore original model name + model = original_model + + return optional_params def transform_request( @@ -90,6 +103,8 @@ def transform_request( _anthropic_request.pop("model", None) _anthropic_request.pop("stream", None) + # Bedrock Invoke doesn't support output_format parameter + _anthropic_request.pop("output_format", None) if "anthropic_version" not in _anthropic_request: _anthropic_request["anthropic_version"] = self.anthropic_version @@ -117,6 +132,26 @@ def transform_request( if "opus-4" in model.lower() or "opus_4" in model.lower(): beta_set.add("tool-search-tool-2025-10-19") + # Filter out beta headers that Bedrock Invoke doesn't support + # AWS Bedrock only supports a specific whitelist of beta flags + # Reference: https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-messages-request-response.html + BEDROCK_SUPPORTED_BETAS = { + "computer-use-2024-10-22", # Legacy computer use + "computer-use-2025-01-24", # Current computer use (Claude 3.7 Sonnet) + "token-efficient-tools-2025-02-19", # Tool use (Claude 3.7+ and Claude 4+) + "interleaved-thinking-2025-05-14", # Interleaved thinking (Claude 4+) + "output-128k-2025-02-19", # 128K output tokens (Claude 3.7 Sonnet) + "dev-full-thinking-2025-05-14", # Developer mode for raw thinking (Claude 4+) + "context-1m-2025-08-07", # 1 million tokens (Claude Sonnet 4) + "context-management-2025-06-27", # Context management (Claude Sonnet/Haiku 4.5) + "effort-2025-11-24", # Effort parameter (Claude Opus 4.5) + "tool-search-tool-2025-10-19", # Tool search (Claude Opus 4.5) + "tool-examples-2025-10-29", # Tool use examples (Claude Opus 4.5) + } + + # Only keep beta headers that Bedrock supports + beta_set = {beta for beta in beta_set if beta in BEDROCK_SUPPORTED_BETAS} + if beta_set: _anthropic_request["anthropic_beta"] = list(beta_set) diff --git a/litellm/llms/openai/chat/gpt_5_transformation.py b/litellm/llms/openai/chat/gpt_5_transformation.py index 3fffa335fdc..05c003c8b7a 100644 --- a/litellm/llms/openai/chat/gpt_5_transformation.py +++ b/litellm/llms/openai/chat/gpt_5_transformation.py @@ -19,7 +19,9 @@ class OpenAIGPT5Config(OpenAIGPTConfig): @classmethod def is_model_gpt_5_model(cls, model: str) -> bool: - return "gpt-5" in model + # gpt-5-chat* behaves like a regular chat model (supports temperature, etc.) + # Don't route it through GPT-5 reasoning-specific parameter restrictions. + return "gpt-5" in model and "gpt-5-chat" not in model @classmethod def is_model_gpt_5_codex_model(cls, model: str) -> bool: diff --git a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py index b78ac8f9e98..a9ac21bb56f 100644 --- a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py +++ b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py @@ -1657,7 +1657,17 @@ def _calculate_usage( # noqa: PLR0915 ## This is necessary because promptTokensDetails includes both cached and non-cached tokens ## See: https://github.com/BerriAI/litellm/issues/18750 if cached_text_tokens is not None and prompt_text_tokens is not None: + # Explicit caching: subtract cached tokens per modality from cacheTokensDetails prompt_text_tokens = prompt_text_tokens - cached_text_tokens + elif ( + cached_tokens is not None + and prompt_text_tokens is not None + and cached_text_tokens is None + ): + # Implicit caching: only cachedContentTokenCount is provided (no cacheTokensDetails) + # Subtract from text tokens since implicit caching is primarily for text content + # See: https://github.com/BerriAI/litellm/issues/16341 + prompt_text_tokens = prompt_text_tokens - cached_tokens if cached_audio_tokens is not None and prompt_audio_tokens is not None: prompt_audio_tokens = prompt_audio_tokens - cached_audio_tokens if cached_image_tokens is not None and prompt_image_tokens is not None: diff --git a/litellm/main.py b/litellm/main.py index 23922e3c8bf..99bf224c5b7 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -7280,8 +7280,11 @@ def _get_encoding(): def __getattr__(name: str) -> Any: """Lazy import handler for main module""" if name == "encoding": - # Lazy load encoding to avoid heavy tiktoken import at module load time - _encoding = tiktoken.get_encoding("cl100k_base") + # Use _get_default_encoding which properly sets TIKTOKEN_CACHE_DIR + # before loading tiktoken, ensuring the local cache is used + # instead of downloading from the internet + from litellm._lazy_imports import _get_default_encoding + _encoding = _get_default_encoding() # Cache it in the module's __dict__ for subsequent accesses import sys diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index d556b746626..fad5f243fff 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -3130,7 +3130,7 @@ "supports_reasoning": true, "supports_response_schema": true, "supports_system_messages": true, - "supports_tool_choice": false, + "supports_tool_choice": true, "supports_vision": true }, "azure/gpt-5-chat-latest": { @@ -3162,7 +3162,7 @@ "supports_reasoning": true, "supports_response_schema": true, "supports_system_messages": true, - "supports_tool_choice": false, + "supports_tool_choice": true, "supports_vision": true }, "azure/gpt-5-codex": { diff --git a/litellm/proxy/common_request_processing.py b/litellm/proxy/common_request_processing.py index 0d3e61b75c7..51f3e6482a4 100644 --- a/litellm/proxy/common_request_processing.py +++ b/litellm/proxy/common_request_processing.py @@ -650,11 +650,15 @@ async def base_process_llm_request( ) tasks = [] + # Start the moderation check (during_call_hook) as early as possible + # This gives it a head start to mask/validate input while the proxy handles routing tasks.append( - proxy_logging_obj.during_call_hook( - data=self.data, - user_api_key_dict=user_api_key_dict, - call_type=route_type, # type: ignore + asyncio.create_task( + proxy_logging_obj.during_call_hook( + data=self.data, + user_api_key_dict=user_api_key_dict, + call_type=route_type, # type: ignore + ) ) ) diff --git a/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/content_filter.py b/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/content_filter.py index c9bd0135a05..083a407e9cf 100644 --- a/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/content_filter.py +++ b/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/content_filter.py @@ -198,6 +198,15 @@ def __init__( for pattern_config in normalized_patterns: self._add_pattern(pattern_config) + # Warn if using during_call with MASK action (unstable) + if self.event_hook == GuardrailEventHooks.during_call and any( + p["action"] == ContentFilterAction.MASK for p in self.compiled_patterns + ): + verbose_proxy_logger.warning( + f"ContentFilterGuardrail '{self.guardrail_name}': 'during_call' mode with 'MASK' action is unstable due to race conditions. " + "Use 'pre_call' mode for reliable request masking." + ) + # Load blocked words - always initialize as dict self.blocked_words: Dict[str, Tuple[ContentFilterAction, Optional[str]]] = {} for word in normalized_blocked_words: @@ -905,11 +914,15 @@ async def _process_images( elif isinstance(e.detail, str): e.detail = e.detail + " (Image description): " + description else: - e.detail = "Content blocked: Image description detected" + description + e.detail = ( + "Content blocked: Image description detected" + description + ) raise e def _count_masked_entities( - self, detections: List[ContentFilterDetection], masked_entity_count: Dict[str, int] + self, + detections: List[ContentFilterDetection], + masked_entity_count: Dict[str, int], ) -> None: """ Count masked entities by type from detections. @@ -964,9 +977,11 @@ def _log_guardrail_information( dict(detection) for detection in detections ] if status != "success": - guardrail_json_response = exception_str if exception_str else [ - dict(detection) for detection in detections - ] + guardrail_json_response = ( + exception_str + if exception_str + else [dict(detection) for detection in detections] + ) self.add_standard_logging_guardrail_information_to_request_data( guardrail_provider=self.guardrail_provider, @@ -1066,99 +1081,84 @@ async def async_post_call_streaming_iterator_hook( Process streaming response chunks and check for blocked content. For BLOCK action: Raises HTTPException immediately when blocked content is detected. - For MASK action: Content passes through (masking streaming responses is not supported). + For MASK action: Content is buffered to handle patterns split across chunks. """ + accumulated_full_text = "" + yielded_masked_text_len = 0 + buffer_size = 50 # Increased buffer to catch patterns split across many chunks - # Accumulate content as we iterate through chunks - accumulated_content = "" + verbose_proxy_logger.info( + f"ContentFilterGuardrail: Starting robust streaming masking for model {request_data.get('model')}" + ) async for item in response: - # Accumulate content from this chunk before checking if isinstance(item, ModelResponseStream) and item.choices: + delta_content = "" + is_final = False for choice in item.choices: if hasattr(choice, "delta") and choice.delta: content = getattr(choice.delta, "content", None) if content and isinstance(content, str): - accumulated_content += content - - # Check accumulated content for blocked patterns/keywords after processing all choices - # Only check for BLOCK actions, not MASK (masking streaming is not supported) - if accumulated_content: - try: - # Check patterns - pattern_match = self._check_patterns(accumulated_content) - if pattern_match: - matched_text, pattern_name, action = pattern_match - if action == ContentFilterAction.BLOCK: - error_msg = ( - f"Content blocked: {pattern_name} pattern detected" - ) - verbose_proxy_logger.warning(error_msg) - raise HTTPException( - status_code=403, - detail={ - "error": error_msg, - "pattern": pattern_name, - }, - ) - - # Check blocked words - blocked_word_match = self._check_blocked_words( - accumulated_content - ) - if blocked_word_match: - keyword, action, description = blocked_word_match - if action == ContentFilterAction.BLOCK: - error_msg = ( - f"Content blocked: keyword '{keyword}' detected" - ) - if description: - error_msg += f" ({description})" - verbose_proxy_logger.warning(error_msg) - raise HTTPException( - status_code=403, - detail={ - "error": error_msg, - "keyword": keyword, - "description": description, - }, - ) - - # Check category keywords - all_exceptions = [] - for category in self.loaded_categories.values(): - all_exceptions.extend(category.exceptions) - category_match = self._check_category_keywords( - accumulated_content, all_exceptions - ) - if category_match: - keyword, category_name, severity, action = category_match - if action == ContentFilterAction.BLOCK: - error_msg = ( - f"Content blocked: {category_name} category keyword '{keyword}' detected " - f"(severity: {severity})" - ) - verbose_proxy_logger.warning(error_msg) - raise HTTPException( - status_code=403, - detail={ - "error": error_msg, - "category": category_name, - "keyword": keyword, - "severity": severity, - }, - ) - except HTTPException: - # Re-raise HTTPException (blocked content detected) - raise - except Exception as e: - # Log other exceptions but don't block the stream - verbose_proxy_logger.warning( - f"Error checking content filter in streaming: {e}" - ) + delta_content += content + if getattr(choice, "finish_reason", None): + is_final = True + + accumulated_full_text += delta_content + + # Check for blocking or apply masking + # Add a space at the end if it's the final chunk to trigger word boundaries (\b) + text_to_check = accumulated_full_text + if is_final: + text_to_check += " " + + try: + masked_text = self._filter_single_text(text_to_check) + if is_final and masked_text.endswith(" "): + masked_text = masked_text[:-1] + except HTTPException: + raise + except Exception as e: + verbose_proxy_logger.error( + f"ContentFilterGuardrail: Error in masking: {e}" + ) + masked_text = text_to_check # Fallback to current text + + # Determine how much can be safely yielded + if is_final: + safe_to_yield_len = len(masked_text) + else: + safe_to_yield_len = max(0, len(masked_text) - buffer_size) + + if safe_to_yield_len > yielded_masked_text_len: + new_masked_content = masked_text[ + yielded_masked_text_len:safe_to_yield_len + ] + # Modify the chunk to contain only the new masked content + if ( + item.choices + and hasattr(item.choices[0], "delta") + and item.choices[0].delta + ): + item.choices[0].delta.content = new_masked_content + yielded_masked_text_len = safe_to_yield_len + yield item + else: + # Hold content by yielding empty content chunk (keeps metadata/structure) + if ( + item.choices + and hasattr(item.choices[0], "delta") + and item.choices[0].delta + ): + item.choices[0].delta.content = "" + yield item + else: + # Not a ModelResponseStream or no choices - yield as is + yield item - # Yield the chunk (only if no exception was raised above) - yield item + # Any remaining content (should have been handled by is_final, but just in case) + if yielded_masked_text_len < len(accumulated_full_text): + # We already reached the end of the generator + pass @staticmethod def get_config_model(): diff --git a/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/patterns.json b/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/patterns.json index f2427b5b920..1eff7804b42 100644 --- a/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/patterns.json +++ b/litellm/proxy/guardrails/guardrail_hooks/litellm_content_filter/patterns.json @@ -108,7 +108,7 @@ { "name": "ipv6", "display_name": "IP Address (IPv6)", - "pattern": "\\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\\b", + "pattern": "(?'" + ) + raise e ## Start RDS IAM token refresh background task if enabled ## # This proactively refreshes IAM tokens before they expire, diff --git a/litellm/proxy/schema.prisma b/litellm/proxy/schema.prisma index ca60b9e1bec..b118400b620 100644 --- a/litellm/proxy/schema.prisma +++ b/litellm/proxy/schema.prisma @@ -5,6 +5,7 @@ datasource client { generator client { provider = "prisma-client-py" + binaryTargets = ["native", "debian-openssl-1.1.x", "debian-openssl-3.0.x", "linux-musl", "linux-musl-openssl-3.0.x"] } // Budget / Rate Limits for an org diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 13f42a2f71d..8922ed032e2 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -1901,7 +1901,18 @@ async def async_post_call_streaming_iterator_hook( ) or _callback.should_run_guardrail( data=request_data, event_type=GuardrailEventHooks.post_call ): - if "apply_guardrail" in type(callback).__dict__: + if ( + "async_post_call_streaming_iterator_hook" + in type(callback).__dict__ + ): + current_response = ( + _callback.async_post_call_streaming_iterator_hook( + user_api_key_dict=user_api_key_dict, + response=current_response, + request_data=request_data, + ) + ) + elif "apply_guardrail" in type(callback).__dict__: request_data["guardrail_to_apply"] = callback current_response = ( unified_guardrail.async_post_call_streaming_iterator_hook( diff --git a/litellm/router.py b/litellm/router.py index 09d71b6b497..a3c3afa9326 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -8729,11 +8729,6 @@ def get_allowed_fails_from_policy(self, exception: Exception): if allowed_fails_policy is None: return None - if ( - isinstance(exception, litellm.BadRequestError) - and allowed_fails_policy.BadRequestErrorAllowedFails is not None - ): - return allowed_fails_policy.BadRequestErrorAllowedFails if ( isinstance(exception, litellm.AuthenticationError) and allowed_fails_policy.AuthenticationErrorAllowedFails is not None @@ -8754,6 +8749,11 @@ def get_allowed_fails_from_policy(self, exception: Exception): and allowed_fails_policy.ContentPolicyViolationErrorAllowedFails is not None ): return allowed_fails_policy.ContentPolicyViolationErrorAllowedFails + if ( + isinstance(exception, litellm.BadRequestError) + and allowed_fails_policy.BadRequestErrorAllowedFails is not None + ): + return allowed_fails_policy.BadRequestErrorAllowedFails def _initialize_alerting(self): from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting diff --git a/litellm/router_utils/get_retry_from_policy.py b/litellm/router_utils/get_retry_from_policy.py index 48df43ef818..ec326ebb50d 100644 --- a/litellm/router_utils/get_retry_from_policy.py +++ b/litellm/router_utils/get_retry_from_policy.py @@ -43,11 +43,6 @@ def get_num_retries_from_retry_policy( if isinstance(retry_policy, dict): retry_policy = RetryPolicy(**retry_policy) - if ( - isinstance(exception, BadRequestError) - and retry_policy.BadRequestErrorRetries is not None - ): - return retry_policy.BadRequestErrorRetries if ( isinstance(exception, AuthenticationError) and retry_policy.AuthenticationErrorRetries is not None @@ -65,6 +60,11 @@ def get_num_retries_from_retry_policy( and retry_policy.ContentPolicyViolationErrorRetries is not None ): return retry_policy.ContentPolicyViolationErrorRetries + if ( + isinstance(exception, BadRequestError) + and retry_policy.BadRequestErrorRetries is not None + ): + return retry_policy.BadRequestErrorRetries def reset_retry_policy() -> RetryPolicy: diff --git a/litellm/types/guardrails.py b/litellm/types/guardrails.py index eea0a26b332..ca22049720e 100644 --- a/litellm/types/guardrails.py +++ b/litellm/types/guardrails.py @@ -20,6 +20,9 @@ from litellm.types.proxy.guardrails.guardrail_hooks.qualifire import ( QualifireGuardrailConfigModel, ) +from litellm.types.proxy.guardrails.guardrail_hooks.litellm_content_filter import ( + ContentFilterCategoryConfig, +) """ Pydantic object defining how to set guardrails on litellm proxy @@ -547,9 +550,27 @@ class ContentFilterConfigModel(BaseModel): blocked_words_file: Optional[str] = Field( default=None, description="Path to YAML file containing blocked_words list" ) + categories: Optional[List[ContentFilterCategoryConfig]] = Field( + default=None, + description="List of prebuilt categories to enable (harmful_*, bias_*)", + ) + severity_threshold: Optional[str] = Field( + default=None, + description="Minimum severity to block (high, medium, low)", + ) + pattern_redaction_format: Optional[str] = Field( + default=None, + description="Format string for pattern redaction (use {pattern_name} placeholder)", + ) + keyword_redaction_tag: Optional[str] = Field( + default=None, + description="Tag to use for keyword redaction", + ) -class BaseLitellmParams(BaseModel): # works for new and patch update guardrails +class BaseLitellmParams( + ContentFilterConfigModel +): # works for new and patch update guardrails api_key: Optional[str] = Field( default=None, description="API key for the guardrail service" ) @@ -630,7 +651,6 @@ class BaseLitellmParams(BaseModel): # works for new and patch update guardrails description="Whether to fail the request if Model Armor encounters an error", ) - # Generic Guardrail API params additional_provider_specific_params: Optional[Dict[str, Any]] = Field( default=None, description="Additional provider-specific parameters for generic guardrail APIs", @@ -657,7 +677,6 @@ class LitellmParams( ToolPermissionGuardrailConfigModel, ZscalerAIGuardConfigModel, JavelinGuardrailConfigModel, - ContentFilterConfigModel, BaseLitellmParams, EnkryptAIGuardrailConfigs, IBMGuardrailsBaseConfigModel, diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index d556b746626..fad5f243fff 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -3130,7 +3130,7 @@ "supports_reasoning": true, "supports_response_schema": true, "supports_system_messages": true, - "supports_tool_choice": false, + "supports_tool_choice": true, "supports_vision": true }, "azure/gpt-5-chat-latest": { @@ -3162,7 +3162,7 @@ "supports_reasoning": true, "supports_response_schema": true, "supports_system_messages": true, - "supports_tool_choice": false, + "supports_tool_choice": true, "supports_vision": true }, "azure/gpt-5-codex": { diff --git a/tests/local_testing/test_auth_utils.py b/tests/local_testing/test_auth_utils.py index 72f799a6cf0..d36f96b1a39 100644 --- a/tests/local_testing/test_auth_utils.py +++ b/tests/local_testing/test_auth_utils.py @@ -356,6 +356,25 @@ def test_get_internal_user_header_from_mapping_no_internal_returns_none(): "/openai/deployments/my-deployment/chat/completions", "my-deployment" ), + # Custom model_name with slashes (e.g., gcp/google/gemini-2.5-flash) + # This is the NVIDIA P0 bug fix - regex should capture full model name including slashes + ( + {}, + "/vertex_ai/v1/projects/my-project/locations/us-central1/publishers/google/models/gcp/google/gemini-2.5-flash:generateContent", + "gcp/google/gemini-2.5-flash" + ), + # Another custom model_name with slashes + ( + {}, + "/vertex_ai/v1/projects/my-project/locations/global/publishers/google/models/gcp/google/gemini-3-flash-preview:generateContent", + "gcp/google/gemini-3-flash-preview" + ), + # Model name with single slash + ( + {}, + "/vertex_ai/v1/projects/my-project/locations/us-central1/publishers/google/models/custom/model:generateContent", + "custom/model" + ), ], ) def test_get_model_from_request_vertex_ai_passthrough(request_data, route, expected_model): diff --git a/tests/local_testing/test_completion_with_retries.py b/tests/local_testing/test_completion_with_retries.py index 6eb3ad460e6..585e1ee2618 100644 --- a/tests/local_testing/test_completion_with_retries.py +++ b/tests/local_testing/test_completion_with_retries.py @@ -60,6 +60,7 @@ async def test_completion_with_retry_policy(sync_mode): retry_number = 1 retry_policy = RetryPolicy( + BadRequestErrorRetries=10, ContentPolicyViolationErrorRetries=retry_number, # run 3 retries for ContentPolicyViolationErrors AuthenticationErrorRetries=0, # run 0 retries for AuthenticationErrorRetries ) diff --git a/tests/test_litellm/llms/anthropic/chat/test_anthropic_chat_transformation.py b/tests/test_litellm/llms/anthropic/chat/test_anthropic_chat_transformation.py index bd229dedfaa..eee0b267fad 100644 --- a/tests/test_litellm/llms/anthropic/chat/test_anthropic_chat_transformation.py +++ b/tests/test_litellm/llms/anthropic/chat/test_anthropic_chat_transformation.py @@ -548,6 +548,59 @@ def test_map_tool_choice_dict_type_function_with_name(): assert result["name"] == "my_tool" +def test_map_tool_choice_dict_type_auto(): + """ + Test that dict {"type": "auto"} maps to Anthropic type='auto'. + This handles Cursor's format for tool_choice. + """ + config = AnthropicConfig() + result = config._map_tool_choice( + tool_choice={"type": "auto"}, + parallel_tool_use=None, + ) + assert result is not None + assert result["type"] == "auto" + + +def test_map_tool_choice_dict_type_required(): + """ + Test that dict {"type": "required"} maps to Anthropic type='any'. + """ + config = AnthropicConfig() + result = config._map_tool_choice( + tool_choice={"type": "required"}, + parallel_tool_use=None, + ) + assert result is not None + assert result["type"] == "any" + + +def test_map_tool_choice_dict_type_none(): + """ + Test that dict {"type": "none"} maps to Anthropic type='none'. + """ + config = AnthropicConfig() + result = config._map_tool_choice( + tool_choice={"type": "none"}, + parallel_tool_use=None, + ) + assert result is not None + assert result["type"] == "none" + + +def test_map_tool_choice_dict_type_function_without_name(): + """ + Test that dict {"type": "function"} without name is handled gracefully. + Should return None since there's no valid tool name. + """ + config = AnthropicConfig() + result = config._map_tool_choice( + tool_choice={"type": "function"}, + parallel_tool_use=None, + ) + assert result is None + + def test_transform_response_with_prefix_prompt(): import httpx diff --git a/tests/test_litellm/llms/azure/chat/test_azure_gpt5_transformation.py b/tests/test_litellm/llms/azure/chat/test_azure_gpt5_transformation.py index 199a16d8590..25f3d1364f6 100644 --- a/tests/test_litellm/llms/azure/chat/test_azure_gpt5_transformation.py +++ b/tests/test_litellm/llms/azure/chat/test_azure_gpt5_transformation.py @@ -16,6 +16,17 @@ def test_azure_gpt5_supports_reasoning_effort(config: AzureOpenAIGPT5Config): ) +def test_azure_gpt5_allows_tool_choice_for_deployment_names(): + supported_params = litellm.get_supported_openai_params( + model="gpt-5-chat-2025-08-07", custom_llm_provider="azure" + ) + assert supported_params is not None + assert "tool_choice" in supported_params + # gpt-5-chat* should not be treated as a GPT-5 reasoning model + assert "reasoning_effort" not in supported_params + assert "temperature" in supported_params + + def test_azure_gpt5_maps_max_tokens(config: AzureOpenAIGPT5Config): params = config.map_openai_params( non_default_params={"max_tokens": 5}, diff --git a/tests/test_litellm/llms/bedrock/chat/invoke_transformations/test_bedrock_chat_invoke_transformations_anthropic_claude3_transformation.py b/tests/test_litellm/llms/bedrock/chat/invoke_transformations/test_bedrock_chat_invoke_transformations_anthropic_claude3_transformation.py index 98b392a353d..5c1b4cbd38e 100644 --- a/tests/test_litellm/llms/bedrock/chat/invoke_transformations/test_bedrock_chat_invoke_transformations_anthropic_claude3_transformation.py +++ b/tests/test_litellm/llms/bedrock/chat/invoke_transformations/test_bedrock_chat_invoke_transformations_anthropic_claude3_transformation.py @@ -464,3 +464,108 @@ def test_opus_4_5_model_detection(): for model in non_opus_4_5_models: assert not config._is_claude_opus_4_5(model), \ f"Should not detect {model} as Opus 4.5" + + +def test_structured_outputs_beta_header_filtered_for_bedrock_invoke(): + """ + Test that unsupported beta headers are filtered out for Bedrock Invoke API. + + Bedrock Invoke API only supports a specific whitelist of beta flags and returns + "invalid beta flag" error for others (e.g., structured-outputs, mcp-servers). + This test ensures unsupported headers are filtered while keeping supported ones. + + Fixes: https://github.com/BerriAI/litellm/issues/16726 + """ + config = AmazonAnthropicClaudeConfig() + + messages = [{"role": "user", "content": "test"}] + + # Test 1: structured-outputs beta header (unsupported) + headers = {"anthropic-beta": "structured-outputs-2025-11-13"} + + result = config.transform_request( + model="anthropic.claude-4-0-sonnet-20250514-v1:0", + messages=messages, + optional_params={}, + litellm_params={}, + headers=headers, + ) + + # Verify structured-outputs beta is filtered out + anthropic_beta = result.get("anthropic_beta", []) + assert not any("structured-outputs" in beta for beta in anthropic_beta), \ + f"structured-outputs beta should be filtered, got: {anthropic_beta}" + + # Test 2: mcp-servers beta header (unsupported - the main issue from #16726) + headers = {"anthropic-beta": "mcp-servers-2025-12-04"} + + result = config.transform_request( + model="anthropic.claude-4-0-sonnet-20250514-v1:0", + messages=messages, + optional_params={}, + litellm_params={}, + headers=headers, + ) + + # Verify mcp-servers beta is filtered out + anthropic_beta = result.get("anthropic_beta", []) + assert not any("mcp-servers" in beta for beta in anthropic_beta), \ + f"mcp-servers beta should be filtered, got: {anthropic_beta}" + + # Test 3: Mix of supported and unsupported beta headers + headers = {"anthropic-beta": "computer-use-2024-10-22,mcp-servers-2025-12-04,structured-outputs-2025-11-13"} + + result = config.transform_request( + model="anthropic.claude-4-0-sonnet-20250514-v1:0", + messages=messages, + optional_params={}, + litellm_params={}, + headers=headers, + ) + + # Verify only supported betas are kept + anthropic_beta = result.get("anthropic_beta", []) + assert not any("structured-outputs" in beta for beta in anthropic_beta), \ + f"structured-outputs beta should be filtered, got: {anthropic_beta}" + assert not any("mcp-servers" in beta for beta in anthropic_beta), \ + f"mcp-servers beta should be filtered, got: {anthropic_beta}" + assert any("computer-use" in beta for beta in anthropic_beta), \ + f"computer-use beta should be kept, got: {anthropic_beta}" + + +def test_output_format_removed_from_bedrock_invoke_request(): + """ + Test that output_format parameter is removed from Bedrock Invoke requests. + + Bedrock Invoke API doesn't support the output_format parameter (only supported + in Anthropic Messages API). This test ensures it's removed to prevent errors. + """ + config = AmazonAnthropicClaudeConfig() + + messages = [{"role": "user", "content": "test"}] + + # Create a request with output_format via map_openai_params + non_default_params = { + "response_format": {"type": "json_object"} + } + optional_params = {} + + # This should trigger tool-based structured outputs + optional_params = config.map_openai_params( + non_default_params=non_default_params, + optional_params=optional_params, + model="anthropic.claude-4-0-sonnet-20250514-v1:0", + drop_params=False, + ) + + result = config.transform_request( + model="anthropic.claude-4-0-sonnet-20250514-v1:0", + messages=messages, + optional_params=optional_params, + litellm_params={}, + headers={}, + ) + + # Verify output_format is not in the request + assert "output_format" not in result, \ + f"output_format should be removed for Bedrock Invoke, got keys: {result.keys()}" diff --git a/tests/test_litellm/llms/openai/test_gpt5_transformation.py b/tests/test_litellm/llms/openai/test_gpt5_transformation.py index fd25d302d07..386f264a4dd 100644 --- a/tests/test_litellm/llms/openai/test_gpt5_transformation.py +++ b/tests/test_litellm/llms/openai/test_gpt5_transformation.py @@ -20,6 +20,23 @@ def test_gpt5_supports_reasoning_effort(config: OpenAIConfig): assert "reasoning_effort" in config.get_supported_openai_params(model="gpt-5-mini") +def test_gpt5_chat_does_not_support_reasoning_effort(config: OpenAIConfig): + assert ( + "reasoning_effort" + not in config.get_supported_openai_params(model="gpt-5-chat-latest") + ) + + +def test_gpt5_chat_supports_temperature(config: OpenAIConfig): + params = config.map_openai_params( + non_default_params={"temperature": 0.3}, + optional_params={}, + model="gpt-5-chat-latest", + drop_params=False, + ) + assert params["temperature"] == 0.3 + + def test_gpt5_maps_max_tokens(config: OpenAIConfig): params = config.map_openai_params( non_default_params={"max_tokens": 10}, diff --git a/tests/test_litellm/proxy/guardrails/guardrail_hooks/content_filter/test_content_filter.py b/tests/test_litellm/proxy/guardrails/guardrail_hooks/content_filter/test_content_filter.py index 265bc530dc2..be8c84a554f 100644 --- a/tests/test_litellm/proxy/guardrails/guardrail_hooks/content_filter/test_content_filter.py +++ b/tests/test_litellm/proxy/guardrails/guardrail_hooks/content_filter/test_content_filter.py @@ -4,7 +4,7 @@ import os import sys -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import MagicMock import pytest @@ -385,20 +385,12 @@ def test_api_key_patterns(self): assert result is not None assert result[1] == "aws_access_key" - @pytest.mark.skip( - reason="Masking in streaming responses is no longer supported after unified_guardrail.py changes. Only blocking/rejecting is supported for responses." - ) @pytest.mark.asyncio async def test_streaming_hook_mask(self): """ - Test streaming hook with MASK action - - Note: After changes to unified_guardrail.py, masking responses to users - is no longer supported. This test is skipped as the feature is deprecated. - Only BLOCK actions (test_streaming_hook_block) are supported for streaming responses. + Test streaming hook with MASK action. + This now works with the 50-char sliding window buffer. """ - from unittest.mock import AsyncMock - from litellm.types.utils import Delta, ModelResponseStream, StreamingChoices patterns = [ @@ -415,51 +407,54 @@ async def test_streaming_hook_mask(self): event_hook=GuardrailEventHooks.during_call, ) - # Create mock streaming chunks + # Create mock streaming chunks that split an email async def mock_stream(): - # Chunk 1: contains email - chunk1 = ModelResponseStream( + # Chunk 1: starts email + yield ModelResponseStream( id="chunk1", choices=[ StreamingChoices( - delta=Delta(content="Contact me at test@example.com"), index=0 + delta=Delta(content="Contact me at test@ex"), index=0 ) ], model="gpt-4", ) - yield chunk1 - - # Chunk 2: normal content - chunk2 = ModelResponseStream( + # Chunk 2: ends email + yield ModelResponseStream( id="chunk2", choices=[ - StreamingChoices(delta=Delta(content=" for more info"), index=0) + StreamingChoices( + delta=Delta(content="ample.com for info"), + index=0, + finish_reason="stop", + ) ], model="gpt-4", ) - yield chunk2 user_api_key_dict = MagicMock() request_data = {} - # Process streaming response - no masking expected - result_chunks = [] + # Process streaming response - masking IS expected now + full_content = "" async for chunk in guardrail.async_post_call_streaming_iterator_hook( user_api_key_dict=user_api_key_dict, response=mock_stream(), request_data=request_data, ): - result_chunks.append(chunk) + if chunk.choices[0].delta.content: + full_content += chunk.choices[0].delta.content - # Chunks should pass through unchanged since masking is no longer supported - assert len(result_chunks) == 2 + # The email should be redacted even though it was split + assert "test@example.com" not in full_content + assert "[EMAIL_REDACTED]" in full_content + assert "Contact me at [EMAIL_REDACTED] for info" in full_content @pytest.mark.asyncio async def test_streaming_hook_block(self): """ Test streaming hook with BLOCK action """ - from unittest.mock import AsyncMock from litellm.types.utils import Delta, ModelResponseStream, StreamingChoices @@ -715,7 +710,10 @@ async def test_apply_guardrail_masks_all_regex_pattern_matches(self): assert result is not None assert len(result) == 1 # All matches should be redacted - assert result[0] == "[CUSTOM_KEY_REDACTED] [CUSTOM_KEY_REDACTED] [CUSTOM_KEY_REDACTED]" + assert ( + result[0] + == "[CUSTOM_KEY_REDACTED] [CUSTOM_KEY_REDACTED] [CUSTOM_KEY_REDACTED]" + ) assert "Key1" not in result[0] assert "Key2" not in result[0] @@ -797,7 +795,7 @@ async def test_apply_guardrail_logs_guardrail_information(self): # Apply guardrail with content that triggers detections # Email will be masked, blocked word will be masked - result = await guardrail.apply_guardrail( + await guardrail.apply_guardrail( inputs={"texts": ["Contact me at test@example.com for confidential info"]}, request_data=request_data, input_type="request", @@ -807,7 +805,9 @@ async def test_apply_guardrail_logs_guardrail_information(self): assert "metadata" in request_data assert "standard_logging_guardrail_information" in request_data["metadata"] - guardrail_info_list = request_data["metadata"]["standard_logging_guardrail_information"] + guardrail_info_list = request_data["metadata"][ + "standard_logging_guardrail_information" + ] assert isinstance(guardrail_info_list, list) assert len(guardrail_info_list) == 1 @@ -820,8 +820,8 @@ async def test_apply_guardrail_logs_guardrail_information(self): assert "start_time" in guardrail_info assert "end_time" in guardrail_info assert "duration" in guardrail_info - assert guardrail_info["duration"] > 0 - assert guardrail_info["start_time"] < guardrail_info["end_time"] + assert guardrail_info["duration"] >= 0 + assert guardrail_info["start_time"] <= guardrail_info["end_time"] # Verify detections are logged assert "guardrail_response" in guardrail_info @@ -839,15 +839,21 @@ async def test_apply_guardrail_logs_guardrail_information(self): assert "action" in detection assert detection["action"] == "MASK" # Verify sensitive content (matched_text) is NOT included - assert "matched_text" not in detection, "Sensitive content should not be logged" + assert ( + "matched_text" not in detection + ), "Sensitive content should not be logged" # Verify blocked word detection structure - blocked_word_detections = [d for d in detections if d.get("type") == "blocked_word"] + blocked_word_detections = [ + d for d in detections if d.get("type") == "blocked_word" + ] assert len(blocked_word_detections) > 0 for detection in blocked_word_detections: assert detection["type"] == "blocked_word" assert "keyword" in detection - assert detection["keyword"] == "confidential" # Config keyword, not user content + assert ( + detection["keyword"] == "confidential" + ) # Config keyword, not user content assert "action" in detection assert detection["action"] == "MASK" assert "description" in detection @@ -896,7 +902,9 @@ async def test_apply_guardrail_logs_blocked_status(self): assert "metadata" in request_data assert "standard_logging_guardrail_information" in request_data["metadata"] - guardrail_info_list = request_data["metadata"]["standard_logging_guardrail_information"] + guardrail_info_list = request_data["metadata"][ + "standard_logging_guardrail_information" + ] assert len(guardrail_info_list) == 1 guardrail_info = guardrail_info_list[0] @@ -909,4 +917,6 @@ async def test_apply_guardrail_logs_blocked_status(self): # If detections are logged, verify they don't contain sensitive content for detection in detections: if detection.get("type") == "pattern": - assert "matched_text" not in detection, "Sensitive content should not be logged" + assert ( + "matched_text" not in detection + ), "Sensitive content should not be logged" diff --git a/tests/test_litellm/test_cost_calculator.py b/tests/test_litellm/test_cost_calculator.py index e277baf0b0c..74f5cf9bdd7 100644 --- a/tests/test_litellm/test_cost_calculator.py +++ b/tests/test_litellm/test_cost_calculator.py @@ -1695,60 +1695,84 @@ def test_gemini_without_cache_tokens_details(): print("✅ Gemini without cacheTokensDetails works correctly") -def test_generic_provider_cached_token_cost(): +def test_gemini_implicit_caching_cost_calculation(): """ - Test that the generic cost calculator correctly handles cached tokens - for providers like z.ai/deepseek that are not explicitly handled. + Test for Issue #16341: Gemini implicit cached tokens not counted in spend log + + When Gemini uses implicit caching, it returns cachedContentTokenCount but NOT + cacheTokensDetails. In this case, we should subtract cachedContentTokenCount + from text_tokens to correctly calculate costs. + + See: https://github.com/BerriAI/litellm/issues/16341 """ - from litellm.cost_calculator import completion_cost - from litellm.types.utils import ModelResponse, PromptTokensDetailsWrapper, Usage - - # Setup model cost for a generic provider - # We use a name that will bypass complex provider mapping logic - model_name = "custom-cached-model" - litellm.model_cost[model_name] = { - "input_cost_per_token": 0.0000006, - "output_cost_per_token": 0.0000006, - "cache_read_input_token_cost": 0.0000001, - "litellm_provider": "openai", + from litellm import completion_cost + from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import ( + VertexGeminiConfig, + ) + from litellm.types.utils import Choices, Message, ModelResponse + + # Simulate Gemini response with implicit caching (cachedContentTokenCount only) + completion_response = { + "usageMetadata": { + "promptTokenCount": 10000, + "candidatesTokenCount": 5, + "totalTokenCount": 10005, + "cachedContentTokenCount": 8000, # Implicit caching - no cacheTokensDetails + "promptTokensDetails": [{"modality": "TEXT", "tokenCount": 10000}], + "candidatesTokensDetails": [{"modality": "TEXT", "tokenCount": 5}], + } } - # Case 1: Standard nested cached tokens (prompt_tokens_details.cached_tokens) - usage = Usage( - prompt_tokens=10000, - completion_tokens=0, - prompt_tokens_details=PromptTokensDetailsWrapper(cached_tokens=9000), + usage = VertexGeminiConfig._calculate_usage(completion_response) + + # Verify parsing + assert ( + usage.cache_read_input_tokens == 8000 + ), f"cache_read_input_tokens should be 8000, got {usage.cache_read_input_tokens}" + assert ( + usage.prompt_tokens_details.cached_tokens == 8000 + ), f"cached_tokens should be 8000, got {usage.prompt_tokens_details.cached_tokens}" + + # CRITICAL: text_tokens should be (10000 - 8000) = 2000, NOT 10000 + # This is the fix for issue #16341 + assert ( + usage.prompt_tokens_details.text_tokens == 2000 + ), f"text_tokens should be 2000 (10000 - 8000), got {usage.prompt_tokens_details.text_tokens}" + + # Verify cost calculation uses cached token pricing + response = ModelResponse( + id="mock-id", + model="gemini-2.0-flash", + choices=[ + Choices( + index=0, + message=Message(role="assistant", content="Hello!"), + finish_reason="stop", + ) + ], + usage=usage, ) - response = ModelResponse(usage=usage, model=model_name) cost = completion_cost( completion_response=response, - model=model_name, - custom_llm_provider="openai", # Explicitly set provider to trigger generic path + model="gemini-2.0-flash", + custom_llm_provider="gemini", ) - # Expected: (1000 * 0.0000006) + (9000 * 0.0000001) = 0.0006 + 0.0009 = 0.0015 - expected_cost = 0.0015 - assert ( - abs(cost - expected_cost) < 1e-9 - ), f"Nested cache cost failed. Got {cost}, expected {expected_cost}" + # Get model pricing for verification + import litellm - # Case 2: Top-level cached tokens (cache_read_input_tokens) - usage_top = Usage( - prompt_tokens=10000, - completion_tokens=0, - cache_read_input_tokens=9000, - ) - response_top = ModelResponse(usage=usage_top, model=model_name) + model_info = litellm.get_model_info("gemini/gemini-2.0-flash") + input_cost = model_info.get("input_cost_per_token", 0) + cache_read_cost = model_info.get("cache_read_input_token_cost", input_cost) + output_cost = model_info.get("output_cost_per_token", 0) - cost_top = completion_cost( - completion_response=response_top, - model=model_name, - custom_llm_provider="openai", - ) + # Expected cost: (2000 * input) + (8000 * cache_read) + (5 * output) + expected_cost = (2000 * input_cost) + (8000 * cache_read_cost) + (5 * output_cost) - assert ( - abs(cost_top - expected_cost) < 1e-9 - ), f"Top-level cache cost failed. Got {cost_top}, expected {expected_cost}" + assert abs(cost - expected_cost) < 1e-9, ( + f"Cost calculation is wrong. Got ${cost:.6f}, expected ${expected_cost:.6f}. " + f"Cached tokens may not be using reduced pricing." + ) - print("✅ Generic provider cached token cost verified") + print("✅ Issue #16341 fix verified: Gemini implicit caching cost calculated correctly") diff --git a/tests/test_litellm/test_eager_tiktoken_load.py b/tests/test_litellm/test_eager_tiktoken_load.py index 1264c68b99e..33dd57fad8d 100644 --- a/tests/test_litellm/test_eager_tiktoken_load.py +++ b/tests/test_litellm/test_eager_tiktoken_load.py @@ -78,6 +78,35 @@ def test_lazy_loading_default(): assert len(tokens) > 0, "Encoding should work" +def test_tiktoken_cache_dir_set_on_lazy_load(): + """Test that TIKTOKEN_CACHE_DIR is set when encoding is lazy loaded. + + This ensures the local tiktoken cache is used instead of downloading + from the internet. Regression test for issue #19768. + """ + # Remove environment variables to ensure clean state + if "LITELLM_DISABLE_LAZY_LOADING" in os.environ: + del os.environ["LITELLM_DISABLE_LAZY_LOADING"] + if "TIKTOKEN_CACHE_DIR" in os.environ: + del os.environ["TIKTOKEN_CACHE_DIR"] + + # Clear any cached modules + modules_to_clear = [k for k in sys.modules.keys() if k.startswith("litellm")] + for module in modules_to_clear: + del sys.modules[module] + + # Import litellm fresh + import litellm + + # Access encoding (triggers lazy load) + _ = litellm.encoding + + # Verify TIKTOKEN_CACHE_DIR is now set and points to local tokenizers + assert "TIKTOKEN_CACHE_DIR" in os.environ, "TIKTOKEN_CACHE_DIR should be set after lazy loading encoding" + cache_dir = os.environ["TIKTOKEN_CACHE_DIR"] + assert "tokenizers" in cache_dir, f"TIKTOKEN_CACHE_DIR should point to tokenizers directory, got: {cache_dir}" + + @pytest.fixture(autouse=True) def cleanup_env(): """Clean up environment variable after each test"""