diff --git a/docs/my-website/docs/providers/azure_ai/azure_model_router.md b/docs/my-website/docs/providers/azure_ai/azure_model_router.md index 16bc1afb70e..9b308b709c7 100644 --- a/docs/my-website/docs/providers/azure_ai/azure_model_router.md +++ b/docs/my-website/docs/providers/azure_ai/azure_model_router.md @@ -2,6 +2,32 @@ Azure Model Router is a feature in Azure AI Foundry that automatically routes your requests to the best available model based on your requirements. This allows you to use a single endpoint that intelligently selects the optimal model for each request. +## Quick Start + +**Model pattern**: `azure_ai/model_router/` + +```python +import litellm + +response = litellm.completion( + model="azure_ai/model_router/model-router", # Replace with your deployment name + messages=[{"role": "user", "content": "Hello!"}], + api_base="https://your-endpoint.cognitiveservices.azure.com/openai/v1/", + api_key="your-api-key", +) +``` + +**Proxy config** (`config.yaml`): + +```yaml +model_list: + - model_name: model-router + litellm_params: + model: azure_ai/model_router/model-router + api_base: https://your-endpoint.cognitiveservices.azure.com/openai/deployments/model-router/chat/completions?api-version=2025-01-01-preview + api_key: your-api-key +``` + ## Key Features - **Automatic Model Selection**: Azure Model Router dynamically selects the best model for your request @@ -229,19 +255,51 @@ Cost is tracked based on the actual model used (e.g., `gpt-4.1-nano`), plus a fl ## Cost Tracking -LiteLLM automatically handles cost tracking for Azure Model Router by: +LiteLLM automatically handles cost tracking for Azure Model Router. Understanding how this works helps you interpret spend and debug billing. + +### How LiteLLM Calculates Cost + +When you use Azure Model Router, LiteLLM computes **two cost components**: + +| Component | Description | When Applied | +|-----------|-------------|--------------| +| **Model Cost** | Token-based cost for the actual model that handled the request (e.g., `gpt-5-nano`, `gpt-4.1-nano`) | Always, when Azure returns the model in the response | +| **Router Flat Cost** | $0.14 per million input tokens (Azure AI Foundry infrastructure fee) | When the **request** was made via a model router endpoint | + +### Cost Calculation Flow + +1. **Request model detection**: LiteLLM records the model you requested (e.g., `azure_ai/model_router/model-router`). If it contains `model_router` or `model-router`, the request is treated as a router request. + +2. **Response model extraction**: Azure returns the actual model used in the response (e.g., `gpt-5-nano-2025-08-07`). LiteLLM uses this for the model cost lookup. + +3. **Model cost**: LiteLLM looks up the response model in its pricing table and computes cost from prompt tokens and completion tokens. + +4. **Router flat cost**: Because the original request was to a model router, LiteLLM adds the flat cost ($0.14 per M input tokens) on top of the model cost. -1. **Detecting the actual model**: When Azure Model Router routes your request to a specific model (e.g., `gpt-4.1-nano-2025-04-14`), LiteLLM extracts this from the response -2. **Calculating accurate costs**: Costs are calculated based on: - - The actual model used (e.g., `gpt-4.1-nano` token costs) - - Plus a flat infrastructure cost of **$0.14 per million input tokens** for using the Model Router -3. **Streaming support**: Cost tracking works correctly for both streaming and non-streaming requests +5. **Total cost**: `Total = Model Cost + Router Flat Cost` + +### Configuration Requirements + +For cost tracking to work correctly: + +- **Use the full pattern**: `azure_ai/model_router/` (e.g., `azure_ai/model_router/model-router`) +- **Proxy config**: When using the LiteLLM proxy, set `model` in `litellm_params` to the full pattern so the request model is correctly identified as a router + +```yaml +# proxy_server_config.yaml +model_list: + - model_name: model-router + litellm_params: + model: azure_ai/model_router/model-router # Required for router cost detection + api_base: https://your-endpoint.cognitiveservices.azure.com/openai/deployments/model-router/chat/completions?api-version=2025-01-01-preview + api_key: your-api-key +``` ### Cost Breakdown When you use Azure Model Router, the total cost includes: -- **Model Cost**: Based on the actual model that handled your request (e.g., `gpt-4.1-nano`) +- **Model Cost**: Based on the actual model that handled your request (e.g., `gpt-5-nano`, `gpt-4.1-nano`) - **Router Flat Cost**: $0.14 per million input tokens (Azure AI Foundry infrastructure fee) ### Example Response with Cost diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index 6354bf44943..75d45af86e6 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -272,6 +272,8 @@ def cost_per_token( # noqa: PLR0915 ### SERVICE TIER ### service_tier: Optional[str] = None, # for OpenAI service tier pricing response: Optional[Any] = None, + ### REQUEST MODEL ### + request_model: Optional[str] = None, # original request model for router detection ) -> Tuple[float, float]: # type: ignore """ Calculates the cost per token for a given model, prompt tokens, and completion tokens. @@ -520,7 +522,7 @@ def cost_per_token( # noqa: PLR0915 return dashscope_cost_per_token(model=model, usage=usage_block) elif custom_llm_provider == "azure_ai": return azure_ai_cost_per_token( - model=model, usage=usage_block, response_time_ms=response_time_ms + model=model, usage=usage_block, response_time_ms=response_time_ms, request_model=request_model ) else: model_info = _cached_get_model_info_helper( @@ -1457,6 +1459,11 @@ def completion_cost( # noqa: PLR0915 text=completion_string ) + # Get the original request model for router detection + request_model_for_cost = None + if litellm_logging_obj is not None: + request_model_for_cost = litellm_logging_obj.model + ( prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar, @@ -1479,6 +1486,7 @@ def completion_cost( # noqa: PLR0915 rerank_billed_units=rerank_billed_units, service_tier=service_tier, response=completion_response, + request_model=request_model_for_cost, ) # Get additional costs from provider (e.g., routing fees, infrastructure costs) diff --git a/litellm/llms/azure_ai/cost_calculator.py b/litellm/llms/azure_ai/cost_calculator.py index 999f94da182..6fb29962677 100644 --- a/litellm/llms/azure_ai/cost_calculator.py +++ b/litellm/llms/azure_ai/cost_calculator.py @@ -61,7 +61,10 @@ def calculate_azure_model_router_flat_cost(model: str, prompt_tokens: int) -> fl def cost_per_token( - model: str, usage: Usage, response_time_ms: Optional[float] = 0.0 + model: str, + usage: Usage, + response_time_ms: Optional[float] = 0.0, + request_model: Optional[str] = None, ) -> Tuple[float, float]: """ Calculate the cost per token for Azure AI models. @@ -71,9 +74,10 @@ def cost_per_token( - Plus the cost of the actual model used (handled by generic_cost_per_token) Args: - model: str, the model name without provider prefix + model: str, the model name without provider prefix (from response) usage: LiteLLM Usage block response_time_ms: Optional response time in milliseconds + request_model: Optional[str], the original request model name (to detect router usage) Returns: Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd @@ -84,7 +88,13 @@ def cost_per_token( """ prompt_cost = 0.0 completion_cost = 0.0 - + + # Determine if this was a model router request + # Check both the response model and the request model + is_router_request = _is_azure_model_router(model) or ( + request_model is not None and _is_azure_model_router(request_model) + ) + # Calculate base cost using generic cost calculator # This may raise an exception if the model is not in the cost map try: @@ -103,19 +113,21 @@ def cost_per_token( verbose_logger.debug( f"Azure AI Model Router: model '{model}' not in cost map, calculating routing flat cost only. Error: {e}" ) - + # Add flat cost for Azure Model Router # The flat cost is defined in model_prices_and_context_window.json for azure_ai/model_router - if _is_azure_model_router(model): - router_flat_cost = calculate_azure_model_router_flat_cost(model, usage.prompt_tokens) - + if is_router_request: + # Use the request model for flat cost calculation if available, otherwise use response model + router_model_for_calc = request_model if request_model else model + router_flat_cost = calculate_azure_model_router_flat_cost(router_model_for_calc, usage.prompt_tokens) + if router_flat_cost > 0: verbose_logger.debug( f"Azure AI Model Router flat cost: ${router_flat_cost:.6f} " f"({usage.prompt_tokens} tokens × ${router_flat_cost / usage.prompt_tokens:.9f}/token)" ) - + # Add flat cost to prompt cost prompt_cost += router_flat_cost - + return prompt_cost, completion_cost diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 1237d963b12..93fb78d76da 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -20632,6 +20632,7 @@ "supports_tool_choice": true, "supports_service_tier": true, "supports_vision": true, + "supports_web_search": true, "supports_none_reasoning_effort": true, "supports_xhigh_reasoning_effort": false }, @@ -20670,6 +20671,7 @@ "supports_tool_choice": true, "supports_service_tier": true, "supports_vision": true, + "supports_web_search": true, "supports_none_reasoning_effort": true, "supports_xhigh_reasoning_effort": false }, @@ -20707,6 +20709,7 @@ "supports_system_messages": true, "supports_tool_choice": false, "supports_vision": true, + "supports_web_search": true, "supports_none_reasoning_effort": true, "supports_xhigh_reasoning_effort": false }, @@ -20746,6 +20749,7 @@ "supports_tool_choice": true, "supports_service_tier": true, "supports_vision": true, + "supports_web_search": true, "supports_none_reasoning_effort": true, "supports_xhigh_reasoning_effort": true }, @@ -20785,6 +20789,7 @@ "supports_tool_choice": true, "supports_service_tier": true, "supports_vision": true, + "supports_web_search": true, "supports_none_reasoning_effort": true, "supports_xhigh_reasoning_effort": true }, @@ -20821,6 +20826,7 @@ "supports_system_messages": true, "supports_tool_choice": true, "supports_vision": true, + "supports_web_search": true, "supports_none_reasoning_effort": false, "supports_xhigh_reasoning_effort": false }, @@ -20857,6 +20863,7 @@ "supports_system_messages": true, "supports_tool_choice": true, "supports_vision": true, + "supports_web_search": true, "supports_none_reasoning_effort": false, "supports_xhigh_reasoning_effort": false }, diff --git a/tests/test_litellm/llms/azure_ai/test_azure_ai_cost_calculator.py b/tests/test_litellm/llms/azure_ai/test_azure_ai_cost_calculator.py index 30bbd753204..ec1d4e4b3ca 100644 --- a/tests/test_litellm/llms/azure_ai/test_azure_ai_cost_calculator.py +++ b/tests/test_litellm/llms/azure_ai/test_azure_ai_cost_calculator.py @@ -196,6 +196,44 @@ def test_model_router_with_cached_tokens(self): ) print(f"Total prompt cost: ${prompt_cost:.6f}") + def test_router_flat_cost_when_response_has_actual_model(self): + """ + Test that router flat cost is added when request was via router but response + contains the actual model (e.g., gpt-5-nano). + + This is the key fix: Azure returns the actual model in the response, but we + must still add the router flat cost because the request was made via model router. + """ + usage = Usage( + prompt_tokens=10000, + completion_tokens=5000, + total_tokens=15000, + ) + + # Response model is the actual model Azure used (not a router name) + response_model = "gpt-5-nano-2025-08-07" + # Request model is the router - user called azure_ai/model_router/model-router + request_model = "azure_ai/model_router/model-router" + + prompt_cost, completion_cost = cost_per_token( + model=response_model, + usage=usage, + request_model=request_model, + ) + + # Expected: model cost (from gpt-5-nano) + router flat cost + expected_flat_cost = ( + usage.prompt_tokens * AZURE_MODEL_ROUTER_FLAT_COST_PER_M_INPUT_TOKENS / 1_000_000 + ) + assert expected_flat_cost == pytest.approx(0.0014, rel=1e-9) + + # Total cost should be model cost + flat cost + total_cost = prompt_cost + completion_cost + assert total_cost >= expected_flat_cost + + # Prompt cost should include both model prompt cost and router flat cost + assert prompt_cost >= expected_flat_cost + class TestAzureModelRouterCostBreakdown: """Test that Azure Model Router flat cost is tracked in cost breakdown."""