diff --git a/litellm/litellm_core_utils/llm_cost_calc/utils.py b/litellm/litellm_core_utils/llm_cost_calc/utils.py index 785976ed319..642cbbb7922 100644 --- a/litellm/litellm_core_utils/llm_cost_calc/utils.py +++ b/litellm/litellm_core_utils/llm_cost_calc/utils.py @@ -566,14 +566,28 @@ def generic_cost_per_token( # noqa: PLR0915 if usage.prompt_tokens_details: prompt_tokens_details = _parse_prompt_tokens_details(usage) - ## EDGE CASE - text tokens not set inside PromptTokensDetails - - if prompt_tokens_details["text_tokens"] == 0: + ## EDGE CASE - text tokens not set or includes cached tokens (double-counting) + ## Some providers (like xAI) report text_tokens = prompt_tokens (including cached) + ## We detect this when: text_tokens + cached_tokens + other > prompt_tokens + ## Ref: https://github.com/BerriAI/litellm/issues/19680, #14874, #14875 + + cache_hit = prompt_tokens_details["cache_hit_tokens"] + text_tokens = prompt_tokens_details["text_tokens"] + audio_tokens = prompt_tokens_details["audio_tokens"] + cache_creation = prompt_tokens_details["cache_creation_tokens"] + image_tokens = prompt_tokens_details["image_tokens"] + + # Check for double-counting: sum of details > prompt_tokens means overlap + total_details = text_tokens + cache_hit + audio_tokens + cache_creation + image_tokens + has_double_counting = cache_hit > 0 and total_details > usage.prompt_tokens + + if text_tokens == 0 or has_double_counting: text_tokens = ( usage.prompt_tokens - - prompt_tokens_details["cache_hit_tokens"] - - prompt_tokens_details["audio_tokens"] - - prompt_tokens_details["cache_creation_tokens"] + - cache_hit + - audio_tokens + - cache_creation + - image_tokens ) prompt_tokens_details["text_tokens"] = text_tokens diff --git a/litellm/llms/azure/cost_calculation.py b/litellm/llms/azure/cost_calculation.py index 96c58d95ff2..5b411095ea1 100644 --- a/litellm/llms/azure/cost_calculation.py +++ b/litellm/llms/azure/cost_calculation.py @@ -1,11 +1,12 @@ """ Helper util for handling azure openai-specific cost calculation -- e.g.: prompt caching +- e.g.: prompt caching, audio tokens """ from typing import Optional, Tuple from litellm._logging import verbose_logger +from litellm.litellm_core_utils.llm_cost_calc.utils import generic_cost_per_token from litellm.types.utils import Usage from litellm.utils import get_model_info @@ -18,34 +19,15 @@ def cost_per_token( Input: - model: str, the model name without provider prefix - - usage: LiteLLM Usage block, containing anthropic caching information + - usage: LiteLLM Usage block, containing caching and audio token information Returns: Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd """ ## GET MODEL INFO model_info = get_model_info(model=model, custom_llm_provider="azure") - cached_tokens: Optional[int] = None - ## CALCULATE INPUT COST - non_cached_text_tokens = usage.prompt_tokens - if usage.prompt_tokens_details and usage.prompt_tokens_details.cached_tokens: - cached_tokens = usage.prompt_tokens_details.cached_tokens - non_cached_text_tokens = non_cached_text_tokens - cached_tokens - prompt_cost: float = non_cached_text_tokens * model_info["input_cost_per_token"] - - ## CALCULATE OUTPUT COST - completion_cost: float = ( - usage["completion_tokens"] * model_info["output_cost_per_token"] - ) - - ## Prompt Caching cost calculation - if model_info.get("cache_read_input_token_cost") is not None and cached_tokens: - # Note: We read ._cache_read_input_tokens from the Usage - since cost_calculator.py standardizes the cache read tokens on usage._cache_read_input_tokens - prompt_cost += cached_tokens * ( - model_info.get("cache_read_input_token_cost", 0) or 0 - ) - ## Speech / Audio cost calculation + ## Speech / Audio cost calculation (cost per second for TTS models) if ( "output_cost_per_second" in model_info and model_info["output_cost_per_second"] is not None @@ -55,7 +37,14 @@ def cost_per_token( f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}" ) ## COST PER SECOND ## - prompt_cost = 0 + prompt_cost = 0.0 completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000 - - return prompt_cost, completion_cost + return prompt_cost, completion_cost + + ## Use generic cost calculator for all other cases + ## This properly handles: text tokens, audio tokens, cached tokens, reasoning tokens, etc. + return generic_cost_per_token( + model=model, + usage=usage, + custom_llm_provider="azure", + ) diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index a74b80e7373..313c1b71f18 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -30006,6 +30006,7 @@ "supports_web_search": true }, "xai/grok-3": { + "cache_read_input_token_cost": 7.5e-07, "input_cost_per_token": 3e-06, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30020,6 +30021,7 @@ "supports_web_search": true }, "xai/grok-3-beta": { + "cache_read_input_token_cost": 7.5e-07, "input_cost_per_token": 3e-06, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30034,6 +30036,7 @@ "supports_web_search": true }, "xai/grok-3-fast-beta": { + "cache_read_input_token_cost": 1.25e-06, "input_cost_per_token": 5e-06, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30048,6 +30051,7 @@ "supports_web_search": true }, "xai/grok-3-fast-latest": { + "cache_read_input_token_cost": 1.25e-06, "input_cost_per_token": 5e-06, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30062,6 +30066,7 @@ "supports_web_search": true }, "xai/grok-3-latest": { + "cache_read_input_token_cost": 7.5e-07, "input_cost_per_token": 3e-06, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30076,6 +30081,7 @@ "supports_web_search": true }, "xai/grok-3-mini": { + "cache_read_input_token_cost": 7.5e-08, "input_cost_per_token": 3e-07, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30091,6 +30097,7 @@ "supports_web_search": true }, "xai/grok-3-mini-beta": { + "cache_read_input_token_cost": 7.5e-08, "input_cost_per_token": 3e-07, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30106,6 +30113,7 @@ "supports_web_search": true }, "xai/grok-3-mini-fast": { + "cache_read_input_token_cost": 1.5e-07, "input_cost_per_token": 6e-07, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30121,6 +30129,7 @@ "supports_web_search": true }, "xai/grok-3-mini-fast-beta": { + "cache_read_input_token_cost": 1.5e-07, "input_cost_per_token": 6e-07, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30136,6 +30145,7 @@ "supports_web_search": true }, "xai/grok-3-mini-fast-latest": { + "cache_read_input_token_cost": 1.5e-07, "input_cost_per_token": 6e-07, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30151,6 +30161,7 @@ "supports_web_search": true }, "xai/grok-3-mini-latest": { + "cache_read_input_token_cost": 7.5e-08, "input_cost_per_token": 3e-07, "litellm_provider": "xai", "max_input_tokens": 131072, diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index a74b80e7373..313c1b71f18 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -30006,6 +30006,7 @@ "supports_web_search": true }, "xai/grok-3": { + "cache_read_input_token_cost": 7.5e-07, "input_cost_per_token": 3e-06, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30020,6 +30021,7 @@ "supports_web_search": true }, "xai/grok-3-beta": { + "cache_read_input_token_cost": 7.5e-07, "input_cost_per_token": 3e-06, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30034,6 +30036,7 @@ "supports_web_search": true }, "xai/grok-3-fast-beta": { + "cache_read_input_token_cost": 1.25e-06, "input_cost_per_token": 5e-06, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30048,6 +30051,7 @@ "supports_web_search": true }, "xai/grok-3-fast-latest": { + "cache_read_input_token_cost": 1.25e-06, "input_cost_per_token": 5e-06, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30062,6 +30066,7 @@ "supports_web_search": true }, "xai/grok-3-latest": { + "cache_read_input_token_cost": 7.5e-07, "input_cost_per_token": 3e-06, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30076,6 +30081,7 @@ "supports_web_search": true }, "xai/grok-3-mini": { + "cache_read_input_token_cost": 7.5e-08, "input_cost_per_token": 3e-07, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30091,6 +30097,7 @@ "supports_web_search": true }, "xai/grok-3-mini-beta": { + "cache_read_input_token_cost": 7.5e-08, "input_cost_per_token": 3e-07, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30106,6 +30113,7 @@ "supports_web_search": true }, "xai/grok-3-mini-fast": { + "cache_read_input_token_cost": 1.5e-07, "input_cost_per_token": 6e-07, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30121,6 +30129,7 @@ "supports_web_search": true }, "xai/grok-3-mini-fast-beta": { + "cache_read_input_token_cost": 1.5e-07, "input_cost_per_token": 6e-07, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30136,6 +30145,7 @@ "supports_web_search": true }, "xai/grok-3-mini-fast-latest": { + "cache_read_input_token_cost": 1.5e-07, "input_cost_per_token": 6e-07, "litellm_provider": "xai", "max_input_tokens": 131072, @@ -30151,6 +30161,7 @@ "supports_web_search": true }, "xai/grok-3-mini-latest": { + "cache_read_input_token_cost": 7.5e-08, "input_cost_per_token": 3e-07, "litellm_provider": "xai", "max_input_tokens": 131072, diff --git a/tests/test_litellm/test_cost_calculator.py b/tests/test_litellm/test_cost_calculator.py index 4d6599fc1b5..55a56c9d7e9 100644 --- a/tests/test_litellm/test_cost_calculator.py +++ b/tests/test_litellm/test_cost_calculator.py @@ -355,6 +355,90 @@ def test_azure_realtime_cost_calculator(): assert cost > 0 +def test_azure_audio_output_cost_calculation(): + """ + Test that Azure audio models correctly calculate costs for audio output tokens. + + Reproduces issue: https://github.com/BerriAI/litellm/issues/19764 + Audio tokens should be charged at output_cost_per_audio_token rate, + not at the text token rate (output_cost_per_token). + """ + from litellm.types.utils import ( + Choices, + CompletionTokensDetailsWrapper, + Message, + ) + + os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True" + litellm.model_cost = litellm.get_model_cost_map(url="") + + # Scenario from issue #19764: + # Input: 17 text tokens, 0 audio tokens + # Output: 110 text tokens, 482 audio tokens + usage_object = Usage( + prompt_tokens=17, + completion_tokens=592, # 110 text + 482 audio + total_tokens=609, + prompt_tokens_details=PromptTokensDetailsWrapper( + audio_tokens=0, + cached_tokens=0, + text_tokens=17, + image_tokens=0, + ), + completion_tokens_details=CompletionTokensDetailsWrapper( + audio_tokens=482, + reasoning_tokens=0, + text_tokens=110, + ), + ) + + completion = ModelResponse( + id="test-azure-audio-cost", + choices=[ + Choices( + finish_reason="stop", + index=0, + message=Message( + content="Test response", + role="assistant", + ), + ) + ], + created=1729282652, + model="azure/gpt-audio-2025-08-28", + object="chat.completion", + usage=usage_object, + ) + + cost = completion_cost(completion, model="azure/gpt-audio-2025-08-28") + + model_info = litellm.get_model_info("azure/gpt-audio-2025-08-28") + + # Calculate expected cost + expected_input_cost = ( + model_info["input_cost_per_token"] * 17 # text tokens + ) + expected_output_cost = ( + model_info["output_cost_per_token"] * 110 # text tokens + + model_info["output_cost_per_audio_token"] * 482 # audio tokens + ) + expected_total_cost = expected_input_cost + expected_output_cost + + # The bug was: all output tokens charged at text rate + wrong_output_cost = model_info["output_cost_per_token"] * 592 + wrong_total_cost = expected_input_cost + wrong_output_cost + + # Verify audio tokens are NOT charged at text rate (the bug) + assert abs(cost - wrong_total_cost) > 0.001, ( + "Bug: Audio tokens are being charged at text token rate" + ) + + # Verify cost matches + assert abs(cost - expected_total_cost) < 0.0000001, ( + f"Expected cost {expected_total_cost}, got {cost}" + ) + + def test_default_image_cost_calculator(monkeypatch): from litellm.cost_calculator import default_image_cost_calculator