BerriAI · krrishdholakia · Jan 27, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/litellm/litellm_core_utils/llm_cost_calc/utils.py b/litellm/litellm_core_utils/llm_cost_calc/utils.py
@@ -566,14 +566,28 @@ def generic_cost_per_token(  # noqa: PLR0915
     if usage.prompt_tokens_details:
         prompt_tokens_details = _parse_prompt_tokens_details(usage)
 
-    ## EDGE CASE - text tokens not set inside PromptTokensDetails
-
-    if prompt_tokens_details["text_tokens"] == 0:
+    ## EDGE CASE - text tokens not set or includes cached tokens (double-counting)
+    ## Some providers (like xAI) report text_tokens = prompt_tokens (including cached)
+    ## We detect this when: text_tokens + cached_tokens + other > prompt_tokens
+    ## Ref: https://github.com/BerriAI/litellm/issues/19680, #14874, #14875
+
+    cache_hit = prompt_tokens_details["cache_hit_tokens"]
+    text_tokens = prompt_tokens_details["text_tokens"]
+    audio_tokens = prompt_tokens_details["audio_tokens"]
+    cache_creation = prompt_tokens_details["cache_creation_tokens"]
+    image_tokens = prompt_tokens_details["image_tokens"]
+
+    # Check for double-counting: sum of details > prompt_tokens means overlap
+    total_details = text_tokens + cache_hit + audio_tokens + cache_creation + image_tokens
+    has_double_counting = cache_hit > 0 and total_details > usage.prompt_tokens
+
+    if text_tokens == 0 or has_double_counting:
         text_tokens = (
             usage.prompt_tokens
-            - prompt_tokens_details["cache_hit_tokens"]
-            - prompt_tokens_details["audio_tokens"]
-            - prompt_tokens_details["cache_creation_tokens"]
+            - cache_hit
+            - audio_tokens
+            - cache_creation
+            - image_tokens
         )
         prompt_tokens_details["text_tokens"] = text_tokens
 

diff --git a/litellm/llms/azure/cost_calculation.py b/litellm/llms/azure/cost_calculation.py
@@ -1,11 +1,12 @@
 """
 Helper util for handling azure openai-specific cost calculation
-- e.g.: prompt caching
+- e.g.: prompt caching, audio tokens
 """
 
 from typing import Optional, Tuple
 
 from litellm._logging import verbose_logger
+from litellm.litellm_core_utils.llm_cost_calc.utils import generic_cost_per_token
 from litellm.types.utils import Usage
 from litellm.utils import get_model_info
 
@@ -18,34 +19,15 @@ def cost_per_token(
 
     Input:
         - model: str, the model name without provider prefix
-        - usage: LiteLLM Usage block, containing anthropic caching information
+        - usage: LiteLLM Usage block, containing caching and audio token information
 
     Returns:
         Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
     """
     ## GET MODEL INFO
     model_info = get_model_info(model=model, custom_llm_provider="azure")
-    cached_tokens: Optional[int] = None
-    ## CALCULATE INPUT COST
-    non_cached_text_tokens = usage.prompt_tokens
-    if usage.prompt_tokens_details and usage.prompt_tokens_details.cached_tokens:
-        cached_tokens = usage.prompt_tokens_details.cached_tokens
-        non_cached_text_tokens = non_cached_text_tokens - cached_tokens
-    prompt_cost: float = non_cached_text_tokens * model_info["input_cost_per_token"]
-
-    ## CALCULATE OUTPUT COST
-    completion_cost: float = (
-        usage["completion_tokens"] * model_info["output_cost_per_token"]
-    )
-
-    ## Prompt Caching cost calculation
-    if model_info.get("cache_read_input_token_cost") is not None and cached_tokens:
-        # Note: We read ._cache_read_input_tokens from the Usage - since cost_calculator.py standardizes the cache read tokens on usage._cache_read_input_tokens
-        prompt_cost += cached_tokens * (
-            model_info.get("cache_read_input_token_cost", 0) or 0
-        )
 
-    ## Speech / Audio cost calculation
+    ## Speech / Audio cost calculation (cost per second for TTS models)
     if (
         "output_cost_per_second" in model_info
         and model_info["output_cost_per_second"] is not None
@@ -55,7 +37,14 @@ def cost_per_token(
             f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}"
         )
         ## COST PER SECOND ##
-        prompt_cost = 0
+        prompt_cost = 0.0
         completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000
-
-    return prompt_cost, completion_cost
+        return prompt_cost, completion_cost
+
+    ## Use generic cost calculator for all other cases
+    ## This properly handles: text tokens, audio tokens, cached tokens, reasoning tokens, etc.
+    return generic_cost_per_token(
+        model=model,
+        usage=usage,
+        custom_llm_provider="azure",
+    )
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
@@ -30006,6 +30006,7 @@
         "supports_web_search": true
     },
     "xai/grok-3": {
+        "cache_read_input_token_cost": 7.5e-07,
         "input_cost_per_token": 3e-06,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30020,6 +30021,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-beta": {
+        "cache_read_input_token_cost": 7.5e-07,
         "input_cost_per_token": 3e-06,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30034,6 +30036,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-fast-beta": {
+        "cache_read_input_token_cost": 1.25e-06,
         "input_cost_per_token": 5e-06,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30048,6 +30051,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-fast-latest": {
+        "cache_read_input_token_cost": 1.25e-06,
         "input_cost_per_token": 5e-06,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30062,6 +30066,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-latest": {
+        "cache_read_input_token_cost": 7.5e-07,
         "input_cost_per_token": 3e-06,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30076,6 +30081,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-mini": {
+        "cache_read_input_token_cost": 7.5e-08,
         "input_cost_per_token": 3e-07,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30091,6 +30097,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-mini-beta": {
+        "cache_read_input_token_cost": 7.5e-08,
         "input_cost_per_token": 3e-07,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30106,6 +30113,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-mini-fast": {
+        "cache_read_input_token_cost": 1.5e-07,
         "input_cost_per_token": 6e-07,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30121,6 +30129,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-mini-fast-beta": {
+        "cache_read_input_token_cost": 1.5e-07,
         "input_cost_per_token": 6e-07,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30136,6 +30145,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-mini-fast-latest": {
+        "cache_read_input_token_cost": 1.5e-07,
         "input_cost_per_token": 6e-07,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30151,6 +30161,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-mini-latest": {
+        "cache_read_input_token_cost": 7.5e-08,
         "input_cost_per_token": 3e-07,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,

diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
@@ -30006,6 +30006,7 @@
         "supports_web_search": true
     },
     "xai/grok-3": {
+        "cache_read_input_token_cost": 7.5e-07,
         "input_cost_per_token": 3e-06,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30020,6 +30021,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-beta": {
+        "cache_read_input_token_cost": 7.5e-07,
         "input_cost_per_token": 3e-06,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30034,6 +30036,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-fast-beta": {
+        "cache_read_input_token_cost": 1.25e-06,
         "input_cost_per_token": 5e-06,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30048,6 +30051,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-fast-latest": {
+        "cache_read_input_token_cost": 1.25e-06,
         "input_cost_per_token": 5e-06,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30062,6 +30066,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-latest": {
+        "cache_read_input_token_cost": 7.5e-07,
         "input_cost_per_token": 3e-06,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30076,6 +30081,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-mini": {
+        "cache_read_input_token_cost": 7.5e-08,
         "input_cost_per_token": 3e-07,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30091,6 +30097,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-mini-beta": {
+        "cache_read_input_token_cost": 7.5e-08,
         "input_cost_per_token": 3e-07,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30106,6 +30113,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-mini-fast": {
+        "cache_read_input_token_cost": 1.5e-07,
         "input_cost_per_token": 6e-07,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30121,6 +30129,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-mini-fast-beta": {
+        "cache_read_input_token_cost": 1.5e-07,
         "input_cost_per_token": 6e-07,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30136,6 +30145,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-mini-fast-latest": {
+        "cache_read_input_token_cost": 1.5e-07,
         "input_cost_per_token": 6e-07,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,
@@ -30151,6 +30161,7 @@
         "supports_web_search": true
     },
     "xai/grok-3-mini-latest": {
+        "cache_read_input_token_cost": 7.5e-08,
         "input_cost_per_token": 3e-07,
         "litellm_provider": "xai",
         "max_input_tokens": 131072,

diff --git a/tests/test_litellm/test_cost_calculator.py b/tests/test_litellm/test_cost_calculator.py
@@ -355,6 +355,90 @@ def test_azure_realtime_cost_calculator():
     assert cost > 0
 
 
+def test_azure_audio_output_cost_calculation():
+    """
+    Test that Azure audio models correctly calculate costs for audio output tokens.
+
+    Reproduces issue: https://github.com/BerriAI/litellm/issues/19764
+    Audio tokens should be charged at output_cost_per_audio_token rate,
+    not at the text token rate (output_cost_per_token).
+    """
+    from litellm.types.utils import (
+        Choices,
+        CompletionTokensDetailsWrapper,
+        Message,
+    )
+
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+
+    # Scenario from issue #19764:
+    # Input: 17 text tokens, 0 audio tokens
+    # Output: 110 text tokens, 482 audio tokens
+    usage_object = Usage(
+        prompt_tokens=17,
+        completion_tokens=592,  # 110 text + 482 audio
+        total_tokens=609,
+        prompt_tokens_details=PromptTokensDetailsWrapper(
+            audio_tokens=0,
+            cached_tokens=0,
+            text_tokens=17,
+            image_tokens=0,
+        ),
+        completion_tokens_details=CompletionTokensDetailsWrapper(
+            audio_tokens=482,
+            reasoning_tokens=0,
+            text_tokens=110,
+        ),
+    )
+
+    completion = ModelResponse(
+        id="test-azure-audio-cost",
+        choices=[
+            Choices(
+                finish_reason="stop",
+                index=0,
+                message=Message(
+                    content="Test response",
+                    role="assistant",
+                ),
+            )
+        ],
+        created=1729282652,
+        model="azure/gpt-audio-2025-08-28",
+        object="chat.completion",
+        usage=usage_object,
+    )
+
+    cost = completion_cost(completion, model="azure/gpt-audio-2025-08-28")
+
+    model_info = litellm.get_model_info("azure/gpt-audio-2025-08-28")
+
+    # Calculate expected cost
+    expected_input_cost = (
+        model_info["input_cost_per_token"] * 17  # text tokens
+    )
+    expected_output_cost = (
+        model_info["output_cost_per_token"] * 110  # text tokens
+        + model_info["output_cost_per_audio_token"] * 482  # audio tokens
+    )
+    expected_total_cost = expected_input_cost + expected_output_cost
+
+    # The bug was: all output tokens charged at text rate
+    wrong_output_cost = model_info["output_cost_per_token"] * 592
+    wrong_total_cost = expected_input_cost + wrong_output_cost
+
+    # Verify audio tokens are NOT charged at text rate (the bug)
+    assert abs(cost - wrong_total_cost) > 0.001, (
+        "Bug: Audio tokens are being charged at text token rate"
+    )
+
+    # Verify cost matches
+    assert abs(cost - expected_total_cost) < 0.0000001, (
+        f"Expected cost {expected_total_cost}, got {cost}"
+    )
+
+
 def test_default_image_cost_calculator(monkeypatch):
     from litellm.cost_calculator import default_image_cost_calculator