Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions litellm/litellm_core_utils/llm_cost_calc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,14 +566,28 @@ def generic_cost_per_token( # noqa: PLR0915
if usage.prompt_tokens_details:
prompt_tokens_details = _parse_prompt_tokens_details(usage)

## EDGE CASE - text tokens not set inside PromptTokensDetails

if prompt_tokens_details["text_tokens"] == 0:
## EDGE CASE - text tokens not set or includes cached tokens (double-counting)
## Some providers (like xAI) report text_tokens = prompt_tokens (including cached)
## We detect this when: text_tokens + cached_tokens + other > prompt_tokens
## Ref: https://github.com/BerriAI/litellm/issues/19680, #14874, #14875

cache_hit = prompt_tokens_details["cache_hit_tokens"]
text_tokens = prompt_tokens_details["text_tokens"]
audio_tokens = prompt_tokens_details["audio_tokens"]
cache_creation = prompt_tokens_details["cache_creation_tokens"]
image_tokens = prompt_tokens_details["image_tokens"]

# Check for double-counting: sum of details > prompt_tokens means overlap
total_details = text_tokens + cache_hit + audio_tokens + cache_creation + image_tokens
has_double_counting = cache_hit > 0 and total_details > usage.prompt_tokens

if text_tokens == 0 or has_double_counting:
text_tokens = (
usage.prompt_tokens
- prompt_tokens_details["cache_hit_tokens"]
- prompt_tokens_details["audio_tokens"]
- prompt_tokens_details["cache_creation_tokens"]
- cache_hit
- audio_tokens
- cache_creation
- image_tokens
)
prompt_tokens_details["text_tokens"] = text_tokens

Expand Down
39 changes: 14 additions & 25 deletions litellm/llms/azure/cost_calculation.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""
Helper util for handling azure openai-specific cost calculation
- e.g.: prompt caching
- e.g.: prompt caching, audio tokens
"""

from typing import Optional, Tuple

from litellm._logging import verbose_logger
from litellm.litellm_core_utils.llm_cost_calc.utils import generic_cost_per_token
from litellm.types.utils import Usage
from litellm.utils import get_model_info

Expand All @@ -18,34 +19,15 @@ def cost_per_token(

Input:
- model: str, the model name without provider prefix
- usage: LiteLLM Usage block, containing anthropic caching information
- usage: LiteLLM Usage block, containing caching and audio token information

Returns:
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
"""
## GET MODEL INFO
model_info = get_model_info(model=model, custom_llm_provider="azure")
cached_tokens: Optional[int] = None
## CALCULATE INPUT COST
non_cached_text_tokens = usage.prompt_tokens
if usage.prompt_tokens_details and usage.prompt_tokens_details.cached_tokens:
cached_tokens = usage.prompt_tokens_details.cached_tokens
non_cached_text_tokens = non_cached_text_tokens - cached_tokens
prompt_cost: float = non_cached_text_tokens * model_info["input_cost_per_token"]

## CALCULATE OUTPUT COST
completion_cost: float = (
usage["completion_tokens"] * model_info["output_cost_per_token"]
)

## Prompt Caching cost calculation
if model_info.get("cache_read_input_token_cost") is not None and cached_tokens:
# Note: We read ._cache_read_input_tokens from the Usage - since cost_calculator.py standardizes the cache read tokens on usage._cache_read_input_tokens
prompt_cost += cached_tokens * (
model_info.get("cache_read_input_token_cost", 0) or 0
)

## Speech / Audio cost calculation
## Speech / Audio cost calculation (cost per second for TTS models)
if (
"output_cost_per_second" in model_info
and model_info["output_cost_per_second"] is not None
Expand All @@ -55,7 +37,14 @@ def cost_per_token(
f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}"
)
## COST PER SECOND ##
prompt_cost = 0
prompt_cost = 0.0
completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000

return prompt_cost, completion_cost
return prompt_cost, completion_cost

## Use generic cost calculator for all other cases
## This properly handles: text tokens, audio tokens, cached tokens, reasoning tokens, etc.
return generic_cost_per_token(
model=model,
usage=usage,
custom_llm_provider="azure",
)
11 changes: 11 additions & 0 deletions litellm/model_prices_and_context_window_backup.json
Original file line number Diff line number Diff line change
Expand Up @@ -30006,6 +30006,7 @@
"supports_web_search": true
},
"xai/grok-3": {
"cache_read_input_token_cost": 7.5e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30020,6 +30021,7 @@
"supports_web_search": true
},
"xai/grok-3-beta": {
"cache_read_input_token_cost": 7.5e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30034,6 +30036,7 @@
"supports_web_search": true
},
"xai/grok-3-fast-beta": {
"cache_read_input_token_cost": 1.25e-06,
"input_cost_per_token": 5e-06,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30048,6 +30051,7 @@
"supports_web_search": true
},
"xai/grok-3-fast-latest": {
"cache_read_input_token_cost": 1.25e-06,
"input_cost_per_token": 5e-06,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30062,6 +30066,7 @@
"supports_web_search": true
},
"xai/grok-3-latest": {
"cache_read_input_token_cost": 7.5e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30076,6 +30081,7 @@
"supports_web_search": true
},
"xai/grok-3-mini": {
"cache_read_input_token_cost": 7.5e-08,
"input_cost_per_token": 3e-07,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30091,6 +30097,7 @@
"supports_web_search": true
},
"xai/grok-3-mini-beta": {
"cache_read_input_token_cost": 7.5e-08,
"input_cost_per_token": 3e-07,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30106,6 +30113,7 @@
"supports_web_search": true
},
"xai/grok-3-mini-fast": {
"cache_read_input_token_cost": 1.5e-07,
"input_cost_per_token": 6e-07,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30121,6 +30129,7 @@
"supports_web_search": true
},
"xai/grok-3-mini-fast-beta": {
"cache_read_input_token_cost": 1.5e-07,
"input_cost_per_token": 6e-07,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30136,6 +30145,7 @@
"supports_web_search": true
},
"xai/grok-3-mini-fast-latest": {
"cache_read_input_token_cost": 1.5e-07,
"input_cost_per_token": 6e-07,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30151,6 +30161,7 @@
"supports_web_search": true
},
"xai/grok-3-mini-latest": {
"cache_read_input_token_cost": 7.5e-08,
"input_cost_per_token": 3e-07,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand Down
11 changes: 11 additions & 0 deletions model_prices_and_context_window.json
Original file line number Diff line number Diff line change
Expand Up @@ -30006,6 +30006,7 @@
"supports_web_search": true
},
"xai/grok-3": {
"cache_read_input_token_cost": 7.5e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30020,6 +30021,7 @@
"supports_web_search": true
},
"xai/grok-3-beta": {
"cache_read_input_token_cost": 7.5e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30034,6 +30036,7 @@
"supports_web_search": true
},
"xai/grok-3-fast-beta": {
"cache_read_input_token_cost": 1.25e-06,
"input_cost_per_token": 5e-06,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30048,6 +30051,7 @@
"supports_web_search": true
},
"xai/grok-3-fast-latest": {
"cache_read_input_token_cost": 1.25e-06,
"input_cost_per_token": 5e-06,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30062,6 +30066,7 @@
"supports_web_search": true
},
"xai/grok-3-latest": {
"cache_read_input_token_cost": 7.5e-07,
"input_cost_per_token": 3e-06,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30076,6 +30081,7 @@
"supports_web_search": true
},
"xai/grok-3-mini": {
"cache_read_input_token_cost": 7.5e-08,
"input_cost_per_token": 3e-07,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30091,6 +30097,7 @@
"supports_web_search": true
},
"xai/grok-3-mini-beta": {
"cache_read_input_token_cost": 7.5e-08,
"input_cost_per_token": 3e-07,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30106,6 +30113,7 @@
"supports_web_search": true
},
"xai/grok-3-mini-fast": {
"cache_read_input_token_cost": 1.5e-07,
"input_cost_per_token": 6e-07,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30121,6 +30129,7 @@
"supports_web_search": true
},
"xai/grok-3-mini-fast-beta": {
"cache_read_input_token_cost": 1.5e-07,
"input_cost_per_token": 6e-07,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30136,6 +30145,7 @@
"supports_web_search": true
},
"xai/grok-3-mini-fast-latest": {
"cache_read_input_token_cost": 1.5e-07,
"input_cost_per_token": 6e-07,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand All @@ -30151,6 +30161,7 @@
"supports_web_search": true
},
"xai/grok-3-mini-latest": {
"cache_read_input_token_cost": 7.5e-08,
"input_cost_per_token": 3e-07,
"litellm_provider": "xai",
"max_input_tokens": 131072,
Expand Down
84 changes: 84 additions & 0 deletions tests/test_litellm/test_cost_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,90 @@ def test_azure_realtime_cost_calculator():
assert cost > 0


def test_azure_audio_output_cost_calculation():
"""
Test that Azure audio models correctly calculate costs for audio output tokens.

Reproduces issue: https://github.com/BerriAI/litellm/issues/19764
Audio tokens should be charged at output_cost_per_audio_token rate,
not at the text token rate (output_cost_per_token).
"""
from litellm.types.utils import (
Choices,
CompletionTokensDetailsWrapper,
Message,
)

os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")

# Scenario from issue #19764:
# Input: 17 text tokens, 0 audio tokens
# Output: 110 text tokens, 482 audio tokens
usage_object = Usage(
prompt_tokens=17,
completion_tokens=592, # 110 text + 482 audio
total_tokens=609,
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=0,
cached_tokens=0,
text_tokens=17,
image_tokens=0,
),
completion_tokens_details=CompletionTokensDetailsWrapper(
audio_tokens=482,
reasoning_tokens=0,
text_tokens=110,
),
)

completion = ModelResponse(
id="test-azure-audio-cost",
choices=[
Choices(
finish_reason="stop",
index=0,
message=Message(
content="Test response",
role="assistant",
),
)
],
created=1729282652,
model="azure/gpt-audio-2025-08-28",
object="chat.completion",
usage=usage_object,
)

cost = completion_cost(completion, model="azure/gpt-audio-2025-08-28")

model_info = litellm.get_model_info("azure/gpt-audio-2025-08-28")

# Calculate expected cost
expected_input_cost = (
model_info["input_cost_per_token"] * 17 # text tokens
)
expected_output_cost = (
model_info["output_cost_per_token"] * 110 # text tokens
+ model_info["output_cost_per_audio_token"] * 482 # audio tokens
)
expected_total_cost = expected_input_cost + expected_output_cost

# The bug was: all output tokens charged at text rate
wrong_output_cost = model_info["output_cost_per_token"] * 592
wrong_total_cost = expected_input_cost + wrong_output_cost

# Verify audio tokens are NOT charged at text rate (the bug)
assert abs(cost - wrong_total_cost) > 0.001, (
"Bug: Audio tokens are being charged at text token rate"
)

# Verify cost matches
assert abs(cost - expected_total_cost) < 0.0000001, (
f"Expected cost {expected_total_cost}, got {cost}"
)


def test_default_image_cost_calculator(monkeypatch):
from litellm.cost_calculator import default_image_cost_calculator

Expand Down
Loading