Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 14 additions & 25 deletions litellm/llms/azure/cost_calculation.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""
Helper util for handling azure openai-specific cost calculation
- e.g.: prompt caching
- e.g.: prompt caching, audio tokens
"""

from typing import Optional, Tuple

from litellm._logging import verbose_logger
from litellm.litellm_core_utils.llm_cost_calc.utils import generic_cost_per_token
from litellm.types.utils import Usage
from litellm.utils import get_model_info

Expand All @@ -18,34 +19,15 @@ def cost_per_token(

Input:
- model: str, the model name without provider prefix
- usage: LiteLLM Usage block, containing anthropic caching information
- usage: LiteLLM Usage block, containing caching and audio token information

Returns:
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
"""
## GET MODEL INFO
model_info = get_model_info(model=model, custom_llm_provider="azure")
cached_tokens: Optional[int] = None
## CALCULATE INPUT COST
non_cached_text_tokens = usage.prompt_tokens
if usage.prompt_tokens_details and usage.prompt_tokens_details.cached_tokens:
cached_tokens = usage.prompt_tokens_details.cached_tokens
non_cached_text_tokens = non_cached_text_tokens - cached_tokens
prompt_cost: float = non_cached_text_tokens * model_info["input_cost_per_token"]

## CALCULATE OUTPUT COST
completion_cost: float = (
usage["completion_tokens"] * model_info["output_cost_per_token"]
)

## Prompt Caching cost calculation
if model_info.get("cache_read_input_token_cost") is not None and cached_tokens:
# Note: We read ._cache_read_input_tokens from the Usage - since cost_calculator.py standardizes the cache read tokens on usage._cache_read_input_tokens
prompt_cost += cached_tokens * (
model_info.get("cache_read_input_token_cost", 0) or 0
)

## Speech / Audio cost calculation
## Speech / Audio cost calculation (cost per second for TTS models)
if (
"output_cost_per_second" in model_info
and model_info["output_cost_per_second"] is not None
Expand All @@ -55,7 +37,14 @@ def cost_per_token(
f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}"
)
## COST PER SECOND ##
prompt_cost = 0
prompt_cost = 0.0
completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000

return prompt_cost, completion_cost
return prompt_cost, completion_cost

## Use generic cost calculator for all other cases
## This properly handles: text tokens, audio tokens, cached tokens, reasoning tokens, etc.
return generic_cost_per_token(
model=model,
usage=usage,
custom_llm_provider="azure",
)
84 changes: 84 additions & 0 deletions tests/test_litellm/test_cost_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,90 @@ def test_azure_realtime_cost_calculator():
assert cost > 0


def test_azure_audio_output_cost_calculation():
"""
Test that Azure audio models correctly calculate costs for audio output tokens.

Reproduces issue: https://github.com/BerriAI/litellm/issues/19764
Audio tokens should be charged at output_cost_per_audio_token rate,
not at the text token rate (output_cost_per_token).
"""
from litellm.types.utils import (
Choices,
CompletionTokensDetailsWrapper,
Message,
)

os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")

# Scenario from issue #19764:
# Input: 17 text tokens, 0 audio tokens
# Output: 110 text tokens, 482 audio tokens
usage_object = Usage(
prompt_tokens=17,
completion_tokens=592, # 110 text + 482 audio
total_tokens=609,
prompt_tokens_details=PromptTokensDetailsWrapper(
audio_tokens=0,
cached_tokens=0,
text_tokens=17,
image_tokens=0,
),
completion_tokens_details=CompletionTokensDetailsWrapper(
audio_tokens=482,
reasoning_tokens=0,
text_tokens=110,
),
)

completion = ModelResponse(
id="test-azure-audio-cost",
choices=[
Choices(
finish_reason="stop",
index=0,
message=Message(
content="Test response",
role="assistant",
),
)
],
created=1729282652,
model="azure/gpt-audio-2025-08-28",
object="chat.completion",
usage=usage_object,
)

cost = completion_cost(completion, model="azure/gpt-audio-2025-08-28")

model_info = litellm.get_model_info("azure/gpt-audio-2025-08-28")

# Calculate expected cost
expected_input_cost = (
model_info["input_cost_per_token"] * 17 # text tokens
)
expected_output_cost = (
model_info["output_cost_per_token"] * 110 # text tokens
+ model_info["output_cost_per_audio_token"] * 482 # audio tokens
)
expected_total_cost = expected_input_cost + expected_output_cost

# The bug was: all output tokens charged at text rate
wrong_output_cost = model_info["output_cost_per_token"] * 592
wrong_total_cost = expected_input_cost + wrong_output_cost

# Verify audio tokens are NOT charged at text rate (the bug)
assert abs(cost - wrong_total_cost) > 0.001, (
"Bug: Audio tokens are being charged at text token rate"
)

# Verify cost matches
assert abs(cost - expected_total_cost) < 0.0000001, (
f"Expected cost {expected_total_cost}, got {cost}"
)


def test_default_image_cost_calculator(monkeypatch):
from litellm.cost_calculator import default_image_cost_calculator

Expand Down
Loading