Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 48 additions & 1 deletion litellm/cost_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,7 @@ def cost_per_token( # noqa: PLR0915
model=model_without_prefix,
custom_llm_provider=custom_llm_provider,
usage=usage_block,
service_tier=service_tier,
)
elif custom_llm_provider == "anthropic":
return anthropic_cost_per_token(model=model, usage=usage_block)
Expand All @@ -500,7 +501,9 @@ def cost_per_token( # noqa: PLR0915
model=model, usage=usage_block, response_time_ms=response_time_ms
)
elif custom_llm_provider == "gemini":
return gemini_cost_per_token(model=model, usage=usage_block)
return gemini_cost_per_token(
model=model, usage=usage_block, service_tier=service_tier
)
elif custom_llm_provider == "deepseek":
return deepseek_cost_per_token(model=model, usage=usage_block)
elif custom_llm_provider == "perplexity":
Expand Down Expand Up @@ -704,6 +707,36 @@ def _get_response_model(completion_response: Any) -> Optional[str]:
return None


_GEMINI_TRAFFIC_TYPE_TO_SERVICE_TIER: dict = {
# ON_DEMAND_PRIORITY maps to "priority" — selects input_cost_per_token_priority, etc.
"ON_DEMAND_PRIORITY": "priority",
# FLEX / BATCH maps to "flex" — selects input_cost_per_token_flex, etc.
"FLEX": "flex",
"BATCH": "flex",
# ON_DEMAND is standard pricing — no service_tier suffix applied
"ON_DEMAND": None,
}


def _map_traffic_type_to_service_tier(traffic_type: Optional[str]) -> Optional[str]:
"""
Map a Gemini usageMetadata.trafficType value to a LiteLLM service_tier string.

This allows the same `_priority` / `_flex` cost-key suffix logic used for
OpenAI/Azure to work for Gemini and Vertex AI models.

trafficType values seen in practice
------------------------------------
ON_DEMAND -> standard pricing (service_tier = None)
ON_DEMAND_PRIORITY -> priority pricing (service_tier = "priority")
FLEX / BATCH -> batch/flex pricing (service_tier = "flex")
"""
if traffic_type is None:
return None
service_tier = _GEMINI_TRAFFIC_TYPE_TO_SERVICE_TIER.get(traffic_type.upper())
return service_tier
Comment on lines +710 to +737
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gemini-specific mapping placed outside llms/

The _GEMINI_TRAFFIC_TYPE_TO_SERVICE_TIER dictionary and _map_traffic_type_to_service_tier function are Gemini/Vertex AI-specific concepts (ON_DEMAND, ON_DEMAND_PRIORITY, FLEX, BATCH are Gemini traffic types). Per repository conventions, provider-specific code should live inside the llms/ directory (e.g., litellm/llms/gemini/cost_calculator.py or litellm/llms/vertex_ai/cost_calculator.py).

Consider moving this mapping to litellm/llms/gemini/cost_calculator.py or a shared Gemini/Vertex utility, and importing it here.

Context Used: Rule from dashboard - What: Avoid writing provider-specific code outside of the llms/ directory.

Why: This practice ensur... (source)

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!

Comment on lines +710 to +737
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gemini-specific mapping outside llms/ directory

_GEMINI_TRAFFIC_TYPE_TO_SERVICE_TIER and _map_traffic_type_to_service_tier encode Gemini/Vertex AI-specific concepts (ON_DEMAND, ON_DEMAND_PRIORITY, FLEX, BATCH are Gemini traffic types). Per repository conventions, provider-specific code should live inside the llms/ directory (e.g., litellm/llms/gemini/cost_calculator.py or a shared Gemini/Vertex utility) and be imported here.

This keeps cost_calculator.py provider-agnostic and aligns with the existing pattern where provider-specific cost logic lives under litellm/llms/{provider}/.

Context Used: Rule from dashboard - What: Avoid writing provider-specific code outside of the llms/ directory.

Why: This practice ensur... (source)

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!



def _get_usage_object(
completion_response: Any,
) -> Optional[Usage]:
Expand Down Expand Up @@ -1145,6 +1178,20 @@ def completion_cost( # noqa: PLR0915
"custom_llm_provider", custom_llm_provider or None
)
region_name = hidden_params.get("region_name", region_name)

# For Gemini/Vertex AI responses, trafficType is stored in
# provider_specific_fields. Map it to the service_tier used
# by the cost key lookup (_priority / _flex suffixes) so that
# ON_DEMAND_PRIORITY requests are billed at priority prices.
if service_tier is None:
provider_specific = (
hidden_params.get("provider_specific_fields") or {}
)
raw_traffic_type = provider_specific.get("traffic_type")
if raw_traffic_type:
service_tier = _map_traffic_type_to_service_tier(
raw_traffic_type
)
else:
if model is None:
raise ValueError(
Expand Down
39 changes: 35 additions & 4 deletions litellm/litellm_core_utils/llm_cost_calc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,14 @@ def _get_token_base_cost(
## CHECK IF ABOVE THRESHOLD
# Optimization: collect threshold keys first to avoid sorting all model_info keys.
# Most models don't have threshold pricing, so we can return early.
# Exclude service_tier-specific variants (e.g. input_cost_per_token_above_200k_tokens_priority)
# so that the threshold detection loop only processes standard keys. The
# service_tier-specific above-threshold key is resolved later via _get_service_tier_cost_key.
threshold_keys = [
k for k in model_info if k.startswith("input_cost_per_token_above_")
k
for k in model_info
if k.startswith("input_cost_per_token_above_")
and not any(k.endswith(f"_{st.value}") for st in ServiceTier)
]
if not threshold_keys:
return (
Expand All @@ -224,14 +230,34 @@ def _get_token_base_cost(
1000 if "k" in threshold_str else 1
)
if usage.prompt_tokens > threshold:
# Prefer a service_tier-specific above-threshold key when available,
# e.g. input_cost_per_token_priority_above_200k_tokens for Gemini
# ON_DEMAND_PRIORITY. Falls back to the standard key automatically
# via _get_cost_per_unit's service_tier fallback logic.
tiered_input_key = (
_get_service_tier_cost_key(
f"input_cost_per_token_above_{threshold_str}_tokens",
service_tier,
)
if service_tier
else key
)
prompt_base_cost = cast(
float, _get_cost_per_unit(model_info, key, prompt_base_cost)
float, _get_cost_per_unit(model_info, tiered_input_key, prompt_base_cost)
)
tiered_output_key = (
_get_service_tier_cost_key(
f"output_cost_per_token_above_{threshold_str}_tokens",
service_tier,
)
if service_tier
else f"output_cost_per_token_above_{threshold_str}_tokens"
)
completion_base_cost = cast(
float,
_get_cost_per_unit(
model_info,
f"output_cost_per_token_above_{threshold_str}_tokens",
tiered_output_key,
completion_base_cost,
),
)
Expand Down Expand Up @@ -517,6 +543,7 @@ def _calculate_input_cost(
cache_read_cost: float,
cache_creation_cost: float,
cache_creation_cost_above_1hr: float,
service_tier: Optional[str] = None,
) -> float:
"""
Calculates the input cost for a given model, prompt tokens, and completion tokens.
Expand All @@ -528,8 +555,11 @@ def _calculate_input_cost(

### AUDIO COST
if prompt_tokens_details["audio_tokens"]:
audio_cost_key = _get_service_tier_cost_key(
"input_cost_per_audio_token", service_tier
)
prompt_cost += calculate_cost_component(
model_info, "input_cost_per_audio_token", prompt_tokens_details["audio_tokens"]
model_info, audio_cost_key, prompt_tokens_details["audio_tokens"]
)

### IMAGE TOKEN COST
Expand Down Expand Up @@ -659,6 +689,7 @@ def generic_cost_per_token( # noqa: PLR0915
cache_read_cost=cache_read_cost,
cache_creation_cost=cache_creation_cost,
cache_creation_cost_above_1hr=cache_creation_cost_above_1hr,
service_tier=service_tier,
)

## CALCULATE OUTPUT COST
Expand Down
8 changes: 5 additions & 3 deletions litellm/llms/gemini/cost_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
Handles the context caching for Gemini API.
"""

from typing import TYPE_CHECKING, Tuple
from typing import TYPE_CHECKING, Optional, Tuple

if TYPE_CHECKING:
from litellm.types.utils import ModelInfo, Usage


def cost_per_token(model: str, usage: "Usage") -> Tuple[float, float]:
def cost_per_token(
model: str, usage: "Usage", service_tier: Optional[str] = None
) -> Tuple[float, float]:
"""
Calculates the cost per token for a given model, prompt tokens, and completion tokens.

Expand All @@ -19,7 +21,7 @@ def cost_per_token(model: str, usage: "Usage") -> Tuple[float, float]:
from litellm.litellm_core_utils.llm_cost_calc.utils import generic_cost_per_token

return generic_cost_per_token(
model=model, usage=usage, custom_llm_provider="gemini"
model=model, usage=usage, custom_llm_provider="gemini", service_tier=service_tier
)


Expand Down
4 changes: 4 additions & 0 deletions litellm/llms/vertex_ai/cost_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ def cost_per_token(
model: str,
custom_llm_provider: str,
usage: Usage,
service_tier: Optional[str] = None,
) -> Tuple[float, float]:
"""
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
Expand All @@ -233,6 +234,8 @@ def cost_per_token(
- custom_llm_provider: str, either "vertex_ai-*" or "gemini"
- prompt_tokens: float, the number of input tokens
- completion_tokens: float, the number of output tokens
- service_tier: optional tier derived from Gemini trafficType
("priority" for ON_DEMAND_PRIORITY, "flex" for FLEX/batch).

Returns:
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
Expand Down Expand Up @@ -266,4 +269,5 @@ def cost_per_token(
model=model,
custom_llm_provider=custom_llm_provider,
usage=usage,
service_tier=service_tier,
)
Loading
Loading