From 20023bdcfbe88d5de46ba0eac18f9613291c752a Mon Sep 17 00:00:00 2001 From: Sameer Kankute Date: Mon, 23 Feb 2026 13:15:47 +0530 Subject: [PATCH 1/4] Add support for Priority PayGo for vertex ai and gemini --- litellm/cost_calculator.py | 49 ++++++++++++++++++- .../litellm_core_utils/llm_cost_calc/utils.py | 39 +++++++++++++-- litellm/llms/gemini/cost_calculator.py | 8 +-- litellm/llms/vertex_ai/cost_calculator.py | 4 ++ 4 files changed, 92 insertions(+), 8 deletions(-) diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index 74c1afb0ccb..ad6eb6b4f32 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -480,6 +480,7 @@ def cost_per_token( # noqa: PLR0915 model=model_without_prefix, custom_llm_provider=custom_llm_provider, usage=usage_block, + service_tier=service_tier, ) elif custom_llm_provider == "anthropic": return anthropic_cost_per_token(model=model, usage=usage_block) @@ -500,7 +501,9 @@ def cost_per_token( # noqa: PLR0915 model=model, usage=usage_block, response_time_ms=response_time_ms ) elif custom_llm_provider == "gemini": - return gemini_cost_per_token(model=model, usage=usage_block) + return gemini_cost_per_token( + model=model, usage=usage_block, service_tier=service_tier + ) elif custom_llm_provider == "deepseek": return deepseek_cost_per_token(model=model, usage=usage_block) elif custom_llm_provider == "perplexity": @@ -704,6 +707,36 @@ def _get_response_model(completion_response: Any) -> Optional[str]: return None +_GEMINI_TRAFFIC_TYPE_TO_SERVICE_TIER: dict = { + # ON_DEMAND_PRIORITY maps to "priority" — selects input_cost_per_token_priority, etc. + "ON_DEMAND_PRIORITY": "priority", + # FLEX / BATCH maps to "flex" — selects input_cost_per_token_flex, etc. + "FLEX": "flex", + "BATCH": "flex", + # ON_DEMAND is standard pricing — no service_tier suffix applied + "ON_DEMAND": None, +} + + +def _map_traffic_type_to_service_tier(traffic_type: Optional[str]) -> Optional[str]: + """ + Map a Gemini usageMetadata.trafficType value to a LiteLLM service_tier string. + + This allows the same `_priority` / `_flex` cost-key suffix logic used for + OpenAI/Azure to work for Gemini and Vertex AI models. + + trafficType values seen in practice + ------------------------------------ + ON_DEMAND -> standard pricing (service_tier = None) + ON_DEMAND_PRIORITY -> priority pricing (service_tier = "priority") + FLEX / BATCH -> batch/flex pricing (service_tier = "flex") + """ + if traffic_type is None: + return None + service_tier = _GEMINI_TRAFFIC_TYPE_TO_SERVICE_TIER.get(traffic_type.upper()) + return service_tier + + def _get_usage_object( completion_response: Any, ) -> Optional[Usage]: @@ -1145,6 +1178,20 @@ def completion_cost( # noqa: PLR0915 "custom_llm_provider", custom_llm_provider or None ) region_name = hidden_params.get("region_name", region_name) + + # For Gemini/Vertex AI responses, trafficType is stored in + # provider_specific_fields. Map it to the service_tier used + # by the cost key lookup (_priority / _flex suffixes) so that + # ON_DEMAND_PRIORITY requests are billed at priority prices. + if service_tier is None: + provider_specific = ( + hidden_params.get("provider_specific_fields") or {} + ) + raw_traffic_type = provider_specific.get("traffic_type") + if raw_traffic_type: + service_tier = _map_traffic_type_to_service_tier( + raw_traffic_type + ) else: if model is None: raise ValueError( diff --git a/litellm/litellm_core_utils/llm_cost_calc/utils.py b/litellm/litellm_core_utils/llm_cost_calc/utils.py index 7c41e1bbe67..a9fd0f4ea8a 100644 --- a/litellm/litellm_core_utils/llm_cost_calc/utils.py +++ b/litellm/litellm_core_utils/llm_cost_calc/utils.py @@ -200,8 +200,14 @@ def _get_token_base_cost( ## CHECK IF ABOVE THRESHOLD # Optimization: collect threshold keys first to avoid sorting all model_info keys. # Most models don't have threshold pricing, so we can return early. + # Exclude service_tier-specific variants (e.g. input_cost_per_token_above_200k_tokens_priority) + # so that the threshold detection loop only processes standard keys. The + # service_tier-specific above-threshold key is resolved later via _get_service_tier_cost_key. threshold_keys = [ - k for k in model_info if k.startswith("input_cost_per_token_above_") + k + for k in model_info + if k.startswith("input_cost_per_token_above_") + and not any(k.endswith(f"_{st.value}") for st in ServiceTier) ] if not threshold_keys: return ( @@ -224,14 +230,34 @@ def _get_token_base_cost( 1000 if "k" in threshold_str else 1 ) if usage.prompt_tokens > threshold: + # Prefer a service_tier-specific above-threshold key when available, + # e.g. input_cost_per_token_priority_above_200k_tokens for Gemini + # ON_DEMAND_PRIORITY. Falls back to the standard key automatically + # via _get_cost_per_unit's service_tier fallback logic. + tiered_input_key = ( + _get_service_tier_cost_key( + f"input_cost_per_token_above_{threshold_str}_tokens", + service_tier, + ) + if service_tier + else key + ) prompt_base_cost = cast( - float, _get_cost_per_unit(model_info, key, prompt_base_cost) + float, _get_cost_per_unit(model_info, tiered_input_key, prompt_base_cost) + ) + tiered_output_key = ( + _get_service_tier_cost_key( + f"output_cost_per_token_above_{threshold_str}_tokens", + service_tier, + ) + if service_tier + else f"output_cost_per_token_above_{threshold_str}_tokens" ) completion_base_cost = cast( float, _get_cost_per_unit( model_info, - f"output_cost_per_token_above_{threshold_str}_tokens", + tiered_output_key, completion_base_cost, ), ) @@ -517,6 +543,7 @@ def _calculate_input_cost( cache_read_cost: float, cache_creation_cost: float, cache_creation_cost_above_1hr: float, + service_tier: Optional[str] = None, ) -> float: """ Calculates the input cost for a given model, prompt tokens, and completion tokens. @@ -528,8 +555,11 @@ def _calculate_input_cost( ### AUDIO COST if prompt_tokens_details["audio_tokens"]: + audio_cost_key = _get_service_tier_cost_key( + "input_cost_per_audio_token", service_tier + ) prompt_cost += calculate_cost_component( - model_info, "input_cost_per_audio_token", prompt_tokens_details["audio_tokens"] + model_info, audio_cost_key, prompt_tokens_details["audio_tokens"] ) ### IMAGE TOKEN COST @@ -659,6 +689,7 @@ def generic_cost_per_token( # noqa: PLR0915 cache_read_cost=cache_read_cost, cache_creation_cost=cache_creation_cost, cache_creation_cost_above_1hr=cache_creation_cost_above_1hr, + service_tier=service_tier, ) ## CALCULATE OUTPUT COST diff --git a/litellm/llms/gemini/cost_calculator.py b/litellm/llms/gemini/cost_calculator.py index 471421b4870..79242fe01d1 100644 --- a/litellm/llms/gemini/cost_calculator.py +++ b/litellm/llms/gemini/cost_calculator.py @@ -4,13 +4,15 @@ Handles the context caching for Gemini API. """ -from typing import TYPE_CHECKING, Tuple +from typing import TYPE_CHECKING, Optional, Tuple if TYPE_CHECKING: from litellm.types.utils import ModelInfo, Usage -def cost_per_token(model: str, usage: "Usage") -> Tuple[float, float]: +def cost_per_token( + model: str, usage: "Usage", service_tier: Optional[str] = None +) -> Tuple[float, float]: """ Calculates the cost per token for a given model, prompt tokens, and completion tokens. @@ -19,7 +21,7 @@ def cost_per_token(model: str, usage: "Usage") -> Tuple[float, float]: from litellm.litellm_core_utils.llm_cost_calc.utils import generic_cost_per_token return generic_cost_per_token( - model=model, usage=usage, custom_llm_provider="gemini" + model=model, usage=usage, custom_llm_provider="gemini", service_tier=service_tier ) diff --git a/litellm/llms/vertex_ai/cost_calculator.py b/litellm/llms/vertex_ai/cost_calculator.py index e98dc75915d..e7ac453e949 100644 --- a/litellm/llms/vertex_ai/cost_calculator.py +++ b/litellm/llms/vertex_ai/cost_calculator.py @@ -224,6 +224,7 @@ def cost_per_token( model: str, custom_llm_provider: str, usage: Usage, + service_tier: Optional[str] = None, ) -> Tuple[float, float]: """ Calculates the cost per token for a given model, prompt tokens, and completion tokens. @@ -233,6 +234,8 @@ def cost_per_token( - custom_llm_provider: str, either "vertex_ai-*" or "gemini" - prompt_tokens: float, the number of input tokens - completion_tokens: float, the number of output tokens + - service_tier: optional tier derived from Gemini trafficType + ("priority" for ON_DEMAND_PRIORITY, "flex" for FLEX/batch). Returns: Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd @@ -266,4 +269,5 @@ def cost_per_token( model=model, custom_llm_provider=custom_llm_provider, usage=usage, + service_tier=service_tier, ) From 4d8f5097fb0626e788a63e4dfa6f007b3f02aa5b Mon Sep 17 00:00:00 2001 From: Sameer Kankute Date: Mon, 23 Feb 2026 13:16:27 +0530 Subject: [PATCH 2/4] Add model pricing --- ...odel_prices_and_context_window_backup.json | 99 ++++++++++++++++--- model_prices_and_context_window.json | 99 ++++++++++++++++--- 2 files changed, 174 insertions(+), 24 deletions(-) diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index e54eaf89d72..cf78a69f61f 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -14768,7 +14768,14 @@ "supports_video_input": true, "supports_vision": true, "supports_web_search": true, - "supports_native_streaming": true + "supports_native_streaming": true, + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "gemini-3.1-pro-preview": { "cache_read_input_token_cost": 2e-07, @@ -14819,7 +14826,14 @@ "supports_vision": true, "supports_web_search": true, "supports_url_context": true, - "supports_native_streaming": true + "supports_native_streaming": true, + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "gemini-3.1-pro-preview-customtools": { "cache_read_input_token_cost": 2e-07, @@ -14919,7 +14933,14 @@ "supports_video_input": true, "supports_vision": true, "supports_web_search": true, - "supports_native_streaming": true + "supports_native_streaming": true, + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "vertex_ai/gemini-3-flash-preview": { "cache_read_input_token_cost": 5e-08, @@ -14963,7 +14984,12 @@ "supports_video_input": true, "supports_vision": true, "supports_web_search": true, - "supports_native_streaming": true + "supports_native_streaming": true, + "input_cost_per_token_priority": 9e-07, + "input_cost_per_audio_token_priority": 1.8e-06, + "output_cost_per_token_priority": 5.4e-06, + "cache_read_input_token_cost_priority": 9e-08, + "supports_service_tier": true }, "vertex_ai/gemini-3.1-pro-preview": { "cache_read_input_token_cost": 2e-07, @@ -15014,7 +15040,14 @@ "supports_vision": true, "supports_web_search": true, "supports_url_context": true, - "supports_native_streaming": true + "supports_native_streaming": true, + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "vertex_ai/gemini-3.1-pro-preview-customtools": { "cache_read_input_token_cost": 2e-07, @@ -15065,7 +15098,14 @@ "supports_vision": true, "supports_web_search": true, "supports_url_context": true, - "supports_native_streaming": true + "supports_native_streaming": true, + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "gemini-2.5-pro-exp-03-25": { "cache_read_input_token_cost": 1.25e-07, @@ -16860,6 +16900,8 @@ "cache_read_input_token_cost_above_200k_tokens": 2.5e-07, "input_cost_per_token": 1.25e-06, "input_cost_per_token_above_200k_tokens": 2.5e-06, + "input_cost_per_token_priority": 1.25e-06, + "input_cost_per_token_above_200k_tokens_priority": 2.5e-06, "litellm_provider": "gemini", "max_audio_length_hours": 8.4, "max_audio_per_prompt": 1, @@ -16873,8 +16915,11 @@ "mode": "chat", "output_cost_per_token": 1e-05, "output_cost_per_token_above_200k_tokens": 1.5e-05, + "output_cost_per_token_priority": 1e-05, + "output_cost_per_token_above_200k_tokens_priority": 1.5e-05, "rpm": 2000, "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing", + "supports_service_tier": true, "supported_endpoints": [ "/v1/chat/completions", "/v1/completions" @@ -16979,7 +17024,14 @@ "supports_video_input": true, "supports_vision": true, "supports_web_search": true, - "tpm": 800000 + "tpm": 800000, + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "gemini/gemini-3-flash-preview": { "cache_read_input_token_cost": 5e-08, @@ -17027,7 +17079,12 @@ "supports_vision": true, "supports_web_search": true, "supports_native_streaming": true, - "tpm": 800000 + "tpm": 800000, + "input_cost_per_token_priority": 9e-07, + "input_cost_per_audio_token_priority": 1.8e-06, + "output_cost_per_token_priority": 5.4e-06, + "cache_read_input_token_cost_priority": 9e-08, + "supports_service_tier": true }, "gemini/gemini-3.1-pro-preview": { "cache_read_input_token_cost": 2e-07, @@ -17078,7 +17135,13 @@ "supports_web_search": true, "supports_url_context": true, "supports_native_streaming": true, - "tpm": 800000 + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "gemini/gemini-3.1-pro-preview-customtools": { "cache_read_input_token_cost": 2e-07, @@ -17129,7 +17192,14 @@ "supports_web_search": true, "supports_url_context": true, "supports_native_streaming": true, - "tpm": 800000 + "tpm": 800000, + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "gemini-3-flash-preview": { "cache_read_input_token_cost": 5e-08, @@ -17175,7 +17245,12 @@ "supports_url_context": true, "supports_vision": true, "supports_web_search": true, - "supports_native_streaming": true + "supports_native_streaming": true, + "input_cost_per_token_priority": 9e-07, + "input_cost_per_audio_token_priority": 1.8e-06, + "output_cost_per_token_priority": 5.4e-06, + "cache_read_input_token_cost_priority": 9e-08, + "supports_service_tier": true }, "gemini/gemini-2.5-pro-exp-03-25": { "cache_read_input_token_cost": 0.0, @@ -37749,4 +37824,4 @@ "notes": "DuckDuckGo Instant Answer API is free and does not require an API key." } } -} +} \ No newline at end of file diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index e54eaf89d72..cf78a69f61f 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -14768,7 +14768,14 @@ "supports_video_input": true, "supports_vision": true, "supports_web_search": true, - "supports_native_streaming": true + "supports_native_streaming": true, + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "gemini-3.1-pro-preview": { "cache_read_input_token_cost": 2e-07, @@ -14819,7 +14826,14 @@ "supports_vision": true, "supports_web_search": true, "supports_url_context": true, - "supports_native_streaming": true + "supports_native_streaming": true, + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "gemini-3.1-pro-preview-customtools": { "cache_read_input_token_cost": 2e-07, @@ -14919,7 +14933,14 @@ "supports_video_input": true, "supports_vision": true, "supports_web_search": true, - "supports_native_streaming": true + "supports_native_streaming": true, + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "vertex_ai/gemini-3-flash-preview": { "cache_read_input_token_cost": 5e-08, @@ -14963,7 +14984,12 @@ "supports_video_input": true, "supports_vision": true, "supports_web_search": true, - "supports_native_streaming": true + "supports_native_streaming": true, + "input_cost_per_token_priority": 9e-07, + "input_cost_per_audio_token_priority": 1.8e-06, + "output_cost_per_token_priority": 5.4e-06, + "cache_read_input_token_cost_priority": 9e-08, + "supports_service_tier": true }, "vertex_ai/gemini-3.1-pro-preview": { "cache_read_input_token_cost": 2e-07, @@ -15014,7 +15040,14 @@ "supports_vision": true, "supports_web_search": true, "supports_url_context": true, - "supports_native_streaming": true + "supports_native_streaming": true, + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "vertex_ai/gemini-3.1-pro-preview-customtools": { "cache_read_input_token_cost": 2e-07, @@ -15065,7 +15098,14 @@ "supports_vision": true, "supports_web_search": true, "supports_url_context": true, - "supports_native_streaming": true + "supports_native_streaming": true, + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "gemini-2.5-pro-exp-03-25": { "cache_read_input_token_cost": 1.25e-07, @@ -16860,6 +16900,8 @@ "cache_read_input_token_cost_above_200k_tokens": 2.5e-07, "input_cost_per_token": 1.25e-06, "input_cost_per_token_above_200k_tokens": 2.5e-06, + "input_cost_per_token_priority": 1.25e-06, + "input_cost_per_token_above_200k_tokens_priority": 2.5e-06, "litellm_provider": "gemini", "max_audio_length_hours": 8.4, "max_audio_per_prompt": 1, @@ -16873,8 +16915,11 @@ "mode": "chat", "output_cost_per_token": 1e-05, "output_cost_per_token_above_200k_tokens": 1.5e-05, + "output_cost_per_token_priority": 1e-05, + "output_cost_per_token_above_200k_tokens_priority": 1.5e-05, "rpm": 2000, "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing", + "supports_service_tier": true, "supported_endpoints": [ "/v1/chat/completions", "/v1/completions" @@ -16979,7 +17024,14 @@ "supports_video_input": true, "supports_vision": true, "supports_web_search": true, - "tpm": 800000 + "tpm": 800000, + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "gemini/gemini-3-flash-preview": { "cache_read_input_token_cost": 5e-08, @@ -17027,7 +17079,12 @@ "supports_vision": true, "supports_web_search": true, "supports_native_streaming": true, - "tpm": 800000 + "tpm": 800000, + "input_cost_per_token_priority": 9e-07, + "input_cost_per_audio_token_priority": 1.8e-06, + "output_cost_per_token_priority": 5.4e-06, + "cache_read_input_token_cost_priority": 9e-08, + "supports_service_tier": true }, "gemini/gemini-3.1-pro-preview": { "cache_read_input_token_cost": 2e-07, @@ -17078,7 +17135,13 @@ "supports_web_search": true, "supports_url_context": true, "supports_native_streaming": true, - "tpm": 800000 + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "gemini/gemini-3.1-pro-preview-customtools": { "cache_read_input_token_cost": 2e-07, @@ -17129,7 +17192,14 @@ "supports_web_search": true, "supports_url_context": true, "supports_native_streaming": true, - "tpm": 800000 + "tpm": 800000, + "input_cost_per_token_priority": 3.6e-06, + "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, + "output_cost_per_token_priority": 2.16e-05, + "output_cost_per_token_above_200k_tokens_priority": 3.24e-05, + "cache_read_input_token_cost_priority": 3.6e-07, + "cache_read_input_token_cost_above_200k_tokens_priority": 7.2e-07, + "supports_service_tier": true }, "gemini-3-flash-preview": { "cache_read_input_token_cost": 5e-08, @@ -17175,7 +17245,12 @@ "supports_url_context": true, "supports_vision": true, "supports_web_search": true, - "supports_native_streaming": true + "supports_native_streaming": true, + "input_cost_per_token_priority": 9e-07, + "input_cost_per_audio_token_priority": 1.8e-06, + "output_cost_per_token_priority": 5.4e-06, + "cache_read_input_token_cost_priority": 9e-08, + "supports_service_tier": true }, "gemini/gemini-2.5-pro-exp-03-25": { "cache_read_input_token_cost": 0.0, @@ -37749,4 +37824,4 @@ "notes": "DuckDuckGo Instant Answer API is free and does not require an API key." } } -} +} \ No newline at end of file From 164cde9bf61b962c382f1c9a91a044211da834e6 Mon Sep 17 00:00:00 2001 From: Sameer Kankute Date: Mon, 23 Feb 2026 17:23:12 +0530 Subject: [PATCH 3/4] Readd tpm limit --- litellm/model_prices_and_context_window_backup.json | 1 + model_prices_and_context_window.json | 1 + 2 files changed, 2 insertions(+) diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index cf78a69f61f..cff3c94419b 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -17135,6 +17135,7 @@ "supports_web_search": true, "supports_url_context": true, "supports_native_streaming": true, + "tpm": 800000, "input_cost_per_token_priority": 3.6e-06, "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, "output_cost_per_token_priority": 2.16e-05, diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index cf78a69f61f..cff3c94419b 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -17135,6 +17135,7 @@ "supports_web_search": true, "supports_url_context": true, "supports_native_streaming": true, + "tpm": 800000, "input_cost_per_token_priority": 3.6e-06, "input_cost_per_token_above_200k_tokens_priority": 7.2e-06, "output_cost_per_token_priority": 2.16e-05, From 2f8d36be1b3069e126ff52bad89888763f63b80f Mon Sep 17 00:00:00 2001 From: Sameer Kankute Date: Mon, 23 Feb 2026 18:56:12 +0530 Subject: [PATCH 4/4] Fix test_aaamodel_prices_and_context_window_json_is_valid --- tests/test_litellm/test_utils.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/test_litellm/test_utils.py b/tests/test_litellm/test_utils.py index 3ae45882780..14a003c2aad 100644 --- a/tests/test_litellm/test_utils.py +++ b/tests/test_litellm/test_utils.py @@ -13,12 +13,12 @@ import litellm from litellm.proxy.utils import is_valid_api_key from litellm.types.utils import ( + CallTypes, Delta, LlmProviders, ModelResponseStream, StreamingChoices, ) -from litellm.types.utils import CallTypes from litellm.utils import ( ProviderConfigManager, TextCompletionStreamWrapper, @@ -606,10 +606,14 @@ def test_aaamodel_prices_and_context_window_json_is_valid(): "input_cost_per_token_above_200k_tokens": {"type": "number"}, "cache_read_input_token_cost_flex": {"type": "number"}, "cache_read_input_token_cost_priority": {"type": "number"}, + "cache_read_input_token_cost_above_200k_tokens_priority": {"type": "number"}, "input_cost_per_token_flex": {"type": "number"}, "input_cost_per_token_priority": {"type": "number"}, + "input_cost_per_token_above_200k_tokens_priority": {"type": "number"}, + "input_cost_per_audio_token_priority": {"type": "number"}, "output_cost_per_token_flex": {"type": "number"}, "output_cost_per_token_priority": {"type": "number"}, + "output_cost_per_token_above_200k_tokens_priority": {"type": "number"}, "input_cost_per_pixel": {"type": "number"}, "input_cost_per_query": {"type": "number"}, "input_cost_per_request": {"type": "number"}, @@ -644,6 +648,7 @@ def test_aaamodel_prices_and_context_window_json_is_valid(): "max_video_length": {"type": "number"}, "max_videos_per_prompt": {"type": "number"}, "metadata": {"type": "object"}, + "provider_specific_entry": {"type": "object"}, "mode": { "type": "string", "enum": [ @@ -802,7 +807,7 @@ def test_aaamodel_prices_and_context_window_json_is_valid(): }, } - prod_json = "./model_prices_and_context_window.json" + prod_json = "litellm/model_prices_and_context_window.json" # prod_json = "../../model_prices_and_context_window.json" with open(prod_json, "r") as model_prices_file: actual_json = json.load(model_prices_file) @@ -2337,7 +2342,7 @@ def test_register_model_with_scientific_notation(): Test that the register_model function can handle scientific notation in the model name. """ import uuid - + # Use a truly unique model name with uuid to avoid conflicts when tests run in parallel test_model_name = f"test-scientific-notation-model-{uuid.uuid4().hex[:12]}" @@ -2981,8 +2986,8 @@ async def test_budget_alerts_soft_budget_with_alert_emails_bypasses_alerting_non via metadata.soft_budget_alerting_emails to work even when global alerting is disabled. """ from litellm.caching.caching import DualCache - from litellm.proxy.utils import ProxyLogging from litellm.proxy._types import CallInfo, Litellm_EntityType + from litellm.proxy.utils import ProxyLogging proxy_logging = ProxyLogging(user_api_key_cache=DualCache()) proxy_logging.alerting = None # Global alerting is disabled @@ -3018,8 +3023,8 @@ async def test_budget_alerts_soft_budget_without_alert_emails_respects_alerting_ and do not send emails when alerting is None. """ from litellm.caching.caching import DualCache - from litellm.proxy.utils import ProxyLogging from litellm.proxy._types import CallInfo, Litellm_EntityType + from litellm.proxy.utils import ProxyLogging proxy_logging = ProxyLogging(user_api_key_cache=DualCache()) proxy_logging.alerting = None @@ -3050,8 +3055,8 @@ async def test_budget_alerts_soft_budget_with_empty_alert_emails_respects_alerti Test that soft_budget alerts with empty alert_emails list still respect alerting=None. """ from litellm.caching.caching import DualCache - from litellm.proxy.utils import ProxyLogging from litellm.proxy._types import CallInfo, Litellm_EntityType + from litellm.proxy.utils import ProxyLogging proxy_logging = ProxyLogging(user_api_key_cache=DualCache()) proxy_logging.alerting = None