From 002f89c95af34353c7ae27f987b014ca25b48492 Mon Sep 17 00:00:00 2001 From: Chesars Date: Mon, 26 Jan 2026 12:01:59 -0300 Subject: [PATCH] fix(gemini): subtract implicit cached tokens from text_tokens for correct cost calculation When Gemini uses implicit caching, it returns cachedContentTokenCount but NOT cacheTokensDetails. Previously, text_tokens was not adjusted in this case, causing costs to be calculated as if all tokens were non-cached. This fix subtracts cachedContentTokenCount from text_tokens when no cacheTokensDetails is present (implicit caching), ensuring correct cost calculation with the reduced cache_read pricing. --- .../vertex_and_google_ai_studio_gemini.py | 10 +++ tests/test_litellm/test_cost_calculator.py | 83 +++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py index 2d2e07e74db..62742763889 100644 --- a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py +++ b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py @@ -1657,7 +1657,17 @@ def _calculate_usage( # noqa: PLR0915 ## This is necessary because promptTokensDetails includes both cached and non-cached tokens ## See: https://github.com/BerriAI/litellm/issues/18750 if cached_text_tokens is not None and prompt_text_tokens is not None: + # Explicit caching: subtract cached tokens per modality from cacheTokensDetails prompt_text_tokens = prompt_text_tokens - cached_text_tokens + elif ( + cached_tokens is not None + and prompt_text_tokens is not None + and cached_text_tokens is None + ): + # Implicit caching: only cachedContentTokenCount is provided (no cacheTokensDetails) + # Subtract from text tokens since implicit caching is primarily for text content + # See: https://github.com/BerriAI/litellm/issues/16341 + prompt_text_tokens = prompt_text_tokens - cached_tokens if cached_audio_tokens is not None and prompt_audio_tokens is not None: prompt_audio_tokens = prompt_audio_tokens - cached_audio_tokens if cached_image_tokens is not None and prompt_image_tokens is not None: diff --git a/tests/test_litellm/test_cost_calculator.py b/tests/test_litellm/test_cost_calculator.py index 4d6599fc1b5..54ed79ae3c8 100644 --- a/tests/test_litellm/test_cost_calculator.py +++ b/tests/test_litellm/test_cost_calculator.py @@ -1532,3 +1532,86 @@ def test_gemini_without_cache_tokens_details(): assert usage.prompt_tokens_details.text_tokens >= 0 print("✅ Gemini without cacheTokensDetails works correctly") + + +def test_gemini_implicit_caching_cost_calculation(): + """ + Test for Issue #16341: Gemini implicit cached tokens not counted in spend log + + When Gemini uses implicit caching, it returns cachedContentTokenCount but NOT + cacheTokensDetails. In this case, we should subtract cachedContentTokenCount + from text_tokens to correctly calculate costs. + + See: https://github.com/BerriAI/litellm/issues/16341 + """ + from litellm import completion_cost + from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import ( + VertexGeminiConfig, + ) + from litellm.types.utils import Choices, Message, ModelResponse + + # Simulate Gemini response with implicit caching (cachedContentTokenCount only) + completion_response = { + "usageMetadata": { + "promptTokenCount": 10000, + "candidatesTokenCount": 5, + "totalTokenCount": 10005, + "cachedContentTokenCount": 8000, # Implicit caching - no cacheTokensDetails + "promptTokensDetails": [{"modality": "TEXT", "tokenCount": 10000}], + "candidatesTokensDetails": [{"modality": "TEXT", "tokenCount": 5}], + } + } + + usage = VertexGeminiConfig._calculate_usage(completion_response) + + # Verify parsing + assert ( + usage.cache_read_input_tokens == 8000 + ), f"cache_read_input_tokens should be 8000, got {usage.cache_read_input_tokens}" + assert ( + usage.prompt_tokens_details.cached_tokens == 8000 + ), f"cached_tokens should be 8000, got {usage.prompt_tokens_details.cached_tokens}" + + # CRITICAL: text_tokens should be (10000 - 8000) = 2000, NOT 10000 + # This is the fix for issue #16341 + assert ( + usage.prompt_tokens_details.text_tokens == 2000 + ), f"text_tokens should be 2000 (10000 - 8000), got {usage.prompt_tokens_details.text_tokens}" + + # Verify cost calculation uses cached token pricing + response = ModelResponse( + id="mock-id", + model="gemini-2.0-flash", + choices=[ + Choices( + index=0, + message=Message(role="assistant", content="Hello!"), + finish_reason="stop", + ) + ], + usage=usage, + ) + + cost = completion_cost( + completion_response=response, + model="gemini-2.0-flash", + custom_llm_provider="gemini", + ) + + # Get model pricing for verification + import litellm + + model_info = litellm.get_model_info("gemini/gemini-2.0-flash") + input_cost = model_info.get("input_cost_per_token", 0) + cache_read_cost = model_info.get("cache_read_input_token_cost", input_cost) + output_cost = model_info.get("output_cost_per_token", 0) + + # Expected cost: (2000 * input) + (8000 * cache_read) + (5 * output) + expected_cost = (2000 * input_cost) + (8000 * cache_read_cost) + (5 * output_cost) + + assert abs(cost - expected_cost) < 1e-9, ( + f"Cost calculation is wrong. Got ${cost:.6f}, expected ${expected_cost:.6f}. " + f"Cached tokens may not be using reduced pricing." + ) + + print("✅ Issue #16341 fix verified: Gemini implicit caching cost calculated correctly")