From 002f89c95af34353c7ae27f987b014ca25b48492 Mon Sep 17 00:00:00 2001
From: Chesars <cesarponce19544@gmail.com>
Date: Mon, 26 Jan 2026 12:01:59 -0300
Subject: [PATCH] fix(gemini): subtract implicit cached tokens from text_tokens
 for correct cost calculation

When Gemini uses implicit caching, it returns cachedContentTokenCount but
NOT cacheTokensDetails. Previously, text_tokens was not adjusted in this case,
causing costs to be calculated as if all tokens were non-cached.

This fix subtracts cachedContentTokenCount from text_tokens when no
cacheTokensDetails is present (implicit caching), ensuring correct cost
calculation with the reduced cache_read pricing.
---
 .../vertex_and_google_ai_studio_gemini.py     | 10 +++
 tests/test_litellm/test_cost_calculator.py    | 83 +++++++++++++++++++
 2 files changed, 93 insertions(+)

diff --git a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
index 2d2e07e74db..62742763889 100644
--- a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
+++ b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@@ -1657,7 +1657,17 @@ def _calculate_usage(  # noqa: PLR0915
         ## This is necessary because promptTokensDetails includes both cached and non-cached tokens
         ## See: https://github.com/BerriAI/litellm/issues/18750
         if cached_text_tokens is not None and prompt_text_tokens is not None:
+            # Explicit caching: subtract cached tokens per modality from cacheTokensDetails
             prompt_text_tokens = prompt_text_tokens - cached_text_tokens
+        elif (
+            cached_tokens is not None
+            and prompt_text_tokens is not None
+            and cached_text_tokens is None
+        ):
+            # Implicit caching: only cachedContentTokenCount is provided (no cacheTokensDetails)
+            # Subtract from text tokens since implicit caching is primarily for text content
+            # See: https://github.com/BerriAI/litellm/issues/16341
+            prompt_text_tokens = prompt_text_tokens - cached_tokens
         if cached_audio_tokens is not None and prompt_audio_tokens is not None:
             prompt_audio_tokens = prompt_audio_tokens - cached_audio_tokens
         if cached_image_tokens is not None and prompt_image_tokens is not None:
diff --git a/tests/test_litellm/test_cost_calculator.py b/tests/test_litellm/test_cost_calculator.py
index 4d6599fc1b5..54ed79ae3c8 100644
--- a/tests/test_litellm/test_cost_calculator.py
+++ b/tests/test_litellm/test_cost_calculator.py
@@ -1532,3 +1532,86 @@ def test_gemini_without_cache_tokens_details():
     assert usage.prompt_tokens_details.text_tokens >= 0
 
     print("✅ Gemini without cacheTokensDetails works correctly")
+
+
+def test_gemini_implicit_caching_cost_calculation():
+    """
+    Test for Issue #16341: Gemini implicit cached tokens not counted in spend log
+
+    When Gemini uses implicit caching, it returns cachedContentTokenCount but NOT
+    cacheTokensDetails. In this case, we should subtract cachedContentTokenCount
+    from text_tokens to correctly calculate costs.
+
+    See: https://github.com/BerriAI/litellm/issues/16341
+    """
+    from litellm import completion_cost
+    from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
+        VertexGeminiConfig,
+    )
+    from litellm.types.utils import Choices, Message, ModelResponse
+
+    # Simulate Gemini response with implicit caching (cachedContentTokenCount only)
+    completion_response = {
+        "usageMetadata": {
+            "promptTokenCount": 10000,
+            "candidatesTokenCount": 5,
+            "totalTokenCount": 10005,
+            "cachedContentTokenCount": 8000,  # Implicit caching - no cacheTokensDetails
+            "promptTokensDetails": [{"modality": "TEXT", "tokenCount": 10000}],
+            "candidatesTokensDetails": [{"modality": "TEXT", "tokenCount": 5}],
+        }
+    }
+
+    usage = VertexGeminiConfig._calculate_usage(completion_response)
+
+    # Verify parsing
+    assert (
+        usage.cache_read_input_tokens == 8000
+    ), f"cache_read_input_tokens should be 8000, got {usage.cache_read_input_tokens}"
+    assert (
+        usage.prompt_tokens_details.cached_tokens == 8000
+    ), f"cached_tokens should be 8000, got {usage.prompt_tokens_details.cached_tokens}"
+
+    # CRITICAL: text_tokens should be (10000 - 8000) = 2000, NOT 10000
+    # This is the fix for issue #16341
+    assert (
+        usage.prompt_tokens_details.text_tokens == 2000
+    ), f"text_tokens should be 2000 (10000 - 8000), got {usage.prompt_tokens_details.text_tokens}"
+
+    # Verify cost calculation uses cached token pricing
+    response = ModelResponse(
+        id="mock-id",
+        model="gemini-2.0-flash",
+        choices=[
+            Choices(
+                index=0,
+                message=Message(role="assistant", content="Hello!"),
+                finish_reason="stop",
+            )
+        ],
+        usage=usage,
+    )
+
+    cost = completion_cost(
+        completion_response=response,
+        model="gemini-2.0-flash",
+        custom_llm_provider="gemini",
+    )
+
+    # Get model pricing for verification
+    import litellm
+
+    model_info = litellm.get_model_info("gemini/gemini-2.0-flash")
+    input_cost = model_info.get("input_cost_per_token", 0)
+    cache_read_cost = model_info.get("cache_read_input_token_cost", input_cost)
+    output_cost = model_info.get("output_cost_per_token", 0)
+
+    # Expected cost: (2000 * input) + (8000 * cache_read) + (5 * output)
+    expected_cost = (2000 * input_cost) + (8000 * cache_read_cost) + (5 * output_cost)
+
+    assert abs(cost - expected_cost) < 1e-9, (
+        f"Cost calculation is wrong. Got ${cost:.6f}, expected ${expected_cost:.6f}. "
+        f"Cached tokens may not be using reduced pricing."
+    )
+
+    print("✅ Issue #16341 fix verified: Gemini implicit caching cost calculated correctly")