BerriAI · Sameerlite · Jan 30, 2026 · Jan 30, 2026
diff --git a/litellm/__init__.py b/litellm/__init__.py
@@ -256,6 +256,7 @@
 disable_token_counter: bool = False
 disable_add_transform_inline_image_block: bool = False
 disable_add_user_agent_to_request_tags: bool = False
+disable_anthropic_gemini_context_caching_transform: bool = False
 extra_spend_tag_headers: Optional[List[str]] = None
 in_memory_llm_clients_cache: "LLMClientCache"
 safe_memory_mode: bool = False

diff --git a/litellm/utils.py b/litellm/utils.py
@@ -7408,7 +7408,13 @@ def is_cached_message(message: AllMessageValues) -> bool:
     Used for anthropic/gemini context caching.
 
     Follows the anthropic format {"cache_control": {"type": "ephemeral"}}
+    
+    Can be disabled globally by setting litellm.disable_anthropic_gemini_context_caching_transform = True
     """
+    # Check if context caching is disabled globally
+    if litellm.disable_anthropic_gemini_context_caching_transform is True:
+        return False
+
     if "content" not in message:
         return False
 

diff --git a/tests/local_testing/test_amazing_vertex_completion.py b/tests/local_testing/test_amazing_vertex_completion.py
@@ -2571,6 +2571,115 @@ async def test_gemini_context_caching_anthropic_format(sync_mode):
         # )
 
 
+@pytest.mark.parametrize(
+    "sync_mode",
+    [True, False],
+)
+@pytest.mark.asyncio
+async def test_gemini_context_caching_disabled_flag(sync_mode):
+    """
+    Test that disable_anthropic_gemini_context_caching_transform flag properly disables context caching.
+
+    When the flag is set to True, messages with cache_control should not trigger caching API calls.
+    """
+    from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+
+    litellm.set_verbose = True
+
+    # Store original value to restore later
+    original_flag_value = litellm.disable_anthropic_gemini_context_caching_transform
+
+    try:
+        # Enable the disable flag
+        litellm.disable_anthropic_gemini_context_caching_transform = True
+
+        gemini_context_caching_messages = [
+            # System Message with cache_control
+            {
+                "role": "system",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Here is the full text of a complex legal agreement {}".format(
+                            uuid.uuid4()
+                        )
+                        * 4000,
+                        "cache_control": {"type": "ephemeral"},
+                    }
+                ],
+            },
+            # User message with cache_control
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What are the key terms and conditions in this agreement?",
+                        "cache_control": {"type": "ephemeral"},
+                    }
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What are the key terms and conditions in this agreement?",
+                    }
+                ],
+            },
+        ]
+
+        if sync_mode:
+            client = HTTPHandler(concurrent_limit=1)
+        else:
+            client = AsyncHTTPHandler(concurrent_limit=1)
+
+        with patch.object(client, "post", side_effect=mock_gemini_request) as mock_client:
+            try:
+                if sync_mode:
+                    response = litellm.completion(
+                        model="gemini/gemini-2.5-flash-lite-001",
+                        messages=gemini_context_caching_messages,
+                        temperature=0.2,
+                        max_tokens=10,
+                        client=client,
+                    )
+                else:
+                    response = await litellm.acompletion(
+                        model="gemini/gemini-2.5-flash-lite-001",
+                        messages=gemini_context_caching_messages,
+                        temperature=0.2,
+                        max_tokens=10,
+                        client=client,
+                    )
+
+            except Exception as e:
+                print(e)
+
+            # When caching is disabled, should only make 1 call (no separate cache creation call)
+            assert mock_client.call_count == 1, f"Expected 1 call when caching is disabled, got {mock_client.call_count}"
+
+            first_call_args = mock_client.call_args_list[0].kwargs
+            first_call_positional_args = mock_client.call_args_list[0].args
+
+            print(f"first_call_args with caching disabled: {first_call_args}")
+            print(f"first_call_positional_args with caching disabled: {first_call_positional_args}")
+
+            # Assert that cachedContents is NOT in the URL when caching is disabled
+            url = first_call_args.get("url", first_call_positional_args[0] if first_call_positional_args else "")
+            assert "cachedContents" not in url, "cachedContents should not be in URL when caching is disabled"
+
+    finally:
+        # Restore original flag value
+        litellm.disable_anthropic_gemini_context_caching_transform = original_flag_value
+
+
+
 @pytest.mark.asyncio
 async def test_partner_models_httpx_ai21():
     litellm.set_verbose = True