diff --git a/litellm/__init__.py b/litellm/__init__.py index f5db57f76fd..4c9a7789295 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -256,6 +256,7 @@ disable_token_counter: bool = False disable_add_transform_inline_image_block: bool = False disable_add_user_agent_to_request_tags: bool = False +disable_anthropic_gemini_context_caching_transform: bool = False extra_spend_tag_headers: Optional[List[str]] = None in_memory_llm_clients_cache: "LLMClientCache" safe_memory_mode: bool = False diff --git a/litellm/utils.py b/litellm/utils.py index d7fb4855a48..153bced18a6 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -7408,7 +7408,13 @@ def is_cached_message(message: AllMessageValues) -> bool: Used for anthropic/gemini context caching. Follows the anthropic format {"cache_control": {"type": "ephemeral"}} + + Can be disabled globally by setting litellm.disable_anthropic_gemini_context_caching_transform = True """ + # Check if context caching is disabled globally + if litellm.disable_anthropic_gemini_context_caching_transform is True: + return False + if "content" not in message: return False diff --git a/tests/local_testing/test_amazing_vertex_completion.py b/tests/local_testing/test_amazing_vertex_completion.py index 866e2a85060..e1c24ea46fe 100644 --- a/tests/local_testing/test_amazing_vertex_completion.py +++ b/tests/local_testing/test_amazing_vertex_completion.py @@ -2571,6 +2571,115 @@ async def test_gemini_context_caching_anthropic_format(sync_mode): # ) +@pytest.mark.parametrize( + "sync_mode", + [True, False], +) +@pytest.mark.asyncio +async def test_gemini_context_caching_disabled_flag(sync_mode): + """ + Test that disable_anthropic_gemini_context_caching_transform flag properly disables context caching. + + When the flag is set to True, messages with cache_control should not trigger caching API calls. + """ + from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler + + litellm.set_verbose = True + + # Store original value to restore later + original_flag_value = litellm.disable_anthropic_gemini_context_caching_transform + + try: + # Enable the disable flag + litellm.disable_anthropic_gemini_context_caching_transform = True + + gemini_context_caching_messages = [ + # System Message with cache_control + { + "role": "system", + "content": [ + { + "type": "text", + "text": "Here is the full text of a complex legal agreement {}".format( + uuid.uuid4() + ) + * 4000, + "cache_control": {"type": "ephemeral"}, + } + ], + }, + # User message with cache_control + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What are the key terms and conditions in this agreement?", + "cache_control": {"type": "ephemeral"}, + } + ], + }, + { + "role": "assistant", + "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo", + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What are the key terms and conditions in this agreement?", + } + ], + }, + ] + + if sync_mode: + client = HTTPHandler(concurrent_limit=1) + else: + client = AsyncHTTPHandler(concurrent_limit=1) + + with patch.object(client, "post", side_effect=mock_gemini_request) as mock_client: + try: + if sync_mode: + response = litellm.completion( + model="gemini/gemini-2.5-flash-lite-001", + messages=gemini_context_caching_messages, + temperature=0.2, + max_tokens=10, + client=client, + ) + else: + response = await litellm.acompletion( + model="gemini/gemini-2.5-flash-lite-001", + messages=gemini_context_caching_messages, + temperature=0.2, + max_tokens=10, + client=client, + ) + + except Exception as e: + print(e) + + # When caching is disabled, should only make 1 call (no separate cache creation call) + assert mock_client.call_count == 1, f"Expected 1 call when caching is disabled, got {mock_client.call_count}" + + first_call_args = mock_client.call_args_list[0].kwargs + first_call_positional_args = mock_client.call_args_list[0].args + + print(f"first_call_args with caching disabled: {first_call_args}") + print(f"first_call_positional_args with caching disabled: {first_call_positional_args}") + + # Assert that cachedContents is NOT in the URL when caching is disabled + url = first_call_args.get("url", first_call_positional_args[0] if first_call_positional_args else "") + assert "cachedContents" not in url, "cachedContents should not be in URL when caching is disabled" + + finally: + # Restore original flag value + litellm.disable_anthropic_gemini_context_caching_transform = original_flag_value + + + @pytest.mark.asyncio async def test_partner_models_httpx_ai21(): litellm.set_verbose = True