Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions litellm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@
disable_token_counter: bool = False
disable_add_transform_inline_image_block: bool = False
disable_add_user_agent_to_request_tags: bool = False
disable_anthropic_gemini_context_caching_transform: bool = False
extra_spend_tag_headers: Optional[List[str]] = None
in_memory_llm_clients_cache: "LLMClientCache"
safe_memory_mode: bool = False
Expand Down
6 changes: 6 additions & 0 deletions litellm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7408,7 +7408,13 @@ def is_cached_message(message: AllMessageValues) -> bool:
Used for anthropic/gemini context caching.
Follows the anthropic format {"cache_control": {"type": "ephemeral"}}
Can be disabled globally by setting litellm.disable_anthropic_gemini_context_caching_transform = True
"""
# Check if context caching is disabled globally
if litellm.disable_anthropic_gemini_context_caching_transform is True:
return False

if "content" not in message:
return False

Expand Down
109 changes: 109 additions & 0 deletions tests/local_testing/test_amazing_vertex_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2571,6 +2571,115 @@ async def test_gemini_context_caching_anthropic_format(sync_mode):
# )


@pytest.mark.parametrize(
"sync_mode",
[True, False],
)
@pytest.mark.asyncio
async def test_gemini_context_caching_disabled_flag(sync_mode):
"""
Test that disable_anthropic_gemini_context_caching_transform flag properly disables context caching.

When the flag is set to True, messages with cache_control should not trigger caching API calls.
"""
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler

litellm.set_verbose = True

# Store original value to restore later
original_flag_value = litellm.disable_anthropic_gemini_context_caching_transform

try:
# Enable the disable flag
litellm.disable_anthropic_gemini_context_caching_transform = True

gemini_context_caching_messages = [
# System Message with cache_control
{
"role": "system",
"content": [
{
"type": "text",
"text": "Here is the full text of a complex legal agreement {}".format(
uuid.uuid4()
)
* 4000,
"cache_control": {"type": "ephemeral"},
}
],
},
# User message with cache_control
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
{
"role": "assistant",
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
}
],
},
]

if sync_mode:
client = HTTPHandler(concurrent_limit=1)
else:
client = AsyncHTTPHandler(concurrent_limit=1)

with patch.object(client, "post", side_effect=mock_gemini_request) as mock_client:
try:
if sync_mode:
response = litellm.completion(
model="gemini/gemini-2.5-flash-lite-001",
messages=gemini_context_caching_messages,
temperature=0.2,
max_tokens=10,
client=client,
)
else:
response = await litellm.acompletion(
model="gemini/gemini-2.5-flash-lite-001",
messages=gemini_context_caching_messages,
temperature=0.2,
max_tokens=10,
client=client,
)

except Exception as e:
print(e)

# When caching is disabled, should only make 1 call (no separate cache creation call)
assert mock_client.call_count == 1, f"Expected 1 call when caching is disabled, got {mock_client.call_count}"

first_call_args = mock_client.call_args_list[0].kwargs
first_call_positional_args = mock_client.call_args_list[0].args

print(f"first_call_args with caching disabled: {first_call_args}")
print(f"first_call_positional_args with caching disabled: {first_call_positional_args}")

# Assert that cachedContents is NOT in the URL when caching is disabled
url = first_call_args.get("url", first_call_positional_args[0] if first_call_positional_args else "")
assert "cachedContents" not in url, "cachedContents should not be in URL when caching is disabled"

finally:
# Restore original flag value
litellm.disable_anthropic_gemini_context_caching_transform = original_flag_value



@pytest.mark.asyncio
async def test_partner_models_httpx_ai21():
litellm.set_verbose = True
Expand Down
Loading