Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@ When opening issues or pull requests, follow these templates:

Using helpers like `supports_reasoning` (which read from `model_prices_and_context_window.json` / `get_model_info`) allows future model updates to "just work" without code changes.

9. **Never close HTTP/SDK clients on cache eviction**: Do not add `close()`, `aclose()`, or `create_task(close_fn())` inside `LLMClientCache._remove_key()` or any cache eviction path. Evicted clients may still be held by in-flight requests; closing them causes `RuntimeError: Cannot send a request, as the client has been closed.` in production after the cache TTL (1 hour) expires. Connection cleanup is handled at shutdown by `close_litellm_async_clients()`. See PR #22247 for the full incident history.

## HELPFUL RESOURCES

- Main documentation: https://docs.litellm.ai/
Expand Down
3 changes: 3 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ LiteLLM is a unified interface for 100+ LLM providers with two main components:
- Optional features enabled via environment variables
- Separate licensing and authentication for enterprise features

### HTTP Client Cache Safety
- **Never close HTTP/SDK clients on cache eviction.** `LLMClientCache._remove_key()` must not call `close()`/`aclose()` on evicted clients β€” they may still be used by in-flight requests. Doing so causes `RuntimeError: Cannot send a request, as the client has been closed.` after the 1-hour TTL expires. Cleanup happens at shutdown via `close_litellm_async_clients()`.

### Troubleshooting: DB schema out of sync after proxy restart
`litellm-proxy-extras` runs `prisma migrate deploy` on startup using **its own** bundled migration files, which may lag behind schema changes in the current worktree. Symptoms: `Unknown column`, `Invalid prisma invocation`, or missing data on new fields.

Expand Down
33 changes: 9 additions & 24 deletions litellm/caching/llm_caching_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,21 @@
"""

import asyncio
from typing import Set

from .in_memory_cache import InMemoryCache


class LLMClientCache(InMemoryCache):
# Background tasks must be stored to prevent garbage collection, which would
# trigger "coroutine was never awaited" warnings. See:
# https://docs.python.org/3/library/asyncio-task.html#creating-tasks
# Intentionally shared across all instances as a global task registry.
_background_tasks: Set[asyncio.Task] = set()
"""Cache for LLM HTTP clients (OpenAI, Azure, httpx, etc.).

def _remove_key(self, key: str) -> None:
"""Close async clients before evicting them to prevent connection pool leaks."""
value = self.cache_dict.get(key)
super()._remove_key(key)
if value is not None:
close_fn = getattr(value, "aclose", None) or getattr(value, "close", None)
if close_fn and asyncio.iscoroutinefunction(close_fn):
try:
task = asyncio.get_running_loop().create_task(close_fn())
self._background_tasks.add(task)
task.add_done_callback(self._background_tasks.discard)
except RuntimeError:
pass
elif close_fn and callable(close_fn):
try:
close_fn()
except Exception:
pass
IMPORTANT: This cache intentionally does NOT close clients on eviction.
Evicted clients may still be in use by in-flight requests. Closing them
eagerly causes ``RuntimeError: Cannot send a request, as the client has
been closed.`` errors in production after the TTL (1 hour) expires.

Clients that are no longer referenced will be garbage-collected normally.
For explicit shutdown cleanup, use ``close_litellm_async_clients()``.
"""

def update_cache_key_with_event_loop(self, key):
"""
Expand Down
116 changes: 66 additions & 50 deletions tests/test_litellm/caching/test_llm_caching_handler.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
"""
Tests for LLMClientCache.

The cache intentionally does NOT close clients on eviction because evicted
clients may still be referenced by in-flight requests. Closing them eagerly
causes ``RuntimeError: Cannot send a request, as the client has been closed.``

See: https://github.com/BerriAI/litellm/pull/22247
"""

import asyncio
import os
import sys
Expand Down Expand Up @@ -33,56 +43,32 @@ def close(self):


@pytest.mark.asyncio
async def test_remove_key_no_unawaited_coroutine_warning():
async def test_remove_key_does_not_close_async_client():
"""
Test that evicting an async client from LLMClientCache does not produce
'coroutine was never awaited' warnings.
Evicting an async client from LLMClientCache must NOT close it because
an in-flight request may still hold a reference to the client.

Regression test for https://github.com/BerriAI/litellm/issues/22128
Regression test for production 'client has been closed' crashes.
"""
cache = LLMClientCache(max_size_in_memory=2)

mock_client = MockAsyncClient()
cache.cache_dict["test-key"] = mock_client
cache.ttl_dict["test-key"] = 0 # expired

with warnings.catch_warnings(record=True) as caught_warnings:
warnings.simplefilter("always")
cache._remove_key("test-key")
# Let the event loop process the close task
await asyncio.sleep(0.1)

coroutine_warnings = [
w for w in caught_warnings if "coroutine" in str(w.message).lower()
]
assert (
len(coroutine_warnings) == 0
), f"Got unawaited coroutine warnings: {coroutine_warnings}"


@pytest.mark.asyncio
async def test_remove_key_closes_async_client():
"""
Test that evicting an async client from the cache properly closes it.
"""
cache = LLMClientCache(max_size_in_memory=2)

mock_client = MockAsyncClient()
cache.cache_dict["test-key"] = mock_client
cache.ttl_dict["test-key"] = 0

cache._remove_key("test-key")
# Let the event loop process the close task
# Give the event loop a chance to run any background tasks
await asyncio.sleep(0.1)

assert mock_client.closed is True
# Client must NOT be closed β€” it may still be in use
assert mock_client.closed is False
assert "test-key" not in cache.cache_dict
assert "test-key" not in cache.ttl_dict


def test_remove_key_closes_sync_client():
def test_remove_key_does_not_close_sync_client():
"""
Test that evicting a sync client from the cache properly closes it.
Evicting a sync client from the cache must NOT close it.
"""
cache = LLMClientCache(max_size_in_memory=2)

Expand All @@ -92,15 +78,15 @@ def test_remove_key_closes_sync_client():

cache._remove_key("test-key")

assert mock_client.closed is True
assert mock_client.closed is False
assert "test-key" not in cache.cache_dict


@pytest.mark.asyncio
async def test_eviction_closes_async_clients():
async def test_eviction_does_not_close_async_clients():
"""
Test that cache eviction (when cache is full) properly closes async clients
without producing warnings.
When the cache is full and an entry is evicted, the evicted async client
must remain open and must not produce 'coroutine was never awaited' warnings.
"""
cache = LLMClientCache(max_size_in_memory=2, default_ttl=1)

Expand All @@ -123,36 +109,66 @@ async def test_eviction_closes_async_clients():
len(coroutine_warnings) == 0
), f"Got unawaited coroutine warnings: {coroutine_warnings}"

# Evicted clients must NOT be closed
for client in clients:
assert client.closed is False

def test_remove_key_no_event_loop():

@pytest.mark.asyncio
async def test_eviction_no_unawaited_coroutine_warning():
"""
Test that _remove_key doesn't raise when there's no running event loop
(falls through to the RuntimeError except branch).
Evicting an async client from LLMClientCache must not produce
'coroutine was never awaited' warnings.

Regression test for https://github.com/BerriAI/litellm/issues/22128
"""
cache = LLMClientCache(max_size_in_memory=2)

mock_client = MockAsyncClient()
cache.cache_dict["test-key"] = mock_client
cache.ttl_dict["test-key"] = 0
cache.ttl_dict["test-key"] = 0 # expired

# Should not raise even though there's no running event loop
cache._remove_key("test-key")
assert "test-key" not in cache.cache_dict
with warnings.catch_warnings(record=True) as caught_warnings:
warnings.simplefilter("always")
cache._remove_key("test-key")
await asyncio.sleep(0.1)

coroutine_warnings = [
w for w in caught_warnings if "coroutine" in str(w.message).lower()
]
assert (
len(coroutine_warnings) == 0
), f"Got unawaited coroutine warnings: {coroutine_warnings}"


@pytest.mark.asyncio
async def test_background_tasks_cleaned_up_after_completion():
def test_remove_key_no_event_loop():
"""
Test that completed close tasks are removed from the _background_tasks set.
_remove_key works correctly even when there's no running event loop.
"""
cache = LLMClientCache(max_size_in_memory=2)

mock_client = MockAsyncClient()
cache.cache_dict["test-key"] = mock_client
cache.ttl_dict["test-key"] = 0

# Should not raise even though there's no running event loop
cache._remove_key("test-key")
# Let the task complete
await asyncio.sleep(0.1)
assert "test-key" not in cache.cache_dict


def test_remove_key_removes_plain_values():
"""
_remove_key correctly removes non-client values (strings, dicts, etc.).
"""
cache = LLMClientCache(max_size_in_memory=5)

cache.cache_dict["str-key"] = "hello"
cache.ttl_dict["str-key"] = 0
cache.cache_dict["dict-key"] = {"foo": "bar"}
cache.ttl_dict["dict-key"] = 0

cache._remove_key("str-key")
cache._remove_key("dict-key")

assert len(cache._background_tasks) == 0
assert "str-key" not in cache.cache_dict
assert "dict-key" not in cache.cache_dict
82 changes: 81 additions & 1 deletion tests/test_litellm/caching/test_llm_client_cache_e2e.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
"""e2e tests: httpx clients obtained via get_async_httpx_client must remain
usable after LLMClientCache evicts their cache entry."""
usable after LLMClientCache evicts their cache entry.

These tests exist to prevent a recurring production crash:
RuntimeError: Cannot send a request, as the client has been closed.

The bug occurs when LLMClientCache._remove_key() eagerly closes evicted
clients that are still referenced by in-flight requests. Every test here
sleeps after eviction to let the event loop drain any background close
tasks β€” a plain ``assert not client.is_closed`` without sleeping is NOT
sufficient to catch the regression (the close task runs asynchronously).

See: https://github.com/BerriAI/litellm/pull/22247
"""

import asyncio

import pytest

Expand All @@ -25,6 +39,10 @@ async def test_evicted_client_is_not_closed():
# This evicts client_a from cache (capacity=1)
client_b = get_async_httpx_client(llm_provider="provider_b")

# Sleep to let any background close tasks execute β€” without this sleep,
# a regression that schedules close via create_task() would go undetected.
await asyncio.sleep(0.15)

assert not client_a.client.is_closed
await client_a.client.aclose()
await client_b.client.aclose()
Expand All @@ -43,5 +61,67 @@ async def test_expired_client_is_not_closed():
cache.expiration_heap = [(0, key) for _, key in cache.expiration_heap]
cache.evict_cache()

await asyncio.sleep(0.15)

assert not client.client.is_closed
await client.client.aclose()


@pytest.mark.asyncio
async def test_evicted_openai_sdk_client_stays_usable():
"""OpenAI/Azure SDK clients cached in LLMClientCache must remain usable
after eviction. This is the exact production scenario: the proxy caches
an AsyncOpenAI client, the TTL expires, a new request evicts the old
entry, but a concurrent streaming request is still reading from it.

Regression guard: if _remove_key ever calls client.close(), the
underlying httpx client is closed and this test fails.
"""
from openai import AsyncOpenAI

cache = litellm.in_memory_llm_clients_cache

client = AsyncOpenAI(api_key="sk-test", base_url="https://api.openai.com/v1")
cache.set_cache("openai-client", client, ttl=600)

# Evict by inserting a second entry (max_size=1)
cache.set_cache("filler", "x", ttl=600)

# Let the event loop drain any background close tasks
await asyncio.sleep(0.15)

# The SDK client's internal httpx client must still be open
assert not client._client.is_closed, (
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Accessing private _client attribute

client._client is a private implementation detail of the openai SDK. If the SDK ever renames or restructures this attribute, this assertion will raise an AttributeError rather than failing with a clear test message. Consider using the public client.is_closed() method if the SDK exposes one, or wrapping the access in a hasattr guard with a fallback:

Suggested change
assert not client._client.is_closed, (
assert not client._client.is_closed, (

Alternatively, a safer pattern would be:

internal_client = getattr(client, "_client", None)
assert internal_client is not None, "AsyncOpenAI no longer exposes ._client"
assert not internal_client.is_closed, (
    "AsyncOpenAI client was closed on cache eviction β€” this causes "
    "'Cannot send a request, as the client has been closed' in production"
)

Same concern applies to client._client.is_closed on line 123 in test_ttl_expired_openai_sdk_client_stays_usable.

"AsyncOpenAI client was closed on cache eviction β€” this causes "
"'Cannot send a request, as the client has been closed' in production"
)
await client.close()


@pytest.mark.asyncio
async def test_ttl_expired_openai_sdk_client_stays_usable():
"""Same as above but triggered via TTL expiry + get_cache (the other
eviction path)."""
from openai import AsyncOpenAI

cache = litellm.in_memory_llm_clients_cache

client = AsyncOpenAI(api_key="sk-test", base_url="https://api.openai.com/v1")
cache.set_cache("openai-client", client, ttl=600)

# Force TTL expiry
for key in list(cache.ttl_dict.keys()):
cache.ttl_dict[key] = 0
cache.expiration_heap = [(0, k) for _, k in cache.expiration_heap]

# get_cache calls evict_element_if_expired β†’ _remove_key
result = cache.get_cache("openai-client")
assert result is None # expired, so returns None

await asyncio.sleep(0.15)

assert not client._client.is_closed, (
"AsyncOpenAI client was closed on TTL expiry β€” this causes "
"'Cannot send a request, as the client has been closed' in production"
)
await client.close()
Loading