diff --git a/AGENTS.md b/AGENTS.md index 546f2997bf..d330961793 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -212,6 +212,8 @@ When opening issues or pull requests, follow these templates: Using helpers like `supports_reasoning` (which read from `model_prices_and_context_window.json` / `get_model_info`) allows future model updates to "just work" without code changes. +9. **Never close HTTP/SDK clients on cache eviction**: Do not add `close()`, `aclose()`, or `create_task(close_fn())` inside `LLMClientCache._remove_key()` or any cache eviction path. Evicted clients may still be held by in-flight requests; closing them causes `RuntimeError: Cannot send a request, as the client has been closed.` in production after the cache TTL (1 hour) expires. Connection cleanup is handled at shutdown by `close_litellm_async_clients()`. See PR #22247 for the full incident history. + ## HELPFUL RESOURCES - Main documentation: https://docs.litellm.ai/ diff --git a/CLAUDE.md b/CLAUDE.md index 5b36c2be8a..104a751eca 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -116,6 +116,9 @@ LiteLLM is a unified interface for 100+ LLM providers with two main components: - Optional features enabled via environment variables - Separate licensing and authentication for enterprise features +### HTTP Client Cache Safety +- **Never close HTTP/SDK clients on cache eviction.** `LLMClientCache._remove_key()` must not call `close()`/`aclose()` on evicted clients — they may still be used by in-flight requests. Doing so causes `RuntimeError: Cannot send a request, as the client has been closed.` after the 1-hour TTL expires. Cleanup happens at shutdown via `close_litellm_async_clients()`. + ### Troubleshooting: DB schema out of sync after proxy restart `litellm-proxy-extras` runs `prisma migrate deploy` on startup using **its own** bundled migration files, which may lag behind schema changes in the current worktree. Symptoms: `Unknown column`, `Invalid prisma invocation`, or missing data on new fields. diff --git a/litellm/caching/llm_caching_handler.py b/litellm/caching/llm_caching_handler.py index 331aa8f51c..c2274713bb 100644 --- a/litellm/caching/llm_caching_handler.py +++ b/litellm/caching/llm_caching_handler.py @@ -3,36 +3,21 @@ """ import asyncio -from typing import Set from .in_memory_cache import InMemoryCache class LLMClientCache(InMemoryCache): - # Background tasks must be stored to prevent garbage collection, which would - # trigger "coroutine was never awaited" warnings. See: - # https://docs.python.org/3/library/asyncio-task.html#creating-tasks - # Intentionally shared across all instances as a global task registry. - _background_tasks: Set[asyncio.Task] = set() + """Cache for LLM HTTP clients (OpenAI, Azure, httpx, etc.). - def _remove_key(self, key: str) -> None: - """Close async clients before evicting them to prevent connection pool leaks.""" - value = self.cache_dict.get(key) - super()._remove_key(key) - if value is not None: - close_fn = getattr(value, "aclose", None) or getattr(value, "close", None) - if close_fn and asyncio.iscoroutinefunction(close_fn): - try: - task = asyncio.get_running_loop().create_task(close_fn()) - self._background_tasks.add(task) - task.add_done_callback(self._background_tasks.discard) - except RuntimeError: - pass - elif close_fn and callable(close_fn): - try: - close_fn() - except Exception: - pass + IMPORTANT: This cache intentionally does NOT close clients on eviction. + Evicted clients may still be in use by in-flight requests. Closing them + eagerly causes ``RuntimeError: Cannot send a request, as the client has + been closed.`` errors in production after the TTL (1 hour) expires. + + Clients that are no longer referenced will be garbage-collected normally. + For explicit shutdown cleanup, use ``close_litellm_async_clients()``. + """ def update_cache_key_with_event_loop(self, key): """ diff --git a/tests/test_litellm/caching/test_llm_caching_handler.py b/tests/test_litellm/caching/test_llm_caching_handler.py index 0ac4ac5de7..8e6a94945b 100644 --- a/tests/test_litellm/caching/test_llm_caching_handler.py +++ b/tests/test_litellm/caching/test_llm_caching_handler.py @@ -1,3 +1,13 @@ +""" +Tests for LLMClientCache. + +The cache intentionally does NOT close clients on eviction because evicted +clients may still be referenced by in-flight requests. Closing them eagerly +causes ``RuntimeError: Cannot send a request, as the client has been closed.`` + +See: https://github.com/BerriAI/litellm/pull/22247 +""" + import asyncio import os import sys @@ -33,12 +43,12 @@ def close(self): @pytest.mark.asyncio -async def test_remove_key_no_unawaited_coroutine_warning(): +async def test_remove_key_does_not_close_async_client(): """ - Test that evicting an async client from LLMClientCache does not produce - 'coroutine was never awaited' warnings. + Evicting an async client from LLMClientCache must NOT close it because + an in-flight request may still hold a reference to the client. - Regression test for https://github.com/BerriAI/litellm/issues/22128 + Regression test for production 'client has been closed' crashes. """ cache = LLMClientCache(max_size_in_memory=2) @@ -46,43 +56,19 @@ async def test_remove_key_no_unawaited_coroutine_warning(): cache.cache_dict["test-key"] = mock_client cache.ttl_dict["test-key"] = 0 # expired - with warnings.catch_warnings(record=True) as caught_warnings: - warnings.simplefilter("always") - cache._remove_key("test-key") - # Let the event loop process the close task - await asyncio.sleep(0.1) - - coroutine_warnings = [ - w for w in caught_warnings if "coroutine" in str(w.message).lower() - ] - assert ( - len(coroutine_warnings) == 0 - ), f"Got unawaited coroutine warnings: {coroutine_warnings}" - - -@pytest.mark.asyncio -async def test_remove_key_closes_async_client(): - """ - Test that evicting an async client from the cache properly closes it. - """ - cache = LLMClientCache(max_size_in_memory=2) - - mock_client = MockAsyncClient() - cache.cache_dict["test-key"] = mock_client - cache.ttl_dict["test-key"] = 0 - cache._remove_key("test-key") - # Let the event loop process the close task + # Give the event loop a chance to run any background tasks await asyncio.sleep(0.1) - assert mock_client.closed is True + # Client must NOT be closed — it may still be in use + assert mock_client.closed is False assert "test-key" not in cache.cache_dict assert "test-key" not in cache.ttl_dict -def test_remove_key_closes_sync_client(): +def test_remove_key_does_not_close_sync_client(): """ - Test that evicting a sync client from the cache properly closes it. + Evicting a sync client from the cache must NOT close it. """ cache = LLMClientCache(max_size_in_memory=2) @@ -92,15 +78,15 @@ def test_remove_key_closes_sync_client(): cache._remove_key("test-key") - assert mock_client.closed is True + assert mock_client.closed is False assert "test-key" not in cache.cache_dict @pytest.mark.asyncio -async def test_eviction_closes_async_clients(): +async def test_eviction_does_not_close_async_clients(): """ - Test that cache eviction (when cache is full) properly closes async clients - without producing warnings. + When the cache is full and an entry is evicted, the evicted async client + must remain open and must not produce 'coroutine was never awaited' warnings. """ cache = LLMClientCache(max_size_in_memory=2, default_ttl=1) @@ -123,27 +109,41 @@ async def test_eviction_closes_async_clients(): len(coroutine_warnings) == 0 ), f"Got unawaited coroutine warnings: {coroutine_warnings}" + # Evicted clients must NOT be closed + for client in clients: + assert client.closed is False -def test_remove_key_no_event_loop(): + +@pytest.mark.asyncio +async def test_eviction_no_unawaited_coroutine_warning(): """ - Test that _remove_key doesn't raise when there's no running event loop - (falls through to the RuntimeError except branch). + Evicting an async client from LLMClientCache must not produce + 'coroutine was never awaited' warnings. + + Regression test for https://github.com/BerriAI/litellm/issues/22128 """ cache = LLMClientCache(max_size_in_memory=2) mock_client = MockAsyncClient() cache.cache_dict["test-key"] = mock_client - cache.ttl_dict["test-key"] = 0 + cache.ttl_dict["test-key"] = 0 # expired - # Should not raise even though there's no running event loop - cache._remove_key("test-key") - assert "test-key" not in cache.cache_dict + with warnings.catch_warnings(record=True) as caught_warnings: + warnings.simplefilter("always") + cache._remove_key("test-key") + await asyncio.sleep(0.1) + + coroutine_warnings = [ + w for w in caught_warnings if "coroutine" in str(w.message).lower() + ] + assert ( + len(coroutine_warnings) == 0 + ), f"Got unawaited coroutine warnings: {coroutine_warnings}" -@pytest.mark.asyncio -async def test_background_tasks_cleaned_up_after_completion(): +def test_remove_key_no_event_loop(): """ - Test that completed close tasks are removed from the _background_tasks set. + _remove_key works correctly even when there's no running event loop. """ cache = LLMClientCache(max_size_in_memory=2) @@ -151,8 +151,24 @@ async def test_background_tasks_cleaned_up_after_completion(): cache.cache_dict["test-key"] = mock_client cache.ttl_dict["test-key"] = 0 + # Should not raise even though there's no running event loop cache._remove_key("test-key") - # Let the task complete - await asyncio.sleep(0.1) + assert "test-key" not in cache.cache_dict + + +def test_remove_key_removes_plain_values(): + """ + _remove_key correctly removes non-client values (strings, dicts, etc.). + """ + cache = LLMClientCache(max_size_in_memory=5) + + cache.cache_dict["str-key"] = "hello" + cache.ttl_dict["str-key"] = 0 + cache.cache_dict["dict-key"] = {"foo": "bar"} + cache.ttl_dict["dict-key"] = 0 + + cache._remove_key("str-key") + cache._remove_key("dict-key") - assert len(cache._background_tasks) == 0 + assert "str-key" not in cache.cache_dict + assert "dict-key" not in cache.cache_dict diff --git a/tests/test_litellm/caching/test_llm_client_cache_e2e.py b/tests/test_litellm/caching/test_llm_client_cache_e2e.py index a7d012d226..061e780e5d 100644 --- a/tests/test_litellm/caching/test_llm_client_cache_e2e.py +++ b/tests/test_litellm/caching/test_llm_client_cache_e2e.py @@ -1,5 +1,19 @@ """e2e tests: httpx clients obtained via get_async_httpx_client must remain -usable after LLMClientCache evicts their cache entry.""" +usable after LLMClientCache evicts their cache entry. + +These tests exist to prevent a recurring production crash: + RuntimeError: Cannot send a request, as the client has been closed. + +The bug occurs when LLMClientCache._remove_key() eagerly closes evicted +clients that are still referenced by in-flight requests. Every test here +sleeps after eviction to let the event loop drain any background close +tasks — a plain ``assert not client.is_closed`` without sleeping is NOT +sufficient to catch the regression (the close task runs asynchronously). + +See: https://github.com/BerriAI/litellm/pull/22247 +""" + +import asyncio import pytest @@ -25,6 +39,10 @@ async def test_evicted_client_is_not_closed(): # This evicts client_a from cache (capacity=1) client_b = get_async_httpx_client(llm_provider="provider_b") + # Sleep to let any background close tasks execute — without this sleep, + # a regression that schedules close via create_task() would go undetected. + await asyncio.sleep(0.15) + assert not client_a.client.is_closed await client_a.client.aclose() await client_b.client.aclose() @@ -43,5 +61,67 @@ async def test_expired_client_is_not_closed(): cache.expiration_heap = [(0, key) for _, key in cache.expiration_heap] cache.evict_cache() + await asyncio.sleep(0.15) + assert not client.client.is_closed await client.client.aclose() + + +@pytest.mark.asyncio +async def test_evicted_openai_sdk_client_stays_usable(): + """OpenAI/Azure SDK clients cached in LLMClientCache must remain usable + after eviction. This is the exact production scenario: the proxy caches + an AsyncOpenAI client, the TTL expires, a new request evicts the old + entry, but a concurrent streaming request is still reading from it. + + Regression guard: if _remove_key ever calls client.close(), the + underlying httpx client is closed and this test fails. + """ + from openai import AsyncOpenAI + + cache = litellm.in_memory_llm_clients_cache + + client = AsyncOpenAI(api_key="sk-test", base_url="https://api.openai.com/v1") + cache.set_cache("openai-client", client, ttl=600) + + # Evict by inserting a second entry (max_size=1) + cache.set_cache("filler", "x", ttl=600) + + # Let the event loop drain any background close tasks + await asyncio.sleep(0.15) + + # The SDK client's internal httpx client must still be open + assert not client._client.is_closed, ( + "AsyncOpenAI client was closed on cache eviction — this causes " + "'Cannot send a request, as the client has been closed' in production" + ) + await client.close() + + +@pytest.mark.asyncio +async def test_ttl_expired_openai_sdk_client_stays_usable(): + """Same as above but triggered via TTL expiry + get_cache (the other + eviction path).""" + from openai import AsyncOpenAI + + cache = litellm.in_memory_llm_clients_cache + + client = AsyncOpenAI(api_key="sk-test", base_url="https://api.openai.com/v1") + cache.set_cache("openai-client", client, ttl=600) + + # Force TTL expiry + for key in list(cache.ttl_dict.keys()): + cache.ttl_dict[key] = 0 + cache.expiration_heap = [(0, k) for _, k in cache.expiration_heap] + + # get_cache calls evict_element_if_expired → _remove_key + result = cache.get_cache("openai-client") + assert result is None # expired, so returns None + + await asyncio.sleep(0.15) + + assert not client._client.is_closed, ( + "AsyncOpenAI client was closed on TTL expiry — this causes " + "'Cannot send a request, as the client has been closed' in production" + ) + await client.close()