BerriAI · ishaan-jaff · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -212,6 +212,8 @@ When opening issues or pull requests, follow these templates:
 
    Using helpers like `supports_reasoning` (which read from `model_prices_and_context_window.json` / `get_model_info`) allows future model updates to "just work" without code changes.
 
+9. **Never close HTTP/SDK clients on cache eviction**: Do not add `close()`, `aclose()`, or `create_task(close_fn())` inside `LLMClientCache._remove_key()` or any cache eviction path. Evicted clients may still be held by in-flight requests; closing them causes `RuntimeError: Cannot send a request, as the client has been closed.` in production after the cache TTL (1 hour) expires. Connection cleanup is handled at shutdown by `close_litellm_async_clients()`. See PR #22247 for the full incident history.
+
 ## HELPFUL RESOURCES
 
 - Main documentation: https://docs.litellm.ai/

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -116,6 +116,9 @@ LiteLLM is a unified interface for 100+ LLM providers with two main components:
 - Optional features enabled via environment variables
 - Separate licensing and authentication for enterprise features
 
+### HTTP Client Cache Safety
+- **Never close HTTP/SDK clients on cache eviction.** `LLMClientCache._remove_key()` must not call `close()`/`aclose()` on evicted clients — they may still be used by in-flight requests. Doing so causes `RuntimeError: Cannot send a request, as the client has been closed.` after the 1-hour TTL expires. Cleanup happens at shutdown via `close_litellm_async_clients()`.
+
 ### Troubleshooting: DB schema out of sync after proxy restart
 `litellm-proxy-extras` runs `prisma migrate deploy` on startup using **its own** bundled migration files, which may lag behind schema changes in the current worktree. Symptoms: `Unknown column`, `Invalid prisma invocation`, or missing data on new fields.
 

diff --git a/litellm/caching/llm_caching_handler.py b/litellm/caching/llm_caching_handler.py
@@ -3,36 +3,21 @@
 """
 
 import asyncio
-from typing import Set
 
 from .in_memory_cache import InMemoryCache
 
 
 class LLMClientCache(InMemoryCache):
-    # Background tasks must be stored to prevent garbage collection, which would
-    # trigger "coroutine was never awaited" warnings. See:
-    # https://docs.python.org/3/library/asyncio-task.html#creating-tasks
-    # Intentionally shared across all instances as a global task registry.
-    _background_tasks: Set[asyncio.Task] = set()
+    """Cache for LLM HTTP clients (OpenAI, Azure, httpx, etc.).
 
-    def _remove_key(self, key: str) -> None:
-        """Close async clients before evicting them to prevent connection pool leaks."""
-        value = self.cache_dict.get(key)
-        super()._remove_key(key)
-        if value is not None:
-            close_fn = getattr(value, "aclose", None) or getattr(value, "close", None)
-            if close_fn and asyncio.iscoroutinefunction(close_fn):
-                try:
-                    task = asyncio.get_running_loop().create_task(close_fn())
-                    self._background_tasks.add(task)
-                    task.add_done_callback(self._background_tasks.discard)
-                except RuntimeError:
-                    pass
-            elif close_fn and callable(close_fn):
-                try:
-                    close_fn()
-                except Exception:
-                    pass
+    IMPORTANT: This cache intentionally does NOT close clients on eviction.
+    Evicted clients may still be in use by in-flight requests. Closing them
+    eagerly causes ``RuntimeError: Cannot send a request, as the client has
+    been closed.`` errors in production after the TTL (1 hour) expires.
+
+    Clients that are no longer referenced will be garbage-collected normally.
+    For explicit shutdown cleanup, use ``close_litellm_async_clients()``.
+    """
 
     def update_cache_key_with_event_loop(self, key):
         """

diff --git a/tests/test_litellm/caching/test_llm_caching_handler.py b/tests/test_litellm/caching/test_llm_caching_handler.py
@@ -1,3 +1,13 @@
+"""
+Tests for LLMClientCache.
+
+The cache intentionally does NOT close clients on eviction because evicted
+clients may still be referenced by in-flight requests.  Closing them eagerly
+causes ``RuntimeError: Cannot send a request, as the client has been closed.``
+
+See: https://github.com/BerriAI/litellm/pull/22247
+"""
+
 import asyncio
 import os
 import sys
@@ -33,56 +43,32 @@ def close(self):
 
 
 @pytest.mark.asyncio
-async def test_remove_key_no_unawaited_coroutine_warning():
+async def test_remove_key_does_not_close_async_client():
     """
-    Test that evicting an async client from LLMClientCache does not produce
-    'coroutine was never awaited' warnings.
+    Evicting an async client from LLMClientCache must NOT close it because
+    an in-flight request may still hold a reference to the client.
 
-    Regression test for https://github.com/BerriAI/litellm/issues/22128
+    Regression test for production 'client has been closed' crashes.
     """
     cache = LLMClientCache(max_size_in_memory=2)
 
     mock_client = MockAsyncClient()
     cache.cache_dict["test-key"] = mock_client
     cache.ttl_dict["test-key"] = 0  # expired
 
-    with warnings.catch_warnings(record=True) as caught_warnings:
-        warnings.simplefilter("always")
-        cache._remove_key("test-key")
-        # Let the event loop process the close task
-        await asyncio.sleep(0.1)
-
-    coroutine_warnings = [
-        w for w in caught_warnings if "coroutine" in str(w.message).lower()
-    ]
-    assert (
-        len(coroutine_warnings) == 0
-    ), f"Got unawaited coroutine warnings: {coroutine_warnings}"
-
-
-@pytest.mark.asyncio
-async def test_remove_key_closes_async_client():
-    """
-    Test that evicting an async client from the cache properly closes it.
-    """
-    cache = LLMClientCache(max_size_in_memory=2)
-
-    mock_client = MockAsyncClient()
-    cache.cache_dict["test-key"] = mock_client
-    cache.ttl_dict["test-key"] = 0
-
     cache._remove_key("test-key")
-    # Let the event loop process the close task
+    # Give the event loop a chance to run any background tasks
     await asyncio.sleep(0.1)
 
-    assert mock_client.closed is True
+    # Client must NOT be closed — it may still be in use
+    assert mock_client.closed is False
     assert "test-key" not in cache.cache_dict
     assert "test-key" not in cache.ttl_dict
 
 
-def test_remove_key_closes_sync_client():
+def test_remove_key_does_not_close_sync_client():
     """
-    Test that evicting a sync client from the cache properly closes it.
+    Evicting a sync client from the cache must NOT close it.
     """
     cache = LLMClientCache(max_size_in_memory=2)
 
@@ -92,15 +78,15 @@ def test_remove_key_closes_sync_client():
 
     cache._remove_key("test-key")
 
-    assert mock_client.closed is True
+    assert mock_client.closed is False
     assert "test-key" not in cache.cache_dict
 
 
 @pytest.mark.asyncio
-async def test_eviction_closes_async_clients():
+async def test_eviction_does_not_close_async_clients():
     """
-    Test that cache eviction (when cache is full) properly closes async clients
-    without producing warnings.
+    When the cache is full and an entry is evicted, the evicted async client
+    must remain open and must not produce 'coroutine was never awaited' warnings.
     """
     cache = LLMClientCache(max_size_in_memory=2, default_ttl=1)
 
@@ -123,36 +109,66 @@ async def test_eviction_closes_async_clients():
         len(coroutine_warnings) == 0
     ), f"Got unawaited coroutine warnings: {coroutine_warnings}"
 
+    # Evicted clients must NOT be closed
+    for client in clients:
+        assert client.closed is False
 
-def test_remove_key_no_event_loop():
+
+@pytest.mark.asyncio
+async def test_eviction_no_unawaited_coroutine_warning():
     """
-    Test that _remove_key doesn't raise when there's no running event loop
-    (falls through to the RuntimeError except branch).
+    Evicting an async client from LLMClientCache must not produce
+    'coroutine was never awaited' warnings.
+
+    Regression test for https://github.com/BerriAI/litellm/issues/22128
     """
     cache = LLMClientCache(max_size_in_memory=2)
 
     mock_client = MockAsyncClient()
     cache.cache_dict["test-key"] = mock_client
-    cache.ttl_dict["test-key"] = 0
+    cache.ttl_dict["test-key"] = 0  # expired
 
-    # Should not raise even though there's no running event loop
-    cache._remove_key("test-key")
-    assert "test-key" not in cache.cache_dict
+    with warnings.catch_warnings(record=True) as caught_warnings:
+        warnings.simplefilter("always")
+        cache._remove_key("test-key")
+        await asyncio.sleep(0.1)
+
+    coroutine_warnings = [
+        w for w in caught_warnings if "coroutine" in str(w.message).lower()
+    ]
+    assert (
+        len(coroutine_warnings) == 0
+    ), f"Got unawaited coroutine warnings: {coroutine_warnings}"
 
 
-@pytest.mark.asyncio
-async def test_background_tasks_cleaned_up_after_completion():
+def test_remove_key_no_event_loop():
     """
-    Test that completed close tasks are removed from the _background_tasks set.
+    _remove_key works correctly even when there's no running event loop.
     """
     cache = LLMClientCache(max_size_in_memory=2)
 
     mock_client = MockAsyncClient()
     cache.cache_dict["test-key"] = mock_client
     cache.ttl_dict["test-key"] = 0
 
+    # Should not raise even though there's no running event loop
     cache._remove_key("test-key")
-    # Let the task complete
-    await asyncio.sleep(0.1)
+    assert "test-key" not in cache.cache_dict
+
+
+def test_remove_key_removes_plain_values():
+    """
+    _remove_key correctly removes non-client values (strings, dicts, etc.).
+    """
+    cache = LLMClientCache(max_size_in_memory=5)
+
+    cache.cache_dict["str-key"] = "hello"
+    cache.ttl_dict["str-key"] = 0
+    cache.cache_dict["dict-key"] = {"foo": "bar"}
+    cache.ttl_dict["dict-key"] = 0
+
+    cache._remove_key("str-key")
+    cache._remove_key("dict-key")
 
-    assert len(cache._background_tasks) == 0
+    assert "str-key" not in cache.cache_dict
+    assert "dict-key" not in cache.cache_dict
diff --git a/tests/test_litellm/caching/test_llm_client_cache_e2e.py b/tests/test_litellm/caching/test_llm_client_cache_e2e.py
@@ -1,5 +1,19 @@
 """e2e tests: httpx clients obtained via get_async_httpx_client must remain
-usable after LLMClientCache evicts their cache entry."""
+usable after LLMClientCache evicts their cache entry.
+
+These tests exist to prevent a recurring production crash:
+  RuntimeError: Cannot send a request, as the client has been closed.
+
+The bug occurs when LLMClientCache._remove_key() eagerly closes evicted
+clients that are still referenced by in-flight requests.  Every test here
+sleeps after eviction to let the event loop drain any background close
+tasks — a plain ``assert not client.is_closed`` without sleeping is NOT
+sufficient to catch the regression (the close task runs asynchronously).
+
+See: https://github.com/BerriAI/litellm/pull/22247
+"""
+
+import asyncio
 
 import pytest
 
@@ -25,6 +39,10 @@ async def test_evicted_client_is_not_closed():
     # This evicts client_a from cache (capacity=1)
     client_b = get_async_httpx_client(llm_provider="provider_b")
 
+    # Sleep to let any background close tasks execute — without this sleep,
+    # a regression that schedules close via create_task() would go undetected.
+    await asyncio.sleep(0.15)
+
     assert not client_a.client.is_closed
     await client_a.client.aclose()
     await client_b.client.aclose()
@@ -43,5 +61,67 @@ async def test_expired_client_is_not_closed():
         cache.expiration_heap = [(0, key) for _, key in cache.expiration_heap]
     cache.evict_cache()
 
+    await asyncio.sleep(0.15)
+
     assert not client.client.is_closed
     await client.client.aclose()
+
+
+@pytest.mark.asyncio
+async def test_evicted_openai_sdk_client_stays_usable():
+    """OpenAI/Azure SDK clients cached in LLMClientCache must remain usable
+    after eviction.  This is the exact production scenario: the proxy caches
+    an AsyncOpenAI client, the TTL expires, a new request evicts the old
+    entry, but a concurrent streaming request is still reading from it.
+
+    Regression guard: if _remove_key ever calls client.close(), the
+    underlying httpx client is closed and this test fails.
+    """
+    from openai import AsyncOpenAI
+
+    cache = litellm.in_memory_llm_clients_cache
+
+    client = AsyncOpenAI(api_key="sk-test", base_url="https://api.openai.com/v1")
+    cache.set_cache("openai-client", client, ttl=600)
+
+    # Evict by inserting a second entry (max_size=1)
+    cache.set_cache("filler", "x", ttl=600)
+
+    # Let the event loop drain any background close tasks
+    await asyncio.sleep(0.15)
+
+    # The SDK client's internal httpx client must still be open
+    assert not client._client.is_closed, (
-    assert not client._client.is_closed, (
+    assert not client._client.is_closed, (
-    assert not client._client.is_closed, (
+    assert not client._client.is_closed, (
+        "AsyncOpenAI client was closed on cache eviction — this causes "
+        "'Cannot send a request, as the client has been closed' in production"
+    )
+    await client.close()
+
+
+@pytest.mark.asyncio
+async def test_ttl_expired_openai_sdk_client_stays_usable():
+    """Same as above but triggered via TTL expiry + get_cache (the other
+    eviction path)."""
+    from openai import AsyncOpenAI
+
+    cache = litellm.in_memory_llm_clients_cache
+
+    client = AsyncOpenAI(api_key="sk-test", base_url="https://api.openai.com/v1")
+    cache.set_cache("openai-client", client, ttl=600)
+
+    # Force TTL expiry
+    for key in list(cache.ttl_dict.keys()):
+        cache.ttl_dict[key] = 0
+        cache.expiration_heap = [(0, k) for _, k in cache.expiration_heap]
+
+    # get_cache calls evict_element_if_expired → _remove_key
+    result = cache.get_cache("openai-client")
+    assert result is None  # expired, so returns None
+
+    await asyncio.sleep(0.15)
+
+    assert not client._client.is_closed, (
+        "AsyncOpenAI client was closed on TTL expiry — this causes "
+        "'Cannot send a request, as the client has been closed' in production"
+    )
+    await client.close()