NVIDIA-NeMo · Kipok · Mar 10, 2026 · Mar 7, 2026 · Mar 10, 2026
diff --git a/nemo_skills/inference/model/base.py b/nemo_skills/inference/model/base.py
@@ -20,6 +20,9 @@
 
 import httpx
 import litellm
+import litellm.constants
+import litellm.llms.custom_httpx.http_handler
+import litellm.llms.openai.common_utils
 import openai
 
 from nemo_skills.inference.patch_litellm_logging import patch_litellm_logging_worker
@@ -28,6 +31,17 @@
 from .context_retry import ContextLimitRetryConfig, with_context_retry
 from .utils import ServerTokenizer, WrapperAutoTokenizer, trim_after_stop_phrases
 
+# litellm caches OpenAI/httpx clients with a 3600s (1hr) TTL. When the cached
+# client expires it is garbage-collected, closing its httpx.AsyncClient and
+# killing every in-flight request with "Cannot send a request, as the client
+# has been closed". This is fine for short API calls but fatal for long-running
+# generation jobs with high concurrency. The constant is copied into submodules
+# via `from litellm.constants import ...`, so all three locations must be patched.
+_EXTENDED_CLIENT_TTL = 14400
+litellm.constants._DEFAULT_TTL_FOR_HTTPX_CLIENTS = _EXTENDED_CLIENT_TTL
+litellm.llms.custom_httpx.http_handler._DEFAULT_TTL_FOR_HTTPX_CLIENTS = _EXTENDED_CLIENT_TTL
+litellm.llms.openai.common_utils._DEFAULT_TTL_FOR_HTTPX_CLIENTS = _EXTENDED_CLIENT_TTL
+
 LOG = logging.getLogger(get_logger_name(__file__))
 
 # The logging worker sometimes does not stop. We patch it to disable its functionality.