diff --git a/nemo_skills/inference/model/base.py b/nemo_skills/inference/model/base.py index 834b9273aa..5330e9266b 100644 --- a/nemo_skills/inference/model/base.py +++ b/nemo_skills/inference/model/base.py @@ -20,6 +20,9 @@ import httpx import litellm +import litellm.constants +import litellm.llms.custom_httpx.http_handler +import litellm.llms.openai.common_utils import openai from nemo_skills.inference.patch_litellm_logging import patch_litellm_logging_worker @@ -28,6 +31,17 @@ from .context_retry import ContextLimitRetryConfig, with_context_retry from .utils import ServerTokenizer, WrapperAutoTokenizer, trim_after_stop_phrases +# litellm caches OpenAI/httpx clients with a 3600s (1hr) TTL. When the cached +# client expires it is garbage-collected, closing its httpx.AsyncClient and +# killing every in-flight request with "Cannot send a request, as the client +# has been closed". This is fine for short API calls but fatal for long-running +# generation jobs with high concurrency. The constant is copied into submodules +# via `from litellm.constants import ...`, so all three locations must be patched. +_EXTENDED_CLIENT_TTL = 14400 +litellm.constants._DEFAULT_TTL_FOR_HTTPX_CLIENTS = _EXTENDED_CLIENT_TTL +litellm.llms.custom_httpx.http_handler._DEFAULT_TTL_FOR_HTTPX_CLIENTS = _EXTENDED_CLIENT_TTL +litellm.llms.openai.common_utils._DEFAULT_TTL_FOR_HTTPX_CLIENTS = _EXTENDED_CLIENT_TTL + LOG = logging.getLogger(get_logger_name(__file__)) # The logging worker sometimes does not stop. We patch it to disable its functionality.