Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions nemo_skills/inference/model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@

import httpx
import litellm
import litellm.constants
import litellm.llms.custom_httpx.http_handler
import litellm.llms.openai.common_utils
import openai

from nemo_skills.inference.patch_litellm_logging import patch_litellm_logging_worker
Expand All @@ -28,6 +31,17 @@
from .context_retry import ContextLimitRetryConfig, with_context_retry
from .utils import ServerTokenizer, WrapperAutoTokenizer, trim_after_stop_phrases

# litellm caches OpenAI/httpx clients with a 3600s (1hr) TTL. When the cached
# client expires it is garbage-collected, closing its httpx.AsyncClient and
# killing every in-flight request with "Cannot send a request, as the client
# has been closed". This is fine for short API calls but fatal for long-running
# generation jobs with high concurrency. The constant is copied into submodules
# via `from litellm.constants import ...`, so all three locations must be patched.
_EXTENDED_CLIENT_TTL = 14400
litellm.constants._DEFAULT_TTL_FOR_HTTPX_CLIENTS = _EXTENDED_CLIENT_TTL
litellm.llms.custom_httpx.http_handler._DEFAULT_TTL_FOR_HTTPX_CLIENTS = _EXTENDED_CLIENT_TTL
litellm.llms.openai.common_utils._DEFAULT_TTL_FOR_HTTPX_CLIENTS = _EXTENDED_CLIENT_TTL

LOG = logging.getLogger(get_logger_name(__file__))

# The logging worker sometimes does not stop. We patch it to disable its functionality.
Expand Down
Loading