Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions litellm/proxy/prometheus_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,20 @@ def wipe_directory(directory: str) -> None:
verbose_proxy_logger.info(
f"Prometheus cleanup: wiped {deleted} stale .db files from {directory}"
)


def mark_worker_exit(worker_pid: int) -> None:
"""Remove prometheus .db files for a dead worker. Called by gunicorn child_exit hook."""
if not os.environ.get("PROMETHEUS_MULTIPROC_DIR"):
return
try:
from prometheus_client import multiprocess

multiprocess.mark_process_dead(worker_pid)
verbose_proxy_logger.info(
f"Prometheus cleanup: marked worker {worker_pid} as dead"
)
except Exception as e:
verbose_proxy_logger.warning(
f"Failed to mark prometheus worker {worker_pid} as dead: {e}"
)
9 changes: 9 additions & 0 deletions litellm/proxy/proxy_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,15 @@ def load(self):
if max_requests_before_restart is not None:
gunicorn_options["max_requests"] = max_requests_before_restart

# Clean up prometheus .db files when a worker exits (prevents ghost gauge values)
if os.environ.get("PROMETHEUS_MULTIPROC_DIR"):
from litellm.proxy.prometheus_cleanup import mark_worker_exit

def child_exit(server, worker):
mark_worker_exit(worker.pid)

gunicorn_options["child_exit"] = child_exit

if ssl_certfile_path is not None and ssl_keyfile_path is not None:
print( # noqa
f"\033[1;32mLiteLLM Proxy: Using SSL with certfile: {ssl_certfile_path} and keyfile: {ssl_keyfile_path}\033[0m\n" # noqa
Expand Down
31 changes: 30 additions & 1 deletion tests/test_litellm/proxy/test_prometheus_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import pytest

from litellm.proxy.prometheus_cleanup import wipe_directory
from litellm.proxy.prometheus_cleanup import mark_worker_exit, wipe_directory
from litellm.proxy.proxy_cli import ProxyInitializationHelpers


Expand All @@ -23,6 +23,35 @@ def test_deletes_all_db_files(self, tmp_path):
assert not list(tmp_path.glob("*.db"))


class TestMarkWorkerExit:
def test_calls_mark_process_dead_when_env_set(self, tmp_path):
with patch.dict(os.environ, {"PROMETHEUS_MULTIPROC_DIR": str(tmp_path)}):
with patch(
"prometheus_client.multiprocess.mark_process_dead"
) as mock_mark:
mark_worker_exit(12345)
mock_mark.assert_called_once_with(12345)

def test_noop_when_env_not_set(self):
with patch.dict(os.environ, {}, clear=False):
os.environ.pop("PROMETHEUS_MULTIPROC_DIR", None)
with patch(
"prometheus_client.multiprocess.mark_process_dead"
) as mock_mark:
mark_worker_exit(12345)
mock_mark.assert_not_called()

def test_exception_is_caught_and_logged(self, tmp_path):
with patch.dict(os.environ, {"PROMETHEUS_MULTIPROC_DIR": str(tmp_path)}):
with patch(
"prometheus_client.multiprocess.mark_process_dead",
side_effect=FileNotFoundError("gone"),
) as mock_mark:
# Should not raise
mark_worker_exit(99)
mock_mark.assert_called_once_with(99)


class TestMaybeSetupPrometheusMultiprocDir:
def test_respects_existing_env_var(self, tmp_path):
"""When PROMETHEUS_MULTIPROC_DIR is already set, don't override it."""
Expand Down
Loading