diff --git a/litellm/proxy/prometheus_cleanup.py b/litellm/proxy/prometheus_cleanup.py index 6d935a8dd90..6353588532a 100644 --- a/litellm/proxy/prometheus_cleanup.py +++ b/litellm/proxy/prometheus_cleanup.py @@ -28,3 +28,20 @@ def wipe_directory(directory: str) -> None: verbose_proxy_logger.info( f"Prometheus cleanup: wiped {deleted} stale .db files from {directory}" ) + + +def mark_worker_exit(worker_pid: int) -> None: + """Remove prometheus .db files for a dead worker. Called by gunicorn child_exit hook.""" + if not os.environ.get("PROMETHEUS_MULTIPROC_DIR"): + return + try: + from prometheus_client import multiprocess + + multiprocess.mark_process_dead(worker_pid) + verbose_proxy_logger.info( + f"Prometheus cleanup: marked worker {worker_pid} as dead" + ) + except Exception as e: + verbose_proxy_logger.warning( + f"Failed to mark prometheus worker {worker_pid} as dead: {e}" + ) diff --git a/litellm/proxy/proxy_cli.py b/litellm/proxy/proxy_cli.py index f5163114983..921d86c35c1 100644 --- a/litellm/proxy/proxy_cli.py +++ b/litellm/proxy/proxy_cli.py @@ -277,6 +277,15 @@ def load(self): if max_requests_before_restart is not None: gunicorn_options["max_requests"] = max_requests_before_restart + # Clean up prometheus .db files when a worker exits (prevents ghost gauge values) + if os.environ.get("PROMETHEUS_MULTIPROC_DIR"): + from litellm.proxy.prometheus_cleanup import mark_worker_exit + + def child_exit(server, worker): + mark_worker_exit(worker.pid) + + gunicorn_options["child_exit"] = child_exit + if ssl_certfile_path is not None and ssl_keyfile_path is not None: print( # noqa f"\033[1;32mLiteLLM Proxy: Using SSL with certfile: {ssl_certfile_path} and keyfile: {ssl_keyfile_path}\033[0m\n" # noqa diff --git a/tests/test_litellm/proxy/test_prometheus_cleanup.py b/tests/test_litellm/proxy/test_prometheus_cleanup.py index 276f2b592db..b3d785f1133 100644 --- a/tests/test_litellm/proxy/test_prometheus_cleanup.py +++ b/tests/test_litellm/proxy/test_prometheus_cleanup.py @@ -10,7 +10,7 @@ import pytest -from litellm.proxy.prometheus_cleanup import wipe_directory +from litellm.proxy.prometheus_cleanup import mark_worker_exit, wipe_directory from litellm.proxy.proxy_cli import ProxyInitializationHelpers @@ -23,6 +23,35 @@ def test_deletes_all_db_files(self, tmp_path): assert not list(tmp_path.glob("*.db")) +class TestMarkWorkerExit: + def test_calls_mark_process_dead_when_env_set(self, tmp_path): + with patch.dict(os.environ, {"PROMETHEUS_MULTIPROC_DIR": str(tmp_path)}): + with patch( + "prometheus_client.multiprocess.mark_process_dead" + ) as mock_mark: + mark_worker_exit(12345) + mock_mark.assert_called_once_with(12345) + + def test_noop_when_env_not_set(self): + with patch.dict(os.environ, {}, clear=False): + os.environ.pop("PROMETHEUS_MULTIPROC_DIR", None) + with patch( + "prometheus_client.multiprocess.mark_process_dead" + ) as mock_mark: + mark_worker_exit(12345) + mock_mark.assert_not_called() + + def test_exception_is_caught_and_logged(self, tmp_path): + with patch.dict(os.environ, {"PROMETHEUS_MULTIPROC_DIR": str(tmp_path)}): + with patch( + "prometheus_client.multiprocess.mark_process_dead", + side_effect=FileNotFoundError("gone"), + ) as mock_mark: + # Should not raise + mark_worker_exit(99) + mock_mark.assert_called_once_with(99) + + class TestMaybeSetupPrometheusMultiprocDir: def test_respects_existing_env_var(self, tmp_path): """When PROMETHEUS_MULTIPROC_DIR is already set, don't override it."""