From 18ad0f7cfa1365d396f824385a1eec6b0f2d9238 Mon Sep 17 00:00:00 2001 From: Wojciech Wais Date: Fri, 6 Mar 2026 16:35:00 +0100 Subject: [PATCH 1/2] [Feature] Add /live liveness probe and k8s shutdown docs - Add /live endpoint that returns 200 during graceful drain (unlike /health which returns 503). This lets Kubernetes distinguish between "draining" and "dead" via separate liveness and readiness probes. - Add is_engine_dead property to EngineClient/AsyncLLM so the /live endpoint only fails on fatal engine errors, not graceful shutdown. - Exempt /live and /metrics from ScalingMiddleware 503 blocking. - Add comprehensive "Graceful Shutdown" section to k8s deployment docs with probe configuration examples and terminationGracePeriodSeconds. Part of RFC #24885 Signed-off-by: Wojciech Wais --- docs/deployment/k8s.md | 83 ++++++++++++++++++- vllm/engine/protocol.py | 7 ++ vllm/entrypoints/launcher.py | 3 + .../serve/elastic_ep/middleware.py | 23 +++-- .../serve/instrumentator/health.py | 19 ++++- .../serve/instrumentator/metrics.py | 1 + vllm/v1/engine/async_llm.py | 6 ++ 7 files changed, 132 insertions(+), 10 deletions(-) diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md index dbcb277278c9..fb13e2faa216 100644 --- a/docs/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -4,6 +4,7 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine le - [Deployment with CPUs](#deployment-with-cpus) - [Deployment with GPUs](#deployment-with-gpus) +- [Graceful Shutdown](#graceful-shutdown) - [Troubleshooting](#troubleshooting) - [Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"](#startup-probe-or-readiness-probe-failure-container-log-contains-keyboardinterrupt-terminated) - [Conclusion](#conclusion) @@ -242,7 +243,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) mountPath: /dev/shm livenessProbe: httpGet: - path: /health + path: /live port: 8000 initialDelaySeconds: 60 periodSeconds: 10 @@ -387,6 +388,86 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) If the service is correctly deployed, you should receive a response from the vLLM model. +## Graceful Shutdown + +When the vLLM server receives a `SIGTERM` signal (e.g. from Kubernetes +during pod termination), it marks itself as draining and begins +shutting down. During this period, the `/health` readiness probe +returns `503` so that load balancers stop routing new traffic, while +the `/live` liveness probe continues returning `200` so that +Kubernetes does not restart the pod mid-shutdown. + +### Probe endpoints + +vLLM exposes two probe endpoints with different semantics: + +| State | `/health` (readiness) | `/live` (liveness) | +|--------------|-----------------------|--------------------| +| Running | 200 | 200 | +| Paused | 503 | 200 | +| Draining | 503 | 200 | +| Dead/crashed | 503 | 503 | + +- **`/health`** is a *readiness* probe — it returns `503` during shutdown + drain so that the load balancer stops routing new traffic to the pod. +- **`/live`** is a *liveness* probe — it returns `200` as long as the + process is alive (even while draining), so Kubernetes does **not** + restart the pod mid-drain. It only returns `503` when the engine has + encountered a fatal error. + +### Kubernetes deployment with graceful shutdown + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-server +spec: + replicas: 3 + selector: + matchLabels: + app: vllm + template: + metadata: + labels: + app: vllm + spec: + terminationGracePeriodSeconds: 60 + containers: + - name: vllm + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: + - "vllm serve mistralai/Mistral-7B-Instruct-v0.3" + ports: + - containerPort: 8000 + livenessProbe: + httpGet: + path: /live + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 5 +``` + +!!! tip "Optional preStop hook" + You can add a `preStop` hook to sleep for a few seconds before + the `SIGTERM` is sent. This gives the Kubernetes endpoints + controller time to remove the pod from the Service, preventing + new connections from arriving during the first moments of drain: + + ```yaml + lifecycle: + preStop: + exec: + command: ["sleep", "5"] + ``` + ## Troubleshooting ### Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated" diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 0b3b29cd6c1f..f9d105be9af3 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -120,6 +120,13 @@ async def check_health(self) -> None: """Raise if unhealthy""" ... + @property + def is_engine_dead(self) -> bool: + """Return True only when the engine has encountered a fatal error. + This is distinct from ``errored`` which also returns True during + graceful shutdown/drain.""" + return self.errored + @abstractmethod async def start_profile(self) -> None: """Start profiling the engine""" diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index 8caeb80836f9..d3d55de72647 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -95,6 +95,9 @@ async def serve_http( shutdown_event = asyncio.Event() def signal_handler() -> None: + # Mark as draining so /health (readiness) returns 503 + # while /live (liveness) continues to return 200. + app.state.draining = True shutdown_event.set() async def dummy_shutdown() -> None: diff --git a/vllm/entrypoints/serve/elastic_ep/middleware.py b/vllm/entrypoints/serve/elastic_ep/middleware.py index 23f45eafeaa0..e7f61ed10551 100644 --- a/vllm/entrypoints/serve/elastic_ep/middleware.py +++ b/vllm/entrypoints/serve/elastic_ep/middleware.py @@ -31,19 +31,26 @@ class ScalingMiddleware: def __init__(self, app: ASGIApp) -> None: self.app = app + # Paths that should never be blocked by the scaling middleware. + _EXEMPT_PATHS = {"/live", "/metrics"} + def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]: if scope["type"] != "http": return self.app(scope, receive, send) # Check global scaling state if get_scaling_elastic_ep(): - # Return 503 Service Unavailable response - response = JSONResponse( - content={ - "error": "The model is currently scaling. Please try again later." - }, - status_code=503, - ) - return response(scope, receive, send) + # Allow liveness probe and metrics through even while scaling. + path = scope.get("path", "") + if path not in self._EXEMPT_PATHS: + # Return 503 Service Unavailable response + response = JSONResponse( + content={ + "error": "The model is currently scaling. " + "Please try again later." + }, + status_code=503, + ) + return response(scope, receive, send) return self.app(scope, receive, send) diff --git a/vllm/entrypoints/serve/instrumentator/health.py b/vllm/entrypoints/serve/instrumentator/health.py index 5c0b2d1855d9..cb1b0db96c9e 100644 --- a/vllm/entrypoints/serve/instrumentator/health.py +++ b/vllm/entrypoints/serve/instrumentator/health.py @@ -21,7 +21,10 @@ def engine_client(request: Request) -> EngineClient: @router.get("/health", response_class=Response) async def health(raw_request: Request) -> Response: - """Health check.""" + """Readiness probe. Returns 503 during shutdown/drain so that + load balancers stop sending new traffic.""" + if getattr(raw_request.app.state, "draining", False): + return Response(status_code=503) client = engine_client(raw_request) if client is None: # Render-only servers have no engine; they are always healthy. @@ -31,3 +34,17 @@ async def health(raw_request: Request) -> Response: return Response(status_code=200) except EngineDeadError: return Response(status_code=503) + + +@router.get("/live", response_class=Response) +async def live(raw_request: Request) -> Response: + """Liveness probe. Returns 200 as long as the process is alive, + even during graceful shutdown/drain. Only returns 503 when the + engine has encountered a fatal error.""" + client = engine_client(raw_request) + if client is None: + # Render-only servers have no engine; they are always alive. + return Response(status_code=200) + if client.is_engine_dead: + return Response(status_code=503) + return Response(status_code=200) diff --git a/vllm/entrypoints/serve/instrumentator/metrics.py b/vllm/entrypoints/serve/instrumentator/metrics.py index 5231451383a2..dfd29a5dabb4 100644 --- a/vllm/entrypoints/serve/instrumentator/metrics.py +++ b/vllm/entrypoints/serve/instrumentator/metrics.py @@ -29,6 +29,7 @@ def attach_router(app: FastAPI): excluded_handlers=[ "/metrics", "/health", + "/live", "/load", "/ping", "/version", diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a9c42e78e53b..fb5ee476920f 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1026,6 +1026,12 @@ def is_stopped(self) -> bool: def errored(self) -> bool: return self.engine_core.resources.engine_dead or not self.is_running + @property + def is_engine_dead(self) -> bool: + """True only when the engine has encountered a fatal error, + not during graceful shutdown/drain.""" + return self.engine_core.resources.engine_dead + @property def dead_error(self) -> BaseException: return EngineDeadError() From e3b380457c18bccb8dae9dd886220319ace35d85 Mon Sep 17 00:00:00 2001 From: Wojciech Wais Date: Wed, 11 Mar 2026 07:54:45 +0100 Subject: [PATCH 2/2] [Feature] Add unit tests for /live probe, /health draining, and ScalingMiddleware - TestLiveEndpoint: test /live returns 200 for healthy/draining engines, 503 for dead engines, 200 for render-only servers. - TestHealthDraining: test /health returns 503 when draining (skipping check_health), 200 when healthy, 200 for render-only servers. - TestScalingMiddlewareExemptions: test /live and /metrics are exempt from 503 during scaling, other paths are blocked. Signed-off-by: Wojciech Wais --- .../entrypoints/instrumentator/test_basic.py | 206 ++++++++++++++++++ 1 file changed, 206 insertions(+) diff --git a/tests/entrypoints/instrumentator/test_basic.py b/tests/entrypoints/instrumentator/test_basic.py index 9c2986ebe6c9..8b2bea65841b 100644 --- a/tests/entrypoints/instrumentator/test_basic.py +++ b/tests/entrypoints/instrumentator/test_basic.py @@ -249,3 +249,209 @@ async def test_health_check_engine_dead_error(): # Assert that it returns 503 Service Unavailable assert response.status_code == 503 + + +# --------------------------------------------------------------------------- +# Unit tests for /live liveness probe and /health draining behavior +# --------------------------------------------------------------------------- + + +def _make_mock_request( + engine_client=None, + draining=False, +): + """Create a mock FastAPI Request with configurable app state.""" + mock_request = Mock(spec=Request) + mock_app_state = Mock() + mock_app_state.engine_client = engine_client + mock_app_state.draining = draining + mock_request.app.state = mock_app_state + return mock_request + + +class TestLiveEndpoint: + """Tests for the /live liveness probe.""" + + @pytest.mark.asyncio + async def test_live_healthy_engine(self): + from vllm.entrypoints.serve.instrumentator.health import live + + mock_client = Mock() + mock_client.is_engine_dead = False + request = _make_mock_request(engine_client=mock_client) + + response = await live(request) + assert response.status_code == 200 + + @pytest.mark.asyncio + async def test_live_dead_engine(self): + from vllm.entrypoints.serve.instrumentator.health import live + + mock_client = Mock() + mock_client.is_engine_dead = True + request = _make_mock_request(engine_client=mock_client) + + response = await live(request) + assert response.status_code == 503 + + @pytest.mark.asyncio + async def test_live_during_drain(self): + """Liveness probe returns 200 during graceful drain.""" + from vllm.entrypoints.serve.instrumentator.health import live + + mock_client = Mock() + mock_client.is_engine_dead = False + request = _make_mock_request(engine_client=mock_client, draining=True) + + response = await live(request) + assert response.status_code == 200 + + @pytest.mark.asyncio + async def test_live_render_only_server(self): + """Render-only servers have no engine client.""" + from vllm.entrypoints.serve.instrumentator.health import live + + request = _make_mock_request(engine_client=None) + + response = await live(request) + assert response.status_code == 200 + + +class TestHealthDraining: + """Tests for /health readiness probe draining behavior.""" + + @pytest.mark.asyncio + async def test_health_returns_503_when_draining(self): + from vllm.entrypoints.serve.instrumentator.health import health + + mock_client = AsyncMock() + request = _make_mock_request(engine_client=mock_client, draining=True) + + response = await health(request) + assert response.status_code == 503 + # check_health should NOT be called when draining + mock_client.check_health.assert_not_called() + + @pytest.mark.asyncio + async def test_health_returns_200_when_not_draining(self): + from vllm.entrypoints.serve.instrumentator.health import health + + mock_client = AsyncMock() + request = _make_mock_request(engine_client=mock_client, draining=False) + + response = await health(request) + assert response.status_code == 200 + mock_client.check_health.assert_called_once() + + @pytest.mark.asyncio + async def test_health_render_only_server(self): + """Render-only servers have no engine; always healthy.""" + from vllm.entrypoints.serve.instrumentator.health import health + + request = _make_mock_request(engine_client=None) + + response = await health(request) + assert response.status_code == 200 + + +class TestScalingMiddlewareExemptions: + """Tests for ScalingMiddleware exempt paths (/live, /metrics).""" + + @pytest.mark.asyncio + async def test_live_exempt_during_scaling(self): + from vllm.entrypoints.serve.elastic_ep.middleware import ( + ScalingMiddleware, + set_scaling_elastic_ep, + ) + + received_scopes = [] + + async def mock_app(scope, receive, send): + received_scopes.append(scope) + + middleware = ScalingMiddleware(mock_app) + scope = {"type": "http", "path": "/live"} + + try: + set_scaling_elastic_ep(True) + await middleware(scope, None, None) + finally: + set_scaling_elastic_ep(False) + + # /live should pass through to the app + assert len(received_scopes) == 1 + + @pytest.mark.asyncio + async def test_metrics_exempt_during_scaling(self): + from vllm.entrypoints.serve.elastic_ep.middleware import ( + ScalingMiddleware, + set_scaling_elastic_ep, + ) + + received_scopes = [] + + async def mock_app(scope, receive, send): + received_scopes.append(scope) + + middleware = ScalingMiddleware(mock_app) + scope = {"type": "http", "path": "/metrics"} + + try: + set_scaling_elastic_ep(True) + await middleware(scope, None, None) + finally: + set_scaling_elastic_ep(False) + + assert len(received_scopes) == 1 + + @pytest.mark.asyncio + async def test_other_paths_blocked_during_scaling(self): + from vllm.entrypoints.serve.elastic_ep.middleware import ( + ScalingMiddleware, + set_scaling_elastic_ep, + ) + + received_scopes = [] + sent_responses = [] + + async def mock_app(scope, receive, send): + received_scopes.append(scope) + + async def mock_send(message): + sent_responses.append(message) + + middleware = ScalingMiddleware(mock_app) + scope = {"type": "http", "path": "/v1/completions"} + + try: + set_scaling_elastic_ep(True) + await middleware(scope, None, mock_send) + finally: + set_scaling_elastic_ep(False) + + # Should NOT pass through to the app + assert len(received_scopes) == 0 + # Should have sent a 503 response + assert any( + r.get("status") == 503 for r in sent_responses if isinstance(r, dict) + ) + + @pytest.mark.asyncio + async def test_all_paths_pass_when_not_scaling(self): + from vllm.entrypoints.serve.elastic_ep.middleware import ( + ScalingMiddleware, + set_scaling_elastic_ep, + ) + + received_scopes = [] + + async def mock_app(scope, receive, send): + received_scopes.append(scope) + + middleware = ScalingMiddleware(mock_app) + set_scaling_elastic_ep(False) + + for path in ["/live", "/health", "/v1/completions", "/metrics"]: + await middleware({"type": "http", "path": path}, None, None) + + assert len(received_scopes) == 4