From 18ad0f7cfa1365d396f824385a1eec6b0f2d9238 Mon Sep 17 00:00:00 2001
From: Wojciech Wais <wojciech.wais@gmail.com>
Date: Fri, 6 Mar 2026 16:35:00 +0100
Subject: [PATCH 1/2] [Feature] Add /live liveness probe and k8s shutdown docs

- Add /live endpoint that returns 200 during graceful drain (unlike
  /health which returns 503). This lets Kubernetes distinguish between
  "draining" and "dead" via separate liveness and readiness probes.

- Add is_engine_dead property to EngineClient/AsyncLLM so the /live
  endpoint only fails on fatal engine errors, not graceful shutdown.

- Exempt /live and /metrics from ScalingMiddleware 503 blocking.

- Add comprehensive "Graceful Shutdown" section to k8s deployment docs
  with probe configuration examples and terminationGracePeriodSeconds.

Part of RFC #24885

Signed-off-by: Wojciech Wais <wojciech.wais@gmail.com>
---
 docs/deployment/k8s.md                        | 83 ++++++++++++++++++-
 vllm/engine/protocol.py                       |  7 ++
 vllm/entrypoints/launcher.py                  |  3 +
 .../serve/elastic_ep/middleware.py            | 23 +++--
 .../serve/instrumentator/health.py            | 19 ++++-
 .../serve/instrumentator/metrics.py           |  1 +
 vllm/v1/engine/async_llm.py                   |  6 ++
 7 files changed, 132 insertions(+), 10 deletions(-)

diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md
index dbcb277278c9..fb13e2faa216 100644
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -4,6 +4,7 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine le
 
 - [Deployment with CPUs](#deployment-with-cpus)
 - [Deployment with GPUs](#deployment-with-gpus)
+- [Graceful Shutdown](#graceful-shutdown)
 - [Troubleshooting](#troubleshooting)
     - [Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"](#startup-probe-or-readiness-probe-failure-container-log-contains-keyboardinterrupt-terminated)
 - [Conclusion](#conclusion)
@@ -242,7 +243,7 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
                 mountPath: /dev/shm
               livenessProbe:
                 httpGet:
-                  path: /health
+                  path: /live
                   port: 8000
                 initialDelaySeconds: 60
                 periodSeconds: 10
@@ -387,6 +388,86 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
 
       If the service is correctly deployed, you should receive a response from the vLLM model.
 
+## Graceful Shutdown
+
+When the vLLM server receives a `SIGTERM` signal (e.g. from Kubernetes
+during pod termination), it marks itself as draining and begins
+shutting down. During this period, the `/health` readiness probe
+returns `503` so that load balancers stop routing new traffic, while
+the `/live` liveness probe continues returning `200` so that
+Kubernetes does not restart the pod mid-shutdown.
+
+### Probe endpoints
+
+vLLM exposes two probe endpoints with different semantics:
+
+| State        | `/health` (readiness) | `/live` (liveness) |
+|--------------|-----------------------|--------------------|
+| Running      | 200                   | 200                |
+| Paused       | 503                   | 200                |
+| Draining     | 503                   | 200                |
+| Dead/crashed | 503                   | 503                |
+
+- **`/health`** is a *readiness* probe — it returns `503` during shutdown
+  drain so that the load balancer stops routing new traffic to the pod.
+- **`/live`** is a *liveness* probe — it returns `200` as long as the
+  process is alive (even while draining), so Kubernetes does **not**
+  restart the pod mid-drain. It only returns `503` when the engine has
+  encountered a fatal error.
+
+### Kubernetes deployment with graceful shutdown
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-server
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: vllm
+  template:
+    metadata:
+      labels:
+        app: vllm
+    spec:
+      terminationGracePeriodSeconds: 60
+      containers:
+      - name: vllm
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c"]
+        args:
+        - "vllm serve mistralai/Mistral-7B-Instruct-v0.3"
+        ports:
+        - containerPort: 8000
+        livenessProbe:
+          httpGet:
+            path: /live
+            port: 8000
+          initialDelaySeconds: 60
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 60
+          periodSeconds: 5
+```
+
+!!! tip "Optional preStop hook"
+    You can add a `preStop` hook to sleep for a few seconds before
+    the `SIGTERM` is sent. This gives the Kubernetes endpoints
+    controller time to remove the pod from the Service, preventing
+    new connections from arriving during the first moments of drain:
+
+    ```yaml
+    lifecycle:
+      preStop:
+        exec:
+          command: ["sleep", "5"]
+    ```
+
 ## Troubleshooting
 
 ### Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 0b3b29cd6c1f..f9d105be9af3 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -120,6 +120,13 @@ async def check_health(self) -> None:
         """Raise if unhealthy"""
         ...
 
+    @property
+    def is_engine_dead(self) -> bool:
+        """Return True only when the engine has encountered a fatal error.
+        This is distinct from ``errored`` which also returns True during
+        graceful shutdown/drain."""
+        return self.errored
+
     @abstractmethod
     async def start_profile(self) -> None:
         """Start profiling the engine"""
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index 8caeb80836f9..d3d55de72647 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -95,6 +95,9 @@ async def serve_http(
     shutdown_event = asyncio.Event()
 
     def signal_handler() -> None:
+        # Mark as draining so /health (readiness) returns 503
+        # while /live (liveness) continues to return 200.
+        app.state.draining = True
         shutdown_event.set()
 
     async def dummy_shutdown() -> None:
diff --git a/vllm/entrypoints/serve/elastic_ep/middleware.py b/vllm/entrypoints/serve/elastic_ep/middleware.py
index 23f45eafeaa0..e7f61ed10551 100644
--- a/vllm/entrypoints/serve/elastic_ep/middleware.py
+++ b/vllm/entrypoints/serve/elastic_ep/middleware.py
@@ -31,19 +31,26 @@ class ScalingMiddleware:
     def __init__(self, app: ASGIApp) -> None:
         self.app = app
 
+    # Paths that should never be blocked by the scaling middleware.
+    _EXEMPT_PATHS = {"/live", "/metrics"}
+
     def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]:
         if scope["type"] != "http":
             return self.app(scope, receive, send)
 
         # Check global scaling state
         if get_scaling_elastic_ep():
-            # Return 503 Service Unavailable response
-            response = JSONResponse(
-                content={
-                    "error": "The model is currently scaling. Please try again later."
-                },
-                status_code=503,
-            )
-            return response(scope, receive, send)
+            # Allow liveness probe and metrics through even while scaling.
+            path = scope.get("path", "")
+            if path not in self._EXEMPT_PATHS:
+                # Return 503 Service Unavailable response
+                response = JSONResponse(
+                    content={
+                        "error": "The model is currently scaling. "
+                        "Please try again later."
+                    },
+                    status_code=503,
+                )
+                return response(scope, receive, send)
 
         return self.app(scope, receive, send)
diff --git a/vllm/entrypoints/serve/instrumentator/health.py b/vllm/entrypoints/serve/instrumentator/health.py
index 5c0b2d1855d9..cb1b0db96c9e 100644
--- a/vllm/entrypoints/serve/instrumentator/health.py
+++ b/vllm/entrypoints/serve/instrumentator/health.py
@@ -21,7 +21,10 @@ def engine_client(request: Request) -> EngineClient:
 
 @router.get("/health", response_class=Response)
 async def health(raw_request: Request) -> Response:
-    """Health check."""
+    """Readiness probe. Returns 503 during shutdown/drain so that
+    load balancers stop sending new traffic."""
+    if getattr(raw_request.app.state, "draining", False):
+        return Response(status_code=503)
     client = engine_client(raw_request)
     if client is None:
         # Render-only servers have no engine; they are always healthy.
@@ -31,3 +34,17 @@ async def health(raw_request: Request) -> Response:
         return Response(status_code=200)
     except EngineDeadError:
         return Response(status_code=503)
+
+
+@router.get("/live", response_class=Response)
+async def live(raw_request: Request) -> Response:
+    """Liveness probe. Returns 200 as long as the process is alive,
+    even during graceful shutdown/drain. Only returns 503 when the
+    engine has encountered a fatal error."""
+    client = engine_client(raw_request)
+    if client is None:
+        # Render-only servers have no engine; they are always alive.
+        return Response(status_code=200)
+    if client.is_engine_dead:
+        return Response(status_code=503)
+    return Response(status_code=200)
diff --git a/vllm/entrypoints/serve/instrumentator/metrics.py b/vllm/entrypoints/serve/instrumentator/metrics.py
index 5231451383a2..dfd29a5dabb4 100644
--- a/vllm/entrypoints/serve/instrumentator/metrics.py
+++ b/vllm/entrypoints/serve/instrumentator/metrics.py
@@ -29,6 +29,7 @@ def attach_router(app: FastAPI):
         excluded_handlers=[
             "/metrics",
             "/health",
+            "/live",
             "/load",
             "/ping",
             "/version",
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a9c42e78e53b..fb5ee476920f 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1026,6 +1026,12 @@ def is_stopped(self) -> bool:
     def errored(self) -> bool:
         return self.engine_core.resources.engine_dead or not self.is_running
 
+    @property
+    def is_engine_dead(self) -> bool:
+        """True only when the engine has encountered a fatal error,
+        not during graceful shutdown/drain."""
+        return self.engine_core.resources.engine_dead
+
     @property
     def dead_error(self) -> BaseException:
         return EngineDeadError()

From e3b380457c18bccb8dae9dd886220319ace35d85 Mon Sep 17 00:00:00 2001
From: Wojciech Wais <wojciech.wais@gmail.com>
Date: Wed, 11 Mar 2026 07:54:45 +0100
Subject: [PATCH 2/2] [Feature] Add unit tests for /live probe, /health
 draining, and ScalingMiddleware

- TestLiveEndpoint: test /live returns 200 for healthy/draining engines,
  503 for dead engines, 200 for render-only servers.
- TestHealthDraining: test /health returns 503 when draining (skipping
  check_health), 200 when healthy, 200 for render-only servers.
- TestScalingMiddlewareExemptions: test /live and /metrics are exempt
  from 503 during scaling, other paths are blocked.

Signed-off-by: Wojciech Wais <wojciech.wais@gmail.com>
---
 .../entrypoints/instrumentator/test_basic.py  | 206 ++++++++++++++++++
 1 file changed, 206 insertions(+)

diff --git a/tests/entrypoints/instrumentator/test_basic.py b/tests/entrypoints/instrumentator/test_basic.py
index 9c2986ebe6c9..8b2bea65841b 100644
--- a/tests/entrypoints/instrumentator/test_basic.py
+++ b/tests/entrypoints/instrumentator/test_basic.py
@@ -249,3 +249,209 @@ async def test_health_check_engine_dead_error():
 
     # Assert that it returns 503 Service Unavailable
     assert response.status_code == 503
+
+
+# ---------------------------------------------------------------------------
+# Unit tests for /live liveness probe and /health draining behavior
+# ---------------------------------------------------------------------------
+
+
+def _make_mock_request(
+    engine_client=None,
+    draining=False,
+):
+    """Create a mock FastAPI Request with configurable app state."""
+    mock_request = Mock(spec=Request)
+    mock_app_state = Mock()
+    mock_app_state.engine_client = engine_client
+    mock_app_state.draining = draining
+    mock_request.app.state = mock_app_state
+    return mock_request
+
+
+class TestLiveEndpoint:
+    """Tests for the /live liveness probe."""
+
+    @pytest.mark.asyncio
+    async def test_live_healthy_engine(self):
+        from vllm.entrypoints.serve.instrumentator.health import live
+
+        mock_client = Mock()
+        mock_client.is_engine_dead = False
+        request = _make_mock_request(engine_client=mock_client)
+
+        response = await live(request)
+        assert response.status_code == 200
+
+    @pytest.mark.asyncio
+    async def test_live_dead_engine(self):
+        from vllm.entrypoints.serve.instrumentator.health import live
+
+        mock_client = Mock()
+        mock_client.is_engine_dead = True
+        request = _make_mock_request(engine_client=mock_client)
+
+        response = await live(request)
+        assert response.status_code == 503
+
+    @pytest.mark.asyncio
+    async def test_live_during_drain(self):
+        """Liveness probe returns 200 during graceful drain."""
+        from vllm.entrypoints.serve.instrumentator.health import live
+
+        mock_client = Mock()
+        mock_client.is_engine_dead = False
+        request = _make_mock_request(engine_client=mock_client, draining=True)
+
+        response = await live(request)
+        assert response.status_code == 200
+
+    @pytest.mark.asyncio
+    async def test_live_render_only_server(self):
+        """Render-only servers have no engine client."""
+        from vllm.entrypoints.serve.instrumentator.health import live
+
+        request = _make_mock_request(engine_client=None)
+
+        response = await live(request)
+        assert response.status_code == 200
+
+
+class TestHealthDraining:
+    """Tests for /health readiness probe draining behavior."""
+
+    @pytest.mark.asyncio
+    async def test_health_returns_503_when_draining(self):
+        from vllm.entrypoints.serve.instrumentator.health import health
+
+        mock_client = AsyncMock()
+        request = _make_mock_request(engine_client=mock_client, draining=True)
+
+        response = await health(request)
+        assert response.status_code == 503
+        # check_health should NOT be called when draining
+        mock_client.check_health.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_health_returns_200_when_not_draining(self):
+        from vllm.entrypoints.serve.instrumentator.health import health
+
+        mock_client = AsyncMock()
+        request = _make_mock_request(engine_client=mock_client, draining=False)
+
+        response = await health(request)
+        assert response.status_code == 200
+        mock_client.check_health.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_health_render_only_server(self):
+        """Render-only servers have no engine; always healthy."""
+        from vllm.entrypoints.serve.instrumentator.health import health
+
+        request = _make_mock_request(engine_client=None)
+
+        response = await health(request)
+        assert response.status_code == 200
+
+
+class TestScalingMiddlewareExemptions:
+    """Tests for ScalingMiddleware exempt paths (/live, /metrics)."""
+
+    @pytest.mark.asyncio
+    async def test_live_exempt_during_scaling(self):
+        from vllm.entrypoints.serve.elastic_ep.middleware import (
+            ScalingMiddleware,
+            set_scaling_elastic_ep,
+        )
+
+        received_scopes = []
+
+        async def mock_app(scope, receive, send):
+            received_scopes.append(scope)
+
+        middleware = ScalingMiddleware(mock_app)
+        scope = {"type": "http", "path": "/live"}
+
+        try:
+            set_scaling_elastic_ep(True)
+            await middleware(scope, None, None)
+        finally:
+            set_scaling_elastic_ep(False)
+
+        # /live should pass through to the app
+        assert len(received_scopes) == 1
+
+    @pytest.mark.asyncio
+    async def test_metrics_exempt_during_scaling(self):
+        from vllm.entrypoints.serve.elastic_ep.middleware import (
+            ScalingMiddleware,
+            set_scaling_elastic_ep,
+        )
+
+        received_scopes = []
+
+        async def mock_app(scope, receive, send):
+            received_scopes.append(scope)
+
+        middleware = ScalingMiddleware(mock_app)
+        scope = {"type": "http", "path": "/metrics"}
+
+        try:
+            set_scaling_elastic_ep(True)
+            await middleware(scope, None, None)
+        finally:
+            set_scaling_elastic_ep(False)
+
+        assert len(received_scopes) == 1
+
+    @pytest.mark.asyncio
+    async def test_other_paths_blocked_during_scaling(self):
+        from vllm.entrypoints.serve.elastic_ep.middleware import (
+            ScalingMiddleware,
+            set_scaling_elastic_ep,
+        )
+
+        received_scopes = []
+        sent_responses = []
+
+        async def mock_app(scope, receive, send):
+            received_scopes.append(scope)
+
+        async def mock_send(message):
+            sent_responses.append(message)
+
+        middleware = ScalingMiddleware(mock_app)
+        scope = {"type": "http", "path": "/v1/completions"}
+
+        try:
+            set_scaling_elastic_ep(True)
+            await middleware(scope, None, mock_send)
+        finally:
+            set_scaling_elastic_ep(False)
+
+        # Should NOT pass through to the app
+        assert len(received_scopes) == 0
+        # Should have sent a 503 response
+        assert any(
+            r.get("status") == 503 for r in sent_responses if isinstance(r, dict)
+        )
+
+    @pytest.mark.asyncio
+    async def test_all_paths_pass_when_not_scaling(self):
+        from vllm.entrypoints.serve.elastic_ep.middleware import (
+            ScalingMiddleware,
+            set_scaling_elastic_ep,
+        )
+
+        received_scopes = []
+
+        async def mock_app(scope, receive, send):
+            received_scopes.append(scope)
+
+        middleware = ScalingMiddleware(mock_app)
+        set_scaling_elastic_ep(False)
+
+        for path in ["/live", "/health", "/v1/completions", "/metrics"]:
+            await middleware({"type": "http", "path": path}, None, None)
+
+        assert len(received_scopes) == 4