vllm-project · wojciech-wais · Mar 6, 2026 · Mar 11, 2026 · gemini-code-assist · Mar 6, 2026
diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md
@@ -4,6 +4,7 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine le
 
 - [Deployment with CPUs](#deployment-with-cpus)
 - [Deployment with GPUs](#deployment-with-gpus)
+- [Graceful Shutdown](#graceful-shutdown)
 - [Troubleshooting](#troubleshooting)
     - [Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"](#startup-probe-or-readiness-probe-failure-container-log-contains-keyboardinterrupt-terminated)
 - [Conclusion](#conclusion)
@@ -242,7 +243,7 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
                 mountPath: /dev/shm
               livenessProbe:
                 httpGet:
-                  path: /health
+                  path: /live
                   port: 8000
                 initialDelaySeconds: 60
                 periodSeconds: 10
@@ -387,6 +388,86 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
 
       If the service is correctly deployed, you should receive a response from the vLLM model.
 
+## Graceful Shutdown
+
+When the vLLM server receives a `SIGTERM` signal (e.g. from Kubernetes
+during pod termination), it marks itself as draining and begins
+shutting down. During this period, the `/health` readiness probe
+returns `503` so that load balancers stop routing new traffic, while
+the `/live` liveness probe continues returning `200` so that
+Kubernetes does not restart the pod mid-shutdown.
+
+### Probe endpoints
+
+vLLM exposes two probe endpoints with different semantics:
+
+| State        | `/health` (readiness) | `/live` (liveness) |
+|--------------|-----------------------|--------------------|
+| Running      | 200                   | 200                |
+| Paused       | 503                   | 200                |
+| Draining     | 503                   | 200                |
+| Dead/crashed | 503                   | 503                |
+
+- **`/health`** is a *readiness* probe — it returns `503` during shutdown
+  drain so that the load balancer stops routing new traffic to the pod.
+- **`/live`** is a *liveness* probe — it returns `200` as long as the
+  process is alive (even while draining), so Kubernetes does **not**
+  restart the pod mid-drain. It only returns `503` when the engine has
+  encountered a fatal error.
+
+### Kubernetes deployment with graceful shutdown
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-server
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: vllm
+  template:
+    metadata:
+      labels:
+        app: vllm
+    spec:
+      terminationGracePeriodSeconds: 60
+      containers:
+      - name: vllm
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c"]
+        args:
+        - "vllm serve mistralai/Mistral-7B-Instruct-v0.3"
+        ports:
+        - containerPort: 8000
+        livenessProbe:
+          httpGet:
+            path: /live
+            port: 8000
+          initialDelaySeconds: 60
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 60
+          periodSeconds: 5
+```
+
+!!! tip "Optional preStop hook"
+    You can add a `preStop` hook to sleep for a few seconds before
+    the `SIGTERM` is sent. This gives the Kubernetes endpoints
+    controller time to remove the pod from the Service, preventing
+    new connections from arriving during the first moments of drain:
+
+    ```yaml
+    lifecycle:
+      preStop:
+        exec:
+          command: ["sleep", "5"]
+    ```
+
 ## Troubleshooting
 
 ### Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"

@@ -249,3 +249,209 @@ async def test_health_check_engine_dead_error():
 
     # Assert that it returns 503 Service Unavailable
     assert response.status_code == 503
+
+
+# ---------------------------------------------------------------------------
+# Unit tests for /live liveness probe and /health draining behavior
+# ---------------------------------------------------------------------------
+
+
+def _make_mock_request(
+    engine_client=None,
+    draining=False,
+):
+    """Create a mock FastAPI Request with configurable app state."""
+    mock_request = Mock(spec=Request)
+    mock_app_state = Mock()
+    mock_app_state.engine_client = engine_client
+    mock_app_state.draining = draining
+    mock_request.app.state = mock_app_state
+    return mock_request
+
+
+class TestLiveEndpoint:
+    """Tests for the /live liveness probe."""
+
+    @pytest.mark.asyncio
+    async def test_live_healthy_engine(self):
+        from vllm.entrypoints.serve.instrumentator.health import live
+
+        mock_client = Mock()
+        mock_client.is_engine_dead = False
+        request = _make_mock_request(engine_client=mock_client)
+
+        response = await live(request)
+        assert response.status_code == 200
+
+    @pytest.mark.asyncio
+    async def test_live_dead_engine(self):
+        from vllm.entrypoints.serve.instrumentator.health import live
+
+        mock_client = Mock()
+        mock_client.is_engine_dead = True
+        request = _make_mock_request(engine_client=mock_client)
+
+        response = await live(request)
+        assert response.status_code == 503
+
+    @pytest.mark.asyncio
+    async def test_live_during_drain(self):
+        """Liveness probe returns 200 during graceful drain."""
+        from vllm.entrypoints.serve.instrumentator.health import live
+
+        mock_client = Mock()
+        mock_client.is_engine_dead = False
+        request = _make_mock_request(engine_client=mock_client, draining=True)
+
+        response = await live(request)
+        assert response.status_code == 200
+
+    @pytest.mark.asyncio
+    async def test_live_render_only_server(self):
+        """Render-only servers have no engine client."""
+        from vllm.entrypoints.serve.instrumentator.health import live
+
+        request = _make_mock_request(engine_client=None)
+
+        response = await live(request)
+        assert response.status_code == 200
+
+
+class TestHealthDraining:
+    """Tests for /health readiness probe draining behavior."""
+
+    @pytest.mark.asyncio
+    async def test_health_returns_503_when_draining(self):
+        from vllm.entrypoints.serve.instrumentator.health import health
+
+        mock_client = AsyncMock()
+        request = _make_mock_request(engine_client=mock_client, draining=True)
+
+        response = await health(request)
+        assert response.status_code == 503
+        # check_health should NOT be called when draining
+        mock_client.check_health.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_health_returns_200_when_not_draining(self):
+        from vllm.entrypoints.serve.instrumentator.health import health
+
+        mock_client = AsyncMock()
+        request = _make_mock_request(engine_client=mock_client, draining=False)
+
+        response = await health(request)
+        assert response.status_code == 200
+        mock_client.check_health.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_health_render_only_server(self):
+        """Render-only servers have no engine; always healthy."""
+        from vllm.entrypoints.serve.instrumentator.health import health
+
+        request = _make_mock_request(engine_client=None)
+
+        response = await health(request)
+        assert response.status_code == 200
+
+
+class TestScalingMiddlewareExemptions:
+    """Tests for ScalingMiddleware exempt paths (/live, /metrics)."""
+
+    @pytest.mark.asyncio
+    async def test_live_exempt_during_scaling(self):
+        from vllm.entrypoints.serve.elastic_ep.middleware import (
+            ScalingMiddleware,
+            set_scaling_elastic_ep,
+        )
+
+        received_scopes = []
+
+        async def mock_app(scope, receive, send):
+            received_scopes.append(scope)
+
+        middleware = ScalingMiddleware(mock_app)
+        scope = {"type": "http", "path": "/live"}
+
+        try:
+            set_scaling_elastic_ep(True)
+            await middleware(scope, None, None)
+        finally:
+            set_scaling_elastic_ep(False)
+
+        # /live should pass through to the app
+        assert len(received_scopes) == 1
+
+    @pytest.mark.asyncio
+    async def test_metrics_exempt_during_scaling(self):
+        from vllm.entrypoints.serve.elastic_ep.middleware import (
+            ScalingMiddleware,
+            set_scaling_elastic_ep,
+        )
+
+        received_scopes = []
+
+        async def mock_app(scope, receive, send):
+            received_scopes.append(scope)
+
+        middleware = ScalingMiddleware(mock_app)
+        scope = {"type": "http", "path": "/metrics"}
+
+        try:
+            set_scaling_elastic_ep(True)
+            await middleware(scope, None, None)
+        finally:
+            set_scaling_elastic_ep(False)
+
+        assert len(received_scopes) == 1
+
+    @pytest.mark.asyncio
+    async def test_other_paths_blocked_during_scaling(self):
+        from vllm.entrypoints.serve.elastic_ep.middleware import (
+            ScalingMiddleware,
+            set_scaling_elastic_ep,
+        )
+
+        received_scopes = []
+        sent_responses = []
+
+        async def mock_app(scope, receive, send):
+            received_scopes.append(scope)
+
+        async def mock_send(message):
+            sent_responses.append(message)
+
+        middleware = ScalingMiddleware(mock_app)
+        scope = {"type": "http", "path": "/v1/completions"}
+
+        try:
+            set_scaling_elastic_ep(True)
+            await middleware(scope, None, mock_send)
+        finally:
+            set_scaling_elastic_ep(False)
+
+        # Should NOT pass through to the app
+        assert len(received_scopes) == 0
+        # Should have sent a 503 response
+        assert any(
+            r.get("status") == 503 for r in sent_responses if isinstance(r, dict)
+        )
+
+    @pytest.mark.asyncio
+    async def test_all_paths_pass_when_not_scaling(self):
+        from vllm.entrypoints.serve.elastic_ep.middleware import (
+            ScalingMiddleware,
+            set_scaling_elastic_ep,
+        )
+
+        received_scopes = []
+
+        async def mock_app(scope, receive, send):
+            received_scopes.append(scope)
+
+        middleware = ScalingMiddleware(mock_app)
+        set_scaling_elastic_ep(False)
+
+        for path in ["/live", "/health", "/v1/completions", "/metrics"]:
+            await middleware({"type": "http", "path": path}, None, None)
+
+        assert len(received_scopes) == 4
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
@@ -120,6 +120,13 @@ async def check_health(self) -> None:
         """Raise if unhealthy"""
         ...
 
+    @property
+    def is_engine_dead(self) -> bool:
+        """Return True only when the engine has encountered a fatal error.
+        This is distinct from ``errored`` which also returns True during
+        graceful shutdown/drain."""
+        return self.errored
-    def is_engine_dead(self) -> bool:
-        """Return True only when the engine has encountered a fatal error.
-        This is distinct from ``errored`` which also returns True during
-        graceful shutdown/drain."""
-        return self.errored
+    @abstractmethod
+    def is_engine_dead(self) -> bool:
+        """Return True only when the engine has encountered a fatal error.
+        This is distinct from ``errored`` which also returns True during
+        graceful shutdown/drain."""
+        ...
-    def is_engine_dead(self) -> bool:
-        """Return True only when the engine has encountered a fatal error.
-        This is distinct from ``errored`` which also returns True during
-        graceful shutdown/drain."""
-        return self.errored
+    @abstractmethod
+    def is_engine_dead(self) -> bool:
+        """Return True only when the engine has encountered a fatal error.
+        This is distinct from ``errored`` which also returns True during
+        graceful shutdown/drain."""
+        ...
+
     @abstractmethod
     async def start_profile(self) -> None:
         """Start profiling the engine"""

@@ -95,6 +95,9 @@ async def serve_http(
     shutdown_event = asyncio.Event()
 
     def signal_handler() -> None:
+        # Mark as draining so /health (readiness) returns 503
+        # while /live (liveness) continues to return 200.
+        app.state.draining = True
         shutdown_event.set()
 
     async def dummy_shutdown() -> None:

@@ -31,19 +31,26 @@ class ScalingMiddleware:
     def __init__(self, app: ASGIApp) -> None:
         self.app = app
 
+    # Paths that should never be blocked by the scaling middleware.
+    _EXEMPT_PATHS = {"/live", "/metrics"}
+
     def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]:
         if scope["type"] != "http":
             return self.app(scope, receive, send)
 
         # Check global scaling state
         if get_scaling_elastic_ep():
-            # Return 503 Service Unavailable response
-            response = JSONResponse(
-                content={
-                    "error": "The model is currently scaling. Please try again later."
-                },
-                status_code=503,
-            )
-            return response(scope, receive, send)
+            # Allow liveness probe and metrics through even while scaling.
+            path = scope.get("path", "")
+            if path not in self._EXEMPT_PATHS:
+                # Return 503 Service Unavailable response
+                response = JSONResponse(
+                    content={
+                        "error": "The model is currently scaling. "
+                        "Please try again later."
+                    },
+                    status_code=503,
+                )
+                return response(scope, receive, send)
 
         return self.app(scope, receive, send)