-
-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[Feature] Abort in-flight requests and drain outputs on shutdown #36964
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
wojciech-wais
wants to merge
1
commit into
vllm-project:main
Choose a base branch
from
wojciech-wais:feat/shutdown-abort-drain
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,116 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
| """Unit tests for abort-and-drain shutdown behavior in EngineCoreProc. | ||
|
|
||
| These tests validate that: | ||
| 1. In-flight requests are aborted during graceful shutdown. | ||
| 2. Abort outputs are sent to clients before the engine core exits. | ||
| 3. The output thread is drained (joined) before shutdown completes. | ||
| """ | ||
|
|
||
| import queue | ||
| import threading | ||
| from unittest.mock import MagicMock | ||
|
|
||
| import pytest | ||
|
|
||
| from vllm.v1.engine import EngineCoreOutputs, FinishReason | ||
| from vllm.v1.engine.core import EngineCoreProc | ||
| from vllm.v1.request import RequestStatus | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def mock_engine_core(): | ||
| """Create a minimal mock of EngineCoreProc for testing shutdown.""" | ||
| engine = MagicMock(spec=EngineCoreProc) | ||
| engine.output_queue = queue.Queue() | ||
|
|
||
| # Use real methods for the ones we're testing. | ||
| engine._abort_and_drain_outputs = EngineCoreProc._abort_and_drain_outputs.__get__( | ||
| engine | ||
| ) | ||
| engine._send_abort_outputs = EngineCoreProc._send_abort_outputs.__get__(engine) | ||
|
|
||
| return engine | ||
|
|
||
|
|
||
| class TestAbortAndDrainOutputs: | ||
| def test_aborts_all_in_flight_requests(self, mock_engine_core): | ||
| """Verify all in-flight requests are aborted via scheduler.""" | ||
| mock_engine_core.scheduler.finish_requests.return_value = [ | ||
| ("req-1", 0), | ||
| ("req-2", 0), | ||
| ] | ||
| # No output thread. | ||
| del mock_engine_core.output_thread | ||
|
|
||
| mock_engine_core._abort_and_drain_outputs() | ||
|
|
||
| mock_engine_core.scheduler.finish_requests.assert_called_once_with( | ||
| None, RequestStatus.FINISHED_ABORTED | ||
| ) | ||
|
|
||
| def test_sends_abort_outputs_to_clients(self, mock_engine_core): | ||
| """Verify abort outputs are placed in the output queue.""" | ||
| mock_engine_core.scheduler.finish_requests.return_value = [ | ||
| ("req-1", 0), | ||
| ("req-2", 1), | ||
| ] | ||
| del mock_engine_core.output_thread | ||
|
|
||
| mock_engine_core._abort_and_drain_outputs() | ||
|
|
||
| # Should have 2 output entries (one per client_index). | ||
| items = [] | ||
| while not mock_engine_core.output_queue.empty(): | ||
| items.append(mock_engine_core.output_queue.get_nowait()) | ||
|
|
||
| # Filter out ENGINE_CORE_DEAD sentinel if present. | ||
| output_items = [i for i in items if not isinstance(i, bytes)] | ||
| assert len(output_items) == 2 | ||
|
|
||
| # Check that abort outputs have the right finish reason. | ||
| for client_index, eco in output_items: | ||
| assert isinstance(eco, EngineCoreOutputs) | ||
| for output in eco.outputs: | ||
| assert output.finish_reason == FinishReason.ABORT | ||
|
|
||
| def test_drains_output_thread(self, mock_engine_core): | ||
| """Verify output thread is signaled and joined.""" | ||
| mock_engine_core.scheduler.finish_requests.return_value = [] | ||
| mock_thread = MagicMock(spec=threading.Thread) | ||
| mock_thread.is_alive.return_value = False | ||
| mock_engine_core.output_thread = mock_thread | ||
|
|
||
| mock_engine_core._abort_and_drain_outputs() | ||
|
|
||
| # ENGINE_CORE_DEAD sentinel should be in the queue. | ||
| sentinel = mock_engine_core.output_queue.get_nowait() | ||
| assert sentinel == EngineCoreProc.ENGINE_CORE_DEAD | ||
|
|
||
| # Output thread should be joined. | ||
| mock_thread.join.assert_called_once_with(timeout=5.0) | ||
|
|
||
| def test_no_output_thread_still_works(self, mock_engine_core): | ||
| """Verify graceful handling when output_thread doesn't exist.""" | ||
| mock_engine_core.scheduler.finish_requests.return_value = [ | ||
| ("req-1", 0), | ||
| ] | ||
| del mock_engine_core.output_thread | ||
|
|
||
| # Should not raise. | ||
| mock_engine_core._abort_and_drain_outputs() | ||
|
|
||
| def test_no_requests_to_abort(self, mock_engine_core): | ||
| """Verify clean shutdown when no requests are in flight.""" | ||
| mock_engine_core.scheduler.finish_requests.return_value = [] | ||
| mock_thread = MagicMock(spec=threading.Thread) | ||
| mock_thread.is_alive.return_value = False | ||
| mock_engine_core.output_thread = mock_thread | ||
|
|
||
| mock_engine_core._abort_and_drain_outputs() | ||
|
|
||
| # Should still signal the output thread to drain. | ||
| sentinel = mock_engine_core.output_queue.get_nowait() | ||
| assert sentinel == EngineCoreProc.ENGINE_CORE_DEAD | ||
| mock_thread.join.assert_called_once_with(timeout=5.0) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The 5-second timeout for draining the output thread is hardcoded. In scenarios with high load or a slow network, this might not be enough time to send all pending messages, including the abort responses. This could result in some clients not receiving an abort notification, which is the problem this PR aims to solve. This same hardcoded value is also used in
_send_engine_dead.To improve robustness and maintainability, this timeout should be extracted into a constant. A future improvement could be to make this value configurable, for example, via an environment variable.