-
-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[Frontend] Add server load limit with --max-server-load parameter #22805
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
scratch-ml
wants to merge
8
commits into
vllm-project:main
from
scratch-ml:feature/server-load-limit
Closed
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
c8917c2
[Feature] Add server load limit with --max-server-load parameter
b6635e1
Add precommit check
scratch-ml fdcb763
fix format error
scratch-ml 9c13609
upgrade version of precommit and re-run precommit
scratch-ml 95e205f
validate max_server_load parameter
scratch-ml 1dd742f
refactor the load_aware_call() function code
scratch-ml b63c5b2
limit the frequency of log output
scratch-ml 85fb832
create a constant JSONResponse for server overload error
scratch-ml File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,207 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
| """Tests for server load limit functionality.""" | ||
|
|
||
| from unittest.mock import MagicMock | ||
|
|
||
| import pytest | ||
| from fastapi.responses import JSONResponse | ||
|
|
||
| from vllm.entrypoints.utils import load_aware_call | ||
|
|
||
|
|
||
| class TestServerLoadLimit: | ||
| """Test suite for server load limiting functionality.""" | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_load_aware_call_max_load_exceeded(self): | ||
| """Test that requests are rejected when max load is exceeded.""" | ||
|
|
||
| @load_aware_call | ||
| async def dummy_handler(raw_request): | ||
| return {"message": "success"} | ||
|
|
||
| # Mock request with load exceeding limit | ||
| mock_request = MagicMock() | ||
| mock_request.app.state.enable_server_load_tracking = True | ||
| mock_request.app.state.max_server_load = 10 | ||
| mock_request.app.state.server_load_metrics = 15 # Exceeds limit | ||
| mock_request.app.state.server_overload_rejections_since_last_log = 0 | ||
|
|
||
| response = await dummy_handler(raw_request=mock_request) | ||
|
|
||
| assert isinstance(response, JSONResponse) | ||
| assert response.status_code == 503 | ||
|
|
||
| # Verify error content | ||
| import json | ||
| content = json.loads(response.body.decode('utf-8')) | ||
| assert content["error"]["type"] == "server_overloaded" | ||
| assert "Server is currently overloaded" in content["error"]["message"] | ||
| assert "Please try again later" in content["error"]["message"] | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_load_aware_call_max_load_at_limit(self): | ||
| """Test that requests are rejected when load equals limit.""" | ||
|
|
||
| @load_aware_call | ||
| async def dummy_handler(raw_request): | ||
| return {"message": "success"} | ||
|
|
||
| # Mock request with load exactly at limit | ||
| mock_request = MagicMock() | ||
| mock_request.app.state.enable_server_load_tracking = True | ||
| mock_request.app.state.max_server_load = 10 | ||
| mock_request.app.state.server_load_metrics = 10 # At limit | ||
| mock_request.app.state.server_overload_rejections_since_last_log = 0 | ||
|
|
||
| response = await dummy_handler(raw_request=mock_request) | ||
|
|
||
| assert isinstance(response, JSONResponse) | ||
| assert response.status_code == 503 | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_load_aware_call_max_load_under_limit(self): | ||
| """Test that requests proceed normally when under limit.""" | ||
|
|
||
| @load_aware_call | ||
| async def dummy_handler(raw_request): | ||
| return {"message": "success"} | ||
|
|
||
| # Mock request with load under limit | ||
| mock_request = MagicMock() | ||
| mock_request.app.state.enable_server_load_tracking = True | ||
| mock_request.app.state.max_server_load = 10 | ||
| mock_request.app.state.server_load_metrics = 5 # Under limit | ||
| mock_request.app.state.server_overload_rejections_since_last_log = 0 | ||
|
|
||
| response = await dummy_handler(raw_request=mock_request) | ||
|
|
||
| # Should proceed normally | ||
| assert response == {"message": "success"} | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_load_aware_call_max_load_not_set(self): | ||
| """Test that requests proceed normally when max_server_load is None.""" | ||
|
|
||
| @load_aware_call | ||
| async def dummy_handler(raw_request): | ||
| return {"message": "success"} | ||
|
|
||
| # Mock request with no max load set | ||
| mock_request = MagicMock() | ||
| mock_request.app.state.enable_server_load_tracking = True | ||
| mock_request.app.state.max_server_load = None # No limit | ||
| mock_request.app.state.server_load_metrics = 100 # High load | ||
| mock_request.app.state.server_overload_rejections_since_last_log = 0 | ||
|
|
||
| response = await dummy_handler(raw_request=mock_request) | ||
|
|
||
| # Should proceed normally despite high load | ||
| assert response == {"message": "success"} | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_load_aware_call_tracking_disabled(self): | ||
| """Test that load limiting is bypassed when tracking is disabled.""" | ||
|
|
||
| @load_aware_call | ||
| async def dummy_handler(raw_request): | ||
| return {"message": "success"} | ||
|
|
||
| # Mock request with tracking disabled | ||
| mock_request = MagicMock() | ||
| mock_request.app.state.enable_server_load_tracking = False | ||
| mock_request.app.state.max_server_load = 5 | ||
| mock_request.app.state.server_load_metrics = 100 # High load | ||
| mock_request.app.state.server_overload_rejections_since_last_log = 0 | ||
|
|
||
| response = await dummy_handler(raw_request=mock_request) | ||
|
|
||
| # Should proceed normally when tracking is disabled | ||
| assert response == {"message": "success"} | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_load_aware_call_with_exception(self): | ||
| """Test that load counter is properly decremented on exception.""" | ||
|
|
||
| @load_aware_call | ||
| async def failing_handler(raw_request): | ||
| raise ValueError("Test exception") | ||
|
|
||
| # Mock request under limit | ||
| mock_request = MagicMock() | ||
| mock_request.app.state.enable_server_load_tracking = True | ||
| mock_request.app.state.max_server_load = 10 | ||
| mock_request.app.state.server_load_metrics = 5 | ||
| mock_request.app.state.server_overload_rejections_since_last_log = 0 | ||
|
|
||
| # Should raise the original exception | ||
| with pytest.raises(ValueError, match="Test exception"): | ||
| await failing_handler(raw_request=mock_request) | ||
|
|
||
| # Load counter should be decremented back to 5 | ||
| assert mock_request.app.state.server_load_metrics == 5 | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_load_aware_call_increments_counter(self): | ||
| """Test that load counter is properly incremented.""" | ||
|
|
||
| @load_aware_call | ||
| async def dummy_handler(raw_request): | ||
| # Verify counter was incremented | ||
| assert raw_request.app.state.server_load_metrics == 6 | ||
| return {"message": "success"} | ||
|
|
||
| # Mock request under limit | ||
| mock_request = MagicMock() | ||
| mock_request.app.state.enable_server_load_tracking = True | ||
| mock_request.app.state.max_server_load = 10 | ||
| mock_request.app.state.server_load_metrics = 5 | ||
| mock_request.app.state.server_overload_rejections_since_last_log = 0 | ||
|
|
||
| response = await dummy_handler(raw_request=mock_request) | ||
|
|
||
| assert response == {"message": "success"} | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_load_aware_call_zero_max_load(self): | ||
| """Test behavior when max_server_load is set to 0.""" | ||
|
|
||
| @load_aware_call | ||
| async def dummy_handler(raw_request): | ||
| return {"message": "success"} | ||
|
|
||
| # Mock request with zero max load | ||
| mock_request = MagicMock() | ||
| mock_request.app.state.enable_server_load_tracking = True | ||
| mock_request.app.state.max_server_load = 0 | ||
| mock_request.app.state.server_load_metrics = 0 | ||
| mock_request.app.state.server_overload_rejections_since_last_log = 0 | ||
|
|
||
| response = await dummy_handler(raw_request=mock_request) | ||
|
|
||
| # Should be rejected since 0 >= 0 | ||
| assert isinstance(response, JSONResponse) | ||
| assert response.status_code == 503 | ||
|
|
||
| def test_max_server_load_parameter_exists(self): | ||
| """Test that max_server_load parameter is properly defined.""" | ||
| from vllm.entrypoints.openai.cli_args import FrontendArgs | ||
|
|
||
| # Check that the parameter exists in FrontendArgs | ||
| frontend_args = FrontendArgs() | ||
| assert hasattr(frontend_args, 'max_server_load') | ||
| assert frontend_args.max_server_load is None # Default value | ||
|
|
||
| def test_frontend_args_annotation(self): | ||
| """Test that max_server_load has proper type annotation.""" | ||
| from vllm.entrypoints.openai.cli_args import FrontendArgs | ||
|
|
||
| # Get type hints | ||
| annotations = FrontendArgs.__annotations__ | ||
| assert 'max_server_load' in annotations | ||
|
|
||
| # Should be Optional[int] | ||
| import typing | ||
| expected_type = typing.Optional[int] | ||
| assert annotations['max_server_load'] == expected_type |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.