From c8917c2d80c992153bd221a04f1d673e18c72965 Mon Sep 17 00:00:00 2001 From: limingliang Date: Wed, 13 Aug 2025 17:13:23 +0800 Subject: [PATCH 1/8] [Feature] Add server load limit with --max-server-load parameter - Add max_server_load parameter to FrontendArgs for setting concurrent request limit - Initialize max_server_load state in init_app_state function - Add load checking logic in load_aware_call decorator - Return HTTP 503 with detailed error message when server is overloaded - Only effective when --enable-server-load-tracking is enabled - Add comprehensive tests for new functionality This feature prevents server overload in production deployments by allowing administrators to set a maximum number of concurrent requests. When the limit is exceeded, new requests receive HTTP 503 responses with clear error messages. Signed-off-by: scratch-ml --- .../openai/test_server_load_limit.py | 197 ++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 1 + vllm/entrypoints/openai/cli_args.py | 3 + vllm/entrypoints/utils.py | 20 ++ 4 files changed, 221 insertions(+) create mode 100644 tests/entrypoints/openai/test_server_load_limit.py diff --git a/tests/entrypoints/openai/test_server_load_limit.py b/tests/entrypoints/openai/test_server_load_limit.py new file mode 100644 index 000000000000..c1826097bb6d --- /dev/null +++ b/tests/entrypoints/openai/test_server_load_limit.py @@ -0,0 +1,197 @@ +"""Tests for server load limit functionality.""" + +import pytest +from unittest.mock import MagicMock, AsyncMock +from fastapi.responses import JSONResponse + +from vllm.entrypoints.utils import load_aware_call + + +class TestServerLoadLimit: + """Test suite for server load limiting functionality.""" + + @pytest.mark.asyncio + async def test_load_aware_call_max_load_exceeded(self): + """Test that requests are rejected when max load is exceeded.""" + + @load_aware_call + async def dummy_handler(raw_request): + return {"message": "success"} + + # Mock request with load exceeding limit + mock_request = MagicMock() + mock_request.app.state.enable_server_load_tracking = True + mock_request.app.state.max_server_load = 10 + mock_request.app.state.server_load_metrics = 15 # Exceeds limit + + response = await dummy_handler(raw_request=mock_request) + + assert isinstance(response, JSONResponse) + assert response.status_code == 503 + + # Verify error content + import json + content = json.loads(response.body.decode('utf-8')) + assert content["error"]["type"] == "server_overloaded" + assert "Server is currently overloaded" in content["error"]["message"] + assert "Current load: 15" in content["error"]["message"] + assert "Max load: 10" in content["error"]["message"] + + @pytest.mark.asyncio + async def test_load_aware_call_max_load_at_limit(self): + """Test that requests are rejected when load equals limit.""" + + @load_aware_call + async def dummy_handler(raw_request): + return {"message": "success"} + + # Mock request with load exactly at limit + mock_request = MagicMock() + mock_request.app.state.enable_server_load_tracking = True + mock_request.app.state.max_server_load = 10 + mock_request.app.state.server_load_metrics = 10 # At limit + + response = await dummy_handler(raw_request=mock_request) + + assert isinstance(response, JSONResponse) + assert response.status_code == 503 + + @pytest.mark.asyncio + async def test_load_aware_call_max_load_under_limit(self): + """Test that requests proceed normally when under limit.""" + + @load_aware_call + async def dummy_handler(raw_request): + return {"message": "success"} + + # Mock request with load under limit + mock_request = MagicMock() + mock_request.app.state.enable_server_load_tracking = True + mock_request.app.state.max_server_load = 10 + mock_request.app.state.server_load_metrics = 5 # Under limit + + response = await dummy_handler(raw_request=mock_request) + + # Should proceed normally + assert response == {"message": "success"} + + @pytest.mark.asyncio + async def test_load_aware_call_max_load_not_set(self): + """Test that requests proceed normally when max_server_load is None.""" + + @load_aware_call + async def dummy_handler(raw_request): + return {"message": "success"} + + # Mock request with no max load set + mock_request = MagicMock() + mock_request.app.state.enable_server_load_tracking = True + mock_request.app.state.max_server_load = None # No limit + mock_request.app.state.server_load_metrics = 100 # High load + + response = await dummy_handler(raw_request=mock_request) + + # Should proceed normally despite high load + assert response == {"message": "success"} + + @pytest.mark.asyncio + async def test_load_aware_call_tracking_disabled(self): + """Test that load limiting is bypassed when tracking is disabled.""" + + @load_aware_call + async def dummy_handler(raw_request): + return {"message": "success"} + + # Mock request with tracking disabled + mock_request = MagicMock() + mock_request.app.state.enable_server_load_tracking = False + mock_request.app.state.max_server_load = 5 + mock_request.app.state.server_load_metrics = 100 # High load + + response = await dummy_handler(raw_request=mock_request) + + # Should proceed normally when tracking is disabled + assert response == {"message": "success"} + + @pytest.mark.asyncio + async def test_load_aware_call_with_exception(self): + """Test that load counter is properly decremented on exception.""" + + @load_aware_call + async def failing_handler(raw_request): + raise ValueError("Test exception") + + # Mock request under limit + mock_request = MagicMock() + mock_request.app.state.enable_server_load_tracking = True + mock_request.app.state.max_server_load = 10 + mock_request.app.state.server_load_metrics = 5 + + # Should raise the original exception + with pytest.raises(ValueError, match="Test exception"): + await failing_handler(raw_request=mock_request) + + # Load counter should be decremented back to 5 + assert mock_request.app.state.server_load_metrics == 5 + + @pytest.mark.asyncio + async def test_load_aware_call_increments_counter(self): + """Test that load counter is properly incremented.""" + + @load_aware_call + async def dummy_handler(raw_request): + # Verify counter was incremented + assert raw_request.app.state.server_load_metrics == 6 + return {"message": "success"} + + # Mock request under limit + mock_request = MagicMock() + mock_request.app.state.enable_server_load_tracking = True + mock_request.app.state.max_server_load = 10 + mock_request.app.state.server_load_metrics = 5 + + response = await dummy_handler(raw_request=mock_request) + + assert response == {"message": "success"} + + @pytest.mark.asyncio + async def test_load_aware_call_zero_max_load(self): + """Test behavior when max_server_load is set to 0.""" + + @load_aware_call + async def dummy_handler(raw_request): + return {"message": "success"} + + # Mock request with zero max load + mock_request = MagicMock() + mock_request.app.state.enable_server_load_tracking = True + mock_request.app.state.max_server_load = 0 + mock_request.app.state.server_load_metrics = 0 + + response = await dummy_handler(raw_request=mock_request) + + # Should be rejected since 0 >= 0 + assert isinstance(response, JSONResponse) + assert response.status_code == 503 + + def test_max_server_load_parameter_exists(self): + """Test that max_server_load parameter is properly defined.""" + from vllm.entrypoints.openai.cli_args import FrontendArgs + + # Check that the parameter exists in FrontendArgs + frontend_args = FrontendArgs() + assert hasattr(frontend_args, 'max_server_load') + assert frontend_args.max_server_load is None # Default value + + def test_frontend_args_annotation(self): + """Test that max_server_load has proper type annotation.""" + from vllm.entrypoints.openai.cli_args import FrontendArgs + + # Get type hints + annotations = FrontendArgs.__annotations__ + assert 'max_server_load' in annotations + + # Should be Optional[int] + import typing + expected_type = typing.Union[int, type(None)] # Optional[int] + assert annotations['max_server_load'] == expected_type \ No newline at end of file diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index e5d31c1fd03f..39873c2a1950 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1762,6 +1762,7 @@ async def init_app_state( state.enable_server_load_tracking = args.enable_server_load_tracking state.server_load_metrics = 0 + state.max_server_load = args.max_server_load def create_server_socket(addr: tuple[str, int]) -> socket.socket: diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index e15f65b43082..93dba3f512cf 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -164,6 +164,9 @@ class FrontendArgs: """If set to True, enable prompt_tokens_details in usage.""" enable_server_load_tracking: bool = False """If set to True, enable tracking server_load_metrics in the app state.""" + max_server_load: Optional[int] = None + """Maximum number of concurrent requests allowed. When exceeded, new requests will be rejected with HTTP 503. + Only effective when --enable-server-load-tracking is enabled.""" enable_force_include_usage: bool = False """If set to True, including usage on every request.""" enable_tokenizer_info_endpoint: bool = False diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index d8905fc14124..c31014863393 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -113,6 +113,26 @@ async def wrapper(*args, **kwargs): False): return await func(*args, **kwargs) + # Check if max load limit is configured and exceeded + max_load = getattr(raw_request.app.state, 'max_server_load', None) + if (max_load is not None and + raw_request.app.state.server_load_metrics >= max_load): + logger.warning( + f"Server overloaded: current load {raw_request.app.state.server_load_metrics} " + f">= max load {max_load}. Rejecting request." + ) + return JSONResponse( + content={ + "error": { + "type": "server_overloaded", + "message": f"Server is currently overloaded. " + f"Current load: {raw_request.app.state.server_load_metrics}, " + f"Max load: {max_load}. Please try again later." + } + }, + status_code=503 + ) + # ensure the counter exists if not hasattr(raw_request.app.state, "server_load_metrics"): raw_request.app.state.server_load_metrics = 0 From b6635e17a1474d67cff49ad51f0521abf1415e97 Mon Sep 17 00:00:00 2001 From: scratch-ml Date: Wed, 13 Aug 2025 18:15:08 +0800 Subject: [PATCH 2/8] Add precommit check Signed-off-by: scratch-ml --- .../openai/test_server_load_limit.py | 79 ++++++++++--------- vllm/entrypoints/utils.py | 29 ++++--- 2 files changed, 55 insertions(+), 53 deletions(-) diff --git a/tests/entrypoints/openai/test_server_load_limit.py b/tests/entrypoints/openai/test_server_load_limit.py index c1826097bb6d..59138fec64b9 100644 --- a/tests/entrypoints/openai/test_server_load_limit.py +++ b/tests/entrypoints/openai/test_server_load_limit.py @@ -1,7 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for server load limit functionality.""" +from unittest.mock import MagicMock + import pytest -from unittest.mock import MagicMock, AsyncMock from fastapi.responses import JSONResponse from vllm.entrypoints.utils import load_aware_call @@ -13,22 +16,22 @@ class TestServerLoadLimit: @pytest.mark.asyncio async def test_load_aware_call_max_load_exceeded(self): """Test that requests are rejected when max load is exceeded.""" - + @load_aware_call async def dummy_handler(raw_request): return {"message": "success"} - + # Mock request with load exceeding limit mock_request = MagicMock() mock_request.app.state.enable_server_load_tracking = True mock_request.app.state.max_server_load = 10 mock_request.app.state.server_load_metrics = 15 # Exceeds limit - + response = await dummy_handler(raw_request=mock_request) - + assert isinstance(response, JSONResponse) assert response.status_code == 503 - + # Verify error content import json content = json.loads(response.body.decode('utf-8')) @@ -40,136 +43,136 @@ async def dummy_handler(raw_request): @pytest.mark.asyncio async def test_load_aware_call_max_load_at_limit(self): """Test that requests are rejected when load equals limit.""" - + @load_aware_call async def dummy_handler(raw_request): return {"message": "success"} - + # Mock request with load exactly at limit mock_request = MagicMock() mock_request.app.state.enable_server_load_tracking = True mock_request.app.state.max_server_load = 10 mock_request.app.state.server_load_metrics = 10 # At limit - + response = await dummy_handler(raw_request=mock_request) - + assert isinstance(response, JSONResponse) assert response.status_code == 503 @pytest.mark.asyncio async def test_load_aware_call_max_load_under_limit(self): """Test that requests proceed normally when under limit.""" - + @load_aware_call async def dummy_handler(raw_request): return {"message": "success"} - + # Mock request with load under limit mock_request = MagicMock() mock_request.app.state.enable_server_load_tracking = True mock_request.app.state.max_server_load = 10 mock_request.app.state.server_load_metrics = 5 # Under limit - + response = await dummy_handler(raw_request=mock_request) - + # Should proceed normally assert response == {"message": "success"} @pytest.mark.asyncio async def test_load_aware_call_max_load_not_set(self): """Test that requests proceed normally when max_server_load is None.""" - + @load_aware_call async def dummy_handler(raw_request): return {"message": "success"} - + # Mock request with no max load set mock_request = MagicMock() mock_request.app.state.enable_server_load_tracking = True mock_request.app.state.max_server_load = None # No limit mock_request.app.state.server_load_metrics = 100 # High load - + response = await dummy_handler(raw_request=mock_request) - + # Should proceed normally despite high load assert response == {"message": "success"} @pytest.mark.asyncio async def test_load_aware_call_tracking_disabled(self): """Test that load limiting is bypassed when tracking is disabled.""" - + @load_aware_call async def dummy_handler(raw_request): return {"message": "success"} - + # Mock request with tracking disabled mock_request = MagicMock() mock_request.app.state.enable_server_load_tracking = False mock_request.app.state.max_server_load = 5 mock_request.app.state.server_load_metrics = 100 # High load - + response = await dummy_handler(raw_request=mock_request) - + # Should proceed normally when tracking is disabled assert response == {"message": "success"} @pytest.mark.asyncio async def test_load_aware_call_with_exception(self): """Test that load counter is properly decremented on exception.""" - + @load_aware_call async def failing_handler(raw_request): raise ValueError("Test exception") - + # Mock request under limit mock_request = MagicMock() mock_request.app.state.enable_server_load_tracking = True mock_request.app.state.max_server_load = 10 mock_request.app.state.server_load_metrics = 5 - + # Should raise the original exception with pytest.raises(ValueError, match="Test exception"): await failing_handler(raw_request=mock_request) - + # Load counter should be decremented back to 5 assert mock_request.app.state.server_load_metrics == 5 @pytest.mark.asyncio async def test_load_aware_call_increments_counter(self): """Test that load counter is properly incremented.""" - + @load_aware_call async def dummy_handler(raw_request): # Verify counter was incremented assert raw_request.app.state.server_load_metrics == 6 return {"message": "success"} - + # Mock request under limit mock_request = MagicMock() mock_request.app.state.enable_server_load_tracking = True mock_request.app.state.max_server_load = 10 mock_request.app.state.server_load_metrics = 5 - + response = await dummy_handler(raw_request=mock_request) - + assert response == {"message": "success"} @pytest.mark.asyncio async def test_load_aware_call_zero_max_load(self): """Test behavior when max_server_load is set to 0.""" - + @load_aware_call async def dummy_handler(raw_request): return {"message": "success"} - + # Mock request with zero max load mock_request = MagicMock() mock_request.app.state.enable_server_load_tracking = True mock_request.app.state.max_server_load = 0 mock_request.app.state.server_load_metrics = 0 - + response = await dummy_handler(raw_request=mock_request) - + # Should be rejected since 0 >= 0 assert isinstance(response, JSONResponse) assert response.status_code == 503 @@ -177,7 +180,7 @@ async def dummy_handler(raw_request): def test_max_server_load_parameter_exists(self): """Test that max_server_load parameter is properly defined.""" from vllm.entrypoints.openai.cli_args import FrontendArgs - + # Check that the parameter exists in FrontendArgs frontend_args = FrontendArgs() assert hasattr(frontend_args, 'max_server_load') @@ -186,12 +189,12 @@ def test_max_server_load_parameter_exists(self): def test_frontend_args_annotation(self): """Test that max_server_load has proper type annotation.""" from vllm.entrypoints.openai.cli_args import FrontendArgs - + # Get type hints annotations = FrontendArgs.__annotations__ assert 'max_server_load' in annotations - + # Should be Optional[int] import typing expected_type = typing.Union[int, type(None)] # Optional[int] - assert annotations['max_server_load'] == expected_type \ No newline at end of file + assert annotations['max_server_load'] == expected_type diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index c31014863393..a11480f5dcd2 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -115,23 +115,22 @@ async def wrapper(*args, **kwargs): # Check if max load limit is configured and exceeded max_load = getattr(raw_request.app.state, 'max_server_load', None) - if (max_load is not None and - raw_request.app.state.server_load_metrics >= max_load): + if (max_load is not None + and raw_request.app.state.server_load_metrics >= max_load): logger.warning( f"Server overloaded: current load {raw_request.app.state.server_load_metrics} " - f">= max load {max_load}. Rejecting request." - ) - return JSONResponse( - content={ - "error": { - "type": "server_overloaded", - "message": f"Server is currently overloaded. " - f"Current load: {raw_request.app.state.server_load_metrics}, " - f"Max load: {max_load}. Please try again later." - } - }, - status_code=503 - ) + f">= max load {max_load}. Rejecting request.") + return JSONResponse(content={ + "error": { + "type": + "server_overloaded", + "message": + f"Server is currently overloaded. " + f"Current load: {raw_request.app.state.server_load_metrics}, " + f"Max load: {max_load}. Please try again later." + } + }, + status_code=503) # ensure the counter exists if not hasattr(raw_request.app.state, "server_load_metrics"): From fdcb763d22dfadc55ae7e575a6e4db14d92ca54c Mon Sep 17 00:00:00 2001 From: scratch-ml Date: Wed, 13 Aug 2025 19:15:52 +0800 Subject: [PATCH 3/8] fix format error Signed-off-by: scratch-ml --- vllm/entrypoints/openai/cli_args.py | 5 +++-- vllm/entrypoints/utils.py | 11 ++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 93dba3f512cf..d0e9cbb628a4 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -165,8 +165,9 @@ class FrontendArgs: enable_server_load_tracking: bool = False """If set to True, enable tracking server_load_metrics in the app state.""" max_server_load: Optional[int] = None - """Maximum number of concurrent requests allowed. When exceeded, new requests will be rejected with HTTP 503. - Only effective when --enable-server-load-tracking is enabled.""" + """Maximum number of concurrent requests allowed. When exceeded, new + requests will be rejected with HTTP 503. Only effective when + --enable-server-load-tracking is enabled.""" enable_force_include_usage: bool = False """If set to True, including usage on every request.""" enable_tokenizer_info_endpoint: bool = False diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index a11480f5dcd2..48c7f1f39cfc 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -118,16 +118,17 @@ async def wrapper(*args, **kwargs): if (max_load is not None and raw_request.app.state.server_load_metrics >= max_load): logger.warning( - f"Server overloaded: current load {raw_request.app.state.server_load_metrics} " - f">= max load {max_load}. Rejecting request.") + "Server overloaded: current load %s >= max load %s. " + "Rejecting request.", + raw_request.app.state.server_load_metrics, max_load) return JSONResponse(content={ "error": { "type": "server_overloaded", "message": - f"Server is currently overloaded. " - f"Current load: {raw_request.app.state.server_load_metrics}, " - f"Max load: {max_load}. Please try again later." + f"Server is currently overloaded. Current load: " + f"{raw_request.app.state.server_load_metrics}, Max load: " + f"{max_load}. Please try again later." } }, status_code=503) From 9c136098a33e9dad27baf3e45133ee9297d29524 Mon Sep 17 00:00:00 2001 From: scratch-ml Date: Thu, 14 Aug 2025 01:22:02 +0800 Subject: [PATCH 4/8] upgrade version of precommit and re-run precommit Signed-off-by: scratch-ml --- tests/entrypoints/openai/test_server_load_limit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_server_load_limit.py b/tests/entrypoints/openai/test_server_load_limit.py index 59138fec64b9..cae1e0758393 100644 --- a/tests/entrypoints/openai/test_server_load_limit.py +++ b/tests/entrypoints/openai/test_server_load_limit.py @@ -196,5 +196,5 @@ def test_frontend_args_annotation(self): # Should be Optional[int] import typing - expected_type = typing.Union[int, type(None)] # Optional[int] + expected_type = typing.Optional[int] assert annotations['max_server_load'] == expected_type From 95e205fd36423aa1eef715d554b665d3c0112b7d Mon Sep 17 00:00:00 2001 From: scratch-ml Date: Tue, 16 Sep 2025 22:47:57 +0800 Subject: [PATCH 5/8] validate max_server_load parameter Signed-off-by: scratch-ml --- vllm/entrypoints/openai/cli_args.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index d0e9cbb628a4..b0960cb0af1c 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -269,6 +269,14 @@ def validate_parsed_serve_args(args: argparse.Namespace): raise TypeError("Error: --enable-auto-tool-choice requires " "--tool-call-parser") + # Validate max_server_load + if args.max_server_load is not None: + if not args.enable_server_load_tracking: + raise TypeError("Error: --max-server-load requires " + "--enable-server-load-tracking to be enabled") + if not isinstance(args.max_server_load, int) or args.max_server_load <= 0: + raise TypeError("Error: --max-server-load must be a positive integer") + def create_parser_for_docs() -> FlexibleArgumentParser: parser_for_docs = FlexibleArgumentParser( From 1dd742fd53197ecbe9eff053c40bf37ca943324d Mon Sep 17 00:00:00 2001 From: scratch-ml Date: Wed, 17 Sep 2025 00:19:36 +0800 Subject: [PATCH 6/8] refactor the load_aware_call() function code Signed-off-by: scratch-ml --- vllm/entrypoints/openai/cli_args.py | 6 ++++-- vllm/entrypoints/utils.py | 27 +++++++++++++-------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index b0960cb0af1c..01d795e82adb 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -274,8 +274,10 @@ def validate_parsed_serve_args(args: argparse.Namespace): if not args.enable_server_load_tracking: raise TypeError("Error: --max-server-load requires " "--enable-server-load-tracking to be enabled") - if not isinstance(args.max_server_load, int) or args.max_server_load <= 0: - raise TypeError("Error: --max-server-load must be a positive integer") + if not isinstance(args.max_server_load, + int) or args.max_server_load <= 0: + raise TypeError( + "Error: --max-server-load must be a positive integer") def create_parser_for_docs() -> FlexibleArgumentParser: diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index 48c7f1f39cfc..2a4ffeb980f4 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -109,39 +109,38 @@ async def wrapper(*args, **kwargs): raise ValueError( "raw_request required when server load tracking is enabled") - if not getattr(raw_request.app.state, "enable_server_load_tracking", - False): + app_state = raw_request.app.state + if not getattr(app_state, "enable_server_load_tracking", False): return await func(*args, **kwargs) + # ensure the counter exists + if not hasattr(app_state, "server_load_metrics"): + app_state.server_load_metrics = 0 + # Check if max load limit is configured and exceeded - max_load = getattr(raw_request.app.state, 'max_server_load', None) + max_load = getattr(app_state, "max_server_load", None) if (max_load is not None - and raw_request.app.state.server_load_metrics >= max_load): + and app_state.server_load_metrics >= max_load): logger.warning( "Server overloaded: current load %s >= max load %s. " - "Rejecting request.", - raw_request.app.state.server_load_metrics, max_load) + "Rejecting request.", app_state.server_load_metrics, max_load) return JSONResponse(content={ "error": { "type": "server_overloaded", "message": f"Server is currently overloaded. Current load: " - f"{raw_request.app.state.server_load_metrics}, Max load: " + f"{app_state.server_load_metrics}, Max load: " f"{max_load}. Please try again later." } }, status_code=503) - # ensure the counter exists - if not hasattr(raw_request.app.state, "server_load_metrics"): - raw_request.app.state.server_load_metrics = 0 - - raw_request.app.state.server_load_metrics += 1 + app_state.server_load_metrics += 1 try: response = await func(*args, **kwargs) except Exception: - raw_request.app.state.server_load_metrics -= 1 + app_state.server_load_metrics -= 1 raise if isinstance(response, (JSONResponse, StreamingResponse)): @@ -161,7 +160,7 @@ async def wrapper(*args, **kwargs): tasks.add_task(decrement_server_load, raw_request) response.background = tasks else: - raw_request.app.state.server_load_metrics -= 1 + app_state.server_load_metrics -= 1 return response From b63c5b298ba28c80bbc0da72afee38f7fea114e8 Mon Sep 17 00:00:00 2001 From: scratch-ml Date: Wed, 17 Sep 2025 14:39:07 +0800 Subject: [PATCH 7/8] limit the frequency of log output Signed-off-by: scratch-ml --- .../openai/test_server_load_limit.py | 8 +++ vllm/entrypoints/utils.py | 49 ++++++++++++++++--- 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/tests/entrypoints/openai/test_server_load_limit.py b/tests/entrypoints/openai/test_server_load_limit.py index cae1e0758393..e443133ad69c 100644 --- a/tests/entrypoints/openai/test_server_load_limit.py +++ b/tests/entrypoints/openai/test_server_load_limit.py @@ -26,6 +26,7 @@ async def dummy_handler(raw_request): mock_request.app.state.enable_server_load_tracking = True mock_request.app.state.max_server_load = 10 mock_request.app.state.server_load_metrics = 15 # Exceeds limit + mock_request.app.state.server_overload_rejections_since_last_log = 0 response = await dummy_handler(raw_request=mock_request) @@ -53,6 +54,7 @@ async def dummy_handler(raw_request): mock_request.app.state.enable_server_load_tracking = True mock_request.app.state.max_server_load = 10 mock_request.app.state.server_load_metrics = 10 # At limit + mock_request.app.state.server_overload_rejections_since_last_log = 0 response = await dummy_handler(raw_request=mock_request) @@ -72,6 +74,7 @@ async def dummy_handler(raw_request): mock_request.app.state.enable_server_load_tracking = True mock_request.app.state.max_server_load = 10 mock_request.app.state.server_load_metrics = 5 # Under limit + mock_request.app.state.server_overload_rejections_since_last_log = 0 response = await dummy_handler(raw_request=mock_request) @@ -91,6 +94,7 @@ async def dummy_handler(raw_request): mock_request.app.state.enable_server_load_tracking = True mock_request.app.state.max_server_load = None # No limit mock_request.app.state.server_load_metrics = 100 # High load + mock_request.app.state.server_overload_rejections_since_last_log = 0 response = await dummy_handler(raw_request=mock_request) @@ -110,6 +114,7 @@ async def dummy_handler(raw_request): mock_request.app.state.enable_server_load_tracking = False mock_request.app.state.max_server_load = 5 mock_request.app.state.server_load_metrics = 100 # High load + mock_request.app.state.server_overload_rejections_since_last_log = 0 response = await dummy_handler(raw_request=mock_request) @@ -129,6 +134,7 @@ async def failing_handler(raw_request): mock_request.app.state.enable_server_load_tracking = True mock_request.app.state.max_server_load = 10 mock_request.app.state.server_load_metrics = 5 + mock_request.app.state.server_overload_rejections_since_last_log = 0 # Should raise the original exception with pytest.raises(ValueError, match="Test exception"): @@ -152,6 +158,7 @@ async def dummy_handler(raw_request): mock_request.app.state.enable_server_load_tracking = True mock_request.app.state.max_server_load = 10 mock_request.app.state.server_load_metrics = 5 + mock_request.app.state.server_overload_rejections_since_last_log = 0 response = await dummy_handler(raw_request=mock_request) @@ -170,6 +177,7 @@ async def dummy_handler(raw_request): mock_request.app.state.enable_server_load_tracking = True mock_request.app.state.max_server_load = 0 mock_request.app.state.server_load_metrics = 0 + mock_request.app.state.server_overload_rejections_since_last_log = 0 response = await dummy_handler(raw_request=mock_request) diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index 2a4ffeb980f4..d0f6b361cd57 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -8,6 +8,7 @@ import os import subprocess import sys +import time from typing import Any, Optional, Union from fastapi import Request @@ -98,6 +99,39 @@ def decrement_server_load(request: Request): request.app.state.server_load_metrics -= 1 +def _flush_pending_overload_warnings(app_state): + """Flush pending aggregated overload warnings if interval elapsed.""" + now = time.monotonic() + pending = getattr(app_state, "server_overload_rejections_since_last_log", + 0) + if pending > 0: + last_log_time = getattr(app_state, "server_overload_last_log_time", + now) + log_interval = getattr(app_state, "server_overload_log_interval", 60.0) + if (now - last_log_time) >= log_interval: + max_load_snapshot = getattr(app_state, "max_server_load", None) + try: + logger.warning( + "Server overloaded: current load %s >= max load %s. " + "Rejected %d requests since last log.", + app_state.server_load_metrics, max_load_snapshot, pending) + except Exception: + logger.exception("Failed to log server overload warning") + else: + app_state.server_overload_rejections_since_last_log = 0 + app_state.server_overload_last_log_time = now + + +def _aggregate_rejection_stats(app_state): + """Aggregate rejections since last log.""" + now = time.monotonic() + if not hasattr(app_state, "server_overload_last_log_time"): + app_state.server_overload_last_log_time = now + if not hasattr(app_state, "server_overload_rejections_since_last_log"): + app_state.server_overload_rejections_since_last_log = 0 + app_state.server_overload_rejections_since_last_log += 1 + + def load_aware_call(func): @functools.wraps(func) @@ -117,13 +151,13 @@ async def wrapper(*args, **kwargs): if not hasattr(app_state, "server_load_metrics"): app_state.server_load_metrics = 0 - # Check if max load limit is configured and exceeded + # Flush pending aggregated overload warnings if interval elapsed. + _flush_pending_overload_warnings(app_state) + max_load = getattr(app_state, "max_server_load", None) - if (max_load is not None - and app_state.server_load_metrics >= max_load): - logger.warning( - "Server overloaded: current load %s >= max load %s. " - "Rejecting request.", app_state.server_load_metrics, max_load) + if max_load is not None and app_state.server_load_metrics >= max_load: + # Aggregate rejections since last log + _aggregate_rejection_stats(app_state) return JSONResponse(content={ "error": { "type": @@ -131,7 +165,8 @@ async def wrapper(*args, **kwargs): "message": f"Server is currently overloaded. Current load: " f"{app_state.server_load_metrics}, Max load: " - f"{max_load}. Please try again later." + f"{getattr(app_state, 'max_server_load', None)}. " + "Please try again later." } }, status_code=503) From 85fb83225bffb58760692972e8834614a150135d Mon Sep 17 00:00:00 2001 From: scratch-ml Date: Wed, 17 Sep 2025 15:05:18 +0800 Subject: [PATCH 8/8] create a constant JSONResponse for server overload error Signed-off-by: scratch-ml --- .../openai/test_server_load_limit.py | 3 +-- vllm/entrypoints/utils.py | 24 +++++++++---------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/tests/entrypoints/openai/test_server_load_limit.py b/tests/entrypoints/openai/test_server_load_limit.py index e443133ad69c..02ce867c7b00 100644 --- a/tests/entrypoints/openai/test_server_load_limit.py +++ b/tests/entrypoints/openai/test_server_load_limit.py @@ -38,8 +38,7 @@ async def dummy_handler(raw_request): content = json.loads(response.body.decode('utf-8')) assert content["error"]["type"] == "server_overloaded" assert "Server is currently overloaded" in content["error"]["message"] - assert "Current load: 15" in content["error"]["message"] - assert "Max load: 10" in content["error"]["message"] + assert "Please try again later" in content["error"]["message"] @pytest.mark.asyncio async def test_load_aware_call_max_load_at_limit(self): diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index d0f6b361cd57..e1a6caeabd15 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -9,6 +9,7 @@ import subprocess import sys import time +from http import HTTPStatus from typing import Any, Optional, Union from fastapi import Request @@ -25,6 +26,16 @@ logger = init_logger(__name__) +SERVER_OVERLOADED_RESPONSE = JSONResponse( + content={ + "error": { + "type": "server_overloaded", + "message": + "Server is currently overloaded. Please try again later." + } + }, + status_code=HTTPStatus.SERVICE_UNAVAILABLE) + VLLM_SUBCMD_PARSER_EPILOG = ( "Tip: Use `vllm [serve|run-batch|bench ] " "--help=` to explore arguments from help.\n" @@ -158,18 +169,7 @@ async def wrapper(*args, **kwargs): if max_load is not None and app_state.server_load_metrics >= max_load: # Aggregate rejections since last log _aggregate_rejection_stats(app_state) - return JSONResponse(content={ - "error": { - "type": - "server_overloaded", - "message": - f"Server is currently overloaded. Current load: " - f"{app_state.server_load_metrics}, Max load: " - f"{getattr(app_state, 'max_server_load', None)}. " - "Please try again later." - } - }, - status_code=503) + return SERVER_OVERLOADED_RESPONSE app_state.server_load_metrics += 1 try: