Skip to content

Commit f7e3b0c

Browse files
[Bugfix][Frontend] Fix Issues Under High Load With zeromq Frontend (#7394)
Co-authored-by: Nick Hill <[email protected]>
1 parent d3c002e commit f7e3b0c

File tree

9 files changed

+322
-141
lines changed

9 files changed

+322
-141
lines changed

.buildkite/test-pipeline.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ steps:
8686
- vllm/
8787
commands:
8888
- pip install -e ./plugins/vllm_add_dummy_model
89+
- pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
8990
- pytest -v -s entrypoints/llm
9091
- pytest -v -s entrypoints/openai
9192

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
"""
2+
This file test accuracy of the vLLM server via LMEval.
3+
It uses local-completions, which interacts with vLLM
4+
through the OAI API with N concurrent connections.
5+
This simulates real work usage of the API and makes
6+
sure that the zmq frontend mp RPC message passing and
7+
AsyncLLMEngine are working correctly.
8+
"""
9+
10+
import lm_eval
11+
import pytest
12+
13+
from ...utils import RemoteOpenAIServer
14+
15+
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
16+
NUM_CONCURRENT = 500
17+
TASK = "gsm8k"
18+
FILTER = "exact_match,strict-match"
19+
RTOL = 0.03
20+
EXPECTED_VALUE = 0.58
21+
22+
23+
@pytest.fixture(scope="module")
24+
def server():
25+
args = [
26+
"--max-model-len", "4096", "--enable-chunked-prefill",
27+
"--disable-log-requests", "--enforce-eager"
28+
]
29+
30+
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
31+
yield remote_server
32+
33+
34+
@pytest.fixture(scope="module")
35+
def server_data(server):
36+
return {
37+
"url": f"{server.url_for('v1')}/completions",
38+
}
39+
40+
41+
def test_lm_eval_accuracy(server_data):
42+
model_args = (f"model={MODEL_NAME},"
43+
f"base_url={server_data['url']},"
44+
f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
45+
46+
results = lm_eval.simple_evaluate(
47+
model="local-completions",
48+
model_args=model_args,
49+
tasks=TASK,
50+
)
51+
52+
measured_value = results["results"][TASK][FILTER]
53+
assert (measured_value - RTOL < EXPECTED_VALUE
54+
and measured_value + RTOL > EXPECTED_VALUE
55+
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"

vllm/engine/async_llm_engine.py

+5
Original file line numberDiff line numberDiff line change
@@ -766,6 +766,11 @@ def is_stopped(self) -> bool:
766766
def errored(self) -> bool:
767767
return self._errored_with is not None
768768

769+
@property
770+
def limit_concurrency(self) -> Optional[int]:
771+
"""Maximum number of concurrently running requests."""
772+
return None
773+
769774
def set_errored(self, exc: Exception) -> None:
770775
self._errored_with = exc
771776

vllm/engine/protocol.py

+4
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ def is_stopped(self) -> bool:
2929
def errored(self) -> bool:
3030
...
3131

32+
@property
33+
def limit_concurrency(self) -> Optional[int]:
34+
"""Maximum number of concurrently running requests."""
35+
3236
def generate(
3337
self,
3438
inputs: PromptInputs,

vllm/entrypoints/launcher.py

+9
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,15 @@ async def serve_http(app: FastAPI, engine: AsyncEngineClient,
2727

2828
logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
2929

30+
# Set concurrency limits in uvicorn if running in multiprocessing mode
31+
# since zmq has maximum socket limit of zmq.constants.SOCKET_LIMIT (65536).
32+
if engine.limit_concurrency is not None:
33+
logger.info(
34+
"Launching Uvicorn with --limit_concurrency %s. To avoid this "
35+
"limit at the expense of performance run with "
36+
"--disable-frontend-multiprocessing", engine.limit_concurrency)
37+
uvicorn_kwargs["limit_concurrency"] = engine.limit_concurrency
38+
3039
config = uvicorn.Config(app, **uvicorn_kwargs)
3140
server = uvicorn.Server(config)
3241
_add_shutdown_handlers(app, server, engine)

vllm/entrypoints/openai/api_server.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,12 @@ async def build_async_engine_client(
135135
logger.info("Multiprocessing frontend to use %s for RPC Path.",
136136
rpc_path)
137137

138+
# Build RPCClient, which conforms to AsyncEngineClient Protocol.
139+
# NOTE: Actually, this is not true yet. We still need to support
140+
# embedding models via RPC (see TODO above)
141+
rpc_client = AsyncEngineRPCClient(rpc_path)
142+
async_engine_client = rpc_client # type: ignore
143+
138144
# Start RPCServer in separate process (holds the AsyncLLMEngine).
139145
context = multiprocessing.get_context("spawn")
140146
# the current process might have CUDA context,
@@ -145,11 +151,6 @@ async def build_async_engine_client(
145151
rpc_server_process.start()
146152
logger.info("Started engine process with PID %d",
147153
rpc_server_process.pid)
148-
# Build RPCClient, which conforms to AsyncEngineClient Protocol.
149-
# NOTE: Actually, this is not true yet. We still need to support
150-
# embedding models via RPC (see TODO above)
151-
rpc_client = AsyncEngineRPCClient(rpc_path)
152-
async_engine_client = rpc_client # type: ignore
153154

154155
try:
155156
while True:

vllm/entrypoints/openai/rpc/__init__.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,18 @@
77
from vllm.prompt_adapter.request import PromptAdapterRequest
88
from vllm.sampling_params import SamplingParams
99

10+
# Success string used for RPC instructions.
1011
VLLM_RPC_SUCCESS_STR = "SUCCESS"
11-
VLLM_RPC_HEALTHY_STR = "HEALTHY"
12+
13+
# Timeouts.
14+
VLLM_RPC_SERVER_START_TIMEOUT_MS = 1000
15+
VLLM_RPC_HEALTH_TIMEOUT_MS = 10000
16+
17+
# Minimum value of ZMQ.SOCKET_LIMIT to run mp.
18+
VLLM_RPC_SOCKET_LIMIT_CUTOFF = 2000
19+
20+
# HWM is set to Infinity.
21+
VLLM_RPC_ZMQ_HWM = 0
1222

1323

1424
@dataclass
@@ -34,7 +44,7 @@ class RPCUtilityRequest(Enum):
3444
GET_SCHEDULER_CONFIG = 5
3545
GET_LORA_CONFIG = 6
3646
DO_LOG_STATS = 7
37-
CHECK_HEALTH = 8
47+
IS_SERVER_HEALTHY = 8
3848
IS_TRACING_ENABLED = 9
3949

4050

0 commit comments

Comments
 (0)