vllm-project · simon-mo · Mar 27, 2025 · Feb 26, 2025 · Feb 27, 2025 · Feb 27, 2025
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -137,8 +137,10 @@ steps:
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/test_async_llm_dp.py
   commands:
   - VLLM_USE_V1=1 python3 ../examples/offline_inference/data_parallel.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
@@ -505,7 +507,10 @@ steps:
   - vllm/worker/worker.py
   - vllm/worker/model_runner.py
   - entrypoints/llm/test_collective_rpc.py
+  - tests/v1/test_async_llm_dp.py
+  - vllm/v1/engine/
   commands:
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
   - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py

diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
@@ -165,11 +165,11 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
 
             core_client: SyncMPClient = client
 
-            result = core_client._call_utility("echo", "testarg")
+            result = core_client.call_utility("echo", "testarg")
             assert result == "testarg"
 
             with pytest.raises(Exception) as e_info:
-                core_client._call_utility("echo", None, "help!")
+                core_client.call_utility("echo", None, "help!")
 
             assert str(e_info.value) == "Call to echo method failed: help!"
 
@@ -236,10 +236,10 @@ async def test_engine_core_client_asyncio(monkeypatch):
 
         core_client: AsyncMPClient = client
 
-        result = await core_client._call_utility_async("echo", "testarg")
+        result = await core_client.call_utility_async("echo", "testarg")
         assert result == "testarg"
 
         with pytest.raises(Exception) as e_info:
-            await core_client._call_utility_async("echo", None, "help!")
+            await core_client.call_utility_async("echo", None, "help!")
 
         assert str(e_info.value) == "Call to echo method failed: help!"
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import asyncio
+import os
+from contextlib import ExitStack
+from typing import Optional
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs import PromptType
+from vllm.platforms import current_platform
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.core_client import DPAsyncMPClient
+
+if not current_platform.is_cuda():
+    pytest.skip(reason="V1 currently only supported on CUDA.",
+                allow_module_level=True)
 # No support for device type other than CUDA, AMD (experiemntal) or 
 # TPU (experimental) so far. 
 if not (current_platform.is_cuda_alike() or current_platform.is_tpu()): 
     _raise_or_fallback( 
         feature_name=f"device type={current_platform.device_type}", 
         recommend_to_remove=False) 
     return False 
 # No support for device type other than CUDA, AMD (experiemntal) or 
 # TPU (experimental) so far. 
 if not (current_platform.is_cuda_alike() or current_platform.is_tpu()): 
     _raise_or_fallback( 
         feature_name=f"device type={current_platform.device_type}", 
         recommend_to_remove=False) 
     return False 
+
+
+async def generate(engine: AsyncLLM,
+                   request_id: str,
+                   prompt: PromptType,
+                   output_kind: RequestOutputKind,
+                   max_tokens: int,
+                   prompt_logprobs: Optional[int] = None) -> tuple[int, str]:
+    # Ensure generate doesn't complete too fast for cancellation test.
+    await asyncio.sleep(0.2)
+
+    count = 0
+    sampling_params = SamplingParams(max_tokens=max_tokens,
+                                     ignore_eos=True,
+                                     output_kind=output_kind,
+                                     temperature=0,
+                                     prompt_logprobs=prompt_logprobs)
+    async for out in engine.generate(request_id=request_id,
+                                     prompt=prompt,
+                                     sampling_params=sampling_params):
+
+        num_tokens = len(out.outputs[0].token_ids)
+        if output_kind == RequestOutputKind.DELTA:
+            count += num_tokens
+        else:
+            count = num_tokens
+
+        await asyncio.sleep(0.)
+
+    return count, request_id
+
+
+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.asyncio
+async def test_load(monkeypatch, output_kind: RequestOutputKind):
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine_args = AsyncEngineArgs(
+            model="ibm-research/PowerMoE-3b",
+            enforce_eager=True,
+            disable_log_requests=True,
+            tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
+            data_parallel_size=int(os.getenv("DP_SIZE", 2)),
+        )
+
+        prompt = "This is a test of data parallel"
+
+        engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
+
+        NUM_REQUESTS = 100
+        NUM_EXPECTED_TOKENS = 10
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, prompt, output_kind,
+                             NUM_EXPECTED_TOKENS)))
+
+        # Confirm that we got all the EXPECTED tokens from the requests.
+        done, pending = await asyncio.wait(tasks,
+                                           return_when=asyncio.FIRST_EXCEPTION)
+        for task in pending:
+            task.cancel()
+        for task in done:
+            num_generated_tokens, request_id = await task
+            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                f"{request_id} generated {num_generated_tokens} but "
+                f"expected {NUM_EXPECTED_TOKENS}")
+
+        assert not engine.output_processor.has_unfinished_requests()
+
+        # testing internals here which may break
+        core_client: DPAsyncMPClient = engine.engine_core
+        assert core_client.num_engines_running == 0
+        assert not core_client.reqs_in_flight
diff --git a/vllm/config.py b/vllm/config.py
@@ -38,7 +38,8 @@
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, random_uuid, resolve_obj_by_qualname)
+                        get_cpu_memory, get_open_port, random_uuid,
+                        resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -1435,10 +1436,15 @@ def __post_init__(self) -> None:
         self.world_size = self.pipeline_parallel_size * \
             self.tensor_parallel_size
 
-        self.data_parallel_size = envs.VLLM_DP_SIZE
-        self.data_parallel_rank = envs.VLLM_DP_RANK
-        self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
-        self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
+        if self.data_parallel_size > 1:
+            self.data_parallel_master_port = get_open_port()
+            # TODO multi-node
+        else:
+            self.data_parallel_size = envs.VLLM_DP_SIZE
+            self.data_parallel_rank = envs.VLLM_DP_RANK
+            self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
+            self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
+
         self.world_size_across_dp = self.world_size * self.data_parallel_size
 
         if self.distributed_executor_backend == "external_launcher":

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -114,6 +114,7 @@ class EngineArgs:
     # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
+    data_parallel_size: int = 1
     enable_expert_parallel: bool = False
     max_parallel_loading_workers: Optional[int] = None
     block_size: Optional[int] = None
@@ -441,6 +442,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             type=int,
                             default=EngineArgs.tensor_parallel_size,
                             help='Number of tensor parallel replicas.')
+        parser.add_argument('--data-parallel-size',
+                            '-dp',
+                            type=int,
+                            default=EngineArgs.data_parallel_size,
+                            help='Number of data parallel replicas. '
+                            'MoE layers will be sharded according to the '
+                            'product of the tensor-parallel-size and '
+                            'data-parallel-size.')
         parser.add_argument(
             '--enable-expert-parallel',
             action='store_true',
@@ -1213,6 +1222,7 @@ def create_engine_config(self,
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
+            data_parallel_size=self.data_parallel_size,
             enable_expert_parallel=self.enable_expert_parallel,
             max_parallel_loading_workers=self.max_parallel_loading_workers,
             disable_custom_all_reduce=self.disable_custom_all_reduce,

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -517,7 +517,7 @@ def get_open_port() -> int:
         dp_port = envs.VLLM_DP_MASTER_PORT
         while True:
             port = _get_open_port()
-            if port >= dp_port and port < dp_port + 10:
+            if dp_port <= port < dp_port + 10:
                 continue
             return port
     return _get_open_port()
@@ -2134,19 +2134,23 @@ def make_zmq_socket(
     if socket_type == zmq.constants.PULL:
         socket.setsockopt(zmq.constants.RCVHWM, 0)
         socket.setsockopt(zmq.constants.RCVBUF, buf_size)
-        socket.connect(path)
+        socket.bind(path)
     elif socket_type == zmq.constants.PUSH:
         socket.setsockopt(zmq.constants.SNDHWM, 0)
         socket.setsockopt(zmq.constants.SNDBUF, buf_size)
-        socket.bind(path)
+        socket.connect(path)
     else:
         raise ValueError(f"Unknown Socket Type: {socket_type}")
 
     return socket
 
 
 @contextlib.contextmanager
-def zmq_socket_ctx(path: str, socket_type: Any) -> Iterator[zmq.Socket]:
+def zmq_socket_ctx(
+    path: str,
+    socket_type: Any,
+    linger: int = 0,
+) -> Iterator[zmq.Socket]:
     """Context manager for a ZMQ socket"""
 
     ctx = zmq.Context()  # type: ignore[attr-defined]
@@ -2157,7 +2161,7 @@ def zmq_socket_ctx(path: str, socket_type: Any) -> Iterator[zmq.Socket]:
         logger.debug("Got Keyboard Interrupt.")
 
     finally:
-        ctx.destroy(linger=0)
+        ctx.destroy(linger=linger)
 
 
 def _check_multiproc_method():

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
@@ -31,12 +31,14 @@ def __init__(
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
         speculative_config: Optional[SpeculativeConfig],
-        log_stats: bool,
+        include_finished_set: bool = False,
+        log_stats: bool = False,
     ) -> None:
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
         self.lora_config = lora_config
         self.speculative_config = speculative_config
+        self.include_finished_set = include_finished_set
         self.log_stats = log_stats
 
         # Scheduling constraints.
@@ -583,10 +585,14 @@ def update_from_output(
                 new_running.append(request)
 
         self.running = new_running
-        return EngineCoreOutputs(
+        engine_core_outputs = EngineCoreOutputs(
             outputs=outputs,
             scheduler_stats=self.make_stats(),
         )
+        if self.include_finished_set:
+            engine_core_outputs.finished_requests = (
+                scheduler_output.finished_req_ids)
+        return engine_core_outputs
 
     def _check_stop(self, request: Request) -> bool:
         if (request.num_tokens >= self.max_model_len
@@ -655,7 +661,7 @@ def get_num_unfinished_requests(self) -> int:
         return len(self.waiting) + len(self.running)
 
     def has_unfinished_requests(self) -> bool:
-        return self.get_num_unfinished_requests() > 0
+        return len(self.running) > 0 or len(self.waiting) > 0
 
     def get_num_unscheduled_requests(self) -> int:
         """Number of requests that are not being processed by the executor."""

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
@@ -134,6 +134,15 @@ class EngineCoreOutputs(
     timestamp: float = 0.0
 
     utility_output: Optional[UtilityOutput] = None
+    finished_requests: Optional[set[str]] = None
+
+    # In DP case, used to signal that the engine is paused.
+    engine_paused: bool = False
+
+    # Set to False to indicate stats should be accumulated rather than
+    # recorded, when there are remaining outputs from other engines
+    # still to come for this iteration.
+    final_outputs_for_step: bool = True
 
     def __post_init__(self):
         if self.timestamp == 0.0:
@@ -147,4 +156,5 @@ class EngineCoreRequestType(enum.Enum):
     """
     ADD = b'\x00'
     ABORT = b'\x01'
-    UTILITY = b'\x02'
+    START_DP = b'\x02'
+    UTILITY = b'\x03'