Fix offline DP compatibility

njhill · njhill · commit a6621696742f · 2025-04-04T17:04:34.000-07:00
Signed-off-by: Nick Hill &lt;nhill@redhat.com&gt;
diff --git a/vllm/config.py b/vllm/config.py
@@ -1546,7 +1546,6 @@ def __post_init__(self) -> None:
         if self.data_parallel_size > 1:
             # Data parallel was specified in the engine args.
             self.data_parallel_master_port = get_open_port()
-            # TODO multi-node
         else:
             # Otherwise fall back to env vars (e.g. for offline SPMD case).
             self.data_parallel_size = envs.VLLM_DP_SIZE
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
@@ -105,6 +105,7 @@ def run_headless(args: argparse.Namespace):
         target_fn=EngineCoreProc.run_engine_core,
         local_engine_count=local_engine_count,
         start_index=engine_args.data_parallel_start_rank,
+        local_start_index=0,
         vllm_config=vllm_config,
         on_head_node=False,
         input_address=input_address,
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
@@ -363,11 +363,28 @@ def sigusr1_handler(signum, frame):
         self._finalizer = weakref.finalize(self, self.resources)
 
         parallel_config = vllm_config.parallel_config
-        dp_size = parallel_config.data_parallel_size
         local_engine_count = parallel_config.data_parallel_size_local
+        start_index = parallel_config.data_parallel_rank
+        local_start_index = parallel_config.data_parallel_rank_local
+
+        # SPMD mode is where there is an LLM instance per DP rank and one
+        # core engine per LLM, see examples/offline_inference/data_parallel.py.
+        spmd_mode = local_start_index is not None
+        if spmd_mode:
+            assert local_engine_count == 1
+            self.core_engines = [
+                CoreEngine(index=local_start_index, local=True)
+            ]
+        else:
+            assert start_index == 0
+            local_start_index = 0
+            self.core_engines = [
+                CoreEngine(index=i, local=(i < local_engine_count))
+                for i in range(parallel_config.data_parallel_size)
+            ]
 
         input_address, output_address = self._get_zmq_addresses(
-            parallel_config)
+            parallel_config, spmd_mode)
 
         # Create input and output sockets.
         self.input_socket = self.resources.input_socket = make_zmq_socket(
@@ -378,6 +395,7 @@ def sigusr1_handler(signum, frame):
                                                        zmq.constants.PULL)
         # Start local engines.
         if local_engine_count:
+            # In server mode, start_index and local_start_index will both be 0.
             self.resources.local_engine_manager = CoreEngineProcManager(
                 EngineCoreProc.run_engine_core,
                 vllm_config=vllm_config,
@@ -386,12 +404,9 @@ def sigusr1_handler(signum, frame):
                 input_address=input_address,
                 on_head_node=True,
                 local_engine_count=local_engine_count,
-                start_index=0)
+                start_index=start_index,
+                local_start_index=local_start_index)
 
-        self.core_engines = [
-            CoreEngine(index=i, local=(i < local_engine_count))
-            for i in range(dp_size)
-        ]
         self.core_engine = self.core_engines[0]
 
         # Wait for engine core process(es) to start.
@@ -400,12 +415,13 @@ def sigusr1_handler(signum, frame):
         self.utility_results: dict[int, AnyFuture] = {}
 
     @staticmethod
-    def _get_zmq_addresses(parallel_config: ParallelConfig) -> tuple[str, str]:
+    def _get_zmq_addresses(parallel_config: ParallelConfig,
+                           spmd_mode: bool) -> tuple[str, str]:
         """Returns (input_address, output_address)."""
         dp_size = parallel_config.data_parallel_size
         local_engine_count = parallel_config.data_parallel_size_local
 
-        if local_engine_count == dp_size:
+        if local_engine_count == dp_size or spmd_mode:
             input_address = get_open_zmq_ipc_path()
             output_address = get_open_zmq_ipc_path()
         else:
@@ -422,8 +438,6 @@ def _wait_for_engine_startup(self, output_address: str,
         # Get a sync handle to the socket which can be sync or async.
         sync_input_socket = zmq.Socket.shadow(self.input_socket)
 
-        # TODO offline case compatibility
-
         # Wait for engine core process(es) to send ready messages.
         local_count = parallel_config.data_parallel_size_local
         remote_count = len(self.core_engines) - local_count
@@ -444,18 +458,20 @@ def _wait_for_engine_startup(self, output_address: str,
             # Receive HELLO and READY messages from the input socket.
             eng_identity, ready_msg_bytes = sync_input_socket.recv_multipart()
             eng_index = int.from_bytes(eng_identity, byteorder="little")
-            if eng_index > len(self.core_engines):
-                raise RuntimeError(
-                    f"Message from engine rank larger than "
-                    f"configured data parallel size: {eng_index}")
-            engine = self.core_engines[eng_index]
+            engine = next(
+                (e for e in self.core_engines if e.identity == eng_identity),
+                None)
+            if engine is None:
+                raise RuntimeError(f"Message from engine with unexpected data "
+                                   f"parallel rank: {eng_index}")
             msg = msgspec.msgpack.decode(ready_msg_bytes)
             status, local = msg["status"], msg["local"]
             if local != engine.local:
                 raise RuntimeError(f"{status} message from "
                                    f"{'local' if local else 'remote'} "
-                                   f" engine {eng_index}, expected it to be "
+                                   f"engine {eng_index}, expected it to be "
                                    f"{'local' if engine.local else 'remote'}")
+
             if status == "HELLO" and engine.state == CoreEngineState.NEW:
 
                 # Send init message with DP config info.
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
@@ -105,6 +105,7 @@ def __init__(
         target_fn: Callable,
         local_engine_count: int,
         start_index: int,
+        local_start_index: int,
         vllm_config: VllmConfig,
         on_head_node: bool,
         input_address: str,
@@ -121,14 +122,15 @@ def __init__(
         }
 
         self.processes = []
-        for local_index in range(local_engine_count):
-            index = local_index + start_index
+        for index in range(local_engine_count):
+            local_index = local_start_index + index
+            global_index = start_index + index
             # Start EngineCore in background process.
             self.processes.append(
                 context.Process(target=target_fn,
-                                name=f"EngineCore_{index}",
+                                name=f"EngineCore_{global_index}",
                                 kwargs=common_kwargs | {
-                                    "dp_rank": index,
+                                    "dp_rank": global_index,
                                     "local_dp_rank": local_index,
                                 }))
 
@@ -172,7 +174,8 @@ def shutdown(procs: list[multiprocessing.Process], input_address: str):
         remaining = deadline - time.monotonic()
         if remaining <= 0:
             break
-        proc.join(remaining)
+        if proc.is_alive():
+            proc.join(remaining)
 
     for proc in procs:
         if proc.is_alive():