[serve][llm] Add TP*PP spacing to port offset for multi-replica deployments (#58073)

nrghosh · web-flow · commit 71a2f40111e7 · 2025-10-30T18:04:06.000-07:00
Signed-off-by: Nikhil Ghosh &lt;nikhil@anyscale.com&gt;
diff --git a/python/ray/llm/_internal/serve/engines/vllm/kv_transfer/base.py b/python/ray/llm/_internal/serve/engines/vllm/kv_transfer/base.py
@@ -43,19 +43,30 @@ def _compute_port_offset(self) -> int:
         Uses data_parallel_rank if DP case, otherwise falls back to
         the replica rank assigned by Ray Serve (TP/PP case).
 
+        For TP/PP cases, multiply by num_devices (tp × pp) to reserve
+        sufficient port space, since each worker needs a unique port.
+        Each TP worker adds its tp_rank (0, 1, ..., tp_size-1) to the
+        base port at bind time, and PP stages also need separate ports.
+
         Returns:
             Non-negative integer offset to add to a base port.
         """
         # Prefer explicit DP rank when available
         dp_rank = self.llm_config.engine_kwargs.get("data_parallel_rank")
         if isinstance(dp_rank, int) and dp_rank >= 0:
+            # vLLM already accounts for TP spacing in DP offset calculation
+            # (data_parallel_rank × tp_size), don't multiply here
             return dp_rank
 
         # Fall back to Serve replica rank for TP/PP cases
         try:
             rc = serve.get_replica_context()
             if rc and hasattr(rc, "rank"):
-                return rc.rank
+                # Use num_devices (tp × pp) to reserve ports for all workers
+                # Each replica spawns num_devices workers, each needing a unique port
+                engine_config = self.llm_config.get_engine_config()
+                num_devices = engine_config.num_devices
+                return rc.rank * num_devices
         except Exception:
             # Best-effort fallback; avoid introducing failures in setup paths
             pass