Skip to content

Commit 71a2f40

Browse files
authored
[serve][llm] Add TP*PP spacing to port offset for multi-replica deployments (#58073)
Signed-off-by: Nikhil Ghosh <[email protected]>
1 parent 168cdc6 commit 71a2f40

File tree

1 file changed

+12
-1
lines changed
  • python/ray/llm/_internal/serve/engines/vllm/kv_transfer

1 file changed

+12
-1
lines changed

python/ray/llm/_internal/serve/engines/vllm/kv_transfer/base.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,19 +43,30 @@ def _compute_port_offset(self) -> int:
4343
Uses data_parallel_rank if DP case, otherwise falls back to
4444
the replica rank assigned by Ray Serve (TP/PP case).
4545
46+
For TP/PP cases, multiply by num_devices (tp × pp) to reserve
47+
sufficient port space, since each worker needs a unique port.
48+
Each TP worker adds its tp_rank (0, 1, ..., tp_size-1) to the
49+
base port at bind time, and PP stages also need separate ports.
50+
4651
Returns:
4752
Non-negative integer offset to add to a base port.
4853
"""
4954
# Prefer explicit DP rank when available
5055
dp_rank = self.llm_config.engine_kwargs.get("data_parallel_rank")
5156
if isinstance(dp_rank, int) and dp_rank >= 0:
57+
# vLLM already accounts for TP spacing in DP offset calculation
58+
# (data_parallel_rank × tp_size), don't multiply here
5259
return dp_rank
5360

5461
# Fall back to Serve replica rank for TP/PP cases
5562
try:
5663
rc = serve.get_replica_context()
5764
if rc and hasattr(rc, "rank"):
58-
return rc.rank
65+
# Use num_devices (tp × pp) to reserve ports for all workers
66+
# Each replica spawns num_devices workers, each needing a unique port
67+
engine_config = self.llm_config.get_engine_config()
68+
num_devices = engine_config.num_devices
69+
return rc.rank * num_devices
5970
except Exception:
6071
# Best-effort fallback; avoid introducing failures in setup paths
6172
pass

0 commit comments

Comments
 (0)