Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .buildkite/test_areas/distributed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ steps:
- tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py
- tests/distributed/test_multiproc_executor.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
Expand Down Expand Up @@ -95,6 +96,8 @@ steps:
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py
# test multi-node TP with multiproc executor (simulated on single node)
- pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
# OLD rlhf examples
Expand Down
6 changes: 4 additions & 2 deletions tests/distributed/test_multiproc_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@

import multiprocessing
import os
import socket

from tests.utils import multi_gpu_test
from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import get_open_port
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.executor.multiproc_executor import MultiprocExecutor

Expand Down Expand Up @@ -333,7 +333,9 @@ def test_multiproc_executor_multi_node():
- Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2
Total world_size = 4, nnodes = 2
"""
port = get_open_port()
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("", 0))
port = s.getsockname()[1]
# symm_mem does not work for simulating multi instance in single node
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"

Expand Down
5 changes: 4 additions & 1 deletion vllm/v1/executor/multiproc_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,6 @@ def __init__(
)

# Load model
self._init_message_queues(input_shm_handle, vllm_config)
is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
if not is_eep_new_worker:
self.worker.init_device()
Expand All @@ -618,6 +617,10 @@ def __init__(
)
self.worker.load_model()

# Initialize message queues after init_device() since multi-node setups
# (nnodes_within_dp > 1) require distributed groups to be initialized
self._init_message_queues(input_shm_handle, vllm_config)

# Enable environment variable cache (e.g. assume no more
# environment variable overrides after this point)
enable_envs_cache()
Expand Down