Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
777e113
Add PyNcclEplbCommunicator using send/recv
ilmarkov Jan 27, 2026
0c22968
Add tests for PyNcclEplbCommunicator
ilmarkov Jan 27, 2026
c6f8a6b
Add eplb communicator config
ilmarkov Jan 27, 2026
3e616af
Fix precommit
ilmarkov Jan 27, 2026
cb6671a
Handle unsupported datatypes
ilmarkov Jan 27, 2026
95cdaca
Merge branch 'main' into imarkov/refactor-eplb-comminication
ilmarkov Feb 6, 2026
89481ac
Fix bug
ilmarkov Feb 6, 2026
a251827
Fix docs
ilmarkov Feb 10, 2026
92f3a50
Merge branch 'main' into imarkov/refactor-eplb-comminication
ilmarkov Feb 11, 2026
0c75914
Remove test case from profile mode
ilmarkov Feb 11, 2026
ca0c265
Fix eplb communicator creation
ilmarkov Feb 11, 2026
23d4b72
Wip
ilmarkov Feb 18, 2026
63fc983
Merge branch 'main' into imarkov/refactor-eplb-comminication
ilmarkov Feb 19, 2026
0287e53
Add torch.symm_mem EPLB communicator
ilmarkov Feb 23, 2026
20d6209
Cleanup
ilmarkov Feb 25, 2026
bb25650
Fix test
ilmarkov Feb 25, 2026
e832056
Merge branch 'main' into imarkov/refactor-eplb-comminication
ilmarkov Feb 25, 2026
522994b
Device capability check
ilmarkov Feb 26, 2026
8d83c7a
Gloo communicator
ilmarkov Feb 26, 2026
0c24b1e
Add nixl communicator
ilmarkov Feb 27, 2026
e3e951c
Update
ilmarkov Mar 2, 2026
0b31b39
Fix potential deadlock in nixl
ilmarkov Mar 2, 2026
cd35309
Merge branch 'main' into imarkov/refactor-eplb-comminication
ilmarkov Mar 2, 2026
3e5fb89
Update execute test
ilmarkov Mar 3, 2026
b093f9f
Adjust configs
ilmarkov Mar 3, 2026
b8229c9
Update skipping logic in tests
ilmarkov Mar 3, 2026
85c496c
Fixes
ilmarkov Mar 4, 2026
74f98a2
local moves on stream
ilmarkov Mar 4, 2026
d9394e9
Get rid of rank_to_global
ilmarkov Mar 4, 2026
f88f196
Fix consumed_event sync
ilmarkov Mar 4, 2026
b53b2a2
Remove NIXL and symm_mem from base EPLB communicator branch.
ilmarkov Mar 6, 2026
d8ade4e
Update naming and config
ilmarkov Mar 6, 2026
bb89e85
Merge branch 'main' into imarkov/refactor-eplb-comminication
ilmarkov Mar 6, 2026
b9a4031
Finish cleanup
ilmarkov Mar 6, 2026
64ccfb3
Fix precommit
ilmarkov Mar 6, 2026
c211cba
Fix precommit
ilmarkov Mar 6, 2026
2ebfbce
Merge branch 'main' into imarkov/refactor-eplb-comminication
ilmarkov Mar 16, 2026
247c795
Update after comments
ilmarkov Mar 18, 2026
bb4ecb3
Merge branch 'main' into imarkov/refactor-eplb-comminication
ilmarkov Mar 20, 2026
ab46ffa
Merge branch 'main' into imarkov/refactor-eplb-comminication
ilmarkov Mar 23, 2026
5c3985b
Fix elastic ep
ilmarkov Mar 24, 2026
1c26ef1
Merge branch 'main' into imarkov/refactor-eplb-comminication
ilmarkov Mar 24, 2026
b9b30b6
Update after comments
ilmarkov Mar 26, 2026
684bf19
Address comments
ilmarkov Mar 26, 2026
84b43eb
Merge branch 'main' into imarkov/refactor-eplb-comminication
ilmarkov Mar 26, 2026
a633c6d
Merge branch 'main' into imarkov/refactor-eplb-comminication
ilmarkov Mar 27, 2026
95b8f7d
Merge branch 'main' into imarkov/refactor-eplb-comminication
ilmarkov Mar 30, 2026
86fbd1e
Merge branch 'main' into imarkov/refactor-eplb-comminication
Mar 31, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .buildkite/test_areas/expert_parallelism.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ steps:
- pytest -v -s distributed/test_eplb_algo.py
- pytest -v -s distributed/test_eplb_utils.py

- label: EPLB Execution
timeout_in_minutes: 20
- label: EPLB Execution # 17min
timeout_in_minutes: 27
working_dir: "/vllm-workspace/tests"
num_devices: 4
source_file_dependencies:
Expand Down
41 changes: 39 additions & 2 deletions tests/distributed/eplb_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import atexit
import os
import random

import pytest
import torch
import torch.multiprocessing as mp

Expand All @@ -16,9 +18,20 @@
mp.set_start_method("spawn", force=True)


def _distributed_worker_wrapper(fn, env, world_size, args, rank, skip_queue):
try:
fn(env, world_size, *args)
except BaseException as exc:
if isinstance(exc, pytest.skip.Exception):
skip_queue.put((rank, str(exc)))
return
raise


def distributed_run(fn, world_size, *args):
number_of_processes = world_size
processes: list[mp.Process] = []
skip_queue: mp.SimpleQueue = mp.SimpleQueue()
for i in range(number_of_processes):
env: dict[str, str] = {}
env["RANK"] = str(i)
Expand All @@ -27,13 +40,32 @@ def distributed_run(fn, world_size, *args):
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
env["MASTER_ADDR"] = "localhost"
env["MASTER_PORT"] = "12345"
p = mp.Process(target=fn, args=(env, world_size, *args))
p = mp.Process(
target=_distributed_worker_wrapper,
args=(fn, env, world_size, args, i, skip_queue),
)
processes.append(p)
p.start()

for p in processes:
p.join()

skipped: list[tuple[int, str]] = []
while not skip_queue.empty():
rank, reason = skip_queue.get()
skipped.append((rank, reason))

if len(skipped) == number_of_processes:
reason = skipped[0][1]
pytest.skip(reason)
if 0 < len(skipped) < number_of_processes:
skipped_ranks = sorted(rank for rank, _ in skipped)
raise AssertionError(
"Distributed test had partial skips; expected either all ranks "
f"to skip or none. Skipped ranks: {skipped_ranks}, "
f"total ranks: {number_of_processes}"
)

for p in processes:
assert p.exitcode == 0

Expand All @@ -48,7 +80,12 @@ def set_env_vars_and_device(env: dict[str, str]) -> None:
vllm_config = VllmConfig()
with set_current_vllm_config(vllm_config):
init_distributed_environment()

atexit.register(_destroy_process_group_if_initialized)
# Ensure each worker process has the same random seed
random.seed(42)
torch.manual_seed(42)


def _destroy_process_group_if_initialized() -> None:
if torch.distributed.is_available() and torch.distributed.is_initialized():
torch.distributed.destroy_process_group()
Loading
Loading