Skip to content
Merged
Show file tree
Hide file tree
Changes from 56 commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
75503d3
[0.9.1][Feature]Moe alltoallv communication optimization for unquanti…
weijinqian0 Jul 14, 2025
63cb062
[v0.9.1][Feature] add Moe alltoallv.
Jul 29, 2025
715e6f1
[v0.9.1][Feature] add Moe alltoallv.
Jul 29, 2025
7c7e4e9
[v0.9.1][Feature] add moe alltoallv.
Jul 29, 2025
7106d77
[v0.9.1][Feature] add moe alltoallv.
Jul 29, 2025
1b53047
[v0.9.1][Feature] add moe alltoallv.
Jul 29, 2025
a863507
[v0.9.1][Feature] add moe alltoallv.
Jul 29, 2025
a702414
[v0.9.1][Feature] add moe alltoallv.
Jul 29, 2025
5841dc8
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
3542670
[Perf] Avoid performing index selection of sin/cos cache every layer …
whx-sjtu Jul 29, 2025
05f2ff2
[Feature] Enable inference support for Deepseekr1-w8a8-MTP (#1994)
Irving11-BKN Jul 29, 2025
5255de2
[CI] Enable linux-aarch64-a2 (64GB) and tp2 * 2 max-parallel to speed…
Potabk Jul 29, 2025
d33cf65
bump default python version to 3.11 (#2072)
ChenTaoyu-SJTU Jul 29, 2025
9ded27a
Add Custom Kernels For LoRA Performance (#1884)
taoxudonghaha Jul 29, 2025
1bcfe57
[Doc] Add performance tuning doc to main (#1392)
shen-shanshan Jul 29, 2025
18adb9d
[e2e]Fixed the issue that pyhccl e2e cannot run continuously with oth…
leo-pony Jul 29, 2025
adb37d0
[Perf][MoE] Improve MoE multistream parallel performace. (#1891)
whx-sjtu Jul 29, 2025
8c28c2b
[Refactor]Refactor sampler (#2050)
wangxiyuan Jul 30, 2025
55c2138
[CI] Fix test on pyhccl to 2 cards (#2094)
MengqingCao Jul 30, 2025
868aa2f
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
71bc50b
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
b118bbd
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
978f430
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
c7cc22a
Merge branch 'main' into main_merge_qwen3
weijinqian0 Jul 30, 2025
2922d9e
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
85a70fd
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
8363a8f
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
1b4eaf6
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
a819a33
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
6fb8ae0
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
bedb8d3
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
288edf4
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
509fe5c
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
000dbcc
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
ecd33b1
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
c23a6bf
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
31615e5
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
07e8dd8
[v0.9.1][Feature] add moe alltoallv.
Jul 30, 2025
41f6a36
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
fe081b4
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
1096789
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
442e26f
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
9130e58
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
544c007
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
6649ad6
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
f71847a
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
402c006
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
d3c188d
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
54dbf76
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
cfff17a
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
1336eb3
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
a35f812
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
f8aa32b
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
01ebd07
[v0.9.1][Feature] add moe alltoallv.
Jul 31, 2025
57b5378
[v0.9.1][Feature] add moe alltoallv.
Aug 1, 2025
aa26b19
[v0.9.1][Feature] add moe alltoallv.
Aug 1, 2025
c4993df
[v0.9.1][Feature] add moe alltoallv.
Aug 1, 2025
5932033
Merge branch 'main' into main_merge_qwen3
weijinqian0 Aug 1, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ ray>=2.47.1
protobuf==4.25.6
librosa
soundfile
pytest_mock
22 changes: 22 additions & 0 deletions tests/e2e/multicard/test_offline_inference_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,28 @@ def test_models_distributed_topk() -> None:
vllm_model.generate(example_prompts, sampling_params)


@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ": "1"})
def test_models_distributed_alltoallv() -> None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

example_prompts = [
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
"Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
]
dtype = "half"
sampling_params = SamplingParams(max_tokens=5,
temperature=0.0,
top_k=50,
top_p=0.9)

with VllmRunner(
"deepseek-ai/DeepSeek-V2-Lite",
dtype=dtype,
tensor_parallel_size=2,
distributed_executor_backend="mp",
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)


def test_models_distributed_Qwen3_W8A8():
example_prompts = [
"Hello, my name is",
Expand Down
139 changes: 139 additions & 0 deletions tests/ut/test_distributed_tensor_parallel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
#
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

plz move this to tests/distributed/test_tensor_paralle.py

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.

import importlib

import pytest
import torch
from pytest_mock import MockerFixture

from tests.ut.base import PytestBase
from vllm_ascend.distributed.tensor_parallel import (
_gather_along_first_dim, _gather_along_last_dim,
_reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,
all_to_all_hp2sp, all_to_all_sp2hp)


class TestDistributedCommunication(PytestBase):

@pytest.fixture(autouse=True)
def context(self, mocker: MockerFixture):
mocker.patch("torch.npu.current_device", return_value="cpu")
mocker.patch("torch.distributed.get_world_size", return_value=4)

mocker.patch("torch.distributed.get_rank", return_value=0)

@pytest.mark.parametrize("world_size, test_tensor, expected",
[(1, torch.randn(8, 16), (8, 16)),
(4, torch.randn(8, 16), (32, 16))])
def test_gather_along_first_dim(self, test_tensor, expected, world_size,
mocker: MockerFixture):
"""test _gather_along_first_dim"""
mocker.patch("torch.distributed.get_world_size",
return_value=world_size)

result = _gather_along_first_dim(test_tensor, mocker.MagicMock())

assert result.shape == expected

@pytest.mark.parametrize("test_tensor, output_split_sizes, expected", [
(torch.randn(8, 16), [5, 10, 15, 2], (32, 16)),
])
def test_gather_along_first_dim_unequal_split(self, test_tensor, expected,
output_split_sizes,
mocker: MockerFixture):
"""test _gather_along_first_dim"""

result = _gather_along_first_dim(test_tensor, mocker.MagicMock(),
output_split_sizes)

assert result.shape == expected

@pytest.mark.parametrize("world_size, test_tensor, expected",
[(1, torch.randn(8, 16, 32), (8, 16, 32)),
(4, torch.randn(8, 16, 32), (8, 16, 32 * 4))])
def test_gather_along_last_dim(self, test_tensor, expected, world_size,
mocker: MockerFixture):
"""test _gather_along_last_dim"""
mocker.patch("torch.distributed.get_world_size",
return_value=world_size)

result = _gather_along_last_dim(test_tensor, mocker.MagicMock())

assert result.shape == expected

@pytest.mark.parametrize("input_shape,expected_shape", [
((32, 16), (8, 16)),
((40, 10), (10, 10)),
])
def test_reduce_scatter_along_first_dim(self, input_shape, expected_shape,
mocker: MockerFixture):
input_tensor = torch.randn(*input_shape)
result = _reduce_scatter_along_first_dim(input_tensor,
mocker.MagicMock())
assert result.shape == expected_shape

@pytest.mark.parametrize("input_shape,expected_shape", [
((8, 16, 32), (8, 16, 8)),
])
def test_reduce_scatter_along_last_dim(self, input_shape, expected_shape,
mocker: MockerFixture):
input_tensor = torch.randn(*input_shape)
result = _reduce_scatter_along_last_dim(input_tensor,
mocker.MagicMock())
assert result.shape == expected_shape

@pytest.mark.parametrize("func,input_shape,expected_shape", [
("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
(8, 16, 128)),
("reduce_scatter_to_sequence_parallel_region", (32, 16), (8, 16)),
("reduce_scatter_last_dim_to_tensor_parallel_region", (8, 16, 32),
(8, 16, 8)),
("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
])
def test_wrapper_functions(self, func, input_shape, expected_shape,
mocker: MockerFixture):
"""test wrapper funcs"""
mod = importlib.import_module(
'vllm_ascend.distributed.tensor_parallel')
globals = mod.__dict__
test_func = globals[func]
input_tensor = torch.randn(*input_shape)
result = test_func(input_tensor, mocker.MagicMock())
assert result.shape == expected_shape

@pytest.mark.parametrize(
"input_shape,output_shape",
[
((8, 16), (32, 4)), # [num_tokens/TP, H] -> [num_tokens, H/TP]
])
def test_all_to_all_sp2hp(self, input_shape, output_shape,
mocker: MockerFixture):
input_tensor = torch.randn(*input_shape)
result = all_to_all_sp2hp(input_tensor, mocker.MagicMock())
assert result.shape == output_shape

@pytest.mark.parametrize(
"input_shape,output_shape",
[
((32, 4), (8, 16)), # [num_tokens, H/TP] -> [num_tokens/TP, H]
])
def test_all_to_all_hp2sp(self, input_shape, output_shape,
mocker: MockerFixture):
input_tensor = torch.randn(*input_shape)
result = all_to_all_hp2sp(input_tensor, mocker.MagicMock())
assert result.shape == output_shape
65 changes: 65 additions & 0 deletions tests/ut/test_token_dispatcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

plz move this to tests/moe_dispatcher/test_token_dispatcher.py

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.

import pytest
from pytest_mock import MockerFixture

from tests.ut.base import PytestBase
from vllm_ascend.ops.moe_dispatcher.token_dispatcher import (
MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig)
from vllm_ascend.utils import adapt_patch # noqa E402


class TestMoEAlltoAllSeqOverLapDispatcher(PytestBase):

@pytest.fixture
def config(self):
config = MoEDispatcherConfig()
config.set_num_local_experts(2)
config.set_num_moe_experts(4)
config.set_moe_pad_expert_input_to_capacity(False)
config.set_moe_expert_capacity_factor(None)
config.set_moe_router_topk(2)
config.set_moe_grouped_gemm(False)
config.set_group_topk(0)
config.set_num_groups(1)
config.set_is_fused(False)
return config.build()

def mock_ep_group(self, mocker):
mock_group = mocker.MagicMock()
mock_group.rank_in_group = 0
mock_group.world_size = 2
mock_group.device_group = "mock_group"
return mock_group

@pytest.fixture
def dispatcher(self, config, mocker: MockerFixture):
mocker.patch(
"vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_ep_group",
return_value=self.mock_ep_group(mocker))
mocker.patch("torch.npu.current_device", return_value="cpu")
mocker.patch("torch.npu.Stream", return_value=mocker.MagicMock)
return MoEAlltoAllSeqOverLapDispatcher(config)

def test_initialization(self, dispatcher, config):
assert dispatcher.num_local_experts == config.num_local_experts
assert dispatcher.num_experts == config.num_moe_experts
assert dispatcher.local_expert_indices == [0, 1]
assert dispatcher.ep_rank == 0
assert dispatcher.ep_size == 2
assert dispatcher.overlap_stream is not None
5 changes: 5 additions & 0 deletions vllm_ascend/ascend_forward_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class FusedMoEState(Enum):
MC2 = 2
AllGatherEP = 3
NaiveMulticast = 4
All2AllSeq = 5


# TODO(zzzzwwjj): add soc_version to choose branch
Expand All @@ -33,6 +34,10 @@ def get_fused_moe_state(ep_size: int, with_prefill: bool,
return FusedMoEState.NaiveMulticast
else:
return FusedMoEState.AllGather
elif envs.VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ:
# MC2 Dispatch/Combine performs better than alltoall_seq in decoding stage.
return (FusedMoEState.All2AllSeq if
(ep_size < 16 or with_prefill) else FusedMoEState.MC2)
# NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph.
elif ep_size < 16 or with_prefill:
return FusedMoEState.All2All
Expand Down
Loading
Loading