vllm-project · wangxiyuan · Aug 2, 2025 · Jul 14, 2025 · Jul 29, 2025 · Jul 29, 2025
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -17,3 +17,4 @@ ray>=2.47.1
 protobuf==4.25.6
 librosa 
 soundfile
+pytest_mock
diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -157,6 +157,28 @@ def test_models_distributed_topk() -> None:
         vllm_model.generate(example_prompts, sampling_params)
 
 
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ": "1"})
+def test_models_distributed_alltoallv() -> None:
+    example_prompts = [
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
+        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
+        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
+    ]
+    dtype = "half"
+    sampling_params = SamplingParams(max_tokens=5,
+                                     temperature=0.0,
+                                     top_k=50,
+                                     top_p=0.9)
+
+    with VllmRunner(
+            "deepseek-ai/DeepSeek-V2-Lite",
+            dtype=dtype,
+            tensor_parallel_size=2,
+            distributed_executor_backend="mp",
+    ) as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)
+
+
 def test_models_distributed_Qwen3_W8A8():
     example_prompts = [
         "Hello, my name is",

diff --git a/tests/ut/test_distributed_tensor_parallel.py b/tests/ut/test_distributed_tensor_parallel.py
@@ -0,0 +1,139 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+
+import importlib
+
+import pytest
+import torch
+from pytest_mock import MockerFixture
+
+from tests.ut.base import PytestBase
+from vllm_ascend.distributed.tensor_parallel import (
+    _gather_along_first_dim, _gather_along_last_dim,
+    _reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,
+    all_to_all_hp2sp, all_to_all_sp2hp)
+
+
+class TestDistributedCommunication(PytestBase):
+
+    @pytest.fixture(autouse=True)
+    def context(self, mocker: MockerFixture):
+        mocker.patch("torch.npu.current_device", return_value="cpu")
+        mocker.patch("torch.distributed.get_world_size", return_value=4)
+
+        mocker.patch("torch.distributed.get_rank", return_value=0)
+
+    @pytest.mark.parametrize("world_size, test_tensor, expected",
+                             [(1, torch.randn(8, 16), (8, 16)),
+                              (4, torch.randn(8, 16), (32, 16))])
+    def test_gather_along_first_dim(self, test_tensor, expected, world_size,
+                                    mocker: MockerFixture):
+        """test _gather_along_first_dim"""
+        mocker.patch("torch.distributed.get_world_size",
+                     return_value=world_size)
+
+        result = _gather_along_first_dim(test_tensor, mocker.MagicMock())
+
+        assert result.shape == expected
+
+    @pytest.mark.parametrize("test_tensor, output_split_sizes, expected", [
+        (torch.randn(8, 16), [5, 10, 15, 2], (32, 16)),
+    ])
+    def test_gather_along_first_dim_unequal_split(self, test_tensor, expected,
+                                                  output_split_sizes,
+                                                  mocker: MockerFixture):
+        """test _gather_along_first_dim"""
+
+        result = _gather_along_first_dim(test_tensor, mocker.MagicMock(),
+                                         output_split_sizes)
+
+        assert result.shape == expected
+
+    @pytest.mark.parametrize("world_size, test_tensor, expected",
+                             [(1, torch.randn(8, 16, 32), (8, 16, 32)),
+                              (4, torch.randn(8, 16, 32), (8, 16, 32 * 4))])
+    def test_gather_along_last_dim(self, test_tensor, expected, world_size,
+                                   mocker: MockerFixture):
+        """test _gather_along_last_dim"""
+        mocker.patch("torch.distributed.get_world_size",
+                     return_value=world_size)
+
+        result = _gather_along_last_dim(test_tensor, mocker.MagicMock())
+
+        assert result.shape == expected
+
+    @pytest.mark.parametrize("input_shape,expected_shape", [
+        ((32, 16), (8, 16)),
+        ((40, 10), (10, 10)),
+    ])
+    def test_reduce_scatter_along_first_dim(self, input_shape, expected_shape,
+                                            mocker: MockerFixture):
+        input_tensor = torch.randn(*input_shape)
+        result = _reduce_scatter_along_first_dim(input_tensor,
+                                                 mocker.MagicMock())
+        assert result.shape == expected_shape
+
+    @pytest.mark.parametrize("input_shape,expected_shape", [
+        ((8, 16, 32), (8, 16, 8)),
+    ])
+    def test_reduce_scatter_along_last_dim(self, input_shape, expected_shape,
+                                           mocker: MockerFixture):
+        input_tensor = torch.randn(*input_shape)
+        result = _reduce_scatter_along_last_dim(input_tensor,
+                                                mocker.MagicMock())
+        assert result.shape == expected_shape
+
+    @pytest.mark.parametrize("func,input_shape,expected_shape", [
+        ("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
+         (8, 16, 128)),
+        ("reduce_scatter_to_sequence_parallel_region", (32, 16), (8, 16)),
+        ("reduce_scatter_last_dim_to_tensor_parallel_region", (8, 16, 32),
+         (8, 16, 8)),
+        ("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
+    ])
+    def test_wrapper_functions(self, func, input_shape, expected_shape,
+                               mocker: MockerFixture):
+        """test wrapper funcs"""
+        mod = importlib.import_module(
+            'vllm_ascend.distributed.tensor_parallel')
+        globals = mod.__dict__
+        test_func = globals[func]
+        input_tensor = torch.randn(*input_shape)
+        result = test_func(input_tensor, mocker.MagicMock())
+        assert result.shape == expected_shape
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [
+            ((8, 16), (32, 4)),  # [num_tokens/TP, H] -> [num_tokens, H/TP]
+        ])
+    def test_all_to_all_sp2hp(self, input_shape, output_shape,
+                              mocker: MockerFixture):
+        input_tensor = torch.randn(*input_shape)
+        result = all_to_all_sp2hp(input_tensor, mocker.MagicMock())
+        assert result.shape == output_shape
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [
+            ((32, 4), (8, 16)),  # [num_tokens, H/TP] -> [num_tokens/TP, H]
+        ])
+    def test_all_to_all_hp2sp(self, input_shape, output_shape,
+                              mocker: MockerFixture):
+        input_tensor = torch.randn(*input_shape)
+        result = all_to_all_hp2sp(input_tensor, mocker.MagicMock())
+        assert result.shape == output_shape
diff --git a/tests/ut/test_token_dispatcher.py b/tests/ut/test_token_dispatcher.py
@@ -0,0 +1,65 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+
+import pytest
+from pytest_mock import MockerFixture
+
+from tests.ut.base import PytestBase
+from vllm_ascend.ops.moe_dispatcher.token_dispatcher import (
+    MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig)
+from vllm_ascend.utils import adapt_patch  # noqa E402
+
+
+class TestMoEAlltoAllSeqOverLapDispatcher(PytestBase):
+
+    @pytest.fixture
+    def config(self):
+        config = MoEDispatcherConfig()
+        config.set_num_local_experts(2)
+        config.set_num_moe_experts(4)
+        config.set_moe_pad_expert_input_to_capacity(False)
+        config.set_moe_expert_capacity_factor(None)
+        config.set_moe_router_topk(2)
+        config.set_moe_grouped_gemm(False)
+        config.set_group_topk(0)
+        config.set_num_groups(1)
+        config.set_is_fused(False)
+        return config.build()
+
+    def mock_ep_group(self, mocker):
+        mock_group = mocker.MagicMock()
+        mock_group.rank_in_group = 0
+        mock_group.world_size = 2
+        mock_group.device_group = "mock_group"
+        return mock_group
+
+    @pytest.fixture
+    def dispatcher(self, config, mocker: MockerFixture):
+        mocker.patch(
+            "vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_ep_group",
+            return_value=self.mock_ep_group(mocker))
+        mocker.patch("torch.npu.current_device", return_value="cpu")
+        mocker.patch("torch.npu.Stream", return_value=mocker.MagicMock)
+        return MoEAlltoAllSeqOverLapDispatcher(config)
+
+    def test_initialization(self, dispatcher, config):
+        assert dispatcher.num_local_experts == config.num_local_experts
+        assert dispatcher.num_experts == config.num_moe_experts
+        assert dispatcher.local_expert_indices == [0, 1]
+        assert dispatcher.ep_rank == 0
+        assert dispatcher.ep_size == 2
+        assert dispatcher.overlap_stream is not None
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
@@ -18,6 +18,7 @@ class FusedMoEState(Enum):
     MC2 = 2
     AllGatherEP = 3
     NaiveMulticast = 4
+    All2AllSeq = 5
 
 
 # TODO(zzzzwwjj): add soc_version to choose branch
@@ -33,6 +34,10 @@ def get_fused_moe_state(ep_size: int, with_prefill: bool,
             return FusedMoEState.NaiveMulticast
         else:
             return FusedMoEState.AllGather
+    elif envs.VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ:
+        # MC2 Dispatch/Combine performs better than alltoall_seq in decoding stage.
+        return (FusedMoEState.All2AllSeq if
+                (ep_size < 16 or with_prefill) else FusedMoEState.MC2)
     # NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph.
     elif ep_size < 16 or with_prefill:
         return FusedMoEState.All2All